diff options
Diffstat (limited to '')
47 files changed, 33007 insertions, 0 deletions
diff --git a/fs/jfs/Kconfig b/fs/jfs/Kconfig new file mode 100644 index 000000000..851de78fd --- /dev/null +++ b/fs/jfs/Kconfig @@ -0,0 +1,47 @@ +config JFS_FS + tristate "JFS filesystem support" + select NLS + select CRC32 + help + This is a port of IBM's Journaled Filesystem . More information is + available in the file <file:Documentation/filesystems/jfs.txt>. + + If you do not intend to use the JFS filesystem, say N. + +config JFS_POSIX_ACL + bool "JFS POSIX Access Control Lists" + depends on JFS_FS + select FS_POSIX_ACL + help + Posix Access Control Lists (ACLs) support permissions for users and + groups beyond the owner/group/world scheme. + + If you don't know what Access Control Lists are, say N + +config JFS_SECURITY + bool "JFS Security Labels" + depends on JFS_FS + help + Security labels support alternative access control models + implemented by security modules like SELinux. This option + enables an extended attribute handler for file security + labels in the jfs filesystem. + + If you are not using a security module that requires using + extended attributes for file security labels, say N. + +config JFS_DEBUG + bool "JFS debugging" + depends on JFS_FS + help + If you are experiencing any problems with the JFS filesystem, say + Y here. This will result in additional debugging messages to be + written to the system log. Under normal circumstances, this + results in very little overhead. + +config JFS_STATISTICS + bool "JFS statistics" + depends on JFS_FS + help + Enabling this option will cause statistics from the JFS file system + to be made available to the user in the /proc/fs/jfs/ directory. diff --git a/fs/jfs/Makefile b/fs/jfs/Makefile new file mode 100644 index 000000000..285ec189e --- /dev/null +++ b/fs/jfs/Makefile @@ -0,0 +1,17 @@ +# SPDX-License-Identifier: GPL-2.0 +# +# Makefile for the Linux JFS filesystem routines. +# + +obj-$(CONFIG_JFS_FS) += jfs.o + +jfs-y := super.o file.o inode.o namei.o jfs_mount.o jfs_umount.o \ + jfs_xtree.o jfs_imap.o jfs_debug.o jfs_dmap.o \ + jfs_unicode.o jfs_dtree.o jfs_inode.o jfs_discard.o \ + jfs_extent.o symlink.o jfs_metapage.o \ + jfs_logmgr.o jfs_txnmgr.o jfs_uniupr.o \ + resize.o xattr.o ioctl.o + +jfs-$(CONFIG_JFS_POSIX_ACL) += acl.o + +ccflags-y := -D_JFS_4K diff --git a/fs/jfs/acl.c b/fs/jfs/acl.c new file mode 100644 index 000000000..2e71b6e7e --- /dev/null +++ b/fs/jfs/acl.c @@ -0,0 +1,161 @@ +/* + * Copyright (C) International Business Machines Corp., 2002-2004 + * Copyright (C) Andreas Gruenbacher, 2001 + * Copyright (C) Linus Torvalds, 1991, 1992 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include <linux/sched.h> +#include <linux/slab.h> +#include <linux/fs.h> +#include <linux/posix_acl_xattr.h> +#include "jfs_incore.h" +#include "jfs_txnmgr.h" +#include "jfs_xattr.h" +#include "jfs_acl.h" + +struct posix_acl *jfs_get_acl(struct inode *inode, int type) +{ + struct posix_acl *acl; + char *ea_name; + int size; + char *value = NULL; + + switch(type) { + case ACL_TYPE_ACCESS: + ea_name = XATTR_NAME_POSIX_ACL_ACCESS; + break; + case ACL_TYPE_DEFAULT: + ea_name = XATTR_NAME_POSIX_ACL_DEFAULT; + break; + default: + return ERR_PTR(-EINVAL); + } + + size = __jfs_getxattr(inode, ea_name, NULL, 0); + + if (size > 0) { + value = kmalloc(size, GFP_KERNEL); + if (!value) + return ERR_PTR(-ENOMEM); + size = __jfs_getxattr(inode, ea_name, value, size); + } + + if (size < 0) { + if (size == -ENODATA) + acl = NULL; + else + acl = ERR_PTR(size); + } else { + acl = posix_acl_from_xattr(&init_user_ns, value, size); + } + kfree(value); + return acl; +} + +static int __jfs_set_acl(tid_t tid, struct inode *inode, int type, + struct posix_acl *acl) +{ + char *ea_name; + int rc; + int size = 0; + char *value = NULL; + + switch (type) { + case ACL_TYPE_ACCESS: + ea_name = XATTR_NAME_POSIX_ACL_ACCESS; + break; + case ACL_TYPE_DEFAULT: + ea_name = XATTR_NAME_POSIX_ACL_DEFAULT; + break; + default: + return -EINVAL; + } + + if (acl) { + size = posix_acl_xattr_size(acl->a_count); + value = kmalloc(size, GFP_KERNEL); + if (!value) + return -ENOMEM; + rc = posix_acl_to_xattr(&init_user_ns, acl, value, size); + if (rc < 0) + goto out; + } + rc = __jfs_setxattr(tid, inode, ea_name, value, size, 0); +out: + kfree(value); + + if (!rc) + set_cached_acl(inode, type, acl); + + return rc; +} + +int jfs_set_acl(struct inode *inode, struct posix_acl *acl, int type) +{ + int rc; + tid_t tid; + int update_mode = 0; + umode_t mode = inode->i_mode; + + tid = txBegin(inode->i_sb, 0); + mutex_lock(&JFS_IP(inode)->commit_mutex); + if (type == ACL_TYPE_ACCESS && acl) { + rc = posix_acl_update_mode(inode, &mode, &acl); + if (rc) + goto end_tx; + update_mode = 1; + } + rc = __jfs_set_acl(tid, inode, type, acl); + if (!rc) { + if (update_mode) { + inode->i_mode = mode; + inode->i_ctime = current_time(inode); + mark_inode_dirty(inode); + } + rc = txCommit(tid, 1, &inode, 0); + } +end_tx: + txEnd(tid); + mutex_unlock(&JFS_IP(inode)->commit_mutex); + return rc; +} + +int jfs_init_acl(tid_t tid, struct inode *inode, struct inode *dir) +{ + struct posix_acl *default_acl, *acl; + int rc = 0; + + rc = posix_acl_create(dir, &inode->i_mode, &default_acl, &acl); + if (rc) + return rc; + + if (default_acl) { + rc = __jfs_set_acl(tid, inode, ACL_TYPE_DEFAULT, default_acl); + posix_acl_release(default_acl); + } + + if (acl) { + if (!rc) + rc = __jfs_set_acl(tid, inode, ACL_TYPE_ACCESS, acl); + posix_acl_release(acl); + } + + JFS_IP(inode)->mode2 = (JFS_IP(inode)->mode2 & 0xffff0000) | + inode->i_mode; + + return rc; +} diff --git a/fs/jfs/file.c b/fs/jfs/file.c new file mode 100644 index 000000000..36665fd37 --- /dev/null +++ b/fs/jfs/file.c @@ -0,0 +1,165 @@ +/* + * Copyright (C) International Business Machines Corp., 2000-2002 + * Portions Copyright (C) Christoph Hellwig, 2001-2002 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include <linux/mm.h> +#include <linux/fs.h> +#include <linux/posix_acl.h> +#include <linux/quotaops.h> +#include "jfs_incore.h" +#include "jfs_inode.h" +#include "jfs_dmap.h" +#include "jfs_txnmgr.h" +#include "jfs_xattr.h" +#include "jfs_acl.h" +#include "jfs_debug.h" + +int jfs_fsync(struct file *file, loff_t start, loff_t end, int datasync) +{ + struct inode *inode = file->f_mapping->host; + int rc = 0; + + rc = file_write_and_wait_range(file, start, end); + if (rc) + return rc; + + inode_lock(inode); + if (!(inode->i_state & I_DIRTY_ALL) || + (datasync && !(inode->i_state & I_DIRTY_DATASYNC))) { + /* Make sure committed changes hit the disk */ + jfs_flush_journal(JFS_SBI(inode->i_sb)->log, 1); + inode_unlock(inode); + return rc; + } + + rc |= jfs_commit_inode(inode, 1); + inode_unlock(inode); + + return rc ? -EIO : 0; +} + +static int jfs_open(struct inode *inode, struct file *file) +{ + int rc; + + if ((rc = dquot_file_open(inode, file))) + return rc; + + /* + * We attempt to allow only one "active" file open per aggregate + * group. Otherwise, appending to files in parallel can cause + * fragmentation within the files. + * + * If the file is empty, it was probably just created and going + * to be written to. If it has a size, we'll hold off until the + * file is actually grown. + */ + if (S_ISREG(inode->i_mode) && file->f_mode & FMODE_WRITE && + (inode->i_size == 0)) { + struct jfs_inode_info *ji = JFS_IP(inode); + spin_lock_irq(&ji->ag_lock); + if (ji->active_ag == -1) { + struct jfs_sb_info *jfs_sb = JFS_SBI(inode->i_sb); + ji->active_ag = BLKTOAG(addressPXD(&ji->ixpxd), jfs_sb); + atomic_inc(&jfs_sb->bmap->db_active[ji->active_ag]); + } + spin_unlock_irq(&ji->ag_lock); + } + + return 0; +} +static int jfs_release(struct inode *inode, struct file *file) +{ + struct jfs_inode_info *ji = JFS_IP(inode); + + spin_lock_irq(&ji->ag_lock); + if (ji->active_ag != -1) { + struct bmap *bmap = JFS_SBI(inode->i_sb)->bmap; + atomic_dec(&bmap->db_active[ji->active_ag]); + ji->active_ag = -1; + } + spin_unlock_irq(&ji->ag_lock); + + return 0; +} + +int jfs_setattr(struct dentry *dentry, struct iattr *iattr) +{ + struct inode *inode = d_inode(dentry); + int rc; + + rc = setattr_prepare(dentry, iattr); + if (rc) + return rc; + + if (is_quota_modification(inode, iattr)) { + rc = dquot_initialize(inode); + if (rc) + return rc; + } + if ((iattr->ia_valid & ATTR_UID && !uid_eq(iattr->ia_uid, inode->i_uid)) || + (iattr->ia_valid & ATTR_GID && !gid_eq(iattr->ia_gid, inode->i_gid))) { + rc = dquot_transfer(inode, iattr); + if (rc) + return rc; + } + + if ((iattr->ia_valid & ATTR_SIZE) && + iattr->ia_size != i_size_read(inode)) { + inode_dio_wait(inode); + + rc = inode_newsize_ok(inode, iattr->ia_size); + if (rc) + return rc; + + truncate_setsize(inode, iattr->ia_size); + jfs_truncate(inode); + } + + setattr_copy(inode, iattr); + mark_inode_dirty(inode); + + if (iattr->ia_valid & ATTR_MODE) + rc = posix_acl_chmod(inode, inode->i_mode); + return rc; +} + +const struct inode_operations jfs_file_inode_operations = { + .listxattr = jfs_listxattr, + .setattr = jfs_setattr, +#ifdef CONFIG_JFS_POSIX_ACL + .get_acl = jfs_get_acl, + .set_acl = jfs_set_acl, +#endif +}; + +const struct file_operations jfs_file_operations = { + .open = jfs_open, + .llseek = generic_file_llseek, + .read_iter = generic_file_read_iter, + .write_iter = generic_file_write_iter, + .mmap = generic_file_mmap, + .splice_read = generic_file_splice_read, + .splice_write = iter_file_splice_write, + .fsync = jfs_fsync, + .release = jfs_release, + .unlocked_ioctl = jfs_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = jfs_compat_ioctl, +#endif +}; diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c new file mode 100644 index 000000000..68779cc36 --- /dev/null +++ b/fs/jfs/inode.c @@ -0,0 +1,426 @@ +/* + * Copyright (C) International Business Machines Corp., 2000-2004 + * Portions Copyright (C) Christoph Hellwig, 2001-2002 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include <linux/fs.h> +#include <linux/mpage.h> +#include <linux/buffer_head.h> +#include <linux/pagemap.h> +#include <linux/quotaops.h> +#include <linux/uio.h> +#include <linux/writeback.h> +#include "jfs_incore.h" +#include "jfs_inode.h" +#include "jfs_filsys.h" +#include "jfs_imap.h" +#include "jfs_extent.h" +#include "jfs_unicode.h" +#include "jfs_debug.h" + + +struct inode *jfs_iget(struct super_block *sb, unsigned long ino) +{ + struct inode *inode; + int ret; + + inode = iget_locked(sb, ino); + if (!inode) + return ERR_PTR(-ENOMEM); + if (!(inode->i_state & I_NEW)) + return inode; + + ret = diRead(inode); + if (ret < 0) { + iget_failed(inode); + return ERR_PTR(ret); + } + + if (S_ISREG(inode->i_mode)) { + inode->i_op = &jfs_file_inode_operations; + inode->i_fop = &jfs_file_operations; + inode->i_mapping->a_ops = &jfs_aops; + } else if (S_ISDIR(inode->i_mode)) { + inode->i_op = &jfs_dir_inode_operations; + inode->i_fop = &jfs_dir_operations; + } else if (S_ISLNK(inode->i_mode)) { + if (inode->i_size >= IDATASIZE) { + inode->i_op = &page_symlink_inode_operations; + inode_nohighmem(inode); + inode->i_mapping->a_ops = &jfs_aops; + } else { + inode->i_op = &jfs_fast_symlink_inode_operations; + inode->i_link = JFS_IP(inode)->i_inline; + /* + * The inline data should be null-terminated, but + * don't let on-disk corruption crash the kernel + */ + inode->i_link[inode->i_size] = '\0'; + } + } else { + inode->i_op = &jfs_file_inode_operations; + init_special_inode(inode, inode->i_mode, inode->i_rdev); + } + unlock_new_inode(inode); + return inode; +} + +/* + * Workhorse of both fsync & write_inode + */ +int jfs_commit_inode(struct inode *inode, int wait) +{ + int rc = 0; + tid_t tid; + static int noisy = 5; + + jfs_info("In jfs_commit_inode, inode = 0x%p", inode); + + /* + * Don't commit if inode has been committed since last being + * marked dirty, or if it has been deleted. + */ + if (inode->i_nlink == 0 || !test_cflag(COMMIT_Dirty, inode)) + return 0; + + if (isReadOnly(inode)) { + /* kernel allows writes to devices on read-only + * partitions and may think inode is dirty + */ + if (!special_file(inode->i_mode) && noisy) { + jfs_err("jfs_commit_inode(0x%p) called on read-only volume", + inode); + jfs_err("Is remount racy?"); + noisy--; + } + return 0; + } + + tid = txBegin(inode->i_sb, COMMIT_INODE); + mutex_lock(&JFS_IP(inode)->commit_mutex); + + /* + * Retest inode state after taking commit_mutex + */ + if (inode->i_nlink && test_cflag(COMMIT_Dirty, inode)) + rc = txCommit(tid, 1, &inode, wait ? COMMIT_SYNC : 0); + + txEnd(tid); + mutex_unlock(&JFS_IP(inode)->commit_mutex); + return rc; +} + +int jfs_write_inode(struct inode *inode, struct writeback_control *wbc) +{ + int wait = wbc->sync_mode == WB_SYNC_ALL; + + if (inode->i_nlink == 0) + return 0; + /* + * If COMMIT_DIRTY is not set, the inode isn't really dirty. + * It has been committed since the last change, but was still + * on the dirty inode list. + */ + if (!test_cflag(COMMIT_Dirty, inode)) { + /* Make sure committed changes hit the disk */ + jfs_flush_journal(JFS_SBI(inode->i_sb)->log, wait); + return 0; + } + + if (jfs_commit_inode(inode, wait)) { + jfs_err("jfs_write_inode: jfs_commit_inode failed!"); + return -EIO; + } else + return 0; +} + +void jfs_evict_inode(struct inode *inode) +{ + jfs_info("In jfs_evict_inode, inode = 0x%p", inode); + + if (!inode->i_nlink && !is_bad_inode(inode)) { + dquot_initialize(inode); + + if (JFS_IP(inode)->fileset == FILESYSTEM_I) { + struct inode *ipimap = JFS_SBI(inode->i_sb)->ipimap; + truncate_inode_pages_final(&inode->i_data); + + if (test_cflag(COMMIT_Freewmap, inode)) + jfs_free_zero_link(inode); + + if (ipimap && JFS_IP(ipimap)->i_imap) + diFree(inode); + + /* + * Free the inode from the quota allocation. + */ + dquot_initialize(inode); + dquot_free_inode(inode); + } + } else { + truncate_inode_pages_final(&inode->i_data); + } + clear_inode(inode); + dquot_drop(inode); +} + +void jfs_dirty_inode(struct inode *inode, int flags) +{ + static int noisy = 5; + + if (isReadOnly(inode)) { + if (!special_file(inode->i_mode) && noisy) { + /* kernel allows writes to devices on read-only + * partitions and may try to mark inode dirty + */ + jfs_err("jfs_dirty_inode called on read-only volume"); + jfs_err("Is remount racy?"); + noisy--; + } + return; + } + + set_cflag(COMMIT_Dirty, inode); +} + +int jfs_get_block(struct inode *ip, sector_t lblock, + struct buffer_head *bh_result, int create) +{ + s64 lblock64 = lblock; + int rc = 0; + xad_t xad; + s64 xaddr; + int xflag; + s32 xlen = bh_result->b_size >> ip->i_blkbits; + + /* + * Take appropriate lock on inode + */ + if (create) + IWRITE_LOCK(ip, RDWRLOCK_NORMAL); + else + IREAD_LOCK(ip, RDWRLOCK_NORMAL); + + if (((lblock64 << ip->i_sb->s_blocksize_bits) < ip->i_size) && + (!xtLookup(ip, lblock64, xlen, &xflag, &xaddr, &xlen, 0)) && + xaddr) { + if (xflag & XAD_NOTRECORDED) { + if (!create) + /* + * Allocated but not recorded, read treats + * this as a hole + */ + goto unlock; +#ifdef _JFS_4K + XADoffset(&xad, lblock64); + XADlength(&xad, xlen); + XADaddress(&xad, xaddr); +#else /* _JFS_4K */ + /* + * As long as block size = 4K, this isn't a problem. + * We should mark the whole page not ABNR, but how + * will we know to mark the other blocks BH_New? + */ + BUG(); +#endif /* _JFS_4K */ + rc = extRecord(ip, &xad); + if (rc) + goto unlock; + set_buffer_new(bh_result); + } + + map_bh(bh_result, ip->i_sb, xaddr); + bh_result->b_size = xlen << ip->i_blkbits; + goto unlock; + } + if (!create) + goto unlock; + + /* + * Allocate a new block + */ +#ifdef _JFS_4K + if ((rc = extHint(ip, lblock64 << ip->i_sb->s_blocksize_bits, &xad))) + goto unlock; + rc = extAlloc(ip, xlen, lblock64, &xad, false); + if (rc) + goto unlock; + + set_buffer_new(bh_result); + map_bh(bh_result, ip->i_sb, addressXAD(&xad)); + bh_result->b_size = lengthXAD(&xad) << ip->i_blkbits; + +#else /* _JFS_4K */ + /* + * We need to do whatever it takes to keep all but the last buffers + * in 4K pages - see jfs_write.c + */ + BUG(); +#endif /* _JFS_4K */ + + unlock: + /* + * Release lock on inode + */ + if (create) + IWRITE_UNLOCK(ip); + else + IREAD_UNLOCK(ip); + return rc; +} + +static int jfs_writepage(struct page *page, struct writeback_control *wbc) +{ + return block_write_full_page(page, jfs_get_block, wbc); +} + +static int jfs_writepages(struct address_space *mapping, + struct writeback_control *wbc) +{ + return mpage_writepages(mapping, wbc, jfs_get_block); +} + +static int jfs_readpage(struct file *file, struct page *page) +{ + return mpage_readpage(page, jfs_get_block); +} + +static int jfs_readpages(struct file *file, struct address_space *mapping, + struct list_head *pages, unsigned nr_pages) +{ + return mpage_readpages(mapping, pages, nr_pages, jfs_get_block); +} + +static void jfs_write_failed(struct address_space *mapping, loff_t to) +{ + struct inode *inode = mapping->host; + + if (to > inode->i_size) { + truncate_pagecache(inode, inode->i_size); + jfs_truncate(inode); + } +} + +static int jfs_write_begin(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata) +{ + int ret; + + ret = nobh_write_begin(mapping, pos, len, flags, pagep, fsdata, + jfs_get_block); + if (unlikely(ret)) + jfs_write_failed(mapping, pos + len); + + return ret; +} + +static sector_t jfs_bmap(struct address_space *mapping, sector_t block) +{ + return generic_block_bmap(mapping, block, jfs_get_block); +} + +static ssize_t jfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) +{ + struct file *file = iocb->ki_filp; + struct address_space *mapping = file->f_mapping; + struct inode *inode = file->f_mapping->host; + size_t count = iov_iter_count(iter); + ssize_t ret; + + ret = blockdev_direct_IO(iocb, inode, iter, jfs_get_block); + + /* + * In case of error extending write may have instantiated a few + * blocks outside i_size. Trim these off again. + */ + if (unlikely(iov_iter_rw(iter) == WRITE && ret < 0)) { + loff_t isize = i_size_read(inode); + loff_t end = iocb->ki_pos + count; + + if (end > isize) + jfs_write_failed(mapping, end); + } + + return ret; +} + +const struct address_space_operations jfs_aops = { + .readpage = jfs_readpage, + .readpages = jfs_readpages, + .writepage = jfs_writepage, + .writepages = jfs_writepages, + .write_begin = jfs_write_begin, + .write_end = nobh_write_end, + .bmap = jfs_bmap, + .direct_IO = jfs_direct_IO, +}; + +/* + * Guts of jfs_truncate. Called with locks already held. Can be called + * with directory for truncating directory index table. + */ +void jfs_truncate_nolock(struct inode *ip, loff_t length) +{ + loff_t newsize; + tid_t tid; + + ASSERT(length >= 0); + + if (test_cflag(COMMIT_Nolink, ip)) { + xtTruncate(0, ip, length, COMMIT_WMAP); + return; + } + + do { + tid = txBegin(ip->i_sb, 0); + + /* + * The commit_mutex cannot be taken before txBegin. + * txBegin may block and there is a chance the inode + * could be marked dirty and need to be committed + * before txBegin unblocks + */ + mutex_lock(&JFS_IP(ip)->commit_mutex); + + newsize = xtTruncate(tid, ip, length, + COMMIT_TRUNCATE | COMMIT_PWMAP); + if (newsize < 0) { + txEnd(tid); + mutex_unlock(&JFS_IP(ip)->commit_mutex); + break; + } + + ip->i_mtime = ip->i_ctime = current_time(ip); + mark_inode_dirty(ip); + + txCommit(tid, 1, &ip, 0); + txEnd(tid); + mutex_unlock(&JFS_IP(ip)->commit_mutex); + } while (newsize > length); /* Truncate isn't always atomic */ +} + +void jfs_truncate(struct inode *ip) +{ + jfs_info("jfs_truncate: size = 0x%lx", (ulong) ip->i_size); + + nobh_truncate_page(ip->i_mapping, ip->i_size, jfs_get_block); + + IWRITE_LOCK(ip, RDWRLOCK_NORMAL); + jfs_truncate_nolock(ip, ip->i_size); + IWRITE_UNLOCK(ip); +} diff --git a/fs/jfs/ioctl.c b/fs/jfs/ioctl.c new file mode 100644 index 000000000..ba34dae8b --- /dev/null +++ b/fs/jfs/ioctl.c @@ -0,0 +1,185 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * linux/fs/jfs/ioctl.c + * + * Copyright (C) 2006 Herbert Poetzl + * adapted from Remy Card's ext2/ioctl.c + */ + +#include <linux/fs.h> +#include <linux/ctype.h> +#include <linux/capability.h> +#include <linux/mount.h> +#include <linux/time.h> +#include <linux/sched.h> +#include <linux/blkdev.h> +#include <asm/current.h> +#include <linux/uaccess.h> + +#include "jfs_filsys.h" +#include "jfs_debug.h" +#include "jfs_incore.h" +#include "jfs_dinode.h" +#include "jfs_inode.h" +#include "jfs_dmap.h" +#include "jfs_discard.h" + +static struct { + long jfs_flag; + long ext2_flag; +} jfs_map[] = { + {JFS_NOATIME_FL, FS_NOATIME_FL}, + {JFS_DIRSYNC_FL, FS_DIRSYNC_FL}, + {JFS_SYNC_FL, FS_SYNC_FL}, + {JFS_SECRM_FL, FS_SECRM_FL}, + {JFS_UNRM_FL, FS_UNRM_FL}, + {JFS_APPEND_FL, FS_APPEND_FL}, + {JFS_IMMUTABLE_FL, FS_IMMUTABLE_FL}, + {0, 0}, +}; + +static long jfs_map_ext2(unsigned long flags, int from) +{ + int index=0; + long mapped=0; + + while (jfs_map[index].jfs_flag) { + if (from) { + if (jfs_map[index].ext2_flag & flags) + mapped |= jfs_map[index].jfs_flag; + } else { + if (jfs_map[index].jfs_flag & flags) + mapped |= jfs_map[index].ext2_flag; + } + index++; + } + return mapped; +} + + +long jfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) +{ + struct inode *inode = file_inode(filp); + struct jfs_inode_info *jfs_inode = JFS_IP(inode); + unsigned int flags; + + switch (cmd) { + case JFS_IOC_GETFLAGS: + flags = jfs_inode->mode2 & JFS_FL_USER_VISIBLE; + flags = jfs_map_ext2(flags, 0); + return put_user(flags, (int __user *) arg); + case JFS_IOC_SETFLAGS: { + unsigned int oldflags; + int err; + + err = mnt_want_write_file(filp); + if (err) + return err; + + if (!inode_owner_or_capable(inode)) { + err = -EACCES; + goto setflags_out; + } + if (get_user(flags, (int __user *) arg)) { + err = -EFAULT; + goto setflags_out; + } + + flags = jfs_map_ext2(flags, 1); + if (!S_ISDIR(inode->i_mode)) + flags &= ~JFS_DIRSYNC_FL; + + /* Is it quota file? Do not allow user to mess with it */ + if (IS_NOQUOTA(inode)) { + err = -EPERM; + goto setflags_out; + } + + /* Lock against other parallel changes of flags */ + inode_lock(inode); + + oldflags = jfs_inode->mode2; + + /* + * The IMMUTABLE and APPEND_ONLY flags can only be changed by + * the relevant capability. + */ + if ((oldflags & JFS_IMMUTABLE_FL) || + ((flags ^ oldflags) & + (JFS_APPEND_FL | JFS_IMMUTABLE_FL))) { + if (!capable(CAP_LINUX_IMMUTABLE)) { + inode_unlock(inode); + err = -EPERM; + goto setflags_out; + } + } + + flags = flags & JFS_FL_USER_MODIFIABLE; + flags |= oldflags & ~JFS_FL_USER_MODIFIABLE; + jfs_inode->mode2 = flags; + + jfs_set_inode_flags(inode); + inode_unlock(inode); + inode->i_ctime = current_time(inode); + mark_inode_dirty(inode); +setflags_out: + mnt_drop_write_file(filp); + return err; + } + + case FITRIM: + { + struct super_block *sb = inode->i_sb; + struct request_queue *q = bdev_get_queue(sb->s_bdev); + struct fstrim_range range; + s64 ret = 0; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (!blk_queue_discard(q)) { + jfs_warn("FITRIM not supported on device"); + return -EOPNOTSUPP; + } + + if (copy_from_user(&range, (struct fstrim_range __user *)arg, + sizeof(range))) + return -EFAULT; + + range.minlen = max_t(unsigned int, range.minlen, + q->limits.discard_granularity); + + ret = jfs_ioc_trim(inode, &range); + if (ret < 0) + return ret; + + if (copy_to_user((struct fstrim_range __user *)arg, &range, + sizeof(range))) + return -EFAULT; + + return 0; + } + + default: + return -ENOTTY; + } +} + +#ifdef CONFIG_COMPAT +long jfs_compat_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) +{ + /* While these ioctl numbers defined with 'long' and have different + * numbers than the 64bit ABI, + * the actual implementation only deals with ints and is compatible. + */ + switch (cmd) { + case JFS_IOC_GETFLAGS32: + cmd = JFS_IOC_GETFLAGS; + break; + case JFS_IOC_SETFLAGS32: + cmd = JFS_IOC_SETFLAGS; + break; + } + return jfs_ioctl(filp, cmd, arg); +} +#endif diff --git a/fs/jfs/jfs_acl.h b/fs/jfs/jfs_acl.h new file mode 100644 index 000000000..489f993b7 --- /dev/null +++ b/fs/jfs/jfs_acl.h @@ -0,0 +1,36 @@ +/* + * Copyright (C) International Business Machines Corp., 2002 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +#ifndef _H_JFS_ACL +#define _H_JFS_ACL + +#ifdef CONFIG_JFS_POSIX_ACL + +struct posix_acl *jfs_get_acl(struct inode *inode, int type); +int jfs_set_acl(struct inode *inode, struct posix_acl *acl, int type); +int jfs_init_acl(tid_t, struct inode *, struct inode *); + +#else + +static inline int jfs_init_acl(tid_t tid, struct inode *inode, + struct inode *dir) +{ + return 0; +} + +#endif +#endif /* _H_JFS_ACL */ diff --git a/fs/jfs/jfs_btree.h b/fs/jfs/jfs_btree.h new file mode 100644 index 000000000..79c61805b --- /dev/null +++ b/fs/jfs/jfs_btree.h @@ -0,0 +1,172 @@ +/* + * Copyright (C) International Business Machines Corp., 2000-2004 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +#ifndef _H_JFS_BTREE +#define _H_JFS_BTREE + +/* + * jfs_btree.h: B+-tree + * + * JFS B+-tree (dtree and xtree) common definitions + */ + +/* + * basic btree page - btpage + * +struct btpage { + s64 next; right sibling bn + s64 prev; left sibling bn + + u8 flag; + u8 rsrvd[7]; type specific + s64 self; self address + + u8 entry[4064]; +}; */ + +/* btpaget_t flag */ +#define BT_TYPE 0x07 /* B+-tree index */ +#define BT_ROOT 0x01 /* root page */ +#define BT_LEAF 0x02 /* leaf page */ +#define BT_INTERNAL 0x04 /* internal page */ +#define BT_RIGHTMOST 0x10 /* rightmost page */ +#define BT_LEFTMOST 0x20 /* leftmost page */ +#define BT_SWAPPED 0x80 /* used by fsck for endian swapping */ + +/* btorder (in inode) */ +#define BT_RANDOM 0x0000 +#define BT_SEQUENTIAL 0x0001 +#define BT_LOOKUP 0x0010 +#define BT_INSERT 0x0020 +#define BT_DELETE 0x0040 + +/* + * btree page buffer cache access + */ +#define BT_IS_ROOT(MP) (((MP)->xflag & COMMIT_PAGE) == 0) + +/* get page from buffer page */ +#define BT_PAGE(IP, MP, TYPE, ROOT)\ + (BT_IS_ROOT(MP) ? (TYPE *)&JFS_IP(IP)->ROOT : (TYPE *)(MP)->data) + +/* get the page buffer and the page for specified block address */ +#define BT_GETPAGE(IP, BN, MP, TYPE, SIZE, P, RC, ROOT)\ +{\ + if ((BN) == 0)\ + {\ + MP = (struct metapage *)&JFS_IP(IP)->bxflag;\ + P = (TYPE *)&JFS_IP(IP)->ROOT;\ + RC = 0;\ + }\ + else\ + {\ + MP = read_metapage((IP), BN, SIZE, 1);\ + if (MP) {\ + RC = 0;\ + P = (MP)->data;\ + } else {\ + P = NULL;\ + jfs_err("bread failed!");\ + RC = -EIO;\ + }\ + }\ +} + +#define BT_MARK_DIRTY(MP, IP)\ +{\ + if (BT_IS_ROOT(MP))\ + mark_inode_dirty(IP);\ + else\ + mark_metapage_dirty(MP);\ +} + +/* put the page buffer */ +#define BT_PUTPAGE(MP)\ +{\ + if (! BT_IS_ROOT(MP)) \ + release_metapage(MP); \ +} + + +/* + * btree traversal stack + * + * record the path traversed during the search; + * top frame record the leaf page/entry selected. + */ +struct btframe { /* stack frame */ + s64 bn; /* 8: */ + s16 index; /* 2: */ + s16 lastindex; /* 2: unused */ + struct metapage *mp; /* 4/8: */ +}; /* (16/24) */ + +struct btstack { + struct btframe *top; + int nsplit; + struct btframe stack[MAXTREEHEIGHT]; +}; + +#define BT_CLR(btstack)\ + (btstack)->top = (btstack)->stack + +#define BT_STACK_FULL(btstack)\ + ( (btstack)->top == &((btstack)->stack[MAXTREEHEIGHT-1])) + +#define BT_PUSH(BTSTACK, BN, INDEX)\ +{\ + assert(!BT_STACK_FULL(BTSTACK));\ + (BTSTACK)->top->bn = BN;\ + (BTSTACK)->top->index = INDEX;\ + ++(BTSTACK)->top;\ +} + +#define BT_POP(btstack)\ + ( (btstack)->top == (btstack)->stack ? NULL : --(btstack)->top ) + +#define BT_STACK(btstack)\ + ( (btstack)->top == (btstack)->stack ? NULL : (btstack)->top ) + +static inline void BT_STACK_DUMP(struct btstack *btstack) +{ + int i; + printk("btstack dump:\n"); + for (i = 0; i < MAXTREEHEIGHT; i++) + printk(KERN_ERR "bn = %Lx, index = %d\n", + (long long)btstack->stack[i].bn, + btstack->stack[i].index); +} + +/* retrieve search results */ +#define BT_GETSEARCH(IP, LEAF, BN, MP, TYPE, P, INDEX, ROOT)\ +{\ + BN = (LEAF)->bn;\ + MP = (LEAF)->mp;\ + if (BN)\ + P = (TYPE *)MP->data;\ + else\ + P = (TYPE *)&JFS_IP(IP)->ROOT;\ + INDEX = (LEAF)->index;\ +} + +/* put the page buffer of search */ +#define BT_PUTSEARCH(BTSTACK)\ +{\ + if (! BT_IS_ROOT((BTSTACK)->top->mp))\ + release_metapage((BTSTACK)->top->mp);\ +} +#endif /* _H_JFS_BTREE */ diff --git a/fs/jfs/jfs_debug.c b/fs/jfs/jfs_debug.c new file mode 100644 index 000000000..35a5b2a81 --- /dev/null +++ b/fs/jfs/jfs_debug.c @@ -0,0 +1,93 @@ +/* + * Copyright (C) International Business Machines Corp., 2000-2004 + * Portions Copyright (C) Christoph Hellwig, 2001-2002 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include <linux/fs.h> +#include <linux/ctype.h> +#include <linux/module.h> +#include <linux/proc_fs.h> +#include <linux/seq_file.h> +#include <linux/uaccess.h> +#include "jfs_incore.h" +#include "jfs_filsys.h" +#include "jfs_debug.h" + +#ifdef PROC_FS_JFS /* see jfs_debug.h */ + +#ifdef CONFIG_JFS_DEBUG +static int jfs_loglevel_proc_show(struct seq_file *m, void *v) +{ + seq_printf(m, "%d\n", jfsloglevel); + return 0; +} + +static int jfs_loglevel_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, jfs_loglevel_proc_show, NULL); +} + +static ssize_t jfs_loglevel_proc_write(struct file *file, + const char __user *buffer, size_t count, loff_t *ppos) +{ + char c; + + if (get_user(c, buffer)) + return -EFAULT; + + /* yes, I know this is an ASCIIism. --hch */ + if (c < '0' || c > '9') + return -EINVAL; + jfsloglevel = c - '0'; + return count; +} + +static const struct file_operations jfs_loglevel_proc_fops = { + .open = jfs_loglevel_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, + .write = jfs_loglevel_proc_write, +}; +#endif + +void jfs_proc_init(void) +{ + struct proc_dir_entry *base; + + base = proc_mkdir("fs/jfs", NULL); + if (!base) + return; + +#ifdef CONFIG_JFS_STATISTICS + proc_create_single("lmstats", 0, base, jfs_lmstats_proc_show); + proc_create_single("txstats", 0, base, jfs_txstats_proc_show); + proc_create_single("xtstat", 0, base, jfs_xtstat_proc_show); + proc_create_single("mpstat", 0, base, jfs_mpstat_proc_show); +#endif +#ifdef CONFIG_JFS_DEBUG + proc_create_single("TxAnchor", 0, base, jfs_txanchor_proc_show); + proc_create("loglevel", 0, base, &jfs_loglevel_proc_fops); +#endif +} + +void jfs_proc_clean(void) +{ + remove_proc_subtree("fs/jfs", NULL); +} + +#endif /* PROC_FS_JFS */ diff --git a/fs/jfs/jfs_debug.h b/fs/jfs/jfs_debug.h new file mode 100644 index 000000000..0d9e35da8 --- /dev/null +++ b/fs/jfs/jfs_debug.h @@ -0,0 +1,122 @@ +/* + * Copyright (C) International Business Machines Corp., 2000-2002 + * Portions Copyright (C) Christoph Hellwig, 2001-2002 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +#ifndef _H_JFS_DEBUG +#define _H_JFS_DEBUG + +/* + * jfs_debug.h + * + * global debug message, data structure/macro definitions + * under control of CONFIG_JFS_DEBUG, CONFIG_JFS_STATISTICS; + */ + +/* + * Create /proc/fs/jfs if procfs is enabled andeither + * CONFIG_JFS_DEBUG or CONFIG_JFS_STATISTICS is defined + */ +#if defined(CONFIG_PROC_FS) && (defined(CONFIG_JFS_DEBUG) || defined(CONFIG_JFS_STATISTICS)) +#define PROC_FS_JFS +extern void jfs_proc_init(void); +extern void jfs_proc_clean(void); +#endif + +/* + * assert with traditional printf/panic + */ +#define assert(p) do { \ + if (!(p)) { \ + printk(KERN_CRIT "BUG at %s:%d assert(%s)\n", \ + __FILE__, __LINE__, #p); \ + BUG(); \ + } \ +} while (0) + +/* + * debug ON + * -------- + */ +#ifdef CONFIG_JFS_DEBUG +#define ASSERT(p) assert(p) + +/* printk verbosity */ +#define JFS_LOGLEVEL_ERR 1 +#define JFS_LOGLEVEL_WARN 2 +#define JFS_LOGLEVEL_DEBUG 3 +#define JFS_LOGLEVEL_INFO 4 + +extern int jfsloglevel; + +int jfs_txanchor_proc_show(struct seq_file *m, void *v); + +/* information message: e.g., configuration, major event */ +#define jfs_info(fmt, arg...) do { \ + if (jfsloglevel >= JFS_LOGLEVEL_INFO) \ + printk(KERN_INFO fmt "\n", ## arg); \ +} while (0) + +/* debug message: ad hoc */ +#define jfs_debug(fmt, arg...) do { \ + if (jfsloglevel >= JFS_LOGLEVEL_DEBUG) \ + printk(KERN_DEBUG fmt "\n", ## arg); \ +} while (0) + +/* warn message: */ +#define jfs_warn(fmt, arg...) do { \ + if (jfsloglevel >= JFS_LOGLEVEL_WARN) \ + printk(KERN_WARNING fmt "\n", ## arg); \ +} while (0) + +/* error event message: e.g., i/o error */ +#define jfs_err(fmt, arg...) do { \ + if (jfsloglevel >= JFS_LOGLEVEL_ERR) \ + printk(KERN_ERR fmt "\n", ## arg); \ +} while (0) + +/* + * debug OFF + * --------- + */ +#else /* CONFIG_JFS_DEBUG */ +#define ASSERT(p) do {} while (0) +#define jfs_info(fmt, arg...) do {} while (0) +#define jfs_debug(fmt, arg...) do {} while (0) +#define jfs_warn(fmt, arg...) do {} while (0) +#define jfs_err(fmt, arg...) do {} while (0) +#endif /* CONFIG_JFS_DEBUG */ + +/* + * statistics + * ---------- + */ +#ifdef CONFIG_JFS_STATISTICS +int jfs_lmstats_proc_show(struct seq_file *m, void *v); +int jfs_txstats_proc_show(struct seq_file *m, void *v); +int jfs_mpstat_proc_show(struct seq_file *m, void *v); +int jfs_xtstat_proc_show(struct seq_file *m, void *v); + +#define INCREMENT(x) ((x)++) +#define DECREMENT(x) ((x)--) +#define HIGHWATERMARK(x,y) ((x) = max((x), (y))) +#else +#define INCREMENT(x) +#define DECREMENT(x) +#define HIGHWATERMARK(x,y) +#endif /* CONFIG_JFS_STATISTICS */ + +#endif /* _H_JFS_DEBUG */ diff --git a/fs/jfs/jfs_dinode.h b/fs/jfs/jfs_dinode.h new file mode 100644 index 000000000..1682a87c0 --- /dev/null +++ b/fs/jfs/jfs_dinode.h @@ -0,0 +1,183 @@ +/* + * Copyright (C) International Business Machines Corp., 2000-2001 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +#ifndef _H_JFS_DINODE +#define _H_JFS_DINODE + +/* + * jfs_dinode.h: on-disk inode manager + */ + +#define INODESLOTSIZE 128 +#define L2INODESLOTSIZE 7 +#define log2INODESIZE 9 /* log2(bytes per dinode) */ + + +/* + * on-disk inode : 512 bytes + * + * note: align 64-bit fields on 8-byte boundary. + */ +struct dinode { + /* + * I. base area (128 bytes) + * ------------------------ + * + * define generic/POSIX attributes + */ + __le32 di_inostamp; /* 4: stamp to show inode belongs to fileset */ + __le32 di_fileset; /* 4: fileset number */ + __le32 di_number; /* 4: inode number, aka file serial number */ + __le32 di_gen; /* 4: inode generation number */ + + pxd_t di_ixpxd; /* 8: inode extent descriptor */ + + __le64 di_size; /* 8: size */ + __le64 di_nblocks; /* 8: number of blocks allocated */ + + __le32 di_nlink; /* 4: number of links to the object */ + + __le32 di_uid; /* 4: user id of owner */ + __le32 di_gid; /* 4: group id of owner */ + + __le32 di_mode; /* 4: attribute, format and permission */ + + struct timestruc_t di_atime; /* 8: time last data accessed */ + struct timestruc_t di_ctime; /* 8: time last status changed */ + struct timestruc_t di_mtime; /* 8: time last data modified */ + struct timestruc_t di_otime; /* 8: time created */ + + dxd_t di_acl; /* 16: acl descriptor */ + + dxd_t di_ea; /* 16: ea descriptor */ + + __le32 di_next_index; /* 4: Next available dir_table index */ + + __le32 di_acltype; /* 4: Type of ACL */ + + /* + * Extension Areas. + * + * Historically, the inode was partitioned into 4 128-byte areas, + * the last 3 being defined as unions which could have multiple + * uses. The first 96 bytes had been completely unused until + * an index table was added to the directory. It is now more + * useful to describe the last 3/4 of the inode as a single + * union. We would probably be better off redesigning the + * entire structure from scratch, but we don't want to break + * commonality with OS/2's JFS at this time. + */ + union { + struct { + /* + * This table contains the information needed to + * find a directory entry from a 32-bit index. + * If the index is small enough, the table is inline, + * otherwise, an x-tree root overlays this table + */ + struct dir_table_slot _table[12]; /* 96: inline */ + + dtroot_t _dtroot; /* 288: dtree root */ + } _dir; /* (384) */ +#define di_dirtable u._dir._table +#define di_dtroot u._dir._dtroot +#define di_parent di_dtroot.header.idotdot +#define di_DASD di_dtroot.header.DASD + + struct { + union { + u8 _data[96]; /* 96: unused */ + struct { + void *_imap; /* 4: unused */ + __le32 _gengen; /* 4: generator */ + } _imap; + } _u1; /* 96: */ +#define di_gengen u._file._u1._imap._gengen + + union { + xtpage_t _xtroot; + struct { + u8 unused[16]; /* 16: */ + dxd_t _dxd; /* 16: */ + union { + __le32 _rdev; /* 4: */ + /* + * The fast symlink area + * is expected to overflow + * into _inlineea when + * needed (which will clear + * INLINEEA). + */ + u8 _fastsymlink[128]; + } _u; + u8 _inlineea[128]; + } _special; + } _u2; + } _file; +#define di_xtroot u._file._u2._xtroot +#define di_dxd u._file._u2._special._dxd +#define di_btroot di_xtroot +#define di_inlinedata u._file._u2._special._u +#define di_rdev u._file._u2._special._u._rdev +#define di_fastsymlink u._file._u2._special._u._fastsymlink +#define di_inlineea u._file._u2._special._inlineea + } u; +}; + +/* extended mode bits (on-disk inode di_mode) */ +#define IFJOURNAL 0x00010000 /* journalled file */ +#define ISPARSE 0x00020000 /* sparse file enabled */ +#define INLINEEA 0x00040000 /* inline EA area free */ +#define ISWAPFILE 0x00800000 /* file open for pager swap space */ + +/* more extended mode bits: attributes for OS/2 */ +#define IREADONLY 0x02000000 /* no write access to file */ +#define IHIDDEN 0x04000000 /* hidden file */ +#define ISYSTEM 0x08000000 /* system file */ + +#define IDIRECTORY 0x20000000 /* directory (shadow of real bit) */ +#define IARCHIVE 0x40000000 /* file archive bit */ +#define INEWNAME 0x80000000 /* non-8.3 filename format */ + +#define IRASH 0x4E000000 /* mask for changeable attributes */ +#define ATTRSHIFT 25 /* bits to shift to move attribute + specification to mode position */ + +/* extended attributes for Linux */ + +#define JFS_NOATIME_FL 0x00080000 /* do not update atime */ + +#define JFS_DIRSYNC_FL 0x00100000 /* dirsync behaviour */ +#define JFS_SYNC_FL 0x00200000 /* Synchronous updates */ +#define JFS_SECRM_FL 0x00400000 /* Secure deletion */ +#define JFS_UNRM_FL 0x00800000 /* allow for undelete */ + +#define JFS_APPEND_FL 0x01000000 /* writes to file may only append */ +#define JFS_IMMUTABLE_FL 0x02000000 /* Immutable file */ + +#define JFS_FL_USER_VISIBLE 0x03F80000 +#define JFS_FL_USER_MODIFIABLE 0x03F80000 +#define JFS_FL_INHERIT 0x03C80000 + +/* These are identical to EXT[23]_IOC_GETFLAGS/SETFLAGS */ +#define JFS_IOC_GETFLAGS _IOR('f', 1, long) +#define JFS_IOC_SETFLAGS _IOW('f', 2, long) + +#define JFS_IOC_GETFLAGS32 _IOR('f', 1, int) +#define JFS_IOC_SETFLAGS32 _IOW('f', 2, int) + +#endif /*_H_JFS_DINODE */ diff --git a/fs/jfs/jfs_discard.c b/fs/jfs/jfs_discard.c new file mode 100644 index 000000000..f76ff0a46 --- /dev/null +++ b/fs/jfs/jfs_discard.c @@ -0,0 +1,119 @@ +/* + * Copyright (C) Tino Reichardt, 2012 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include <linux/fs.h> +#include <linux/slab.h> +#include <linux/blkdev.h> + +#include "jfs_incore.h" +#include "jfs_superblock.h" +#include "jfs_discard.h" +#include "jfs_dmap.h" +#include "jfs_debug.h" + + +/* + * NAME: jfs_issue_discard() + * + * FUNCTION: TRIM the specified block range on device, if supported + * + * PARAMETERS: + * ip - pointer to in-core inode + * blkno - starting block number to be trimmed (0..N) + * nblocks - number of blocks to be trimmed + * + * RETURN VALUES: + * none + * + * serialization: IREAD_LOCK(ipbmap) held on entry/exit; + */ +void jfs_issue_discard(struct inode *ip, u64 blkno, u64 nblocks) +{ + struct super_block *sb = ip->i_sb; + int r = 0; + + r = sb_issue_discard(sb, blkno, nblocks, GFP_NOFS, 0); + if (unlikely(r != 0)) { + jfs_err("JFS: sb_issue_discard(%p, %llu, %llu, GFP_NOFS, 0) = %d => failed!", + sb, (unsigned long long)blkno, + (unsigned long long)nblocks, r); + } + + jfs_info("JFS: sb_issue_discard(%p, %llu, %llu, GFP_NOFS, 0) = %d", + sb, (unsigned long long)blkno, + (unsigned long long)nblocks, r); + + return; +} + +/* + * NAME: jfs_ioc_trim() + * + * FUNCTION: attempt to discard (TRIM) all free blocks from the + * filesystem. + * + * PARAMETERS: + * ip - pointer to in-core inode; + * range - the range, given by user space + * + * RETURN VALUES: + * 0 - success + * -EIO - i/o error + */ +int jfs_ioc_trim(struct inode *ip, struct fstrim_range *range) +{ + struct inode *ipbmap = JFS_SBI(ip->i_sb)->ipbmap; + struct bmap *bmp = JFS_SBI(ip->i_sb)->bmap; + struct super_block *sb = ipbmap->i_sb; + int agno, agno_end; + u64 start, end, minlen; + u64 trimmed = 0; + + /** + * convert byte values to block size of filesystem: + * start: First Byte to trim + * len: number of Bytes to trim from start + * minlen: minimum extent length in Bytes + */ + start = range->start >> sb->s_blocksize_bits; + end = start + (range->len >> sb->s_blocksize_bits) - 1; + minlen = range->minlen >> sb->s_blocksize_bits; + if (minlen == 0) + minlen = 1; + + if (minlen > bmp->db_agsize || + start >= bmp->db_mapsize || + range->len < sb->s_blocksize) + return -EINVAL; + + if (end >= bmp->db_mapsize) + end = bmp->db_mapsize - 1; + + /** + * we trim all ag's within the range + */ + agno = BLKTOAG(start, JFS_SBI(ip->i_sb)); + agno_end = BLKTOAG(end, JFS_SBI(ip->i_sb)); + while (agno <= agno_end) { + trimmed += dbDiscardAG(ip, agno, minlen); + agno++; + } + range->len = trimmed << sb->s_blocksize_bits; + + return 0; +} diff --git a/fs/jfs/jfs_discard.h b/fs/jfs/jfs_discard.h new file mode 100644 index 000000000..40d1ee608 --- /dev/null +++ b/fs/jfs/jfs_discard.h @@ -0,0 +1,26 @@ +/* + * Copyright (C) Tino Reichardt, 2012 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +#ifndef _H_JFS_DISCARD +#define _H_JFS_DISCARD + +struct fstrim_range; + +extern void jfs_issue_discard(struct inode *ip, u64 blkno, u64 nblocks); +extern int jfs_ioc_trim(struct inode *ip, struct fstrim_range *range); + +#endif /* _H_JFS_DISCARD */ diff --git a/fs/jfs/jfs_dmap.c b/fs/jfs/jfs_dmap.c new file mode 100644 index 000000000..1014f2a24 --- /dev/null +++ b/fs/jfs/jfs_dmap.c @@ -0,0 +1,4100 @@ +/* + * Copyright (C) International Business Machines Corp., 2000-2004 + * Portions Copyright (C) Tino Reichardt, 2012 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include <linux/fs.h> +#include <linux/slab.h> +#include "jfs_incore.h" +#include "jfs_superblock.h" +#include "jfs_dmap.h" +#include "jfs_imap.h" +#include "jfs_lock.h" +#include "jfs_metapage.h" +#include "jfs_debug.h" +#include "jfs_discard.h" + +/* + * SERIALIZATION of the Block Allocation Map. + * + * the working state of the block allocation map is accessed in + * two directions: + * + * 1) allocation and free requests that start at the dmap + * level and move up through the dmap control pages (i.e. + * the vast majority of requests). + * + * 2) allocation requests that start at dmap control page + * level and work down towards the dmaps. + * + * the serialization scheme used here is as follows. + * + * requests which start at the bottom are serialized against each + * other through buffers and each requests holds onto its buffers + * as it works it way up from a single dmap to the required level + * of dmap control page. + * requests that start at the top are serialized against each other + * and request that start from the bottom by the multiple read/single + * write inode lock of the bmap inode. requests starting at the top + * take this lock in write mode while request starting at the bottom + * take the lock in read mode. a single top-down request may proceed + * exclusively while multiple bottoms-up requests may proceed + * simultaneously (under the protection of busy buffers). + * + * in addition to information found in dmaps and dmap control pages, + * the working state of the block allocation map also includes read/ + * write information maintained in the bmap descriptor (i.e. total + * free block count, allocation group level free block counts). + * a single exclusive lock (BMAP_LOCK) is used to guard this information + * in the face of multiple-bottoms up requests. + * (lock ordering: IREAD_LOCK, BMAP_LOCK); + * + * accesses to the persistent state of the block allocation map (limited + * to the persistent bitmaps in dmaps) is guarded by (busy) buffers. + */ + +#define BMAP_LOCK_INIT(bmp) mutex_init(&bmp->db_bmaplock) +#define BMAP_LOCK(bmp) mutex_lock(&bmp->db_bmaplock) +#define BMAP_UNLOCK(bmp) mutex_unlock(&bmp->db_bmaplock) + +/* + * forward references + */ +static void dbAllocBits(struct bmap * bmp, struct dmap * dp, s64 blkno, + int nblocks); +static void dbSplit(dmtree_t * tp, int leafno, int splitsz, int newval); +static int dbBackSplit(dmtree_t * tp, int leafno); +static int dbJoin(dmtree_t * tp, int leafno, int newval); +static void dbAdjTree(dmtree_t * tp, int leafno, int newval); +static int dbAdjCtl(struct bmap * bmp, s64 blkno, int newval, int alloc, + int level); +static int dbAllocAny(struct bmap * bmp, s64 nblocks, int l2nb, s64 * results); +static int dbAllocNext(struct bmap * bmp, struct dmap * dp, s64 blkno, + int nblocks); +static int dbAllocNear(struct bmap * bmp, struct dmap * dp, s64 blkno, + int nblocks, + int l2nb, s64 * results); +static int dbAllocDmap(struct bmap * bmp, struct dmap * dp, s64 blkno, + int nblocks); +static int dbAllocDmapLev(struct bmap * bmp, struct dmap * dp, int nblocks, + int l2nb, + s64 * results); +static int dbAllocAG(struct bmap * bmp, int agno, s64 nblocks, int l2nb, + s64 * results); +static int dbAllocCtl(struct bmap * bmp, s64 nblocks, int l2nb, s64 blkno, + s64 * results); +static int dbExtend(struct inode *ip, s64 blkno, s64 nblocks, s64 addnblocks); +static int dbFindBits(u32 word, int l2nb); +static int dbFindCtl(struct bmap * bmp, int l2nb, int level, s64 * blkno); +static int dbFindLeaf(dmtree_t * tp, int l2nb, int *leafidx); +static int dbFreeBits(struct bmap * bmp, struct dmap * dp, s64 blkno, + int nblocks); +static int dbFreeDmap(struct bmap * bmp, struct dmap * dp, s64 blkno, + int nblocks); +static int dbMaxBud(u8 * cp); +static int blkstol2(s64 nb); + +static int cntlz(u32 value); +static int cnttz(u32 word); + +static int dbAllocDmapBU(struct bmap * bmp, struct dmap * dp, s64 blkno, + int nblocks); +static int dbInitDmap(struct dmap * dp, s64 blkno, int nblocks); +static int dbInitDmapTree(struct dmap * dp); +static int dbInitTree(struct dmaptree * dtp); +static int dbInitDmapCtl(struct dmapctl * dcp, int level, int i); +static int dbGetL2AGSize(s64 nblocks); + +/* + * buddy table + * + * table used for determining buddy sizes within characters of + * dmap bitmap words. the characters themselves serve as indexes + * into the table, with the table elements yielding the maximum + * binary buddy of free bits within the character. + */ +static const s8 budtab[256] = { + 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 2, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, + 2, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, + 2, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, + 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 2, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, + 2, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, + 2, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, + 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 2, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, + 2, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, + 2, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, -1 +}; + +/* + * NAME: dbMount() + * + * FUNCTION: initializate the block allocation map. + * + * memory is allocated for the in-core bmap descriptor and + * the in-core descriptor is initialized from disk. + * + * PARAMETERS: + * ipbmap - pointer to in-core inode for the block map. + * + * RETURN VALUES: + * 0 - success + * -ENOMEM - insufficient memory + * -EIO - i/o error + * -EINVAL - wrong bmap data + */ +int dbMount(struct inode *ipbmap) +{ + struct bmap *bmp; + struct dbmap_disk *dbmp_le; + struct metapage *mp; + int i; + + /* + * allocate/initialize the in-memory bmap descriptor + */ + /* allocate memory for the in-memory bmap descriptor */ + bmp = kmalloc(sizeof(struct bmap), GFP_KERNEL); + if (bmp == NULL) + return -ENOMEM; + + /* read the on-disk bmap descriptor. */ + mp = read_metapage(ipbmap, + BMAPBLKNO << JFS_SBI(ipbmap->i_sb)->l2nbperpage, + PSIZE, 0); + if (mp == NULL) { + kfree(bmp); + return -EIO; + } + + /* copy the on-disk bmap descriptor to its in-memory version. */ + dbmp_le = (struct dbmap_disk *) mp->data; + bmp->db_mapsize = le64_to_cpu(dbmp_le->dn_mapsize); + bmp->db_nfree = le64_to_cpu(dbmp_le->dn_nfree); + bmp->db_l2nbperpage = le32_to_cpu(dbmp_le->dn_l2nbperpage); + bmp->db_numag = le32_to_cpu(dbmp_le->dn_numag); + if (!bmp->db_numag) { + release_metapage(mp); + kfree(bmp); + return -EINVAL; + } + + bmp->db_maxlevel = le32_to_cpu(dbmp_le->dn_maxlevel); + bmp->db_maxag = le32_to_cpu(dbmp_le->dn_maxag); + bmp->db_agpref = le32_to_cpu(dbmp_le->dn_agpref); + bmp->db_aglevel = le32_to_cpu(dbmp_le->dn_aglevel); + bmp->db_agheight = le32_to_cpu(dbmp_le->dn_agheight); + bmp->db_agwidth = le32_to_cpu(dbmp_le->dn_agwidth); + bmp->db_agstart = le32_to_cpu(dbmp_le->dn_agstart); + bmp->db_agl2size = le32_to_cpu(dbmp_le->dn_agl2size); + for (i = 0; i < MAXAG; i++) + bmp->db_agfree[i] = le64_to_cpu(dbmp_le->dn_agfree[i]); + bmp->db_agsize = le64_to_cpu(dbmp_le->dn_agsize); + bmp->db_maxfreebud = dbmp_le->dn_maxfreebud; + + /* release the buffer. */ + release_metapage(mp); + + /* bind the bmap inode and the bmap descriptor to each other. */ + bmp->db_ipbmap = ipbmap; + JFS_SBI(ipbmap->i_sb)->bmap = bmp; + + memset(bmp->db_active, 0, sizeof(bmp->db_active)); + + /* + * allocate/initialize the bmap lock + */ + BMAP_LOCK_INIT(bmp); + + return (0); +} + + +/* + * NAME: dbUnmount() + * + * FUNCTION: terminate the block allocation map in preparation for + * file system unmount. + * + * the in-core bmap descriptor is written to disk and + * the memory for this descriptor is freed. + * + * PARAMETERS: + * ipbmap - pointer to in-core inode for the block map. + * + * RETURN VALUES: + * 0 - success + * -EIO - i/o error + */ +int dbUnmount(struct inode *ipbmap, int mounterror) +{ + struct bmap *bmp = JFS_SBI(ipbmap->i_sb)->bmap; + + if (!(mounterror || isReadOnly(ipbmap))) + dbSync(ipbmap); + + /* + * Invalidate the page cache buffers + */ + truncate_inode_pages(ipbmap->i_mapping, 0); + + /* free the memory for the in-memory bmap. */ + kfree(bmp); + + return (0); +} + +/* + * dbSync() + */ +int dbSync(struct inode *ipbmap) +{ + struct dbmap_disk *dbmp_le; + struct bmap *bmp = JFS_SBI(ipbmap->i_sb)->bmap; + struct metapage *mp; + int i; + + /* + * write bmap global control page + */ + /* get the buffer for the on-disk bmap descriptor. */ + mp = read_metapage(ipbmap, + BMAPBLKNO << JFS_SBI(ipbmap->i_sb)->l2nbperpage, + PSIZE, 0); + if (mp == NULL) { + jfs_err("dbSync: read_metapage failed!"); + return -EIO; + } + /* copy the in-memory version of the bmap to the on-disk version */ + dbmp_le = (struct dbmap_disk *) mp->data; + dbmp_le->dn_mapsize = cpu_to_le64(bmp->db_mapsize); + dbmp_le->dn_nfree = cpu_to_le64(bmp->db_nfree); + dbmp_le->dn_l2nbperpage = cpu_to_le32(bmp->db_l2nbperpage); + dbmp_le->dn_numag = cpu_to_le32(bmp->db_numag); + dbmp_le->dn_maxlevel = cpu_to_le32(bmp->db_maxlevel); + dbmp_le->dn_maxag = cpu_to_le32(bmp->db_maxag); + dbmp_le->dn_agpref = cpu_to_le32(bmp->db_agpref); + dbmp_le->dn_aglevel = cpu_to_le32(bmp->db_aglevel); + dbmp_le->dn_agheight = cpu_to_le32(bmp->db_agheight); + dbmp_le->dn_agwidth = cpu_to_le32(bmp->db_agwidth); + dbmp_le->dn_agstart = cpu_to_le32(bmp->db_agstart); + dbmp_le->dn_agl2size = cpu_to_le32(bmp->db_agl2size); + for (i = 0; i < MAXAG; i++) + dbmp_le->dn_agfree[i] = cpu_to_le64(bmp->db_agfree[i]); + dbmp_le->dn_agsize = cpu_to_le64(bmp->db_agsize); + dbmp_le->dn_maxfreebud = bmp->db_maxfreebud; + + /* write the buffer */ + write_metapage(mp); + + /* + * write out dirty pages of bmap + */ + filemap_write_and_wait(ipbmap->i_mapping); + + diWriteSpecial(ipbmap, 0); + + return (0); +} + +/* + * NAME: dbFree() + * + * FUNCTION: free the specified block range from the working block + * allocation map. + * + * the blocks will be free from the working map one dmap + * at a time. + * + * PARAMETERS: + * ip - pointer to in-core inode; + * blkno - starting block number to be freed. + * nblocks - number of blocks to be freed. + * + * RETURN VALUES: + * 0 - success + * -EIO - i/o error + */ +int dbFree(struct inode *ip, s64 blkno, s64 nblocks) +{ + struct metapage *mp; + struct dmap *dp; + int nb, rc; + s64 lblkno, rem; + struct inode *ipbmap = JFS_SBI(ip->i_sb)->ipbmap; + struct bmap *bmp = JFS_SBI(ip->i_sb)->bmap; + struct super_block *sb = ipbmap->i_sb; + + IREAD_LOCK(ipbmap, RDWRLOCK_DMAP); + + /* block to be freed better be within the mapsize. */ + if (unlikely((blkno == 0) || (blkno + nblocks > bmp->db_mapsize))) { + IREAD_UNLOCK(ipbmap); + printk(KERN_ERR "blkno = %Lx, nblocks = %Lx\n", + (unsigned long long) blkno, + (unsigned long long) nblocks); + jfs_error(ip->i_sb, "block to be freed is outside the map\n"); + return -EIO; + } + + /** + * TRIM the blocks, when mounted with discard option + */ + if (JFS_SBI(sb)->flag & JFS_DISCARD) + if (JFS_SBI(sb)->minblks_trim <= nblocks) + jfs_issue_discard(ipbmap, blkno, nblocks); + + /* + * free the blocks a dmap at a time. + */ + mp = NULL; + for (rem = nblocks; rem > 0; rem -= nb, blkno += nb) { + /* release previous dmap if any */ + if (mp) { + write_metapage(mp); + } + + /* get the buffer for the current dmap. */ + lblkno = BLKTODMAP(blkno, bmp->db_l2nbperpage); + mp = read_metapage(ipbmap, lblkno, PSIZE, 0); + if (mp == NULL) { + IREAD_UNLOCK(ipbmap); + return -EIO; + } + dp = (struct dmap *) mp->data; + + /* determine the number of blocks to be freed from + * this dmap. + */ + nb = min(rem, BPERDMAP - (blkno & (BPERDMAP - 1))); + + /* free the blocks. */ + if ((rc = dbFreeDmap(bmp, dp, blkno, nb))) { + jfs_error(ip->i_sb, "error in block map\n"); + release_metapage(mp); + IREAD_UNLOCK(ipbmap); + return (rc); + } + } + + /* write the last buffer. */ + if (mp) + write_metapage(mp); + + IREAD_UNLOCK(ipbmap); + + return (0); +} + + +/* + * NAME: dbUpdatePMap() + * + * FUNCTION: update the allocation state (free or allocate) of the + * specified block range in the persistent block allocation map. + * + * the blocks will be updated in the persistent map one + * dmap at a time. + * + * PARAMETERS: + * ipbmap - pointer to in-core inode for the block map. + * free - 'true' if block range is to be freed from the persistent + * map; 'false' if it is to be allocated. + * blkno - starting block number of the range. + * nblocks - number of contiguous blocks in the range. + * tblk - transaction block; + * + * RETURN VALUES: + * 0 - success + * -EIO - i/o error + */ +int +dbUpdatePMap(struct inode *ipbmap, + int free, s64 blkno, s64 nblocks, struct tblock * tblk) +{ + int nblks, dbitno, wbitno, rbits; + int word, nbits, nwords; + struct bmap *bmp = JFS_SBI(ipbmap->i_sb)->bmap; + s64 lblkno, rem, lastlblkno; + u32 mask; + struct dmap *dp; + struct metapage *mp; + struct jfs_log *log; + int lsn, difft, diffp; + unsigned long flags; + + /* the blocks better be within the mapsize. */ + if (blkno + nblocks > bmp->db_mapsize) { + printk(KERN_ERR "blkno = %Lx, nblocks = %Lx\n", + (unsigned long long) blkno, + (unsigned long long) nblocks); + jfs_error(ipbmap->i_sb, "blocks are outside the map\n"); + return -EIO; + } + + /* compute delta of transaction lsn from log syncpt */ + lsn = tblk->lsn; + log = (struct jfs_log *) JFS_SBI(tblk->sb)->log; + logdiff(difft, lsn, log); + + /* + * update the block state a dmap at a time. + */ + mp = NULL; + lastlblkno = 0; + for (rem = nblocks; rem > 0; rem -= nblks, blkno += nblks) { + /* get the buffer for the current dmap. */ + lblkno = BLKTODMAP(blkno, bmp->db_l2nbperpage); + if (lblkno != lastlblkno) { + if (mp) { + write_metapage(mp); + } + + mp = read_metapage(bmp->db_ipbmap, lblkno, PSIZE, + 0); + if (mp == NULL) + return -EIO; + metapage_wait_for_io(mp); + } + dp = (struct dmap *) mp->data; + + /* determine the bit number and word within the dmap of + * the starting block. also determine how many blocks + * are to be updated within this dmap. + */ + dbitno = blkno & (BPERDMAP - 1); + word = dbitno >> L2DBWORD; + nblks = min(rem, (s64)BPERDMAP - dbitno); + + /* update the bits of the dmap words. the first and last + * words may only have a subset of their bits updated. if + * this is the case, we'll work against that word (i.e. + * partial first and/or last) only in a single pass. a + * single pass will also be used to update all words that + * are to have all their bits updated. + */ + for (rbits = nblks; rbits > 0; + rbits -= nbits, dbitno += nbits) { + /* determine the bit number within the word and + * the number of bits within the word. + */ + wbitno = dbitno & (DBWORD - 1); + nbits = min(rbits, DBWORD - wbitno); + + /* check if only part of the word is to be updated. */ + if (nbits < DBWORD) { + /* update (free or allocate) the bits + * in this word. + */ + mask = + (ONES << (DBWORD - nbits) >> wbitno); + if (free) + dp->pmap[word] &= + cpu_to_le32(~mask); + else + dp->pmap[word] |= + cpu_to_le32(mask); + + word += 1; + } else { + /* one or more words are to have all + * their bits updated. determine how + * many words and how many bits. + */ + nwords = rbits >> L2DBWORD; + nbits = nwords << L2DBWORD; + + /* update (free or allocate) the bits + * in these words. + */ + if (free) + memset(&dp->pmap[word], 0, + nwords * 4); + else + memset(&dp->pmap[word], (int) ONES, + nwords * 4); + + word += nwords; + } + } + + /* + * update dmap lsn + */ + if (lblkno == lastlblkno) + continue; + + lastlblkno = lblkno; + + LOGSYNC_LOCK(log, flags); + if (mp->lsn != 0) { + /* inherit older/smaller lsn */ + logdiff(diffp, mp->lsn, log); + if (difft < diffp) { + mp->lsn = lsn; + + /* move bp after tblock in logsync list */ + list_move(&mp->synclist, &tblk->synclist); + } + + /* inherit younger/larger clsn */ + logdiff(difft, tblk->clsn, log); + logdiff(diffp, mp->clsn, log); + if (difft > diffp) + mp->clsn = tblk->clsn; + } else { + mp->log = log; + mp->lsn = lsn; + + /* insert bp after tblock in logsync list */ + log->count++; + list_add(&mp->synclist, &tblk->synclist); + + mp->clsn = tblk->clsn; + } + LOGSYNC_UNLOCK(log, flags); + } + + /* write the last buffer. */ + if (mp) { + write_metapage(mp); + } + + return (0); +} + + +/* + * NAME: dbNextAG() + * + * FUNCTION: find the preferred allocation group for new allocations. + * + * Within the allocation groups, we maintain a preferred + * allocation group which consists of a group with at least + * average free space. It is the preferred group that we target + * new inode allocation towards. The tie-in between inode + * allocation and block allocation occurs as we allocate the + * first (data) block of an inode and specify the inode (block) + * as the allocation hint for this block. + * + * We try to avoid having more than one open file growing in + * an allocation group, as this will lead to fragmentation. + * This differs from the old OS/2 method of trying to keep + * empty ags around for large allocations. + * + * PARAMETERS: + * ipbmap - pointer to in-core inode for the block map. + * + * RETURN VALUES: + * the preferred allocation group number. + */ +int dbNextAG(struct inode *ipbmap) +{ + s64 avgfree; + int agpref; + s64 hwm = 0; + int i; + int next_best = -1; + struct bmap *bmp = JFS_SBI(ipbmap->i_sb)->bmap; + + BMAP_LOCK(bmp); + + /* determine the average number of free blocks within the ags. */ + avgfree = (u32)bmp->db_nfree / bmp->db_numag; + + /* + * if the current preferred ag does not have an active allocator + * and has at least average freespace, return it + */ + agpref = bmp->db_agpref; + if ((atomic_read(&bmp->db_active[agpref]) == 0) && + (bmp->db_agfree[agpref] >= avgfree)) + goto unlock; + + /* From the last preferred ag, find the next one with at least + * average free space. + */ + for (i = 0 ; i < bmp->db_numag; i++, agpref++) { + if (agpref == bmp->db_numag) + agpref = 0; + + if (atomic_read(&bmp->db_active[agpref])) + /* open file is currently growing in this ag */ + continue; + if (bmp->db_agfree[agpref] >= avgfree) { + /* Return this one */ + bmp->db_agpref = agpref; + goto unlock; + } else if (bmp->db_agfree[agpref] > hwm) { + /* Less than avg. freespace, but best so far */ + hwm = bmp->db_agfree[agpref]; + next_best = agpref; + } + } + + /* + * If no inactive ag was found with average freespace, use the + * next best + */ + if (next_best != -1) + bmp->db_agpref = next_best; + /* else leave db_agpref unchanged */ +unlock: + BMAP_UNLOCK(bmp); + + /* return the preferred group. + */ + return (bmp->db_agpref); +} + +/* + * NAME: dbAlloc() + * + * FUNCTION: attempt to allocate a specified number of contiguous free + * blocks from the working allocation block map. + * + * the block allocation policy uses hints and a multi-step + * approach. + * + * for allocation requests smaller than the number of blocks + * per dmap, we first try to allocate the new blocks + * immediately following the hint. if these blocks are not + * available, we try to allocate blocks near the hint. if + * no blocks near the hint are available, we next try to + * allocate within the same dmap as contains the hint. + * + * if no blocks are available in the dmap or the allocation + * request is larger than the dmap size, we try to allocate + * within the same allocation group as contains the hint. if + * this does not succeed, we finally try to allocate anywhere + * within the aggregate. + * + * we also try to allocate anywhere within the aggregate for + * for allocation requests larger than the allocation group + * size or requests that specify no hint value. + * + * PARAMETERS: + * ip - pointer to in-core inode; + * hint - allocation hint. + * nblocks - number of contiguous blocks in the range. + * results - on successful return, set to the starting block number + * of the newly allocated contiguous range. + * + * RETURN VALUES: + * 0 - success + * -ENOSPC - insufficient disk resources + * -EIO - i/o error + */ +int dbAlloc(struct inode *ip, s64 hint, s64 nblocks, s64 * results) +{ + int rc, agno; + struct inode *ipbmap = JFS_SBI(ip->i_sb)->ipbmap; + struct bmap *bmp; + struct metapage *mp; + s64 lblkno, blkno; + struct dmap *dp; + int l2nb; + s64 mapSize; + int writers; + + /* assert that nblocks is valid */ + assert(nblocks > 0); + + /* get the log2 number of blocks to be allocated. + * if the number of blocks is not a log2 multiple, + * it will be rounded up to the next log2 multiple. + */ + l2nb = BLKSTOL2(nblocks); + + bmp = JFS_SBI(ip->i_sb)->bmap; + + mapSize = bmp->db_mapsize; + + /* the hint should be within the map */ + if (hint >= mapSize) { + jfs_error(ip->i_sb, "the hint is outside the map\n"); + return -EIO; + } + + /* if the number of blocks to be allocated is greater than the + * allocation group size, try to allocate anywhere. + */ + if (l2nb > bmp->db_agl2size) { + IWRITE_LOCK(ipbmap, RDWRLOCK_DMAP); + + rc = dbAllocAny(bmp, nblocks, l2nb, results); + + goto write_unlock; + } + + /* + * If no hint, let dbNextAG recommend an allocation group + */ + if (hint == 0) + goto pref_ag; + + /* we would like to allocate close to the hint. adjust the + * hint to the block following the hint since the allocators + * will start looking for free space starting at this point. + */ + blkno = hint + 1; + + if (blkno >= bmp->db_mapsize) + goto pref_ag; + + agno = blkno >> bmp->db_agl2size; + + /* check if blkno crosses over into a new allocation group. + * if so, check if we should allow allocations within this + * allocation group. + */ + if ((blkno & (bmp->db_agsize - 1)) == 0) + /* check if the AG is currently being written to. + * if so, call dbNextAG() to find a non-busy + * AG with sufficient free space. + */ + if (atomic_read(&bmp->db_active[agno])) + goto pref_ag; + + /* check if the allocation request size can be satisfied from a + * single dmap. if so, try to allocate from the dmap containing + * the hint using a tiered strategy. + */ + if (nblocks <= BPERDMAP) { + IREAD_LOCK(ipbmap, RDWRLOCK_DMAP); + + /* get the buffer for the dmap containing the hint. + */ + rc = -EIO; + lblkno = BLKTODMAP(blkno, bmp->db_l2nbperpage); + mp = read_metapage(ipbmap, lblkno, PSIZE, 0); + if (mp == NULL) + goto read_unlock; + + dp = (struct dmap *) mp->data; + + /* first, try to satisfy the allocation request with the + * blocks beginning at the hint. + */ + if ((rc = dbAllocNext(bmp, dp, blkno, (int) nblocks)) + != -ENOSPC) { + if (rc == 0) { + *results = blkno; + mark_metapage_dirty(mp); + } + + release_metapage(mp); + goto read_unlock; + } + + writers = atomic_read(&bmp->db_active[agno]); + if ((writers > 1) || + ((writers == 1) && (JFS_IP(ip)->active_ag != agno))) { + /* + * Someone else is writing in this allocation + * group. To avoid fragmenting, try another ag + */ + release_metapage(mp); + IREAD_UNLOCK(ipbmap); + goto pref_ag; + } + + /* next, try to satisfy the allocation request with blocks + * near the hint. + */ + if ((rc = + dbAllocNear(bmp, dp, blkno, (int) nblocks, l2nb, results)) + != -ENOSPC) { + if (rc == 0) + mark_metapage_dirty(mp); + + release_metapage(mp); + goto read_unlock; + } + + /* try to satisfy the allocation request with blocks within + * the same dmap as the hint. + */ + if ((rc = dbAllocDmapLev(bmp, dp, (int) nblocks, l2nb, results)) + != -ENOSPC) { + if (rc == 0) + mark_metapage_dirty(mp); + + release_metapage(mp); + goto read_unlock; + } + + release_metapage(mp); + IREAD_UNLOCK(ipbmap); + } + + /* try to satisfy the allocation request with blocks within + * the same allocation group as the hint. + */ + IWRITE_LOCK(ipbmap, RDWRLOCK_DMAP); + if ((rc = dbAllocAG(bmp, agno, nblocks, l2nb, results)) != -ENOSPC) + goto write_unlock; + + IWRITE_UNLOCK(ipbmap); + + + pref_ag: + /* + * Let dbNextAG recommend a preferred allocation group + */ + agno = dbNextAG(ipbmap); + IWRITE_LOCK(ipbmap, RDWRLOCK_DMAP); + + /* Try to allocate within this allocation group. if that fails, try to + * allocate anywhere in the map. + */ + if ((rc = dbAllocAG(bmp, agno, nblocks, l2nb, results)) == -ENOSPC) + rc = dbAllocAny(bmp, nblocks, l2nb, results); + + write_unlock: + IWRITE_UNLOCK(ipbmap); + + return (rc); + + read_unlock: + IREAD_UNLOCK(ipbmap); + + return (rc); +} + +#ifdef _NOTYET +/* + * NAME: dbAllocExact() + * + * FUNCTION: try to allocate the requested extent; + * + * PARAMETERS: + * ip - pointer to in-core inode; + * blkno - extent address; + * nblocks - extent length; + * + * RETURN VALUES: + * 0 - success + * -ENOSPC - insufficient disk resources + * -EIO - i/o error + */ +int dbAllocExact(struct inode *ip, s64 blkno, int nblocks) +{ + int rc; + struct inode *ipbmap = JFS_SBI(ip->i_sb)->ipbmap; + struct bmap *bmp = JFS_SBI(ip->i_sb)->bmap; + struct dmap *dp; + s64 lblkno; + struct metapage *mp; + + IREAD_LOCK(ipbmap, RDWRLOCK_DMAP); + + /* + * validate extent request: + * + * note: defragfs policy: + * max 64 blocks will be moved. + * allocation request size must be satisfied from a single dmap. + */ + if (nblocks <= 0 || nblocks > BPERDMAP || blkno >= bmp->db_mapsize) { + IREAD_UNLOCK(ipbmap); + return -EINVAL; + } + + if (nblocks > ((s64) 1 << bmp->db_maxfreebud)) { + /* the free space is no longer available */ + IREAD_UNLOCK(ipbmap); + return -ENOSPC; + } + + /* read in the dmap covering the extent */ + lblkno = BLKTODMAP(blkno, bmp->db_l2nbperpage); + mp = read_metapage(ipbmap, lblkno, PSIZE, 0); + if (mp == NULL) { + IREAD_UNLOCK(ipbmap); + return -EIO; + } + dp = (struct dmap *) mp->data; + + /* try to allocate the requested extent */ + rc = dbAllocNext(bmp, dp, blkno, nblocks); + + IREAD_UNLOCK(ipbmap); + + if (rc == 0) + mark_metapage_dirty(mp); + + release_metapage(mp); + + return (rc); +} +#endif /* _NOTYET */ + +/* + * NAME: dbReAlloc() + * + * FUNCTION: attempt to extend a current allocation by a specified + * number of blocks. + * + * this routine attempts to satisfy the allocation request + * by first trying to extend the existing allocation in + * place by allocating the additional blocks as the blocks + * immediately following the current allocation. if these + * blocks are not available, this routine will attempt to + * allocate a new set of contiguous blocks large enough + * to cover the existing allocation plus the additional + * number of blocks required. + * + * PARAMETERS: + * ip - pointer to in-core inode requiring allocation. + * blkno - starting block of the current allocation. + * nblocks - number of contiguous blocks within the current + * allocation. + * addnblocks - number of blocks to add to the allocation. + * results - on successful return, set to the starting block number + * of the existing allocation if the existing allocation + * was extended in place or to a newly allocated contiguous + * range if the existing allocation could not be extended + * in place. + * + * RETURN VALUES: + * 0 - success + * -ENOSPC - insufficient disk resources + * -EIO - i/o error + */ +int +dbReAlloc(struct inode *ip, + s64 blkno, s64 nblocks, s64 addnblocks, s64 * results) +{ + int rc; + + /* try to extend the allocation in place. + */ + if ((rc = dbExtend(ip, blkno, nblocks, addnblocks)) == 0) { + *results = blkno; + return (0); + } else { + if (rc != -ENOSPC) + return (rc); + } + + /* could not extend the allocation in place, so allocate a + * new set of blocks for the entire request (i.e. try to get + * a range of contiguous blocks large enough to cover the + * existing allocation plus the additional blocks.) + */ + return (dbAlloc + (ip, blkno + nblocks - 1, addnblocks + nblocks, results)); +} + + +/* + * NAME: dbExtend() + * + * FUNCTION: attempt to extend a current allocation by a specified + * number of blocks. + * + * this routine attempts to satisfy the allocation request + * by first trying to extend the existing allocation in + * place by allocating the additional blocks as the blocks + * immediately following the current allocation. + * + * PARAMETERS: + * ip - pointer to in-core inode requiring allocation. + * blkno - starting block of the current allocation. + * nblocks - number of contiguous blocks within the current + * allocation. + * addnblocks - number of blocks to add to the allocation. + * + * RETURN VALUES: + * 0 - success + * -ENOSPC - insufficient disk resources + * -EIO - i/o error + */ +static int dbExtend(struct inode *ip, s64 blkno, s64 nblocks, s64 addnblocks) +{ + struct jfs_sb_info *sbi = JFS_SBI(ip->i_sb); + s64 lblkno, lastblkno, extblkno; + uint rel_block; + struct metapage *mp; + struct dmap *dp; + int rc; + struct inode *ipbmap = sbi->ipbmap; + struct bmap *bmp; + + /* + * We don't want a non-aligned extent to cross a page boundary + */ + if (((rel_block = blkno & (sbi->nbperpage - 1))) && + (rel_block + nblocks + addnblocks > sbi->nbperpage)) + return -ENOSPC; + + /* get the last block of the current allocation */ + lastblkno = blkno + nblocks - 1; + + /* determine the block number of the block following + * the existing allocation. + */ + extblkno = lastblkno + 1; + + IREAD_LOCK(ipbmap, RDWRLOCK_DMAP); + + /* better be within the file system */ + bmp = sbi->bmap; + if (lastblkno < 0 || lastblkno >= bmp->db_mapsize) { + IREAD_UNLOCK(ipbmap); + jfs_error(ip->i_sb, "the block is outside the filesystem\n"); + return -EIO; + } + + /* we'll attempt to extend the current allocation in place by + * allocating the additional blocks as the blocks immediately + * following the current allocation. we only try to extend the + * current allocation in place if the number of additional blocks + * can fit into a dmap, the last block of the current allocation + * is not the last block of the file system, and the start of the + * inplace extension is not on an allocation group boundary. + */ + if (addnblocks > BPERDMAP || extblkno >= bmp->db_mapsize || + (extblkno & (bmp->db_agsize - 1)) == 0) { + IREAD_UNLOCK(ipbmap); + return -ENOSPC; + } + + /* get the buffer for the dmap containing the first block + * of the extension. + */ + lblkno = BLKTODMAP(extblkno, bmp->db_l2nbperpage); + mp = read_metapage(ipbmap, lblkno, PSIZE, 0); + if (mp == NULL) { + IREAD_UNLOCK(ipbmap); + return -EIO; + } + + dp = (struct dmap *) mp->data; + + /* try to allocate the blocks immediately following the + * current allocation. + */ + rc = dbAllocNext(bmp, dp, extblkno, (int) addnblocks); + + IREAD_UNLOCK(ipbmap); + + /* were we successful ? */ + if (rc == 0) + write_metapage(mp); + else + /* we were not successful */ + release_metapage(mp); + + return (rc); +} + + +/* + * NAME: dbAllocNext() + * + * FUNCTION: attempt to allocate the blocks of the specified block + * range within a dmap. + * + * PARAMETERS: + * bmp - pointer to bmap descriptor + * dp - pointer to dmap. + * blkno - starting block number of the range. + * nblocks - number of contiguous free blocks of the range. + * + * RETURN VALUES: + * 0 - success + * -ENOSPC - insufficient disk resources + * -EIO - i/o error + * + * serialization: IREAD_LOCK(ipbmap) held on entry/exit; + */ +static int dbAllocNext(struct bmap * bmp, struct dmap * dp, s64 blkno, + int nblocks) +{ + int dbitno, word, rembits, nb, nwords, wbitno, nw; + int l2size; + s8 *leaf; + u32 mask; + + if (dp->tree.leafidx != cpu_to_le32(LEAFIND)) { + jfs_error(bmp->db_ipbmap->i_sb, "Corrupt dmap page\n"); + return -EIO; + } + + /* pick up a pointer to the leaves of the dmap tree. + */ + leaf = dp->tree.stree + le32_to_cpu(dp->tree.leafidx); + + /* determine the bit number and word within the dmap of the + * starting block. + */ + dbitno = blkno & (BPERDMAP - 1); + word = dbitno >> L2DBWORD; + + /* check if the specified block range is contained within + * this dmap. + */ + if (dbitno + nblocks > BPERDMAP) + return -ENOSPC; + + /* check if the starting leaf indicates that anything + * is free. + */ + if (leaf[word] == NOFREE) + return -ENOSPC; + + /* check the dmaps words corresponding to block range to see + * if the block range is free. not all bits of the first and + * last words may be contained within the block range. if this + * is the case, we'll work against those words (i.e. partial first + * and/or last) on an individual basis (a single pass) and examine + * the actual bits to determine if they are free. a single pass + * will be used for all dmap words fully contained within the + * specified range. within this pass, the leaves of the dmap + * tree will be examined to determine if the blocks are free. a + * single leaf may describe the free space of multiple dmap + * words, so we may visit only a subset of the actual leaves + * corresponding to the dmap words of the block range. + */ + for (rembits = nblocks; rembits > 0; rembits -= nb, dbitno += nb) { + /* determine the bit number within the word and + * the number of bits within the word. + */ + wbitno = dbitno & (DBWORD - 1); + nb = min(rembits, DBWORD - wbitno); + + /* check if only part of the word is to be examined. + */ + if (nb < DBWORD) { + /* check if the bits are free. + */ + mask = (ONES << (DBWORD - nb) >> wbitno); + if ((mask & ~le32_to_cpu(dp->wmap[word])) != mask) + return -ENOSPC; + + word += 1; + } else { + /* one or more dmap words are fully contained + * within the block range. determine how many + * words and how many bits. + */ + nwords = rembits >> L2DBWORD; + nb = nwords << L2DBWORD; + + /* now examine the appropriate leaves to determine + * if the blocks are free. + */ + while (nwords > 0) { + /* does the leaf describe any free space ? + */ + if (leaf[word] < BUDMIN) + return -ENOSPC; + + /* determine the l2 number of bits provided + * by this leaf. + */ + l2size = + min_t(int, leaf[word], NLSTOL2BSZ(nwords)); + + /* determine how many words were handled. + */ + nw = BUDSIZE(l2size, BUDMIN); + + nwords -= nw; + word += nw; + } + } + } + + /* allocate the blocks. + */ + return (dbAllocDmap(bmp, dp, blkno, nblocks)); +} + + +/* + * NAME: dbAllocNear() + * + * FUNCTION: attempt to allocate a number of contiguous free blocks near + * a specified block (hint) within a dmap. + * + * starting with the dmap leaf that covers the hint, we'll + * check the next four contiguous leaves for sufficient free + * space. if sufficient free space is found, we'll allocate + * the desired free space. + * + * PARAMETERS: + * bmp - pointer to bmap descriptor + * dp - pointer to dmap. + * blkno - block number to allocate near. + * nblocks - actual number of contiguous free blocks desired. + * l2nb - log2 number of contiguous free blocks desired. + * results - on successful return, set to the starting block number + * of the newly allocated range. + * + * RETURN VALUES: + * 0 - success + * -ENOSPC - insufficient disk resources + * -EIO - i/o error + * + * serialization: IREAD_LOCK(ipbmap) held on entry/exit; + */ +static int +dbAllocNear(struct bmap * bmp, + struct dmap * dp, s64 blkno, int nblocks, int l2nb, s64 * results) +{ + int word, lword, rc; + s8 *leaf; + + if (dp->tree.leafidx != cpu_to_le32(LEAFIND)) { + jfs_error(bmp->db_ipbmap->i_sb, "Corrupt dmap page\n"); + return -EIO; + } + + leaf = dp->tree.stree + le32_to_cpu(dp->tree.leafidx); + + /* determine the word within the dmap that holds the hint + * (i.e. blkno). also, determine the last word in the dmap + * that we'll include in our examination. + */ + word = (blkno & (BPERDMAP - 1)) >> L2DBWORD; + lword = min(word + 4, LPERDMAP); + + /* examine the leaves for sufficient free space. + */ + for (; word < lword; word++) { + /* does the leaf describe sufficient free space ? + */ + if (leaf[word] < l2nb) + continue; + + /* determine the block number within the file system + * of the first block described by this dmap word. + */ + blkno = le64_to_cpu(dp->start) + (word << L2DBWORD); + + /* if not all bits of the dmap word are free, get the + * starting bit number within the dmap word of the required + * string of free bits and adjust the block number with the + * value. + */ + if (leaf[word] < BUDMIN) + blkno += + dbFindBits(le32_to_cpu(dp->wmap[word]), l2nb); + + /* allocate the blocks. + */ + if ((rc = dbAllocDmap(bmp, dp, blkno, nblocks)) == 0) + *results = blkno; + + return (rc); + } + + return -ENOSPC; +} + + +/* + * NAME: dbAllocAG() + * + * FUNCTION: attempt to allocate the specified number of contiguous + * free blocks within the specified allocation group. + * + * unless the allocation group size is equal to the number + * of blocks per dmap, the dmap control pages will be used to + * find the required free space, if available. we start the + * search at the highest dmap control page level which + * distinctly describes the allocation group's free space + * (i.e. the highest level at which the allocation group's + * free space is not mixed in with that of any other group). + * in addition, we start the search within this level at a + * height of the dmapctl dmtree at which the nodes distinctly + * describe the allocation group's free space. at this height, + * the allocation group's free space may be represented by 1 + * or two sub-trees, depending on the allocation group size. + * we search the top nodes of these subtrees left to right for + * sufficient free space. if sufficient free space is found, + * the subtree is searched to find the leftmost leaf that + * has free space. once we have made it to the leaf, we + * move the search to the next lower level dmap control page + * corresponding to this leaf. we continue down the dmap control + * pages until we find the dmap that contains or starts the + * sufficient free space and we allocate at this dmap. + * + * if the allocation group size is equal to the dmap size, + * we'll start at the dmap corresponding to the allocation + * group and attempt the allocation at this level. + * + * the dmap control page search is also not performed if the + * allocation group is completely free and we go to the first + * dmap of the allocation group to do the allocation. this is + * done because the allocation group may be part (not the first + * part) of a larger binary buddy system, causing the dmap + * control pages to indicate no free space (NOFREE) within + * the allocation group. + * + * PARAMETERS: + * bmp - pointer to bmap descriptor + * agno - allocation group number. + * nblocks - actual number of contiguous free blocks desired. + * l2nb - log2 number of contiguous free blocks desired. + * results - on successful return, set to the starting block number + * of the newly allocated range. + * + * RETURN VALUES: + * 0 - success + * -ENOSPC - insufficient disk resources + * -EIO - i/o error + * + * note: IWRITE_LOCK(ipmap) held on entry/exit; + */ +static int +dbAllocAG(struct bmap * bmp, int agno, s64 nblocks, int l2nb, s64 * results) +{ + struct metapage *mp; + struct dmapctl *dcp; + int rc, ti, i, k, m, n, agperlev; + s64 blkno, lblkno; + int budmin; + + /* allocation request should not be for more than the + * allocation group size. + */ + if (l2nb > bmp->db_agl2size) { + jfs_error(bmp->db_ipbmap->i_sb, + "allocation request is larger than the allocation group size\n"); + return -EIO; + } + + /* determine the starting block number of the allocation + * group. + */ + blkno = (s64) agno << bmp->db_agl2size; + + /* check if the allocation group size is the minimum allocation + * group size or if the allocation group is completely free. if + * the allocation group size is the minimum size of BPERDMAP (i.e. + * 1 dmap), there is no need to search the dmap control page (below) + * that fully describes the allocation group since the allocation + * group is already fully described by a dmap. in this case, we + * just call dbAllocCtl() to search the dmap tree and allocate the + * required space if available. + * + * if the allocation group is completely free, dbAllocCtl() is + * also called to allocate the required space. this is done for + * two reasons. first, it makes no sense searching the dmap control + * pages for free space when we know that free space exists. second, + * the dmap control pages may indicate that the allocation group + * has no free space if the allocation group is part (not the first + * part) of a larger binary buddy system. + */ + if (bmp->db_agsize == BPERDMAP + || bmp->db_agfree[agno] == bmp->db_agsize) { + rc = dbAllocCtl(bmp, nblocks, l2nb, blkno, results); + if ((rc == -ENOSPC) && + (bmp->db_agfree[agno] == bmp->db_agsize)) { + printk(KERN_ERR "blkno = %Lx, blocks = %Lx\n", + (unsigned long long) blkno, + (unsigned long long) nblocks); + jfs_error(bmp->db_ipbmap->i_sb, + "dbAllocCtl failed in free AG\n"); + } + return (rc); + } + + /* the buffer for the dmap control page that fully describes the + * allocation group. + */ + lblkno = BLKTOCTL(blkno, bmp->db_l2nbperpage, bmp->db_aglevel); + mp = read_metapage(bmp->db_ipbmap, lblkno, PSIZE, 0); + if (mp == NULL) + return -EIO; + dcp = (struct dmapctl *) mp->data; + budmin = dcp->budmin; + + if (dcp->leafidx != cpu_to_le32(CTLLEAFIND)) { + jfs_error(bmp->db_ipbmap->i_sb, "Corrupt dmapctl page\n"); + release_metapage(mp); + return -EIO; + } + + /* search the subtree(s) of the dmap control page that describes + * the allocation group, looking for sufficient free space. to begin, + * determine how many allocation groups are represented in a dmap + * control page at the control page level (i.e. L0, L1, L2) that + * fully describes an allocation group. next, determine the starting + * tree index of this allocation group within the control page. + */ + agperlev = + (1 << (L2LPERCTL - (bmp->db_agheight << 1))) / bmp->db_agwidth; + ti = bmp->db_agstart + bmp->db_agwidth * (agno & (agperlev - 1)); + + /* dmap control page trees fan-out by 4 and a single allocation + * group may be described by 1 or 2 subtrees within the ag level + * dmap control page, depending upon the ag size. examine the ag's + * subtrees for sufficient free space, starting with the leftmost + * subtree. + */ + for (i = 0; i < bmp->db_agwidth; i++, ti++) { + /* is there sufficient free space ? + */ + if (l2nb > dcp->stree[ti]) + continue; + + /* sufficient free space found in a subtree. now search down + * the subtree to find the leftmost leaf that describes this + * free space. + */ + for (k = bmp->db_agheight; k > 0; k--) { + for (n = 0, m = (ti << 2) + 1; n < 4; n++) { + if (l2nb <= dcp->stree[m + n]) { + ti = m + n; + break; + } + } + if (n == 4) { + jfs_error(bmp->db_ipbmap->i_sb, + "failed descending stree\n"); + release_metapage(mp); + return -EIO; + } + } + + /* determine the block number within the file system + * that corresponds to this leaf. + */ + if (bmp->db_aglevel == 2) + blkno = 0; + else if (bmp->db_aglevel == 1) + blkno &= ~(MAXL1SIZE - 1); + else /* bmp->db_aglevel == 0 */ + blkno &= ~(MAXL0SIZE - 1); + + blkno += + ((s64) (ti - le32_to_cpu(dcp->leafidx))) << budmin; + + /* release the buffer in preparation for going down + * the next level of dmap control pages. + */ + release_metapage(mp); + + /* check if we need to continue to search down the lower + * level dmap control pages. we need to if the number of + * blocks required is less than maximum number of blocks + * described at the next lower level. + */ + if (l2nb < budmin) { + + /* search the lower level dmap control pages to get + * the starting block number of the dmap that + * contains or starts off the free space. + */ + if ((rc = + dbFindCtl(bmp, l2nb, bmp->db_aglevel - 1, + &blkno))) { + if (rc == -ENOSPC) { + jfs_error(bmp->db_ipbmap->i_sb, + "control page inconsistent\n"); + return -EIO; + } + return (rc); + } + } + + /* allocate the blocks. + */ + rc = dbAllocCtl(bmp, nblocks, l2nb, blkno, results); + if (rc == -ENOSPC) { + jfs_error(bmp->db_ipbmap->i_sb, + "unable to allocate blocks\n"); + rc = -EIO; + } + return (rc); + } + + /* no space in the allocation group. release the buffer and + * return -ENOSPC. + */ + release_metapage(mp); + + return -ENOSPC; +} + + +/* + * NAME: dbAllocAny() + * + * FUNCTION: attempt to allocate the specified number of contiguous + * free blocks anywhere in the file system. + * + * dbAllocAny() attempts to find the sufficient free space by + * searching down the dmap control pages, starting with the + * highest level (i.e. L0, L1, L2) control page. if free space + * large enough to satisfy the desired free space is found, the + * desired free space is allocated. + * + * PARAMETERS: + * bmp - pointer to bmap descriptor + * nblocks - actual number of contiguous free blocks desired. + * l2nb - log2 number of contiguous free blocks desired. + * results - on successful return, set to the starting block number + * of the newly allocated range. + * + * RETURN VALUES: + * 0 - success + * -ENOSPC - insufficient disk resources + * -EIO - i/o error + * + * serialization: IWRITE_LOCK(ipbmap) held on entry/exit; + */ +static int dbAllocAny(struct bmap * bmp, s64 nblocks, int l2nb, s64 * results) +{ + int rc; + s64 blkno = 0; + + /* starting with the top level dmap control page, search + * down the dmap control levels for sufficient free space. + * if free space is found, dbFindCtl() returns the starting + * block number of the dmap that contains or starts off the + * range of free space. + */ + if ((rc = dbFindCtl(bmp, l2nb, bmp->db_maxlevel, &blkno))) + return (rc); + + /* allocate the blocks. + */ + rc = dbAllocCtl(bmp, nblocks, l2nb, blkno, results); + if (rc == -ENOSPC) { + jfs_error(bmp->db_ipbmap->i_sb, "unable to allocate blocks\n"); + return -EIO; + } + return (rc); +} + + +/* + * NAME: dbDiscardAG() + * + * FUNCTION: attempt to discard (TRIM) all free blocks of specific AG + * + * algorithm: + * 1) allocate blocks, as large as possible and save them + * while holding IWRITE_LOCK on ipbmap + * 2) trim all these saved block/length values + * 3) mark the blocks free again + * + * benefit: + * - we work only on one ag at some time, minimizing how long we + * need to lock ipbmap + * - reading / writing the fs is possible most time, even on + * trimming + * + * downside: + * - we write two times to the dmapctl and dmap pages + * - but for me, this seems the best way, better ideas? + * /TR 2012 + * + * PARAMETERS: + * ip - pointer to in-core inode + * agno - ag to trim + * minlen - minimum value of contiguous blocks + * + * RETURN VALUES: + * s64 - actual number of blocks trimmed + */ +s64 dbDiscardAG(struct inode *ip, int agno, s64 minlen) +{ + struct inode *ipbmap = JFS_SBI(ip->i_sb)->ipbmap; + struct bmap *bmp = JFS_SBI(ip->i_sb)->bmap; + s64 nblocks, blkno; + u64 trimmed = 0; + int rc, l2nb; + struct super_block *sb = ipbmap->i_sb; + + struct range2trim { + u64 blkno; + u64 nblocks; + } *totrim, *tt; + + /* max blkno / nblocks pairs to trim */ + int count = 0, range_cnt; + u64 max_ranges; + + /* prevent others from writing new stuff here, while trimming */ + IWRITE_LOCK(ipbmap, RDWRLOCK_DMAP); + + nblocks = bmp->db_agfree[agno]; + max_ranges = nblocks; + do_div(max_ranges, minlen); + range_cnt = min_t(u64, max_ranges + 1, 32 * 1024); + totrim = kmalloc_array(range_cnt, sizeof(struct range2trim), GFP_NOFS); + if (totrim == NULL) { + jfs_error(bmp->db_ipbmap->i_sb, "no memory for trim array\n"); + IWRITE_UNLOCK(ipbmap); + return 0; + } + + tt = totrim; + while (nblocks >= minlen) { + l2nb = BLKSTOL2(nblocks); + + /* 0 = okay, -EIO = fatal, -ENOSPC -> try smaller block */ + rc = dbAllocAG(bmp, agno, nblocks, l2nb, &blkno); + if (rc == 0) { + tt->blkno = blkno; + tt->nblocks = nblocks; + tt++; count++; + + /* the whole ag is free, trim now */ + if (bmp->db_agfree[agno] == 0) + break; + + /* give a hint for the next while */ + nblocks = bmp->db_agfree[agno]; + continue; + } else if (rc == -ENOSPC) { + /* search for next smaller log2 block */ + l2nb = BLKSTOL2(nblocks) - 1; + nblocks = 1LL << l2nb; + } else { + /* Trim any already allocated blocks */ + jfs_error(bmp->db_ipbmap->i_sb, "-EIO\n"); + break; + } + + /* check, if our trim array is full */ + if (unlikely(count >= range_cnt - 1)) + break; + } + IWRITE_UNLOCK(ipbmap); + + tt->nblocks = 0; /* mark the current end */ + for (tt = totrim; tt->nblocks != 0; tt++) { + /* when mounted with online discard, dbFree() will + * call jfs_issue_discard() itself */ + if (!(JFS_SBI(sb)->flag & JFS_DISCARD)) + jfs_issue_discard(ip, tt->blkno, tt->nblocks); + dbFree(ip, tt->blkno, tt->nblocks); + trimmed += tt->nblocks; + } + kfree(totrim); + + return trimmed; +} + +/* + * NAME: dbFindCtl() + * + * FUNCTION: starting at a specified dmap control page level and block + * number, search down the dmap control levels for a range of + * contiguous free blocks large enough to satisfy an allocation + * request for the specified number of free blocks. + * + * if sufficient contiguous free blocks are found, this routine + * returns the starting block number within a dmap page that + * contains or starts a range of contiqious free blocks that + * is sufficient in size. + * + * PARAMETERS: + * bmp - pointer to bmap descriptor + * level - starting dmap control page level. + * l2nb - log2 number of contiguous free blocks desired. + * *blkno - on entry, starting block number for conducting the search. + * on successful return, the first block within a dmap page + * that contains or starts a range of contiguous free blocks. + * + * RETURN VALUES: + * 0 - success + * -ENOSPC - insufficient disk resources + * -EIO - i/o error + * + * serialization: IWRITE_LOCK(ipbmap) held on entry/exit; + */ +static int dbFindCtl(struct bmap * bmp, int l2nb, int level, s64 * blkno) +{ + int rc, leafidx, lev; + s64 b, lblkno; + struct dmapctl *dcp; + int budmin; + struct metapage *mp; + + /* starting at the specified dmap control page level and block + * number, search down the dmap control levels for the starting + * block number of a dmap page that contains or starts off + * sufficient free blocks. + */ + for (lev = level, b = *blkno; lev >= 0; lev--) { + /* get the buffer of the dmap control page for the block + * number and level (i.e. L0, L1, L2). + */ + lblkno = BLKTOCTL(b, bmp->db_l2nbperpage, lev); + mp = read_metapage(bmp->db_ipbmap, lblkno, PSIZE, 0); + if (mp == NULL) + return -EIO; + dcp = (struct dmapctl *) mp->data; + budmin = dcp->budmin; + + if (dcp->leafidx != cpu_to_le32(CTLLEAFIND)) { + jfs_error(bmp->db_ipbmap->i_sb, + "Corrupt dmapctl page\n"); + release_metapage(mp); + return -EIO; + } + + /* search the tree within the dmap control page for + * sufficient free space. if sufficient free space is found, + * dbFindLeaf() returns the index of the leaf at which + * free space was found. + */ + rc = dbFindLeaf((dmtree_t *) dcp, l2nb, &leafidx); + + /* release the buffer. + */ + release_metapage(mp); + + /* space found ? + */ + if (rc) { + if (lev != level) { + jfs_error(bmp->db_ipbmap->i_sb, + "dmap inconsistent\n"); + return -EIO; + } + return -ENOSPC; + } + + /* adjust the block number to reflect the location within + * the dmap control page (i.e. the leaf) at which free + * space was found. + */ + b += (((s64) leafidx) << budmin); + + /* we stop the search at this dmap control page level if + * the number of blocks required is greater than or equal + * to the maximum number of blocks described at the next + * (lower) level. + */ + if (l2nb >= budmin) + break; + } + + *blkno = b; + return (0); +} + + +/* + * NAME: dbAllocCtl() + * + * FUNCTION: attempt to allocate a specified number of contiguous + * blocks starting within a specific dmap. + * + * this routine is called by higher level routines that search + * the dmap control pages above the actual dmaps for contiguous + * free space. the result of successful searches by these + * routines are the starting block numbers within dmaps, with + * the dmaps themselves containing the desired contiguous free + * space or starting a contiguous free space of desired size + * that is made up of the blocks of one or more dmaps. these + * calls should not fail due to insufficent resources. + * + * this routine is called in some cases where it is not known + * whether it will fail due to insufficient resources. more + * specifically, this occurs when allocating from an allocation + * group whose size is equal to the number of blocks per dmap. + * in this case, the dmap control pages are not examined prior + * to calling this routine (to save pathlength) and the call + * might fail. + * + * for a request size that fits within a dmap, this routine relies + * upon the dmap's dmtree to find the requested contiguous free + * space. for request sizes that are larger than a dmap, the + * requested free space will start at the first block of the + * first dmap (i.e. blkno). + * + * PARAMETERS: + * bmp - pointer to bmap descriptor + * nblocks - actual number of contiguous free blocks to allocate. + * l2nb - log2 number of contiguous free blocks to allocate. + * blkno - starting block number of the dmap to start the allocation + * from. + * results - on successful return, set to the starting block number + * of the newly allocated range. + * + * RETURN VALUES: + * 0 - success + * -ENOSPC - insufficient disk resources + * -EIO - i/o error + * + * serialization: IWRITE_LOCK(ipbmap) held on entry/exit; + */ +static int +dbAllocCtl(struct bmap * bmp, s64 nblocks, int l2nb, s64 blkno, s64 * results) +{ + int rc, nb; + s64 b, lblkno, n; + struct metapage *mp; + struct dmap *dp; + + /* check if the allocation request is confined to a single dmap. + */ + if (l2nb <= L2BPERDMAP) { + /* get the buffer for the dmap. + */ + lblkno = BLKTODMAP(blkno, bmp->db_l2nbperpage); + mp = read_metapage(bmp->db_ipbmap, lblkno, PSIZE, 0); + if (mp == NULL) + return -EIO; + dp = (struct dmap *) mp->data; + + /* try to allocate the blocks. + */ + rc = dbAllocDmapLev(bmp, dp, (int) nblocks, l2nb, results); + if (rc == 0) + mark_metapage_dirty(mp); + + release_metapage(mp); + + return (rc); + } + + /* allocation request involving multiple dmaps. it must start on + * a dmap boundary. + */ + assert((blkno & (BPERDMAP - 1)) == 0); + + /* allocate the blocks dmap by dmap. + */ + for (n = nblocks, b = blkno; n > 0; n -= nb, b += nb) { + /* get the buffer for the dmap. + */ + lblkno = BLKTODMAP(b, bmp->db_l2nbperpage); + mp = read_metapage(bmp->db_ipbmap, lblkno, PSIZE, 0); + if (mp == NULL) { + rc = -EIO; + goto backout; + } + dp = (struct dmap *) mp->data; + + /* the dmap better be all free. + */ + if (dp->tree.stree[ROOT] != L2BPERDMAP) { + release_metapage(mp); + jfs_error(bmp->db_ipbmap->i_sb, + "the dmap is not all free\n"); + rc = -EIO; + goto backout; + } + + /* determine how many blocks to allocate from this dmap. + */ + nb = min_t(s64, n, BPERDMAP); + + /* allocate the blocks from the dmap. + */ + if ((rc = dbAllocDmap(bmp, dp, b, nb))) { + release_metapage(mp); + goto backout; + } + + /* write the buffer. + */ + write_metapage(mp); + } + + /* set the results (starting block number) and return. + */ + *results = blkno; + return (0); + + /* something failed in handling an allocation request involving + * multiple dmaps. we'll try to clean up by backing out any + * allocation that has already happened for this request. if + * we fail in backing out the allocation, we'll mark the file + * system to indicate that blocks have been leaked. + */ + backout: + + /* try to backout the allocations dmap by dmap. + */ + for (n = nblocks - n, b = blkno; n > 0; + n -= BPERDMAP, b += BPERDMAP) { + /* get the buffer for this dmap. + */ + lblkno = BLKTODMAP(b, bmp->db_l2nbperpage); + mp = read_metapage(bmp->db_ipbmap, lblkno, PSIZE, 0); + if (mp == NULL) { + /* could not back out. mark the file system + * to indicate that we have leaked blocks. + */ + jfs_error(bmp->db_ipbmap->i_sb, + "I/O Error: Block Leakage\n"); + continue; + } + dp = (struct dmap *) mp->data; + + /* free the blocks is this dmap. + */ + if (dbFreeDmap(bmp, dp, b, BPERDMAP)) { + /* could not back out. mark the file system + * to indicate that we have leaked blocks. + */ + release_metapage(mp); + jfs_error(bmp->db_ipbmap->i_sb, "Block Leakage\n"); + continue; + } + + /* write the buffer. + */ + write_metapage(mp); + } + + return (rc); +} + + +/* + * NAME: dbAllocDmapLev() + * + * FUNCTION: attempt to allocate a specified number of contiguous blocks + * from a specified dmap. + * + * this routine checks if the contiguous blocks are available. + * if so, nblocks of blocks are allocated; otherwise, ENOSPC is + * returned. + * + * PARAMETERS: + * mp - pointer to bmap descriptor + * dp - pointer to dmap to attempt to allocate blocks from. + * l2nb - log2 number of contiguous block desired. + * nblocks - actual number of contiguous block desired. + * results - on successful return, set to the starting block number + * of the newly allocated range. + * + * RETURN VALUES: + * 0 - success + * -ENOSPC - insufficient disk resources + * -EIO - i/o error + * + * serialization: IREAD_LOCK(ipbmap), e.g., from dbAlloc(), or + * IWRITE_LOCK(ipbmap), e.g., dbAllocCtl(), held on entry/exit; + */ +static int +dbAllocDmapLev(struct bmap * bmp, + struct dmap * dp, int nblocks, int l2nb, s64 * results) +{ + s64 blkno; + int leafidx, rc; + + /* can't be more than a dmaps worth of blocks */ + assert(l2nb <= L2BPERDMAP); + + /* search the tree within the dmap page for sufficient + * free space. if sufficient free space is found, dbFindLeaf() + * returns the index of the leaf at which free space was found. + */ + if (dbFindLeaf((dmtree_t *) & dp->tree, l2nb, &leafidx)) + return -ENOSPC; + + /* determine the block number within the file system corresponding + * to the leaf at which free space was found. + */ + blkno = le64_to_cpu(dp->start) + (leafidx << L2DBWORD); + + /* if not all bits of the dmap word are free, get the starting + * bit number within the dmap word of the required string of free + * bits and adjust the block number with this value. + */ + if (dp->tree.stree[leafidx + LEAFIND] < BUDMIN) + blkno += dbFindBits(le32_to_cpu(dp->wmap[leafidx]), l2nb); + + /* allocate the blocks */ + if ((rc = dbAllocDmap(bmp, dp, blkno, nblocks)) == 0) + *results = blkno; + + return (rc); +} + + +/* + * NAME: dbAllocDmap() + * + * FUNCTION: adjust the disk allocation map to reflect the allocation + * of a specified block range within a dmap. + * + * this routine allocates the specified blocks from the dmap + * through a call to dbAllocBits(). if the allocation of the + * block range causes the maximum string of free blocks within + * the dmap to change (i.e. the value of the root of the dmap's + * dmtree), this routine will cause this change to be reflected + * up through the appropriate levels of the dmap control pages + * by a call to dbAdjCtl() for the L0 dmap control page that + * covers this dmap. + * + * PARAMETERS: + * bmp - pointer to bmap descriptor + * dp - pointer to dmap to allocate the block range from. + * blkno - starting block number of the block to be allocated. + * nblocks - number of blocks to be allocated. + * + * RETURN VALUES: + * 0 - success + * -EIO - i/o error + * + * serialization: IREAD_LOCK(ipbmap) or IWRITE_LOCK(ipbmap) held on entry/exit; + */ +static int dbAllocDmap(struct bmap * bmp, struct dmap * dp, s64 blkno, + int nblocks) +{ + s8 oldroot; + int rc; + + /* save the current value of the root (i.e. maximum free string) + * of the dmap tree. + */ + oldroot = dp->tree.stree[ROOT]; + + /* allocate the specified (blocks) bits */ + dbAllocBits(bmp, dp, blkno, nblocks); + + /* if the root has not changed, done. */ + if (dp->tree.stree[ROOT] == oldroot) + return (0); + + /* root changed. bubble the change up to the dmap control pages. + * if the adjustment of the upper level control pages fails, + * backout the bit allocation (thus making everything consistent). + */ + if ((rc = dbAdjCtl(bmp, blkno, dp->tree.stree[ROOT], 1, 0))) + dbFreeBits(bmp, dp, blkno, nblocks); + + return (rc); +} + + +/* + * NAME: dbFreeDmap() + * + * FUNCTION: adjust the disk allocation map to reflect the allocation + * of a specified block range within a dmap. + * + * this routine frees the specified blocks from the dmap through + * a call to dbFreeBits(). if the deallocation of the block range + * causes the maximum string of free blocks within the dmap to + * change (i.e. the value of the root of the dmap's dmtree), this + * routine will cause this change to be reflected up through the + * appropriate levels of the dmap control pages by a call to + * dbAdjCtl() for the L0 dmap control page that covers this dmap. + * + * PARAMETERS: + * bmp - pointer to bmap descriptor + * dp - pointer to dmap to free the block range from. + * blkno - starting block number of the block to be freed. + * nblocks - number of blocks to be freed. + * + * RETURN VALUES: + * 0 - success + * -EIO - i/o error + * + * serialization: IREAD_LOCK(ipbmap) or IWRITE_LOCK(ipbmap) held on entry/exit; + */ +static int dbFreeDmap(struct bmap * bmp, struct dmap * dp, s64 blkno, + int nblocks) +{ + s8 oldroot; + int rc = 0, word; + + /* save the current value of the root (i.e. maximum free string) + * of the dmap tree. + */ + oldroot = dp->tree.stree[ROOT]; + + /* free the specified (blocks) bits */ + rc = dbFreeBits(bmp, dp, blkno, nblocks); + + /* if error or the root has not changed, done. */ + if (rc || (dp->tree.stree[ROOT] == oldroot)) + return (rc); + + /* root changed. bubble the change up to the dmap control pages. + * if the adjustment of the upper level control pages fails, + * backout the deallocation. + */ + if ((rc = dbAdjCtl(bmp, blkno, dp->tree.stree[ROOT], 0, 0))) { + word = (blkno & (BPERDMAP - 1)) >> L2DBWORD; + + /* as part of backing out the deallocation, we will have + * to back split the dmap tree if the deallocation caused + * the freed blocks to become part of a larger binary buddy + * system. + */ + if (dp->tree.stree[word] == NOFREE) + dbBackSplit((dmtree_t *) & dp->tree, word); + + dbAllocBits(bmp, dp, blkno, nblocks); + } + + return (rc); +} + + +/* + * NAME: dbAllocBits() + * + * FUNCTION: allocate a specified block range from a dmap. + * + * this routine updates the dmap to reflect the working + * state allocation of the specified block range. it directly + * updates the bits of the working map and causes the adjustment + * of the binary buddy system described by the dmap's dmtree + * leaves to reflect the bits allocated. it also causes the + * dmap's dmtree, as a whole, to reflect the allocated range. + * + * PARAMETERS: + * bmp - pointer to bmap descriptor + * dp - pointer to dmap to allocate bits from. + * blkno - starting block number of the bits to be allocated. + * nblocks - number of bits to be allocated. + * + * RETURN VALUES: none + * + * serialization: IREAD_LOCK(ipbmap) or IWRITE_LOCK(ipbmap) held on entry/exit; + */ +static void dbAllocBits(struct bmap * bmp, struct dmap * dp, s64 blkno, + int nblocks) +{ + int dbitno, word, rembits, nb, nwords, wbitno, nw, agno; + dmtree_t *tp = (dmtree_t *) & dp->tree; + int size; + s8 *leaf; + + /* pick up a pointer to the leaves of the dmap tree */ + leaf = dp->tree.stree + LEAFIND; + + /* determine the bit number and word within the dmap of the + * starting block. + */ + dbitno = blkno & (BPERDMAP - 1); + word = dbitno >> L2DBWORD; + + /* block range better be within the dmap */ + assert(dbitno + nblocks <= BPERDMAP); + + /* allocate the bits of the dmap's words corresponding to the block + * range. not all bits of the first and last words may be contained + * within the block range. if this is the case, we'll work against + * those words (i.e. partial first and/or last) on an individual basis + * (a single pass), allocating the bits of interest by hand and + * updating the leaf corresponding to the dmap word. a single pass + * will be used for all dmap words fully contained within the + * specified range. within this pass, the bits of all fully contained + * dmap words will be marked as free in a single shot and the leaves + * will be updated. a single leaf may describe the free space of + * multiple dmap words, so we may update only a subset of the actual + * leaves corresponding to the dmap words of the block range. + */ + for (rembits = nblocks; rembits > 0; rembits -= nb, dbitno += nb) { + /* determine the bit number within the word and + * the number of bits within the word. + */ + wbitno = dbitno & (DBWORD - 1); + nb = min(rembits, DBWORD - wbitno); + + /* check if only part of a word is to be allocated. + */ + if (nb < DBWORD) { + /* allocate (set to 1) the appropriate bits within + * this dmap word. + */ + dp->wmap[word] |= cpu_to_le32(ONES << (DBWORD - nb) + >> wbitno); + + /* update the leaf for this dmap word. in addition + * to setting the leaf value to the binary buddy max + * of the updated dmap word, dbSplit() will split + * the binary system of the leaves if need be. + */ + dbSplit(tp, word, BUDMIN, + dbMaxBud((u8 *) & dp->wmap[word])); + + word += 1; + } else { + /* one or more dmap words are fully contained + * within the block range. determine how many + * words and allocate (set to 1) the bits of these + * words. + */ + nwords = rembits >> L2DBWORD; + memset(&dp->wmap[word], (int) ONES, nwords * 4); + + /* determine how many bits. + */ + nb = nwords << L2DBWORD; + + /* now update the appropriate leaves to reflect + * the allocated words. + */ + for (; nwords > 0; nwords -= nw) { + if (leaf[word] < BUDMIN) { + jfs_error(bmp->db_ipbmap->i_sb, + "leaf page corrupt\n"); + break; + } + + /* determine what the leaf value should be + * updated to as the minimum of the l2 number + * of bits being allocated and the l2 number + * of bits currently described by this leaf. + */ + size = min_t(int, leaf[word], + NLSTOL2BSZ(nwords)); + + /* update the leaf to reflect the allocation. + * in addition to setting the leaf value to + * NOFREE, dbSplit() will split the binary + * system of the leaves to reflect the current + * allocation (size). + */ + dbSplit(tp, word, size, NOFREE); + + /* get the number of dmap words handled */ + nw = BUDSIZE(size, BUDMIN); + word += nw; + } + } + } + + /* update the free count for this dmap */ + le32_add_cpu(&dp->nfree, -nblocks); + + BMAP_LOCK(bmp); + + /* if this allocation group is completely free, + * update the maximum allocation group number if this allocation + * group is the new max. + */ + agno = blkno >> bmp->db_agl2size; + if (agno > bmp->db_maxag) + bmp->db_maxag = agno; + + /* update the free count for the allocation group and map */ + bmp->db_agfree[agno] -= nblocks; + bmp->db_nfree -= nblocks; + + BMAP_UNLOCK(bmp); +} + + +/* + * NAME: dbFreeBits() + * + * FUNCTION: free a specified block range from a dmap. + * + * this routine updates the dmap to reflect the working + * state allocation of the specified block range. it directly + * updates the bits of the working map and causes the adjustment + * of the binary buddy system described by the dmap's dmtree + * leaves to reflect the bits freed. it also causes the dmap's + * dmtree, as a whole, to reflect the deallocated range. + * + * PARAMETERS: + * bmp - pointer to bmap descriptor + * dp - pointer to dmap to free bits from. + * blkno - starting block number of the bits to be freed. + * nblocks - number of bits to be freed. + * + * RETURN VALUES: 0 for success + * + * serialization: IREAD_LOCK(ipbmap) or IWRITE_LOCK(ipbmap) held on entry/exit; + */ +static int dbFreeBits(struct bmap * bmp, struct dmap * dp, s64 blkno, + int nblocks) +{ + int dbitno, word, rembits, nb, nwords, wbitno, nw, agno; + dmtree_t *tp = (dmtree_t *) & dp->tree; + int rc = 0; + int size; + + /* determine the bit number and word within the dmap of the + * starting block. + */ + dbitno = blkno & (BPERDMAP - 1); + word = dbitno >> L2DBWORD; + + /* block range better be within the dmap. + */ + assert(dbitno + nblocks <= BPERDMAP); + + /* free the bits of the dmaps words corresponding to the block range. + * not all bits of the first and last words may be contained within + * the block range. if this is the case, we'll work against those + * words (i.e. partial first and/or last) on an individual basis + * (a single pass), freeing the bits of interest by hand and updating + * the leaf corresponding to the dmap word. a single pass will be used + * for all dmap words fully contained within the specified range. + * within this pass, the bits of all fully contained dmap words will + * be marked as free in a single shot and the leaves will be updated. a + * single leaf may describe the free space of multiple dmap words, + * so we may update only a subset of the actual leaves corresponding + * to the dmap words of the block range. + * + * dbJoin() is used to update leaf values and will join the binary + * buddy system of the leaves if the new leaf values indicate this + * should be done. + */ + for (rembits = nblocks; rembits > 0; rembits -= nb, dbitno += nb) { + /* determine the bit number within the word and + * the number of bits within the word. + */ + wbitno = dbitno & (DBWORD - 1); + nb = min(rembits, DBWORD - wbitno); + + /* check if only part of a word is to be freed. + */ + if (nb < DBWORD) { + /* free (zero) the appropriate bits within this + * dmap word. + */ + dp->wmap[word] &= + cpu_to_le32(~(ONES << (DBWORD - nb) + >> wbitno)); + + /* update the leaf for this dmap word. + */ + rc = dbJoin(tp, word, + dbMaxBud((u8 *) & dp->wmap[word])); + if (rc) + return rc; + + word += 1; + } else { + /* one or more dmap words are fully contained + * within the block range. determine how many + * words and free (zero) the bits of these words. + */ + nwords = rembits >> L2DBWORD; + memset(&dp->wmap[word], 0, nwords * 4); + + /* determine how many bits. + */ + nb = nwords << L2DBWORD; + + /* now update the appropriate leaves to reflect + * the freed words. + */ + for (; nwords > 0; nwords -= nw) { + /* determine what the leaf value should be + * updated to as the minimum of the l2 number + * of bits being freed and the l2 (max) number + * of bits that can be described by this leaf. + */ + size = + min(LITOL2BSZ + (word, L2LPERDMAP, BUDMIN), + NLSTOL2BSZ(nwords)); + + /* update the leaf. + */ + rc = dbJoin(tp, word, size); + if (rc) + return rc; + + /* get the number of dmap words handled. + */ + nw = BUDSIZE(size, BUDMIN); + word += nw; + } + } + } + + /* update the free count for this dmap. + */ + le32_add_cpu(&dp->nfree, nblocks); + + BMAP_LOCK(bmp); + + /* update the free count for the allocation group and + * map. + */ + agno = blkno >> bmp->db_agl2size; + bmp->db_nfree += nblocks; + bmp->db_agfree[agno] += nblocks; + + /* check if this allocation group is not completely free and + * if it is currently the maximum (rightmost) allocation group. + * if so, establish the new maximum allocation group number by + * searching left for the first allocation group with allocation. + */ + if ((bmp->db_agfree[agno] == bmp->db_agsize && agno == bmp->db_maxag) || + (agno == bmp->db_numag - 1 && + bmp->db_agfree[agno] == (bmp-> db_mapsize & (BPERDMAP - 1)))) { + while (bmp->db_maxag > 0) { + bmp->db_maxag -= 1; + if (bmp->db_agfree[bmp->db_maxag] != + bmp->db_agsize) + break; + } + + /* re-establish the allocation group preference if the + * current preference is right of the maximum allocation + * group. + */ + if (bmp->db_agpref > bmp->db_maxag) + bmp->db_agpref = bmp->db_maxag; + } + + BMAP_UNLOCK(bmp); + + return 0; +} + + +/* + * NAME: dbAdjCtl() + * + * FUNCTION: adjust a dmap control page at a specified level to reflect + * the change in a lower level dmap or dmap control page's + * maximum string of free blocks (i.e. a change in the root + * of the lower level object's dmtree) due to the allocation + * or deallocation of a range of blocks with a single dmap. + * + * on entry, this routine is provided with the new value of + * the lower level dmap or dmap control page root and the + * starting block number of the block range whose allocation + * or deallocation resulted in the root change. this range + * is respresented by a single leaf of the current dmapctl + * and the leaf will be updated with this value, possibly + * causing a binary buddy system within the leaves to be + * split or joined. the update may also cause the dmapctl's + * dmtree to be updated. + * + * if the adjustment of the dmap control page, itself, causes its + * root to change, this change will be bubbled up to the next dmap + * control level by a recursive call to this routine, specifying + * the new root value and the next dmap control page level to + * be adjusted. + * PARAMETERS: + * bmp - pointer to bmap descriptor + * blkno - the first block of a block range within a dmap. it is + * the allocation or deallocation of this block range that + * requires the dmap control page to be adjusted. + * newval - the new value of the lower level dmap or dmap control + * page root. + * alloc - 'true' if adjustment is due to an allocation. + * level - current level of dmap control page (i.e. L0, L1, L2) to + * be adjusted. + * + * RETURN VALUES: + * 0 - success + * -EIO - i/o error + * + * serialization: IREAD_LOCK(ipbmap) or IWRITE_LOCK(ipbmap) held on entry/exit; + */ +static int +dbAdjCtl(struct bmap * bmp, s64 blkno, int newval, int alloc, int level) +{ + struct metapage *mp; + s8 oldroot; + int oldval; + s64 lblkno; + struct dmapctl *dcp; + int rc, leafno, ti; + + /* get the buffer for the dmap control page for the specified + * block number and control page level. + */ + lblkno = BLKTOCTL(blkno, bmp->db_l2nbperpage, level); + mp = read_metapage(bmp->db_ipbmap, lblkno, PSIZE, 0); + if (mp == NULL) + return -EIO; + dcp = (struct dmapctl *) mp->data; + + if (dcp->leafidx != cpu_to_le32(CTLLEAFIND)) { + jfs_error(bmp->db_ipbmap->i_sb, "Corrupt dmapctl page\n"); + release_metapage(mp); + return -EIO; + } + + /* determine the leaf number corresponding to the block and + * the index within the dmap control tree. + */ + leafno = BLKTOCTLLEAF(blkno, dcp->budmin); + ti = leafno + le32_to_cpu(dcp->leafidx); + + /* save the current leaf value and the current root level (i.e. + * maximum l2 free string described by this dmapctl). + */ + oldval = dcp->stree[ti]; + oldroot = dcp->stree[ROOT]; + + /* check if this is a control page update for an allocation. + * if so, update the leaf to reflect the new leaf value using + * dbSplit(); otherwise (deallocation), use dbJoin() to update + * the leaf with the new value. in addition to updating the + * leaf, dbSplit() will also split the binary buddy system of + * the leaves, if required, and bubble new values within the + * dmapctl tree, if required. similarly, dbJoin() will join + * the binary buddy system of leaves and bubble new values up + * the dmapctl tree as required by the new leaf value. + */ + if (alloc) { + /* check if we are in the middle of a binary buddy + * system. this happens when we are performing the + * first allocation out of an allocation group that + * is part (not the first part) of a larger binary + * buddy system. if we are in the middle, back split + * the system prior to calling dbSplit() which assumes + * that it is at the front of a binary buddy system. + */ + if (oldval == NOFREE) { + rc = dbBackSplit((dmtree_t *) dcp, leafno); + if (rc) + return rc; + oldval = dcp->stree[ti]; + } + dbSplit((dmtree_t *) dcp, leafno, dcp->budmin, newval); + } else { + rc = dbJoin((dmtree_t *) dcp, leafno, newval); + if (rc) + return rc; + } + + /* check if the root of the current dmap control page changed due + * to the update and if the current dmap control page is not at + * the current top level (i.e. L0, L1, L2) of the map. if so (i.e. + * root changed and this is not the top level), call this routine + * again (recursion) for the next higher level of the mapping to + * reflect the change in root for the current dmap control page. + */ + if (dcp->stree[ROOT] != oldroot) { + /* are we below the top level of the map. if so, + * bubble the root up to the next higher level. + */ + if (level < bmp->db_maxlevel) { + /* bubble up the new root of this dmap control page to + * the next level. + */ + if ((rc = + dbAdjCtl(bmp, blkno, dcp->stree[ROOT], alloc, + level + 1))) { + /* something went wrong in bubbling up the new + * root value, so backout the changes to the + * current dmap control page. + */ + if (alloc) { + dbJoin((dmtree_t *) dcp, leafno, + oldval); + } else { + /* the dbJoin() above might have + * caused a larger binary buddy system + * to form and we may now be in the + * middle of it. if this is the case, + * back split the buddies. + */ + if (dcp->stree[ti] == NOFREE) + dbBackSplit((dmtree_t *) + dcp, leafno); + dbSplit((dmtree_t *) dcp, leafno, + dcp->budmin, oldval); + } + + /* release the buffer and return the error. + */ + release_metapage(mp); + return (rc); + } + } else { + /* we're at the top level of the map. update + * the bmap control page to reflect the size + * of the maximum free buddy system. + */ + assert(level == bmp->db_maxlevel); + if (bmp->db_maxfreebud != oldroot) { + jfs_error(bmp->db_ipbmap->i_sb, + "the maximum free buddy is not the old root\n"); + } + bmp->db_maxfreebud = dcp->stree[ROOT]; + } + } + + /* write the buffer. + */ + write_metapage(mp); + + return (0); +} + + +/* + * NAME: dbSplit() + * + * FUNCTION: update the leaf of a dmtree with a new value, splitting + * the leaf from the binary buddy system of the dmtree's + * leaves, as required. + * + * PARAMETERS: + * tp - pointer to the tree containing the leaf. + * leafno - the number of the leaf to be updated. + * splitsz - the size the binary buddy system starting at the leaf + * must be split to, specified as the log2 number of blocks. + * newval - the new value for the leaf. + * + * RETURN VALUES: none + * + * serialization: IREAD_LOCK(ipbmap) or IWRITE_LOCK(ipbmap) held on entry/exit; + */ +static void dbSplit(dmtree_t * tp, int leafno, int splitsz, int newval) +{ + int budsz; + int cursz; + s8 *leaf = tp->dmt_stree + le32_to_cpu(tp->dmt_leafidx); + + /* check if the leaf needs to be split. + */ + if (leaf[leafno] > tp->dmt_budmin) { + /* the split occurs by cutting the buddy system in half + * at the specified leaf until we reach the specified + * size. pick up the starting split size (current size + * - 1 in l2) and the corresponding buddy size. + */ + cursz = leaf[leafno] - 1; + budsz = BUDSIZE(cursz, tp->dmt_budmin); + + /* split until we reach the specified size. + */ + while (cursz >= splitsz) { + /* update the buddy's leaf with its new value. + */ + dbAdjTree(tp, leafno ^ budsz, cursz); + + /* on to the next size and buddy. + */ + cursz -= 1; + budsz >>= 1; + } + } + + /* adjust the dmap tree to reflect the specified leaf's new + * value. + */ + dbAdjTree(tp, leafno, newval); +} + + +/* + * NAME: dbBackSplit() + * + * FUNCTION: back split the binary buddy system of dmtree leaves + * that hold a specified leaf until the specified leaf + * starts its own binary buddy system. + * + * the allocators typically perform allocations at the start + * of binary buddy systems and dbSplit() is used to accomplish + * any required splits. in some cases, however, allocation + * may occur in the middle of a binary system and requires a + * back split, with the split proceeding out from the middle of + * the system (less efficient) rather than the start of the + * system (more efficient). the cases in which a back split + * is required are rare and are limited to the first allocation + * within an allocation group which is a part (not first part) + * of a larger binary buddy system and a few exception cases + * in which a previous join operation must be backed out. + * + * PARAMETERS: + * tp - pointer to the tree containing the leaf. + * leafno - the number of the leaf to be updated. + * + * RETURN VALUES: none + * + * serialization: IREAD_LOCK(ipbmap) or IWRITE_LOCK(ipbmap) held on entry/exit; + */ +static int dbBackSplit(dmtree_t * tp, int leafno) +{ + int budsz, bud, w, bsz, size; + int cursz; + s8 *leaf = tp->dmt_stree + le32_to_cpu(tp->dmt_leafidx); + + /* leaf should be part (not first part) of a binary + * buddy system. + */ + assert(leaf[leafno] == NOFREE); + + /* the back split is accomplished by iteratively finding the leaf + * that starts the buddy system that contains the specified leaf and + * splitting that system in two. this iteration continues until + * the specified leaf becomes the start of a buddy system. + * + * determine maximum possible l2 size for the specified leaf. + */ + size = + LITOL2BSZ(leafno, le32_to_cpu(tp->dmt_l2nleafs), + tp->dmt_budmin); + + /* determine the number of leaves covered by this size. this + * is the buddy size that we will start with as we search for + * the buddy system that contains the specified leaf. + */ + budsz = BUDSIZE(size, tp->dmt_budmin); + + /* back split. + */ + while (leaf[leafno] == NOFREE) { + /* find the leftmost buddy leaf. + */ + for (w = leafno, bsz = budsz;; bsz <<= 1, + w = (w < bud) ? w : bud) { + if (bsz >= le32_to_cpu(tp->dmt_nleafs)) { + jfs_err("JFS: block map error in dbBackSplit"); + return -EIO; + } + + /* determine the buddy. + */ + bud = w ^ bsz; + + /* check if this buddy is the start of the system. + */ + if (leaf[bud] != NOFREE) { + /* split the leaf at the start of the + * system in two. + */ + cursz = leaf[bud] - 1; + dbSplit(tp, bud, cursz, cursz); + break; + } + } + } + + if (leaf[leafno] != size) { + jfs_err("JFS: wrong leaf value in dbBackSplit"); + return -EIO; + } + return 0; +} + + +/* + * NAME: dbJoin() + * + * FUNCTION: update the leaf of a dmtree with a new value, joining + * the leaf with other leaves of the dmtree into a multi-leaf + * binary buddy system, as required. + * + * PARAMETERS: + * tp - pointer to the tree containing the leaf. + * leafno - the number of the leaf to be updated. + * newval - the new value for the leaf. + * + * RETURN VALUES: none + */ +static int dbJoin(dmtree_t * tp, int leafno, int newval) +{ + int budsz, buddy; + s8 *leaf; + + /* can the new leaf value require a join with other leaves ? + */ + if (newval >= tp->dmt_budmin) { + /* pickup a pointer to the leaves of the tree. + */ + leaf = tp->dmt_stree + le32_to_cpu(tp->dmt_leafidx); + + /* try to join the specified leaf into a large binary + * buddy system. the join proceeds by attempting to join + * the specified leafno with its buddy (leaf) at new value. + * if the join occurs, we attempt to join the left leaf + * of the joined buddies with its buddy at new value + 1. + * we continue to join until we find a buddy that cannot be + * joined (does not have a value equal to the size of the + * last join) or until all leaves have been joined into a + * single system. + * + * get the buddy size (number of words covered) of + * the new value. + */ + budsz = BUDSIZE(newval, tp->dmt_budmin); + + /* try to join. + */ + while (budsz < le32_to_cpu(tp->dmt_nleafs)) { + /* get the buddy leaf. + */ + buddy = leafno ^ budsz; + + /* if the leaf's new value is greater than its + * buddy's value, we join no more. + */ + if (newval > leaf[buddy]) + break; + + /* It shouldn't be less */ + if (newval < leaf[buddy]) + return -EIO; + + /* check which (leafno or buddy) is the left buddy. + * the left buddy gets to claim the blocks resulting + * from the join while the right gets to claim none. + * the left buddy is also eligible to participate in + * a join at the next higher level while the right + * is not. + * + */ + if (leafno < buddy) { + /* leafno is the left buddy. + */ + dbAdjTree(tp, buddy, NOFREE); + } else { + /* buddy is the left buddy and becomes + * leafno. + */ + dbAdjTree(tp, leafno, NOFREE); + leafno = buddy; + } + + /* on to try the next join. + */ + newval += 1; + budsz <<= 1; + } + } + + /* update the leaf value. + */ + dbAdjTree(tp, leafno, newval); + + return 0; +} + + +/* + * NAME: dbAdjTree() + * + * FUNCTION: update a leaf of a dmtree with a new value, adjusting + * the dmtree, as required, to reflect the new leaf value. + * the combination of any buddies must already be done before + * this is called. + * + * PARAMETERS: + * tp - pointer to the tree to be adjusted. + * leafno - the number of the leaf to be updated. + * newval - the new value for the leaf. + * + * RETURN VALUES: none + */ +static void dbAdjTree(dmtree_t * tp, int leafno, int newval) +{ + int lp, pp, k; + int max; + + /* pick up the index of the leaf for this leafno. + */ + lp = leafno + le32_to_cpu(tp->dmt_leafidx); + + /* is the current value the same as the old value ? if so, + * there is nothing to do. + */ + if (tp->dmt_stree[lp] == newval) + return; + + /* set the new value. + */ + tp->dmt_stree[lp] = newval; + + /* bubble the new value up the tree as required. + */ + for (k = 0; k < le32_to_cpu(tp->dmt_height); k++) { + /* get the index of the first leaf of the 4 leaf + * group containing the specified leaf (leafno). + */ + lp = ((lp - 1) & ~0x03) + 1; + + /* get the index of the parent of this 4 leaf group. + */ + pp = (lp - 1) >> 2; + + /* determine the maximum of the 4 leaves. + */ + max = TREEMAX(&tp->dmt_stree[lp]); + + /* if the maximum of the 4 is the same as the + * parent's value, we're done. + */ + if (tp->dmt_stree[pp] == max) + break; + + /* parent gets new value. + */ + tp->dmt_stree[pp] = max; + + /* parent becomes leaf for next go-round. + */ + lp = pp; + } +} + + +/* + * NAME: dbFindLeaf() + * + * FUNCTION: search a dmtree_t for sufficient free blocks, returning + * the index of a leaf describing the free blocks if + * sufficient free blocks are found. + * + * the search starts at the top of the dmtree_t tree and + * proceeds down the tree to the leftmost leaf with sufficient + * free space. + * + * PARAMETERS: + * tp - pointer to the tree to be searched. + * l2nb - log2 number of free blocks to search for. + * leafidx - return pointer to be set to the index of the leaf + * describing at least l2nb free blocks if sufficient + * free blocks are found. + * + * RETURN VALUES: + * 0 - success + * -ENOSPC - insufficient free blocks. + */ +static int dbFindLeaf(dmtree_t * tp, int l2nb, int *leafidx) +{ + int ti, n = 0, k, x = 0; + + /* first check the root of the tree to see if there is + * sufficient free space. + */ + if (l2nb > tp->dmt_stree[ROOT]) + return -ENOSPC; + + /* sufficient free space available. now search down the tree + * starting at the next level for the leftmost leaf that + * describes sufficient free space. + */ + for (k = le32_to_cpu(tp->dmt_height), ti = 1; + k > 0; k--, ti = ((ti + n) << 2) + 1) { + /* search the four nodes at this level, starting from + * the left. + */ + for (x = ti, n = 0; n < 4; n++) { + /* sufficient free space found. move to the next + * level (or quit if this is the last level). + */ + if (l2nb <= tp->dmt_stree[x + n]) + break; + } + + /* better have found something since the higher + * levels of the tree said it was here. + */ + assert(n < 4); + } + + /* set the return to the leftmost leaf describing sufficient + * free space. + */ + *leafidx = x + n - le32_to_cpu(tp->dmt_leafidx); + + return (0); +} + + +/* + * NAME: dbFindBits() + * + * FUNCTION: find a specified number of binary buddy free bits within a + * dmap bitmap word value. + * + * this routine searches the bitmap value for (1 << l2nb) free + * bits at (1 << l2nb) alignments within the value. + * + * PARAMETERS: + * word - dmap bitmap word value. + * l2nb - number of free bits specified as a log2 number. + * + * RETURN VALUES: + * starting bit number of free bits. + */ +static int dbFindBits(u32 word, int l2nb) +{ + int bitno, nb; + u32 mask; + + /* get the number of bits. + */ + nb = 1 << l2nb; + assert(nb <= DBWORD); + + /* complement the word so we can use a mask (i.e. 0s represent + * free bits) and compute the mask. + */ + word = ~word; + mask = ONES << (DBWORD - nb); + + /* scan the word for nb free bits at nb alignments. + */ + for (bitno = 0; mask != 0; bitno += nb, mask >>= nb) { + if ((mask & word) == mask) + break; + } + + ASSERT(bitno < 32); + + /* return the bit number. + */ + return (bitno); +} + + +/* + * NAME: dbMaxBud(u8 *cp) + * + * FUNCTION: determine the largest binary buddy string of free + * bits within 32-bits of the map. + * + * PARAMETERS: + * cp - pointer to the 32-bit value. + * + * RETURN VALUES: + * largest binary buddy of free bits within a dmap word. + */ +static int dbMaxBud(u8 * cp) +{ + signed char tmp1, tmp2; + + /* check if the wmap word is all free. if so, the + * free buddy size is BUDMIN. + */ + if (*((uint *) cp) == 0) + return (BUDMIN); + + /* check if the wmap word is half free. if so, the + * free buddy size is BUDMIN-1. + */ + if (*((u16 *) cp) == 0 || *((u16 *) cp + 1) == 0) + return (BUDMIN - 1); + + /* not all free or half free. determine the free buddy + * size thru table lookup using quarters of the wmap word. + */ + tmp1 = max(budtab[cp[2]], budtab[cp[3]]); + tmp2 = max(budtab[cp[0]], budtab[cp[1]]); + return (max(tmp1, tmp2)); +} + + +/* + * NAME: cnttz(uint word) + * + * FUNCTION: determine the number of trailing zeros within a 32-bit + * value. + * + * PARAMETERS: + * value - 32-bit value to be examined. + * + * RETURN VALUES: + * count of trailing zeros + */ +static int cnttz(u32 word) +{ + int n; + + for (n = 0; n < 32; n++, word >>= 1) { + if (word & 0x01) + break; + } + + return (n); +} + + +/* + * NAME: cntlz(u32 value) + * + * FUNCTION: determine the number of leading zeros within a 32-bit + * value. + * + * PARAMETERS: + * value - 32-bit value to be examined. + * + * RETURN VALUES: + * count of leading zeros + */ +static int cntlz(u32 value) +{ + int n; + + for (n = 0; n < 32; n++, value <<= 1) { + if (value & HIGHORDER) + break; + } + return (n); +} + + +/* + * NAME: blkstol2(s64 nb) + * + * FUNCTION: convert a block count to its log2 value. if the block + * count is not a l2 multiple, it is rounded up to the next + * larger l2 multiple. + * + * PARAMETERS: + * nb - number of blocks + * + * RETURN VALUES: + * log2 number of blocks + */ +static int blkstol2(s64 nb) +{ + int l2nb; + s64 mask; /* meant to be signed */ + + mask = (s64) 1 << (64 - 1); + + /* count the leading bits. + */ + for (l2nb = 0; l2nb < 64; l2nb++, mask >>= 1) { + /* leading bit found. + */ + if (nb & mask) { + /* determine the l2 value. + */ + l2nb = (64 - 1) - l2nb; + + /* check if we need to round up. + */ + if (~mask & nb) + l2nb++; + + return (l2nb); + } + } + assert(0); + return 0; /* fix compiler warning */ +} + + +/* + * NAME: dbAllocBottomUp() + * + * FUNCTION: alloc the specified block range from the working block + * allocation map. + * + * the blocks will be alloc from the working map one dmap + * at a time. + * + * PARAMETERS: + * ip - pointer to in-core inode; + * blkno - starting block number to be freed. + * nblocks - number of blocks to be freed. + * + * RETURN VALUES: + * 0 - success + * -EIO - i/o error + */ +int dbAllocBottomUp(struct inode *ip, s64 blkno, s64 nblocks) +{ + struct metapage *mp; + struct dmap *dp; + int nb, rc; + s64 lblkno, rem; + struct inode *ipbmap = JFS_SBI(ip->i_sb)->ipbmap; + struct bmap *bmp = JFS_SBI(ip->i_sb)->bmap; + + IREAD_LOCK(ipbmap, RDWRLOCK_DMAP); + + /* block to be allocated better be within the mapsize. */ + ASSERT(nblocks <= bmp->db_mapsize - blkno); + + /* + * allocate the blocks a dmap at a time. + */ + mp = NULL; + for (rem = nblocks; rem > 0; rem -= nb, blkno += nb) { + /* release previous dmap if any */ + if (mp) { + write_metapage(mp); + } + + /* get the buffer for the current dmap. */ + lblkno = BLKTODMAP(blkno, bmp->db_l2nbperpage); + mp = read_metapage(ipbmap, lblkno, PSIZE, 0); + if (mp == NULL) { + IREAD_UNLOCK(ipbmap); + return -EIO; + } + dp = (struct dmap *) mp->data; + + /* determine the number of blocks to be allocated from + * this dmap. + */ + nb = min(rem, BPERDMAP - (blkno & (BPERDMAP - 1))); + + /* allocate the blocks. */ + if ((rc = dbAllocDmapBU(bmp, dp, blkno, nb))) { + release_metapage(mp); + IREAD_UNLOCK(ipbmap); + return (rc); + } + } + + /* write the last buffer. */ + write_metapage(mp); + + IREAD_UNLOCK(ipbmap); + + return (0); +} + + +static int dbAllocDmapBU(struct bmap * bmp, struct dmap * dp, s64 blkno, + int nblocks) +{ + int rc; + int dbitno, word, rembits, nb, nwords, wbitno, agno; + s8 oldroot; + struct dmaptree *tp = (struct dmaptree *) & dp->tree; + + /* save the current value of the root (i.e. maximum free string) + * of the dmap tree. + */ + oldroot = tp->stree[ROOT]; + + /* determine the bit number and word within the dmap of the + * starting block. + */ + dbitno = blkno & (BPERDMAP - 1); + word = dbitno >> L2DBWORD; + + /* block range better be within the dmap */ + assert(dbitno + nblocks <= BPERDMAP); + + /* allocate the bits of the dmap's words corresponding to the block + * range. not all bits of the first and last words may be contained + * within the block range. if this is the case, we'll work against + * those words (i.e. partial first and/or last) on an individual basis + * (a single pass), allocating the bits of interest by hand and + * updating the leaf corresponding to the dmap word. a single pass + * will be used for all dmap words fully contained within the + * specified range. within this pass, the bits of all fully contained + * dmap words will be marked as free in a single shot and the leaves + * will be updated. a single leaf may describe the free space of + * multiple dmap words, so we may update only a subset of the actual + * leaves corresponding to the dmap words of the block range. + */ + for (rembits = nblocks; rembits > 0; rembits -= nb, dbitno += nb) { + /* determine the bit number within the word and + * the number of bits within the word. + */ + wbitno = dbitno & (DBWORD - 1); + nb = min(rembits, DBWORD - wbitno); + + /* check if only part of a word is to be allocated. + */ + if (nb < DBWORD) { + /* allocate (set to 1) the appropriate bits within + * this dmap word. + */ + dp->wmap[word] |= cpu_to_le32(ONES << (DBWORD - nb) + >> wbitno); + + word++; + } else { + /* one or more dmap words are fully contained + * within the block range. determine how many + * words and allocate (set to 1) the bits of these + * words. + */ + nwords = rembits >> L2DBWORD; + memset(&dp->wmap[word], (int) ONES, nwords * 4); + + /* determine how many bits */ + nb = nwords << L2DBWORD; + word += nwords; + } + } + + /* update the free count for this dmap */ + le32_add_cpu(&dp->nfree, -nblocks); + + /* reconstruct summary tree */ + dbInitDmapTree(dp); + + BMAP_LOCK(bmp); + + /* if this allocation group is completely free, + * update the highest active allocation group number + * if this allocation group is the new max. + */ + agno = blkno >> bmp->db_agl2size; + if (agno > bmp->db_maxag) + bmp->db_maxag = agno; + + /* update the free count for the allocation group and map */ + bmp->db_agfree[agno] -= nblocks; + bmp->db_nfree -= nblocks; + + BMAP_UNLOCK(bmp); + + /* if the root has not changed, done. */ + if (tp->stree[ROOT] == oldroot) + return (0); + + /* root changed. bubble the change up to the dmap control pages. + * if the adjustment of the upper level control pages fails, + * backout the bit allocation (thus making everything consistent). + */ + if ((rc = dbAdjCtl(bmp, blkno, tp->stree[ROOT], 1, 0))) + dbFreeBits(bmp, dp, blkno, nblocks); + + return (rc); +} + + +/* + * NAME: dbExtendFS() + * + * FUNCTION: extend bmap from blkno for nblocks; + * dbExtendFS() updates bmap ready for dbAllocBottomUp(); + * + * L2 + * | + * L1---------------------------------L1 + * | | + * L0---------L0---------L0 L0---------L0---------L0 + * | | | | | | + * d0,...,dn d0,...,dn d0,...,dn d0,...,dn d0,...,dn d0,.,dm; + * L2L1L0d0,...,dnL0d0,...,dnL0d0,...,dnL1L0d0,...,dnL0d0,...,dnL0d0,..dm + * + * <---old---><----------------------------extend-----------------------> + */ +int dbExtendFS(struct inode *ipbmap, s64 blkno, s64 nblocks) +{ + struct jfs_sb_info *sbi = JFS_SBI(ipbmap->i_sb); + int nbperpage = sbi->nbperpage; + int i, i0 = true, j, j0 = true, k, n; + s64 newsize; + s64 p; + struct metapage *mp, *l2mp, *l1mp = NULL, *l0mp = NULL; + struct dmapctl *l2dcp, *l1dcp, *l0dcp; + struct dmap *dp; + s8 *l0leaf, *l1leaf, *l2leaf; + struct bmap *bmp = sbi->bmap; + int agno, l2agsize, oldl2agsize; + s64 ag_rem; + + newsize = blkno + nblocks; + + jfs_info("dbExtendFS: blkno:%Ld nblocks:%Ld newsize:%Ld", + (long long) blkno, (long long) nblocks, (long long) newsize); + + /* + * initialize bmap control page. + * + * all the data in bmap control page should exclude + * the mkfs hidden dmap page. + */ + + /* update mapsize */ + bmp->db_mapsize = newsize; + bmp->db_maxlevel = BMAPSZTOLEV(bmp->db_mapsize); + + /* compute new AG size */ + l2agsize = dbGetL2AGSize(newsize); + oldl2agsize = bmp->db_agl2size; + + bmp->db_agl2size = l2agsize; + bmp->db_agsize = 1 << l2agsize; + + /* compute new number of AG */ + agno = bmp->db_numag; + bmp->db_numag = newsize >> l2agsize; + bmp->db_numag += ((u32) newsize % (u32) bmp->db_agsize) ? 1 : 0; + + /* + * reconfigure db_agfree[] + * from old AG configuration to new AG configuration; + * + * coalesce contiguous k (newAGSize/oldAGSize) AGs; + * i.e., (AGi, ..., AGj) where i = k*n and j = k*(n+1) - 1 to AGn; + * note: new AG size = old AG size * (2**x). + */ + if (l2agsize == oldl2agsize) + goto extend; + k = 1 << (l2agsize - oldl2agsize); + ag_rem = bmp->db_agfree[0]; /* save agfree[0] */ + for (i = 0, n = 0; i < agno; n++) { + bmp->db_agfree[n] = 0; /* init collection point */ + + /* coalesce contiguous k AGs; */ + for (j = 0; j < k && i < agno; j++, i++) { + /* merge AGi to AGn */ + bmp->db_agfree[n] += bmp->db_agfree[i]; + } + } + bmp->db_agfree[0] += ag_rem; /* restore agfree[0] */ + + for (; n < MAXAG; n++) + bmp->db_agfree[n] = 0; + + /* + * update highest active ag number + */ + + bmp->db_maxag = bmp->db_maxag / k; + + /* + * extend bmap + * + * update bit maps and corresponding level control pages; + * global control page db_nfree, db_agfree[agno], db_maxfreebud; + */ + extend: + /* get L2 page */ + p = BMAPBLKNO + nbperpage; /* L2 page */ + l2mp = read_metapage(ipbmap, p, PSIZE, 0); + if (!l2mp) { + jfs_error(ipbmap->i_sb, "L2 page could not be read\n"); + return -EIO; + } + l2dcp = (struct dmapctl *) l2mp->data; + + /* compute start L1 */ + k = blkno >> L2MAXL1SIZE; + l2leaf = l2dcp->stree + CTLLEAFIND + k; + p = BLKTOL1(blkno, sbi->l2nbperpage); /* L1 page */ + + /* + * extend each L1 in L2 + */ + for (; k < LPERCTL; k++, p += nbperpage) { + /* get L1 page */ + if (j0) { + /* read in L1 page: (blkno & (MAXL1SIZE - 1)) */ + l1mp = read_metapage(ipbmap, p, PSIZE, 0); + if (l1mp == NULL) + goto errout; + l1dcp = (struct dmapctl *) l1mp->data; + + /* compute start L0 */ + j = (blkno & (MAXL1SIZE - 1)) >> L2MAXL0SIZE; + l1leaf = l1dcp->stree + CTLLEAFIND + j; + p = BLKTOL0(blkno, sbi->l2nbperpage); + j0 = false; + } else { + /* assign/init L1 page */ + l1mp = get_metapage(ipbmap, p, PSIZE, 0); + if (l1mp == NULL) + goto errout; + + l1dcp = (struct dmapctl *) l1mp->data; + + /* compute start L0 */ + j = 0; + l1leaf = l1dcp->stree + CTLLEAFIND; + p += nbperpage; /* 1st L0 of L1.k */ + } + + /* + * extend each L0 in L1 + */ + for (; j < LPERCTL; j++) { + /* get L0 page */ + if (i0) { + /* read in L0 page: (blkno & (MAXL0SIZE - 1)) */ + + l0mp = read_metapage(ipbmap, p, PSIZE, 0); + if (l0mp == NULL) + goto errout; + l0dcp = (struct dmapctl *) l0mp->data; + + /* compute start dmap */ + i = (blkno & (MAXL0SIZE - 1)) >> + L2BPERDMAP; + l0leaf = l0dcp->stree + CTLLEAFIND + i; + p = BLKTODMAP(blkno, + sbi->l2nbperpage); + i0 = false; + } else { + /* assign/init L0 page */ + l0mp = get_metapage(ipbmap, p, PSIZE, 0); + if (l0mp == NULL) + goto errout; + + l0dcp = (struct dmapctl *) l0mp->data; + + /* compute start dmap */ + i = 0; + l0leaf = l0dcp->stree + CTLLEAFIND; + p += nbperpage; /* 1st dmap of L0.j */ + } + + /* + * extend each dmap in L0 + */ + for (; i < LPERCTL; i++) { + /* + * reconstruct the dmap page, and + * initialize corresponding parent L0 leaf + */ + if ((n = blkno & (BPERDMAP - 1))) { + /* read in dmap page: */ + mp = read_metapage(ipbmap, p, + PSIZE, 0); + if (mp == NULL) + goto errout; + n = min(nblocks, (s64)BPERDMAP - n); + } else { + /* assign/init dmap page */ + mp = read_metapage(ipbmap, p, + PSIZE, 0); + if (mp == NULL) + goto errout; + + n = min_t(s64, nblocks, BPERDMAP); + } + + dp = (struct dmap *) mp->data; + *l0leaf = dbInitDmap(dp, blkno, n); + + bmp->db_nfree += n; + agno = le64_to_cpu(dp->start) >> l2agsize; + bmp->db_agfree[agno] += n; + + write_metapage(mp); + + l0leaf++; + p += nbperpage; + + blkno += n; + nblocks -= n; + if (nblocks == 0) + break; + } /* for each dmap in a L0 */ + + /* + * build current L0 page from its leaves, and + * initialize corresponding parent L1 leaf + */ + *l1leaf = dbInitDmapCtl(l0dcp, 0, ++i); + write_metapage(l0mp); + l0mp = NULL; + + if (nblocks) + l1leaf++; /* continue for next L0 */ + else { + /* more than 1 L0 ? */ + if (j > 0) + break; /* build L1 page */ + else { + /* summarize in global bmap page */ + bmp->db_maxfreebud = *l1leaf; + release_metapage(l1mp); + release_metapage(l2mp); + goto finalize; + } + } + } /* for each L0 in a L1 */ + + /* + * build current L1 page from its leaves, and + * initialize corresponding parent L2 leaf + */ + *l2leaf = dbInitDmapCtl(l1dcp, 1, ++j); + write_metapage(l1mp); + l1mp = NULL; + + if (nblocks) + l2leaf++; /* continue for next L1 */ + else { + /* more than 1 L1 ? */ + if (k > 0) + break; /* build L2 page */ + else { + /* summarize in global bmap page */ + bmp->db_maxfreebud = *l2leaf; + release_metapage(l2mp); + goto finalize; + } + } + } /* for each L1 in a L2 */ + + jfs_error(ipbmap->i_sb, "function has not returned as expected\n"); +errout: + if (l0mp) + release_metapage(l0mp); + if (l1mp) + release_metapage(l1mp); + release_metapage(l2mp); + return -EIO; + + /* + * finalize bmap control page + */ +finalize: + + return 0; +} + + +/* + * dbFinalizeBmap() + */ +void dbFinalizeBmap(struct inode *ipbmap) +{ + struct bmap *bmp = JFS_SBI(ipbmap->i_sb)->bmap; + int actags, inactags, l2nl; + s64 ag_rem, actfree, inactfree, avgfree; + int i, n; + + /* + * finalize bmap control page + */ +//finalize: + /* + * compute db_agpref: preferred ag to allocate from + * (the leftmost ag with average free space in it); + */ +//agpref: + /* get the number of active ags and inacitve ags */ + actags = bmp->db_maxag + 1; + inactags = bmp->db_numag - actags; + ag_rem = bmp->db_mapsize & (bmp->db_agsize - 1); /* ??? */ + + /* determine how many blocks are in the inactive allocation + * groups. in doing this, we must account for the fact that + * the rightmost group might be a partial group (i.e. file + * system size is not a multiple of the group size). + */ + inactfree = (inactags && ag_rem) ? + ((inactags - 1) << bmp->db_agl2size) + ag_rem + : inactags << bmp->db_agl2size; + + /* determine how many free blocks are in the active + * allocation groups plus the average number of free blocks + * within the active ags. + */ + actfree = bmp->db_nfree - inactfree; + avgfree = (u32) actfree / (u32) actags; + + /* if the preferred allocation group has not average free space. + * re-establish the preferred group as the leftmost + * group with average free space. + */ + if (bmp->db_agfree[bmp->db_agpref] < avgfree) { + for (bmp->db_agpref = 0; bmp->db_agpref < actags; + bmp->db_agpref++) { + if (bmp->db_agfree[bmp->db_agpref] >= avgfree) + break; + } + if (bmp->db_agpref >= bmp->db_numag) { + jfs_error(ipbmap->i_sb, + "cannot find ag with average freespace\n"); + } + } + + /* + * compute db_aglevel, db_agheight, db_width, db_agstart: + * an ag is covered in aglevel dmapctl summary tree, + * at agheight level height (from leaf) with agwidth number of nodes + * each, which starts at agstart index node of the smmary tree node + * array; + */ + bmp->db_aglevel = BMAPSZTOLEV(bmp->db_agsize); + l2nl = + bmp->db_agl2size - (L2BPERDMAP + bmp->db_aglevel * L2LPERCTL); + bmp->db_agheight = l2nl >> 1; + bmp->db_agwidth = 1 << (l2nl - (bmp->db_agheight << 1)); + for (i = 5 - bmp->db_agheight, bmp->db_agstart = 0, n = 1; i > 0; + i--) { + bmp->db_agstart += n; + n <<= 2; + } + +} + + +/* + * NAME: dbInitDmap()/ujfs_idmap_page() + * + * FUNCTION: initialize working/persistent bitmap of the dmap page + * for the specified number of blocks: + * + * at entry, the bitmaps had been initialized as free (ZEROS); + * The number of blocks will only account for the actually + * existing blocks. Blocks which don't actually exist in + * the aggregate will be marked as allocated (ONES); + * + * PARAMETERS: + * dp - pointer to page of map + * nblocks - number of blocks this page + * + * RETURNS: NONE + */ +static int dbInitDmap(struct dmap * dp, s64 Blkno, int nblocks) +{ + int blkno, w, b, r, nw, nb, i; + + /* starting block number within the dmap */ + blkno = Blkno & (BPERDMAP - 1); + + if (blkno == 0) { + dp->nblocks = dp->nfree = cpu_to_le32(nblocks); + dp->start = cpu_to_le64(Blkno); + + if (nblocks == BPERDMAP) { + memset(&dp->wmap[0], 0, LPERDMAP * 4); + memset(&dp->pmap[0], 0, LPERDMAP * 4); + goto initTree; + } + } else { + le32_add_cpu(&dp->nblocks, nblocks); + le32_add_cpu(&dp->nfree, nblocks); + } + + /* word number containing start block number */ + w = blkno >> L2DBWORD; + + /* + * free the bits corresponding to the block range (ZEROS): + * note: not all bits of the first and last words may be contained + * within the block range. + */ + for (r = nblocks; r > 0; r -= nb, blkno += nb) { + /* number of bits preceding range to be freed in the word */ + b = blkno & (DBWORD - 1); + /* number of bits to free in the word */ + nb = min(r, DBWORD - b); + + /* is partial word to be freed ? */ + if (nb < DBWORD) { + /* free (set to 0) from the bitmap word */ + dp->wmap[w] &= cpu_to_le32(~(ONES << (DBWORD - nb) + >> b)); + dp->pmap[w] &= cpu_to_le32(~(ONES << (DBWORD - nb) + >> b)); + + /* skip the word freed */ + w++; + } else { + /* free (set to 0) contiguous bitmap words */ + nw = r >> L2DBWORD; + memset(&dp->wmap[w], 0, nw * 4); + memset(&dp->pmap[w], 0, nw * 4); + + /* skip the words freed */ + nb = nw << L2DBWORD; + w += nw; + } + } + + /* + * mark bits following the range to be freed (non-existing + * blocks) as allocated (ONES) + */ + + if (blkno == BPERDMAP) + goto initTree; + + /* the first word beyond the end of existing blocks */ + w = blkno >> L2DBWORD; + + /* does nblocks fall on a 32-bit boundary ? */ + b = blkno & (DBWORD - 1); + if (b) { + /* mark a partial word allocated */ + dp->wmap[w] = dp->pmap[w] = cpu_to_le32(ONES >> b); + w++; + } + + /* set the rest of the words in the page to allocated (ONES) */ + for (i = w; i < LPERDMAP; i++) + dp->pmap[i] = dp->wmap[i] = cpu_to_le32(ONES); + + /* + * init tree + */ + initTree: + return (dbInitDmapTree(dp)); +} + + +/* + * NAME: dbInitDmapTree()/ujfs_complete_dmap() + * + * FUNCTION: initialize summary tree of the specified dmap: + * + * at entry, bitmap of the dmap has been initialized; + * + * PARAMETERS: + * dp - dmap to complete + * blkno - starting block number for this dmap + * treemax - will be filled in with max free for this dmap + * + * RETURNS: max free string at the root of the tree + */ +static int dbInitDmapTree(struct dmap * dp) +{ + struct dmaptree *tp; + s8 *cp; + int i; + + /* init fixed info of tree */ + tp = &dp->tree; + tp->nleafs = cpu_to_le32(LPERDMAP); + tp->l2nleafs = cpu_to_le32(L2LPERDMAP); + tp->leafidx = cpu_to_le32(LEAFIND); + tp->height = cpu_to_le32(4); + tp->budmin = BUDMIN; + + /* init each leaf from corresponding wmap word: + * note: leaf is set to NOFREE(-1) if all blocks of corresponding + * bitmap word are allocated. + */ + cp = tp->stree + le32_to_cpu(tp->leafidx); + for (i = 0; i < LPERDMAP; i++) + *cp++ = dbMaxBud((u8 *) & dp->wmap[i]); + + /* build the dmap's binary buddy summary tree */ + return (dbInitTree(tp)); +} + + +/* + * NAME: dbInitTree()/ujfs_adjtree() + * + * FUNCTION: initialize binary buddy summary tree of a dmap or dmapctl. + * + * at entry, the leaves of the tree has been initialized + * from corresponding bitmap word or root of summary tree + * of the child control page; + * configure binary buddy system at the leaf level, then + * bubble up the values of the leaf nodes up the tree. + * + * PARAMETERS: + * cp - Pointer to the root of the tree + * l2leaves- Number of leaf nodes as a power of 2 + * l2min - Number of blocks that can be covered by a leaf + * as a power of 2 + * + * RETURNS: max free string at the root of the tree + */ +static int dbInitTree(struct dmaptree * dtp) +{ + int l2max, l2free, bsize, nextb, i; + int child, parent, nparent; + s8 *tp, *cp, *cp1; + + tp = dtp->stree; + + /* Determine the maximum free string possible for the leaves */ + l2max = le32_to_cpu(dtp->l2nleafs) + dtp->budmin; + + /* + * configure the leaf levevl into binary buddy system + * + * Try to combine buddies starting with a buddy size of 1 + * (i.e. two leaves). At a buddy size of 1 two buddy leaves + * can be combined if both buddies have a maximum free of l2min; + * the combination will result in the left-most buddy leaf having + * a maximum free of l2min+1. + * After processing all buddies for a given size, process buddies + * at the next higher buddy size (i.e. current size * 2) and + * the next maximum free (current free + 1). + * This continues until the maximum possible buddy combination + * yields maximum free. + */ + for (l2free = dtp->budmin, bsize = 1; l2free < l2max; + l2free++, bsize = nextb) { + /* get next buddy size == current buddy pair size */ + nextb = bsize << 1; + + /* scan each adjacent buddy pair at current buddy size */ + for (i = 0, cp = tp + le32_to_cpu(dtp->leafidx); + i < le32_to_cpu(dtp->nleafs); + i += nextb, cp += nextb) { + /* coalesce if both adjacent buddies are max free */ + if (*cp == l2free && *(cp + bsize) == l2free) { + *cp = l2free + 1; /* left take right */ + *(cp + bsize) = -1; /* right give left */ + } + } + } + + /* + * bubble summary information of leaves up the tree. + * + * Starting at the leaf node level, the four nodes described by + * the higher level parent node are compared for a maximum free and + * this maximum becomes the value of the parent node. + * when all lower level nodes are processed in this fashion then + * move up to the next level (parent becomes a lower level node) and + * continue the process for that level. + */ + for (child = le32_to_cpu(dtp->leafidx), + nparent = le32_to_cpu(dtp->nleafs) >> 2; + nparent > 0; nparent >>= 2, child = parent) { + /* get index of 1st node of parent level */ + parent = (child - 1) >> 2; + + /* set the value of the parent node as the maximum + * of the four nodes of the current level. + */ + for (i = 0, cp = tp + child, cp1 = tp + parent; + i < nparent; i++, cp += 4, cp1++) + *cp1 = TREEMAX(cp); + } + + return (*tp); +} + + +/* + * dbInitDmapCtl() + * + * function: initialize dmapctl page + */ +static int dbInitDmapCtl(struct dmapctl * dcp, int level, int i) +{ /* start leaf index not covered by range */ + s8 *cp; + + dcp->nleafs = cpu_to_le32(LPERCTL); + dcp->l2nleafs = cpu_to_le32(L2LPERCTL); + dcp->leafidx = cpu_to_le32(CTLLEAFIND); + dcp->height = cpu_to_le32(5); + dcp->budmin = L2BPERDMAP + L2LPERCTL * level; + + /* + * initialize the leaves of current level that were not covered + * by the specified input block range (i.e. the leaves have no + * low level dmapctl or dmap). + */ + cp = &dcp->stree[CTLLEAFIND + i]; + for (; i < LPERCTL; i++) + *cp++ = NOFREE; + + /* build the dmap's binary buddy summary tree */ + return (dbInitTree((struct dmaptree *) dcp)); +} + + +/* + * NAME: dbGetL2AGSize()/ujfs_getagl2size() + * + * FUNCTION: Determine log2(allocation group size) from aggregate size + * + * PARAMETERS: + * nblocks - Number of blocks in aggregate + * + * RETURNS: log2(allocation group size) in aggregate blocks + */ +static int dbGetL2AGSize(s64 nblocks) +{ + s64 sz; + s64 m; + int l2sz; + + if (nblocks < BPERDMAP * MAXAG) + return (L2BPERDMAP); + + /* round up aggregate size to power of 2 */ + m = ((u64) 1 << (64 - 1)); + for (l2sz = 64; l2sz >= 0; l2sz--, m >>= 1) { + if (m & nblocks) + break; + } + + sz = (s64) 1 << l2sz; + if (sz < nblocks) + l2sz += 1; + + /* agsize = roundupSize/max_number_of_ag */ + return (l2sz - L2MAXAG); +} + + +/* + * NAME: dbMapFileSizeToMapSize() + * + * FUNCTION: compute number of blocks the block allocation map file + * can cover from the map file size; + * + * RETURNS: Number of blocks which can be covered by this block map file; + */ + +/* + * maximum number of map pages at each level including control pages + */ +#define MAXL0PAGES (1 + LPERCTL) +#define MAXL1PAGES (1 + LPERCTL * MAXL0PAGES) +#define MAXL2PAGES (1 + LPERCTL * MAXL1PAGES) + +/* + * convert number of map pages to the zero origin top dmapctl level + */ +#define BMAPPGTOLEV(npages) \ + (((npages) <= 3 + MAXL0PAGES) ? 0 : \ + ((npages) <= 2 + MAXL1PAGES) ? 1 : 2) + +s64 dbMapFileSizeToMapSize(struct inode * ipbmap) +{ + struct super_block *sb = ipbmap->i_sb; + s64 nblocks; + s64 npages, ndmaps; + int level, i; + int complete, factor; + + nblocks = ipbmap->i_size >> JFS_SBI(sb)->l2bsize; + npages = nblocks >> JFS_SBI(sb)->l2nbperpage; + level = BMAPPGTOLEV(npages); + + /* At each level, accumulate the number of dmap pages covered by + * the number of full child levels below it; + * repeat for the last incomplete child level. + */ + ndmaps = 0; + npages--; /* skip the first global control page */ + /* skip higher level control pages above top level covered by map */ + npages -= (2 - level); + npages--; /* skip top level's control page */ + for (i = level; i >= 0; i--) { + factor = + (i == 2) ? MAXL1PAGES : ((i == 1) ? MAXL0PAGES : 1); + complete = (u32) npages / factor; + ndmaps += complete * ((i == 2) ? LPERCTL * LPERCTL : + ((i == 1) ? LPERCTL : 1)); + + /* pages in last/incomplete child */ + npages = (u32) npages % factor; + /* skip incomplete child's level control page */ + npages--; + } + + /* convert the number of dmaps into the number of blocks + * which can be covered by the dmaps; + */ + nblocks = ndmaps << L2BPERDMAP; + + return (nblocks); +} diff --git a/fs/jfs/jfs_dmap.h b/fs/jfs/jfs_dmap.h new file mode 100644 index 000000000..f502a15c6 --- /dev/null +++ b/fs/jfs/jfs_dmap.h @@ -0,0 +1,316 @@ +/* + * Copyright (C) International Business Machines Corp., 2000-2002 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +#ifndef _H_JFS_DMAP +#define _H_JFS_DMAP + +#include "jfs_txnmgr.h" + +#define BMAPVERSION 1 /* version number */ +#define TREESIZE (256+64+16+4+1) /* size of a dmap tree */ +#define LEAFIND (64+16+4+1) /* index of 1st leaf of a dmap tree */ +#define LPERDMAP 256 /* num leaves per dmap tree */ +#define L2LPERDMAP 8 /* l2 number of leaves per dmap tree */ +#define DBWORD 32 /* # of blks covered by a map word */ +#define L2DBWORD 5 /* l2 # of blks covered by a mword */ +#define BUDMIN L2DBWORD /* max free string in a map word */ +#define BPERDMAP (LPERDMAP * DBWORD) /* num of blks per dmap */ +#define L2BPERDMAP 13 /* l2 num of blks per dmap */ +#define CTLTREESIZE (1024+256+64+16+4+1) /* size of a dmapctl tree */ +#define CTLLEAFIND (256+64+16+4+1) /* idx of 1st leaf of a dmapctl tree */ +#define LPERCTL 1024 /* num of leaves per dmapctl tree */ +#define L2LPERCTL 10 /* l2 num of leaves per dmapctl tree */ +#define ROOT 0 /* index of the root of a tree */ +#define NOFREE ((s8) -1) /* no blocks free */ +#define MAXAG 128 /* max number of allocation groups */ +#define L2MAXAG 7 /* l2 max num of AG */ +#define L2MINAGSZ 25 /* l2 of minimum AG size in bytes */ +#define BMAPBLKNO 0 /* lblkno of bmap within the map */ + +/* + * maximum l2 number of disk blocks at the various dmapctl levels. + */ +#define L2MAXL0SIZE (L2BPERDMAP + 1 * L2LPERCTL) +#define L2MAXL1SIZE (L2BPERDMAP + 2 * L2LPERCTL) +#define L2MAXL2SIZE (L2BPERDMAP + 3 * L2LPERCTL) + +/* + * maximum number of disk blocks at the various dmapctl levels. + */ +#define MAXL0SIZE ((s64)1 << L2MAXL0SIZE) +#define MAXL1SIZE ((s64)1 << L2MAXL1SIZE) +#define MAXL2SIZE ((s64)1 << L2MAXL2SIZE) + +#define MAXMAPSIZE MAXL2SIZE /* maximum aggregate map size */ + +/* + * determine the maximum free string for four (lower level) nodes + * of the tree. + */ +static inline signed char TREEMAX(signed char *cp) +{ + signed char tmp1, tmp2; + + tmp1 = max(*(cp+2), *(cp+3)); + tmp2 = max(*(cp), *(cp+1)); + + return max(tmp1, tmp2); +} + +/* + * convert disk block number to the logical block number of the dmap + * describing the disk block. s is the log2(number of logical blocks per page) + * + * The calculation figures out how many logical pages are in front of the dmap. + * - the number of dmaps preceding it + * - the number of L0 pages preceding its L0 page + * - the number of L1 pages preceding its L1 page + * - 3 is added to account for the L2, L1, and L0 page for this dmap + * - 1 is added to account for the control page of the map. + */ +#define BLKTODMAP(b,s) \ + ((((b) >> 13) + ((b) >> 23) + ((b) >> 33) + 3 + 1) << (s)) + +/* + * convert disk block number to the logical block number of the LEVEL 0 + * dmapctl describing the disk block. s is the log2(number of logical blocks + * per page) + * + * The calculation figures out how many logical pages are in front of the L0. + * - the number of dmap pages preceding it + * - the number of L0 pages preceding it + * - the number of L1 pages preceding its L1 page + * - 2 is added to account for the L2, and L1 page for this L0 + * - 1 is added to account for the control page of the map. + */ +#define BLKTOL0(b,s) \ + (((((b) >> 23) << 10) + ((b) >> 23) + ((b) >> 33) + 2 + 1) << (s)) + +/* + * convert disk block number to the logical block number of the LEVEL 1 + * dmapctl describing the disk block. s is the log2(number of logical blocks + * per page) + * + * The calculation figures out how many logical pages are in front of the L1. + * - the number of dmap pages preceding it + * - the number of L0 pages preceding it + * - the number of L1 pages preceding it + * - 1 is added to account for the L2 page + * - 1 is added to account for the control page of the map. + */ +#define BLKTOL1(b,s) \ + (((((b) >> 33) << 20) + (((b) >> 33) << 10) + ((b) >> 33) + 1 + 1) << (s)) + +/* + * convert disk block number to the logical block number of the dmapctl + * at the specified level which describes the disk block. + */ +#define BLKTOCTL(b,s,l) \ + (((l) == 2) ? 1 : ((l) == 1) ? BLKTOL1((b),(s)) : BLKTOL0((b),(s))) + +/* + * convert aggregate map size to the zero origin dmapctl level of the + * top dmapctl. + */ +#define BMAPSZTOLEV(size) \ + (((size) <= MAXL0SIZE) ? 0 : ((size) <= MAXL1SIZE) ? 1 : 2) + +/* convert disk block number to allocation group number. + */ +#define BLKTOAG(b,sbi) ((b) >> ((sbi)->bmap->db_agl2size)) + +/* convert allocation group number to starting disk block + * number. + */ +#define AGTOBLK(a,ip) \ + ((s64)(a) << (JFS_SBI((ip)->i_sb)->bmap->db_agl2size)) + +/* + * dmap summary tree + * + * dmaptree must be consistent with dmapctl. + */ +struct dmaptree { + __le32 nleafs; /* 4: number of tree leafs */ + __le32 l2nleafs; /* 4: l2 number of tree leafs */ + __le32 leafidx; /* 4: index of first tree leaf */ + __le32 height; /* 4: height of the tree */ + s8 budmin; /* 1: min l2 tree leaf value to combine */ + s8 stree[TREESIZE]; /* TREESIZE: tree */ + u8 pad[2]; /* 2: pad to word boundary */ +}; /* - 360 - */ + +/* + * dmap page per 8K blocks bitmap + */ +struct dmap { + __le32 nblocks; /* 4: num blks covered by this dmap */ + __le32 nfree; /* 4: num of free blks in this dmap */ + __le64 start; /* 8: starting blkno for this dmap */ + struct dmaptree tree; /* 360: dmap tree */ + u8 pad[1672]; /* 1672: pad to 2048 bytes */ + __le32 wmap[LPERDMAP]; /* 1024: bits of the working map */ + __le32 pmap[LPERDMAP]; /* 1024: bits of the persistent map */ +}; /* - 4096 - */ + +/* + * disk map control page per level. + * + * dmapctl must be consistent with dmaptree. + */ +struct dmapctl { + __le32 nleafs; /* 4: number of tree leafs */ + __le32 l2nleafs; /* 4: l2 number of tree leafs */ + __le32 leafidx; /* 4: index of the first tree leaf */ + __le32 height; /* 4: height of tree */ + s8 budmin; /* 1: minimum l2 tree leaf value */ + s8 stree[CTLTREESIZE]; /* CTLTREESIZE: dmapctl tree */ + u8 pad[2714]; /* 2714: pad to 4096 */ +}; /* - 4096 - */ + +/* + * common definition for dmaptree within dmap and dmapctl + */ +typedef union dmtree { + struct dmaptree t1; + struct dmapctl t2; +} dmtree_t; + +/* macros for accessing fields within dmtree */ +#define dmt_nleafs t1.nleafs +#define dmt_l2nleafs t1.l2nleafs +#define dmt_leafidx t1.leafidx +#define dmt_height t1.height +#define dmt_budmin t1.budmin +#define dmt_stree t2.stree + +/* + * on-disk aggregate disk allocation map descriptor. + */ +struct dbmap_disk { + __le64 dn_mapsize; /* 8: number of blocks in aggregate */ + __le64 dn_nfree; /* 8: num free blks in aggregate map */ + __le32 dn_l2nbperpage; /* 4: number of blks per page */ + __le32 dn_numag; /* 4: total number of ags */ + __le32 dn_maxlevel; /* 4: number of active ags */ + __le32 dn_maxag; /* 4: max active alloc group number */ + __le32 dn_agpref; /* 4: preferred alloc group (hint) */ + __le32 dn_aglevel; /* 4: dmapctl level holding the AG */ + __le32 dn_agheight; /* 4: height in dmapctl of the AG */ + __le32 dn_agwidth; /* 4: width in dmapctl of the AG */ + __le32 dn_agstart; /* 4: start tree index at AG height */ + __le32 dn_agl2size; /* 4: l2 num of blks per alloc group */ + __le64 dn_agfree[MAXAG];/* 8*MAXAG: per AG free count */ + __le64 dn_agsize; /* 8: num of blks per alloc group */ + s8 dn_maxfreebud; /* 1: max free buddy system */ + u8 pad[3007]; /* 3007: pad to 4096 */ +}; /* - 4096 - */ + +struct dbmap { + s64 dn_mapsize; /* number of blocks in aggregate */ + s64 dn_nfree; /* num free blks in aggregate map */ + int dn_l2nbperpage; /* number of blks per page */ + int dn_numag; /* total number of ags */ + int dn_maxlevel; /* number of active ags */ + int dn_maxag; /* max active alloc group number */ + int dn_agpref; /* preferred alloc group (hint) */ + int dn_aglevel; /* dmapctl level holding the AG */ + int dn_agheight; /* height in dmapctl of the AG */ + int dn_agwidth; /* width in dmapctl of the AG */ + int dn_agstart; /* start tree index at AG height */ + int dn_agl2size; /* l2 num of blks per alloc group */ + s64 dn_agfree[MAXAG]; /* per AG free count */ + s64 dn_agsize; /* num of blks per alloc group */ + signed char dn_maxfreebud; /* max free buddy system */ +}; /* - 4096 - */ +/* + * in-memory aggregate disk allocation map descriptor. + */ +struct bmap { + struct dbmap db_bmap; /* on-disk aggregate map descriptor */ + struct inode *db_ipbmap; /* ptr to aggregate map incore inode */ + struct mutex db_bmaplock; /* aggregate map lock */ + atomic_t db_active[MAXAG]; /* count of active, open files in AG */ + u32 *db_DBmap; +}; + +/* macros for accessing fields within in-memory aggregate map descriptor */ +#define db_mapsize db_bmap.dn_mapsize +#define db_nfree db_bmap.dn_nfree +#define db_agfree db_bmap.dn_agfree +#define db_agsize db_bmap.dn_agsize +#define db_agl2size db_bmap.dn_agl2size +#define db_agwidth db_bmap.dn_agwidth +#define db_agheight db_bmap.dn_agheight +#define db_agstart db_bmap.dn_agstart +#define db_numag db_bmap.dn_numag +#define db_maxlevel db_bmap.dn_maxlevel +#define db_aglevel db_bmap.dn_aglevel +#define db_agpref db_bmap.dn_agpref +#define db_maxag db_bmap.dn_maxag +#define db_maxfreebud db_bmap.dn_maxfreebud +#define db_l2nbperpage db_bmap.dn_l2nbperpage + +/* + * macros for various conversions needed by the allocators. + * blkstol2(), cntlz(), and cnttz() are operating system dependent functions. + */ +/* convert number of blocks to log2 number of blocks, rounding up to + * the next log2 value if blocks is not a l2 multiple. + */ +#define BLKSTOL2(d) (blkstol2(d)) + +/* convert number of leafs to log2 leaf value */ +#define NLSTOL2BSZ(n) (31 - cntlz((n)) + BUDMIN) + +/* convert leaf index to log2 leaf value */ +#define LITOL2BSZ(n,m,b) ((((n) == 0) ? (m) : cnttz((n))) + (b)) + +/* convert a block number to a dmap control leaf index */ +#define BLKTOCTLLEAF(b,m) \ + (((b) & (((s64)1 << ((m) + L2LPERCTL)) - 1)) >> (m)) + +/* convert log2 leaf value to buddy size */ +#define BUDSIZE(s,m) (1 << ((s) - (m))) + +/* + * external references. + */ +extern int dbMount(struct inode *ipbmap); + +extern int dbUnmount(struct inode *ipbmap, int mounterror); + +extern int dbFree(struct inode *ipbmap, s64 blkno, s64 nblocks); + +extern int dbUpdatePMap(struct inode *ipbmap, + int free, s64 blkno, s64 nblocks, struct tblock * tblk); + +extern int dbNextAG(struct inode *ipbmap); + +extern int dbAlloc(struct inode *ipbmap, s64 hint, s64 nblocks, s64 * results); + +extern int dbReAlloc(struct inode *ipbmap, + s64 blkno, s64 nblocks, s64 addnblocks, s64 * results); + +extern int dbSync(struct inode *ipbmap); +extern int dbAllocBottomUp(struct inode *ip, s64 blkno, s64 nblocks); +extern int dbExtendFS(struct inode *ipbmap, s64 blkno, s64 nblocks); +extern void dbFinalizeBmap(struct inode *ipbmap); +extern s64 dbMapFileSizeToMapSize(struct inode *ipbmap); +extern s64 dbDiscardAG(struct inode *ip, int agno, s64 minlen); + +#endif /* _H_JFS_DMAP */ diff --git a/fs/jfs/jfs_dtree.c b/fs/jfs/jfs_dtree.c new file mode 100644 index 000000000..52bae3f5c --- /dev/null +++ b/fs/jfs/jfs_dtree.c @@ -0,0 +1,4575 @@ +/* + * Copyright (C) International Business Machines Corp., 2000-2004 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +/* + * jfs_dtree.c: directory B+-tree manager + * + * B+-tree with variable length key directory: + * + * each directory page is structured as an array of 32-byte + * directory entry slots initialized as a freelist + * to avoid search/compaction of free space at insertion. + * when an entry is inserted, a number of slots are allocated + * from the freelist as required to store variable length data + * of the entry; when the entry is deleted, slots of the entry + * are returned to freelist. + * + * leaf entry stores full name as key and file serial number + * (aka inode number) as data. + * internal/router entry stores sufffix compressed name + * as key and simple extent descriptor as data. + * + * each directory page maintains a sorted entry index table + * which stores the start slot index of sorted entries + * to allow binary search on the table. + * + * directory starts as a root/leaf page in on-disk inode + * inline data area. + * when it becomes full, it starts a leaf of a external extent + * of length of 1 block. each time the first leaf becomes full, + * it is extended rather than split (its size is doubled), + * until its length becoms 4 KBytes, from then the extent is split + * with new 4 Kbyte extent when it becomes full + * to reduce external fragmentation of small directories. + * + * blah, blah, blah, for linear scan of directory in pieces by + * readdir(). + * + * + * case-insensitive directory file system + * + * names are stored in case-sensitive way in leaf entry. + * but stored, searched and compared in case-insensitive (uppercase) order + * (i.e., both search key and entry key are folded for search/compare): + * (note that case-sensitive order is BROKEN in storage, e.g., + * sensitive: Ad, aB, aC, aD -> insensitive: aB, aC, aD, Ad + * + * entries which folds to the same key makes up a equivalent class + * whose members are stored as contiguous cluster (may cross page boundary) + * but whose order is arbitrary and acts as duplicate, e.g., + * abc, Abc, aBc, abC) + * + * once match is found at leaf, requires scan forward/backward + * either for, in case-insensitive search, duplicate + * or for, in case-sensitive search, for exact match + * + * router entry must be created/stored in case-insensitive way + * in internal entry: + * (right most key of left page and left most key of right page + * are folded, and its suffix compression is propagated as router + * key in parent) + * (e.g., if split occurs <abc> and <aBd>, <ABD> trather than <aB> + * should be made the router key for the split) + * + * case-insensitive search: + * + * fold search key; + * + * case-insensitive search of B-tree: + * for internal entry, router key is already folded; + * for leaf entry, fold the entry key before comparison. + * + * if (leaf entry case-insensitive match found) + * if (next entry satisfies case-insensitive match) + * return EDUPLICATE; + * if (prev entry satisfies case-insensitive match) + * return EDUPLICATE; + * return match; + * else + * return no match; + * + * serialization: + * target directory inode lock is being held on entry/exit + * of all main directory service routines. + * + * log based recovery: + */ + +#include <linux/fs.h> +#include <linux/quotaops.h> +#include <linux/slab.h> +#include "jfs_incore.h" +#include "jfs_superblock.h" +#include "jfs_filsys.h" +#include "jfs_metapage.h" +#include "jfs_dmap.h" +#include "jfs_unicode.h" +#include "jfs_debug.h" + +/* dtree split parameter */ +struct dtsplit { + struct metapage *mp; + s16 index; + s16 nslot; + struct component_name *key; + ddata_t *data; + struct pxdlist *pxdlist; +}; + +#define DT_PAGE(IP, MP) BT_PAGE(IP, MP, dtpage_t, i_dtroot) + +/* get page buffer for specified block address */ +#define DT_GETPAGE(IP, BN, MP, SIZE, P, RC) \ +do { \ + BT_GETPAGE(IP, BN, MP, dtpage_t, SIZE, P, RC, i_dtroot); \ + if (!(RC)) { \ + if (((P)->header.nextindex > \ + (((BN) == 0) ? DTROOTMAXSLOT : (P)->header.maxslot)) || \ + ((BN) && ((P)->header.maxslot > DTPAGEMAXSLOT))) { \ + BT_PUTPAGE(MP); \ + jfs_error((IP)->i_sb, \ + "DT_GETPAGE: dtree page corrupt\n"); \ + MP = NULL; \ + RC = -EIO; \ + } \ + } \ +} while (0) + +/* for consistency */ +#define DT_PUTPAGE(MP) BT_PUTPAGE(MP) + +#define DT_GETSEARCH(IP, LEAF, BN, MP, P, INDEX) \ + BT_GETSEARCH(IP, LEAF, BN, MP, dtpage_t, P, INDEX, i_dtroot) + +/* + * forward references + */ +static int dtSplitUp(tid_t tid, struct inode *ip, + struct dtsplit * split, struct btstack * btstack); + +static int dtSplitPage(tid_t tid, struct inode *ip, struct dtsplit * split, + struct metapage ** rmpp, dtpage_t ** rpp, pxd_t * rxdp); + +static int dtExtendPage(tid_t tid, struct inode *ip, + struct dtsplit * split, struct btstack * btstack); + +static int dtSplitRoot(tid_t tid, struct inode *ip, + struct dtsplit * split, struct metapage ** rmpp); + +static int dtDeleteUp(tid_t tid, struct inode *ip, struct metapage * fmp, + dtpage_t * fp, struct btstack * btstack); + +static int dtRelink(tid_t tid, struct inode *ip, dtpage_t * p); + +static int dtReadFirst(struct inode *ip, struct btstack * btstack); + +static int dtReadNext(struct inode *ip, + loff_t * offset, struct btstack * btstack); + +static int dtCompare(struct component_name * key, dtpage_t * p, int si); + +static int ciCompare(struct component_name * key, dtpage_t * p, int si, + int flag); + +static void dtGetKey(dtpage_t * p, int i, struct component_name * key, + int flag); + +static int ciGetLeafPrefixKey(dtpage_t * lp, int li, dtpage_t * rp, + int ri, struct component_name * key, int flag); + +static void dtInsertEntry(dtpage_t * p, int index, struct component_name * key, + ddata_t * data, struct dt_lock **); + +static void dtMoveEntry(dtpage_t * sp, int si, dtpage_t * dp, + struct dt_lock ** sdtlock, struct dt_lock ** ddtlock, + int do_index); + +static void dtDeleteEntry(dtpage_t * p, int fi, struct dt_lock ** dtlock); + +static void dtTruncateEntry(dtpage_t * p, int ti, struct dt_lock ** dtlock); + +static void dtLinelockFreelist(dtpage_t * p, int m, struct dt_lock ** dtlock); + +#define ciToUpper(c) UniStrupr((c)->name) + +/* + * read_index_page() + * + * Reads a page of a directory's index table. + * Having metadata mapped into the directory inode's address space + * presents a multitude of problems. We avoid this by mapping to + * the absolute address space outside of the *_metapage routines + */ +static struct metapage *read_index_page(struct inode *inode, s64 blkno) +{ + int rc; + s64 xaddr; + int xflag; + s32 xlen; + + rc = xtLookup(inode, blkno, 1, &xflag, &xaddr, &xlen, 1); + if (rc || (xaddr == 0)) + return NULL; + + return read_metapage(inode, xaddr, PSIZE, 1); +} + +/* + * get_index_page() + * + * Same as get_index_page(), but get's a new page without reading + */ +static struct metapage *get_index_page(struct inode *inode, s64 blkno) +{ + int rc; + s64 xaddr; + int xflag; + s32 xlen; + + rc = xtLookup(inode, blkno, 1, &xflag, &xaddr, &xlen, 1); + if (rc || (xaddr == 0)) + return NULL; + + return get_metapage(inode, xaddr, PSIZE, 1); +} + +/* + * find_index() + * + * Returns dtree page containing directory table entry for specified + * index and pointer to its entry. + * + * mp must be released by caller. + */ +static struct dir_table_slot *find_index(struct inode *ip, u32 index, + struct metapage ** mp, s64 *lblock) +{ + struct jfs_inode_info *jfs_ip = JFS_IP(ip); + s64 blkno; + s64 offset; + int page_offset; + struct dir_table_slot *slot; + static int maxWarnings = 10; + + if (index < 2) { + if (maxWarnings) { + jfs_warn("find_entry called with index = %d", index); + maxWarnings--; + } + return NULL; + } + + if (index >= jfs_ip->next_index) { + jfs_warn("find_entry called with index >= next_index"); + return NULL; + } + + if (jfs_dirtable_inline(ip)) { + /* + * Inline directory table + */ + *mp = NULL; + slot = &jfs_ip->i_dirtable[index - 2]; + } else { + offset = (index - 2) * sizeof(struct dir_table_slot); + page_offset = offset & (PSIZE - 1); + blkno = ((offset + 1) >> L2PSIZE) << + JFS_SBI(ip->i_sb)->l2nbperpage; + + if (*mp && (*lblock != blkno)) { + release_metapage(*mp); + *mp = NULL; + } + if (!(*mp)) { + *lblock = blkno; + *mp = read_index_page(ip, blkno); + } + if (!(*mp)) { + jfs_err("free_index: error reading directory table"); + return NULL; + } + + slot = + (struct dir_table_slot *) ((char *) (*mp)->data + + page_offset); + } + return slot; +} + +static inline void lock_index(tid_t tid, struct inode *ip, struct metapage * mp, + u32 index) +{ + struct tlock *tlck; + struct linelock *llck; + struct lv *lv; + + tlck = txLock(tid, ip, mp, tlckDATA); + llck = (struct linelock *) tlck->lock; + + if (llck->index >= llck->maxcnt) + llck = txLinelock(llck); + lv = &llck->lv[llck->index]; + + /* + * Linelock slot size is twice the size of directory table + * slot size. 512 entries per page. + */ + lv->offset = ((index - 2) & 511) >> 1; + lv->length = 1; + llck->index++; +} + +/* + * add_index() + * + * Adds an entry to the directory index table. This is used to provide + * each directory entry with a persistent index in which to resume + * directory traversals + */ +static u32 add_index(tid_t tid, struct inode *ip, s64 bn, int slot) +{ + struct super_block *sb = ip->i_sb; + struct jfs_sb_info *sbi = JFS_SBI(sb); + struct jfs_inode_info *jfs_ip = JFS_IP(ip); + u64 blkno; + struct dir_table_slot *dirtab_slot; + u32 index; + struct linelock *llck; + struct lv *lv; + struct metapage *mp; + s64 offset; + uint page_offset; + struct tlock *tlck; + s64 xaddr; + + ASSERT(DO_INDEX(ip)); + + if (jfs_ip->next_index < 2) { + jfs_warn("add_index: next_index = %d. Resetting!", + jfs_ip->next_index); + jfs_ip->next_index = 2; + } + + index = jfs_ip->next_index++; + + if (index <= MAX_INLINE_DIRTABLE_ENTRY) { + /* + * i_size reflects size of index table, or 8 bytes per entry. + */ + ip->i_size = (loff_t) (index - 1) << 3; + + /* + * dir table fits inline within inode + */ + dirtab_slot = &jfs_ip->i_dirtable[index-2]; + dirtab_slot->flag = DIR_INDEX_VALID; + dirtab_slot->slot = slot; + DTSaddress(dirtab_slot, bn); + + set_cflag(COMMIT_Dirtable, ip); + + return index; + } + if (index == (MAX_INLINE_DIRTABLE_ENTRY + 1)) { + struct dir_table_slot temp_table[12]; + + /* + * It's time to move the inline table to an external + * page and begin to build the xtree + */ + if (dquot_alloc_block(ip, sbi->nbperpage)) + goto clean_up; + if (dbAlloc(ip, 0, sbi->nbperpage, &xaddr)) { + dquot_free_block(ip, sbi->nbperpage); + goto clean_up; + } + + /* + * Save the table, we're going to overwrite it with the + * xtree root + */ + memcpy(temp_table, &jfs_ip->i_dirtable, sizeof(temp_table)); + + /* + * Initialize empty x-tree + */ + xtInitRoot(tid, ip); + + /* + * Add the first block to the xtree + */ + if (xtInsert(tid, ip, 0, 0, sbi->nbperpage, &xaddr, 0)) { + /* This really shouldn't fail */ + jfs_warn("add_index: xtInsert failed!"); + memcpy(&jfs_ip->i_dirtable, temp_table, + sizeof (temp_table)); + dbFree(ip, xaddr, sbi->nbperpage); + dquot_free_block(ip, sbi->nbperpage); + goto clean_up; + } + ip->i_size = PSIZE; + + mp = get_index_page(ip, 0); + if (!mp) { + jfs_err("add_index: get_metapage failed!"); + xtTruncate(tid, ip, 0, COMMIT_PWMAP); + memcpy(&jfs_ip->i_dirtable, temp_table, + sizeof (temp_table)); + goto clean_up; + } + tlck = txLock(tid, ip, mp, tlckDATA); + llck = (struct linelock *) & tlck->lock; + ASSERT(llck->index == 0); + lv = &llck->lv[0]; + + lv->offset = 0; + lv->length = 6; /* tlckDATA slot size is 16 bytes */ + llck->index++; + + memcpy(mp->data, temp_table, sizeof(temp_table)); + + mark_metapage_dirty(mp); + release_metapage(mp); + + /* + * Logging is now directed by xtree tlocks + */ + clear_cflag(COMMIT_Dirtable, ip); + } + + offset = (index - 2) * sizeof(struct dir_table_slot); + page_offset = offset & (PSIZE - 1); + blkno = ((offset + 1) >> L2PSIZE) << sbi->l2nbperpage; + if (page_offset == 0) { + /* + * This will be the beginning of a new page + */ + xaddr = 0; + if (xtInsert(tid, ip, 0, blkno, sbi->nbperpage, &xaddr, 0)) { + jfs_warn("add_index: xtInsert failed!"); + goto clean_up; + } + ip->i_size += PSIZE; + + if ((mp = get_index_page(ip, blkno))) + memset(mp->data, 0, PSIZE); /* Just looks better */ + else + xtTruncate(tid, ip, offset, COMMIT_PWMAP); + } else + mp = read_index_page(ip, blkno); + + if (!mp) { + jfs_err("add_index: get/read_metapage failed!"); + goto clean_up; + } + + lock_index(tid, ip, mp, index); + + dirtab_slot = + (struct dir_table_slot *) ((char *) mp->data + page_offset); + dirtab_slot->flag = DIR_INDEX_VALID; + dirtab_slot->slot = slot; + DTSaddress(dirtab_slot, bn); + + mark_metapage_dirty(mp); + release_metapage(mp); + + return index; + + clean_up: + + jfs_ip->next_index--; + + return 0; +} + +/* + * free_index() + * + * Marks an entry to the directory index table as free. + */ +static void free_index(tid_t tid, struct inode *ip, u32 index, u32 next) +{ + struct dir_table_slot *dirtab_slot; + s64 lblock; + struct metapage *mp = NULL; + + dirtab_slot = find_index(ip, index, &mp, &lblock); + + if (!dirtab_slot) + return; + + dirtab_slot->flag = DIR_INDEX_FREE; + dirtab_slot->slot = dirtab_slot->addr1 = 0; + dirtab_slot->addr2 = cpu_to_le32(next); + + if (mp) { + lock_index(tid, ip, mp, index); + mark_metapage_dirty(mp); + release_metapage(mp); + } else + set_cflag(COMMIT_Dirtable, ip); +} + +/* + * modify_index() + * + * Changes an entry in the directory index table + */ +static void modify_index(tid_t tid, struct inode *ip, u32 index, s64 bn, + int slot, struct metapage ** mp, s64 *lblock) +{ + struct dir_table_slot *dirtab_slot; + + dirtab_slot = find_index(ip, index, mp, lblock); + + if (!dirtab_slot) + return; + + DTSaddress(dirtab_slot, bn); + dirtab_slot->slot = slot; + + if (*mp) { + lock_index(tid, ip, *mp, index); + mark_metapage_dirty(*mp); + } else + set_cflag(COMMIT_Dirtable, ip); +} + +/* + * read_index() + * + * reads a directory table slot + */ +static int read_index(struct inode *ip, u32 index, + struct dir_table_slot * dirtab_slot) +{ + s64 lblock; + struct metapage *mp = NULL; + struct dir_table_slot *slot; + + slot = find_index(ip, index, &mp, &lblock); + if (!slot) { + return -EIO; + } + + memcpy(dirtab_slot, slot, sizeof(struct dir_table_slot)); + + if (mp) + release_metapage(mp); + + return 0; +} + +/* + * dtSearch() + * + * function: + * Search for the entry with specified key + * + * parameter: + * + * return: 0 - search result on stack, leaf page pinned; + * errno - I/O error + */ +int dtSearch(struct inode *ip, struct component_name * key, ino_t * data, + struct btstack * btstack, int flag) +{ + int rc = 0; + int cmp = 1; /* init for empty page */ + s64 bn; + struct metapage *mp; + dtpage_t *p; + s8 *stbl; + int base, index, lim; + struct btframe *btsp; + pxd_t *pxd; + int psize = 288; /* initial in-line directory */ + ino_t inumber; + struct component_name ciKey; + struct super_block *sb = ip->i_sb; + + ciKey.name = kmalloc_array(JFS_NAME_MAX + 1, sizeof(wchar_t), + GFP_NOFS); + if (!ciKey.name) { + rc = -ENOMEM; + goto dtSearch_Exit2; + } + + + /* uppercase search key for c-i directory */ + UniStrcpy(ciKey.name, key->name); + ciKey.namlen = key->namlen; + + /* only uppercase if case-insensitive support is on */ + if ((JFS_SBI(sb)->mntflag & JFS_OS2) == JFS_OS2) { + ciToUpper(&ciKey); + } + BT_CLR(btstack); /* reset stack */ + + /* init level count for max pages to split */ + btstack->nsplit = 1; + + /* + * search down tree from root: + * + * between two consecutive entries of <Ki, Pi> and <Kj, Pj> of + * internal page, child page Pi contains entry with k, Ki <= K < Kj. + * + * if entry with search key K is not found + * internal page search find the entry with largest key Ki + * less than K which point to the child page to search; + * leaf page search find the entry with smallest key Kj + * greater than K so that the returned index is the position of + * the entry to be shifted right for insertion of new entry. + * for empty tree, search key is greater than any key of the tree. + * + * by convention, root bn = 0. + */ + for (bn = 0;;) { + /* get/pin the page to search */ + DT_GETPAGE(ip, bn, mp, psize, p, rc); + if (rc) + goto dtSearch_Exit1; + + /* get sorted entry table of the page */ + stbl = DT_GETSTBL(p); + + /* + * binary search with search key K on the current page. + */ + for (base = 0, lim = p->header.nextindex; lim; lim >>= 1) { + index = base + (lim >> 1); + + if (p->header.flag & BT_LEAF) { + /* uppercase leaf name to compare */ + cmp = + ciCompare(&ciKey, p, stbl[index], + JFS_SBI(sb)->mntflag); + } else { + /* router key is in uppercase */ + + cmp = dtCompare(&ciKey, p, stbl[index]); + + + } + if (cmp == 0) { + /* + * search hit + */ + /* search hit - leaf page: + * return the entry found + */ + if (p->header.flag & BT_LEAF) { + inumber = le32_to_cpu( + ((struct ldtentry *) & p->slot[stbl[index]])->inumber); + + /* + * search for JFS_LOOKUP + */ + if (flag == JFS_LOOKUP) { + *data = inumber; + rc = 0; + goto out; + } + + /* + * search for JFS_CREATE + */ + if (flag == JFS_CREATE) { + *data = inumber; + rc = -EEXIST; + goto out; + } + + /* + * search for JFS_REMOVE or JFS_RENAME + */ + if ((flag == JFS_REMOVE || + flag == JFS_RENAME) && + *data != inumber) { + rc = -ESTALE; + goto out; + } + + /* + * JFS_REMOVE|JFS_FINDDIR|JFS_RENAME + */ + /* save search result */ + *data = inumber; + btsp = btstack->top; + btsp->bn = bn; + btsp->index = index; + btsp->mp = mp; + + rc = 0; + goto dtSearch_Exit1; + } + + /* search hit - internal page: + * descend/search its child page + */ + goto getChild; + } + + if (cmp > 0) { + base = index + 1; + --lim; + } + } + + /* + * search miss + * + * base is the smallest index with key (Kj) greater than + * search key (K) and may be zero or (maxindex + 1) index. + */ + /* + * search miss - leaf page + * + * return location of entry (base) where new entry with + * search key K is to be inserted. + */ + if (p->header.flag & BT_LEAF) { + /* + * search for JFS_LOOKUP, JFS_REMOVE, or JFS_RENAME + */ + if (flag == JFS_LOOKUP || flag == JFS_REMOVE || + flag == JFS_RENAME) { + rc = -ENOENT; + goto out; + } + + /* + * search for JFS_CREATE|JFS_FINDDIR: + * + * save search result + */ + *data = 0; + btsp = btstack->top; + btsp->bn = bn; + btsp->index = base; + btsp->mp = mp; + + rc = 0; + goto dtSearch_Exit1; + } + + /* + * search miss - internal page + * + * if base is non-zero, decrement base by one to get the parent + * entry of the child page to search. + */ + index = base ? base - 1 : base; + + /* + * go down to child page + */ + getChild: + /* update max. number of pages to split */ + if (BT_STACK_FULL(btstack)) { + /* Something's corrupted, mark filesystem dirty so + * chkdsk will fix it. + */ + jfs_error(sb, "stack overrun!\n"); + BT_STACK_DUMP(btstack); + rc = -EIO; + goto out; + } + btstack->nsplit++; + + /* push (bn, index) of the parent page/entry */ + BT_PUSH(btstack, bn, index); + + /* get the child page block number */ + pxd = (pxd_t *) & p->slot[stbl[index]]; + bn = addressPXD(pxd); + psize = lengthPXD(pxd) << JFS_SBI(ip->i_sb)->l2bsize; + + /* unpin the parent page */ + DT_PUTPAGE(mp); + } + + out: + DT_PUTPAGE(mp); + + dtSearch_Exit1: + + kfree(ciKey.name); + + dtSearch_Exit2: + + return rc; +} + + +/* + * dtInsert() + * + * function: insert an entry to directory tree + * + * parameter: + * + * return: 0 - success; + * errno - failure; + */ +int dtInsert(tid_t tid, struct inode *ip, + struct component_name * name, ino_t * fsn, struct btstack * btstack) +{ + int rc = 0; + struct metapage *mp; /* meta-page buffer */ + dtpage_t *p; /* base B+-tree index page */ + s64 bn; + int index; + struct dtsplit split; /* split information */ + ddata_t data; + struct dt_lock *dtlck; + int n; + struct tlock *tlck; + struct lv *lv; + + /* + * retrieve search result + * + * dtSearch() returns (leaf page pinned, index at which to insert). + * n.b. dtSearch() may return index of (maxindex + 1) of + * the full page. + */ + DT_GETSEARCH(ip, btstack->top, bn, mp, p, index); + + /* + * insert entry for new key + */ + if (DO_INDEX(ip)) { + if (JFS_IP(ip)->next_index == DIREND) { + DT_PUTPAGE(mp); + return -EMLINK; + } + n = NDTLEAF(name->namlen); + data.leaf.tid = tid; + data.leaf.ip = ip; + } else { + n = NDTLEAF_LEGACY(name->namlen); + data.leaf.ip = NULL; /* signifies legacy directory format */ + } + data.leaf.ino = *fsn; + + /* + * leaf page does not have enough room for new entry: + * + * extend/split the leaf page; + * + * dtSplitUp() will insert the entry and unpin the leaf page. + */ + if (n > p->header.freecnt) { + split.mp = mp; + split.index = index; + split.nslot = n; + split.key = name; + split.data = &data; + rc = dtSplitUp(tid, ip, &split, btstack); + return rc; + } + + /* + * leaf page does have enough room for new entry: + * + * insert the new data entry into the leaf page; + */ + BT_MARK_DIRTY(mp, ip); + /* + * acquire a transaction lock on the leaf page + */ + tlck = txLock(tid, ip, mp, tlckDTREE | tlckENTRY); + dtlck = (struct dt_lock *) & tlck->lock; + ASSERT(dtlck->index == 0); + lv = & dtlck->lv[0]; + + /* linelock header */ + lv->offset = 0; + lv->length = 1; + dtlck->index++; + + dtInsertEntry(p, index, name, &data, &dtlck); + + /* linelock stbl of non-root leaf page */ + if (!(p->header.flag & BT_ROOT)) { + if (dtlck->index >= dtlck->maxcnt) + dtlck = (struct dt_lock *) txLinelock(dtlck); + lv = & dtlck->lv[dtlck->index]; + n = index >> L2DTSLOTSIZE; + lv->offset = p->header.stblindex + n; + lv->length = + ((p->header.nextindex - 1) >> L2DTSLOTSIZE) - n + 1; + dtlck->index++; + } + + /* unpin the leaf page */ + DT_PUTPAGE(mp); + + return 0; +} + + +/* + * dtSplitUp() + * + * function: propagate insertion bottom up; + * + * parameter: + * + * return: 0 - success; + * errno - failure; + * leaf page unpinned; + */ +static int dtSplitUp(tid_t tid, + struct inode *ip, struct dtsplit * split, struct btstack * btstack) +{ + struct jfs_sb_info *sbi = JFS_SBI(ip->i_sb); + int rc = 0; + struct metapage *smp; + dtpage_t *sp; /* split page */ + struct metapage *rmp; + dtpage_t *rp; /* new right page split from sp */ + pxd_t rpxd; /* new right page extent descriptor */ + struct metapage *lmp; + dtpage_t *lp; /* left child page */ + int skip; /* index of entry of insertion */ + struct btframe *parent; /* parent page entry on traverse stack */ + s64 xaddr, nxaddr; + int xlen, xsize; + struct pxdlist pxdlist; + pxd_t *pxd; + struct component_name key = { 0, NULL }; + ddata_t *data = split->data; + int n; + struct dt_lock *dtlck; + struct tlock *tlck; + struct lv *lv; + int quota_allocation = 0; + + /* get split page */ + smp = split->mp; + sp = DT_PAGE(ip, smp); + + key.name = kmalloc_array(JFS_NAME_MAX + 2, sizeof(wchar_t), GFP_NOFS); + if (!key.name) { + DT_PUTPAGE(smp); + rc = -ENOMEM; + goto dtSplitUp_Exit; + } + + /* + * split leaf page + * + * The split routines insert the new entry, and + * acquire txLock as appropriate. + */ + /* + * split root leaf page: + */ + if (sp->header.flag & BT_ROOT) { + /* + * allocate a single extent child page + */ + xlen = 1; + n = sbi->bsize >> L2DTSLOTSIZE; + n -= (n + 31) >> L2DTSLOTSIZE; /* stbl size */ + n -= DTROOTMAXSLOT - sp->header.freecnt; /* header + entries */ + if (n <= split->nslot) + xlen++; + if ((rc = dbAlloc(ip, 0, (s64) xlen, &xaddr))) { + DT_PUTPAGE(smp); + goto freeKeyName; + } + + pxdlist.maxnpxd = 1; + pxdlist.npxd = 0; + pxd = &pxdlist.pxd[0]; + PXDaddress(pxd, xaddr); + PXDlength(pxd, xlen); + split->pxdlist = &pxdlist; + rc = dtSplitRoot(tid, ip, split, &rmp); + + if (rc) + dbFree(ip, xaddr, xlen); + else + DT_PUTPAGE(rmp); + + DT_PUTPAGE(smp); + + if (!DO_INDEX(ip)) + ip->i_size = xlen << sbi->l2bsize; + + goto freeKeyName; + } + + /* + * extend first leaf page + * + * extend the 1st extent if less than buffer page size + * (dtExtendPage() reurns leaf page unpinned) + */ + pxd = &sp->header.self; + xlen = lengthPXD(pxd); + xsize = xlen << sbi->l2bsize; + if (xsize < PSIZE) { + xaddr = addressPXD(pxd); + n = xsize >> L2DTSLOTSIZE; + n -= (n + 31) >> L2DTSLOTSIZE; /* stbl size */ + if ((n + sp->header.freecnt) <= split->nslot) + n = xlen + (xlen << 1); + else + n = xlen; + + /* Allocate blocks to quota. */ + rc = dquot_alloc_block(ip, n); + if (rc) + goto extendOut; + quota_allocation += n; + + if ((rc = dbReAlloc(sbi->ipbmap, xaddr, (s64) xlen, + (s64) n, &nxaddr))) + goto extendOut; + + pxdlist.maxnpxd = 1; + pxdlist.npxd = 0; + pxd = &pxdlist.pxd[0]; + PXDaddress(pxd, nxaddr); + PXDlength(pxd, xlen + n); + split->pxdlist = &pxdlist; + if ((rc = dtExtendPage(tid, ip, split, btstack))) { + nxaddr = addressPXD(pxd); + if (xaddr != nxaddr) { + /* free relocated extent */ + xlen = lengthPXD(pxd); + dbFree(ip, nxaddr, (s64) xlen); + } else { + /* free extended delta */ + xlen = lengthPXD(pxd) - n; + xaddr = addressPXD(pxd) + xlen; + dbFree(ip, xaddr, (s64) n); + } + } else if (!DO_INDEX(ip)) + ip->i_size = lengthPXD(pxd) << sbi->l2bsize; + + + extendOut: + DT_PUTPAGE(smp); + goto freeKeyName; + } + + /* + * split leaf page <sp> into <sp> and a new right page <rp>. + * + * return <rp> pinned and its extent descriptor <rpxd> + */ + /* + * allocate new directory page extent and + * new index page(s) to cover page split(s) + * + * allocation hint: ? + */ + n = btstack->nsplit; + pxdlist.maxnpxd = pxdlist.npxd = 0; + xlen = sbi->nbperpage; + for (pxd = pxdlist.pxd; n > 0; n--, pxd++) { + if ((rc = dbAlloc(ip, 0, (s64) xlen, &xaddr)) == 0) { + PXDaddress(pxd, xaddr); + PXDlength(pxd, xlen); + pxdlist.maxnpxd++; + continue; + } + + DT_PUTPAGE(smp); + + /* undo allocation */ + goto splitOut; + } + + split->pxdlist = &pxdlist; + if ((rc = dtSplitPage(tid, ip, split, &rmp, &rp, &rpxd))) { + DT_PUTPAGE(smp); + + /* undo allocation */ + goto splitOut; + } + + if (!DO_INDEX(ip)) + ip->i_size += PSIZE; + + /* + * propagate up the router entry for the leaf page just split + * + * insert a router entry for the new page into the parent page, + * propagate the insert/split up the tree by walking back the stack + * of (bn of parent page, index of child page entry in parent page) + * that were traversed during the search for the page that split. + * + * the propagation of insert/split up the tree stops if the root + * splits or the page inserted into doesn't have to split to hold + * the new entry. + * + * the parent entry for the split page remains the same, and + * a new entry is inserted at its right with the first key and + * block number of the new right page. + * + * There are a maximum of 4 pages pinned at any time: + * two children, left parent and right parent (when the parent splits). + * keep the child pages pinned while working on the parent. + * make sure that all pins are released at exit. + */ + while ((parent = BT_POP(btstack)) != NULL) { + /* parent page specified by stack frame <parent> */ + + /* keep current child pages (<lp>, <rp>) pinned */ + lmp = smp; + lp = sp; + + /* + * insert router entry in parent for new right child page <rp> + */ + /* get the parent page <sp> */ + DT_GETPAGE(ip, parent->bn, smp, PSIZE, sp, rc); + if (rc) { + DT_PUTPAGE(lmp); + DT_PUTPAGE(rmp); + goto splitOut; + } + + /* + * The new key entry goes ONE AFTER the index of parent entry, + * because the split was to the right. + */ + skip = parent->index + 1; + + /* + * compute the key for the router entry + * + * key suffix compression: + * for internal pages that have leaf pages as children, + * retain only what's needed to distinguish between + * the new entry and the entry on the page to its left. + * If the keys compare equal, retain the entire key. + * + * note that compression is performed only at computing + * router key at the lowest internal level. + * further compression of the key between pairs of higher + * level internal pages loses too much information and + * the search may fail. + * (e.g., two adjacent leaf pages of {a, ..., x} {xx, ...,} + * results in two adjacent parent entries (a)(xx). + * if split occurs between these two entries, and + * if compression is applied, the router key of parent entry + * of right page (x) will divert search for x into right + * subtree and miss x in the left subtree.) + * + * the entire key must be retained for the next-to-leftmost + * internal key at any level of the tree, or search may fail + * (e.g., ?) + */ + switch (rp->header.flag & BT_TYPE) { + case BT_LEAF: + /* + * compute the length of prefix for suffix compression + * between last entry of left page and first entry + * of right page + */ + if ((sp->header.flag & BT_ROOT && skip > 1) || + sp->header.prev != 0 || skip > 1) { + /* compute uppercase router prefix key */ + rc = ciGetLeafPrefixKey(lp, + lp->header.nextindex-1, + rp, 0, &key, + sbi->mntflag); + if (rc) { + DT_PUTPAGE(lmp); + DT_PUTPAGE(rmp); + DT_PUTPAGE(smp); + goto splitOut; + } + } else { + /* next to leftmost entry of + lowest internal level */ + + /* compute uppercase router key */ + dtGetKey(rp, 0, &key, sbi->mntflag); + key.name[key.namlen] = 0; + + if ((sbi->mntflag & JFS_OS2) == JFS_OS2) + ciToUpper(&key); + } + + n = NDTINTERNAL(key.namlen); + break; + + case BT_INTERNAL: + dtGetKey(rp, 0, &key, sbi->mntflag); + n = NDTINTERNAL(key.namlen); + break; + + default: + jfs_err("dtSplitUp(): UFO!"); + break; + } + + /* unpin left child page */ + DT_PUTPAGE(lmp); + + /* + * compute the data for the router entry + */ + data->xd = rpxd; /* child page xd */ + + /* + * parent page is full - split the parent page + */ + if (n > sp->header.freecnt) { + /* init for parent page split */ + split->mp = smp; + split->index = skip; /* index at insert */ + split->nslot = n; + split->key = &key; + /* split->data = data; */ + + /* unpin right child page */ + DT_PUTPAGE(rmp); + + /* The split routines insert the new entry, + * acquire txLock as appropriate. + * return <rp> pinned and its block number <rbn>. + */ + rc = (sp->header.flag & BT_ROOT) ? + dtSplitRoot(tid, ip, split, &rmp) : + dtSplitPage(tid, ip, split, &rmp, &rp, &rpxd); + if (rc) { + DT_PUTPAGE(smp); + goto splitOut; + } + + /* smp and rmp are pinned */ + } + /* + * parent page is not full - insert router entry in parent page + */ + else { + BT_MARK_DIRTY(smp, ip); + /* + * acquire a transaction lock on the parent page + */ + tlck = txLock(tid, ip, smp, tlckDTREE | tlckENTRY); + dtlck = (struct dt_lock *) & tlck->lock; + ASSERT(dtlck->index == 0); + lv = & dtlck->lv[0]; + + /* linelock header */ + lv->offset = 0; + lv->length = 1; + dtlck->index++; + + /* linelock stbl of non-root parent page */ + if (!(sp->header.flag & BT_ROOT)) { + lv++; + n = skip >> L2DTSLOTSIZE; + lv->offset = sp->header.stblindex + n; + lv->length = + ((sp->header.nextindex - + 1) >> L2DTSLOTSIZE) - n + 1; + dtlck->index++; + } + + dtInsertEntry(sp, skip, &key, data, &dtlck); + + /* exit propagate up */ + break; + } + } + + /* unpin current split and its right page */ + DT_PUTPAGE(smp); + DT_PUTPAGE(rmp); + + /* + * free remaining extents allocated for split + */ + splitOut: + n = pxdlist.npxd; + pxd = &pxdlist.pxd[n]; + for (; n < pxdlist.maxnpxd; n++, pxd++) + dbFree(ip, addressPXD(pxd), (s64) lengthPXD(pxd)); + + freeKeyName: + kfree(key.name); + + /* Rollback quota allocation */ + if (rc && quota_allocation) + dquot_free_block(ip, quota_allocation); + + dtSplitUp_Exit: + + return rc; +} + + +/* + * dtSplitPage() + * + * function: Split a non-root page of a btree. + * + * parameter: + * + * return: 0 - success; + * errno - failure; + * return split and new page pinned; + */ +static int dtSplitPage(tid_t tid, struct inode *ip, struct dtsplit * split, + struct metapage ** rmpp, dtpage_t ** rpp, pxd_t * rpxdp) +{ + int rc = 0; + struct metapage *smp; + dtpage_t *sp; + struct metapage *rmp; + dtpage_t *rp; /* new right page allocated */ + s64 rbn; /* new right page block number */ + struct metapage *mp; + dtpage_t *p; + s64 nextbn; + struct pxdlist *pxdlist; + pxd_t *pxd; + int skip, nextindex, half, left, nxt, off, si; + struct ldtentry *ldtentry; + struct idtentry *idtentry; + u8 *stbl; + struct dtslot *f; + int fsi, stblsize; + int n; + struct dt_lock *sdtlck, *rdtlck; + struct tlock *tlck; + struct dt_lock *dtlck; + struct lv *slv, *rlv, *lv; + + /* get split page */ + smp = split->mp; + sp = DT_PAGE(ip, smp); + + /* + * allocate the new right page for the split + */ + pxdlist = split->pxdlist; + pxd = &pxdlist->pxd[pxdlist->npxd]; + pxdlist->npxd++; + rbn = addressPXD(pxd); + rmp = get_metapage(ip, rbn, PSIZE, 1); + if (rmp == NULL) + return -EIO; + + /* Allocate blocks to quota. */ + rc = dquot_alloc_block(ip, lengthPXD(pxd)); + if (rc) { + release_metapage(rmp); + return rc; + } + + jfs_info("dtSplitPage: ip:0x%p smp:0x%p rmp:0x%p", ip, smp, rmp); + + BT_MARK_DIRTY(rmp, ip); + /* + * acquire a transaction lock on the new right page + */ + tlck = txLock(tid, ip, rmp, tlckDTREE | tlckNEW); + rdtlck = (struct dt_lock *) & tlck->lock; + + rp = (dtpage_t *) rmp->data; + *rpp = rp; + rp->header.self = *pxd; + + BT_MARK_DIRTY(smp, ip); + /* + * acquire a transaction lock on the split page + * + * action: + */ + tlck = txLock(tid, ip, smp, tlckDTREE | tlckENTRY); + sdtlck = (struct dt_lock *) & tlck->lock; + + /* linelock header of split page */ + ASSERT(sdtlck->index == 0); + slv = & sdtlck->lv[0]; + slv->offset = 0; + slv->length = 1; + sdtlck->index++; + + /* + * initialize/update sibling pointers between sp and rp + */ + nextbn = le64_to_cpu(sp->header.next); + rp->header.next = cpu_to_le64(nextbn); + rp->header.prev = cpu_to_le64(addressPXD(&sp->header.self)); + sp->header.next = cpu_to_le64(rbn); + + /* + * initialize new right page + */ + rp->header.flag = sp->header.flag; + + /* compute sorted entry table at start of extent data area */ + rp->header.nextindex = 0; + rp->header.stblindex = 1; + + n = PSIZE >> L2DTSLOTSIZE; + rp->header.maxslot = n; + stblsize = (n + 31) >> L2DTSLOTSIZE; /* in unit of slot */ + + /* init freelist */ + fsi = rp->header.stblindex + stblsize; + rp->header.freelist = fsi; + rp->header.freecnt = rp->header.maxslot - fsi; + + /* + * sequential append at tail: append without split + * + * If splitting the last page on a level because of appending + * a entry to it (skip is maxentry), it's likely that the access is + * sequential. Adding an empty page on the side of the level is less + * work and can push the fill factor much higher than normal. + * If we're wrong it's no big deal, we'll just do the split the right + * way next time. + * (It may look like it's equally easy to do a similar hack for + * reverse sorted data, that is, split the tree left, + * but it's not. Be my guest.) + */ + if (nextbn == 0 && split->index == sp->header.nextindex) { + /* linelock header + stbl (first slot) of new page */ + rlv = & rdtlck->lv[rdtlck->index]; + rlv->offset = 0; + rlv->length = 2; + rdtlck->index++; + + /* + * initialize freelist of new right page + */ + f = &rp->slot[fsi]; + for (fsi++; fsi < rp->header.maxslot; f++, fsi++) + f->next = fsi; + f->next = -1; + + /* insert entry at the first entry of the new right page */ + dtInsertEntry(rp, 0, split->key, split->data, &rdtlck); + + goto out; + } + + /* + * non-sequential insert (at possibly middle page) + */ + + /* + * update prev pointer of previous right sibling page; + */ + if (nextbn != 0) { + DT_GETPAGE(ip, nextbn, mp, PSIZE, p, rc); + if (rc) { + discard_metapage(rmp); + return rc; + } + + BT_MARK_DIRTY(mp, ip); + /* + * acquire a transaction lock on the next page + */ + tlck = txLock(tid, ip, mp, tlckDTREE | tlckRELINK); + jfs_info("dtSplitPage: tlck = 0x%p, ip = 0x%p, mp=0x%p", + tlck, ip, mp); + dtlck = (struct dt_lock *) & tlck->lock; + + /* linelock header of previous right sibling page */ + lv = & dtlck->lv[dtlck->index]; + lv->offset = 0; + lv->length = 1; + dtlck->index++; + + p->header.prev = cpu_to_le64(rbn); + + DT_PUTPAGE(mp); + } + + /* + * split the data between the split and right pages. + */ + skip = split->index; + half = (PSIZE >> L2DTSLOTSIZE) >> 1; /* swag */ + left = 0; + + /* + * compute fill factor for split pages + * + * <nxt> traces the next entry to move to rp + * <off> traces the next entry to stay in sp + */ + stbl = (u8 *) & sp->slot[sp->header.stblindex]; + nextindex = sp->header.nextindex; + for (nxt = off = 0; nxt < nextindex; ++off) { + if (off == skip) + /* check for fill factor with new entry size */ + n = split->nslot; + else { + si = stbl[nxt]; + switch (sp->header.flag & BT_TYPE) { + case BT_LEAF: + ldtentry = (struct ldtentry *) & sp->slot[si]; + if (DO_INDEX(ip)) + n = NDTLEAF(ldtentry->namlen); + else + n = NDTLEAF_LEGACY(ldtentry-> + namlen); + break; + + case BT_INTERNAL: + idtentry = (struct idtentry *) & sp->slot[si]; + n = NDTINTERNAL(idtentry->namlen); + break; + + default: + break; + } + + ++nxt; /* advance to next entry to move in sp */ + } + + left += n; + if (left >= half) + break; + } + + /* <nxt> poins to the 1st entry to move */ + + /* + * move entries to right page + * + * dtMoveEntry() initializes rp and reserves entry for insertion + * + * split page moved out entries are linelocked; + * new/right page moved in entries are linelocked; + */ + /* linelock header + stbl of new right page */ + rlv = & rdtlck->lv[rdtlck->index]; + rlv->offset = 0; + rlv->length = 5; + rdtlck->index++; + + dtMoveEntry(sp, nxt, rp, &sdtlck, &rdtlck, DO_INDEX(ip)); + + sp->header.nextindex = nxt; + + /* + * finalize freelist of new right page + */ + fsi = rp->header.freelist; + f = &rp->slot[fsi]; + for (fsi++; fsi < rp->header.maxslot; f++, fsi++) + f->next = fsi; + f->next = -1; + + /* + * Update directory index table for entries now in right page + */ + if ((rp->header.flag & BT_LEAF) && DO_INDEX(ip)) { + s64 lblock; + + mp = NULL; + stbl = DT_GETSTBL(rp); + for (n = 0; n < rp->header.nextindex; n++) { + ldtentry = (struct ldtentry *) & rp->slot[stbl[n]]; + modify_index(tid, ip, le32_to_cpu(ldtentry->index), + rbn, n, &mp, &lblock); + } + if (mp) + release_metapage(mp); + } + + /* + * the skipped index was on the left page, + */ + if (skip <= off) { + /* insert the new entry in the split page */ + dtInsertEntry(sp, skip, split->key, split->data, &sdtlck); + + /* linelock stbl of split page */ + if (sdtlck->index >= sdtlck->maxcnt) + sdtlck = (struct dt_lock *) txLinelock(sdtlck); + slv = & sdtlck->lv[sdtlck->index]; + n = skip >> L2DTSLOTSIZE; + slv->offset = sp->header.stblindex + n; + slv->length = + ((sp->header.nextindex - 1) >> L2DTSLOTSIZE) - n + 1; + sdtlck->index++; + } + /* + * the skipped index was on the right page, + */ + else { + /* adjust the skip index to reflect the new position */ + skip -= nxt; + + /* insert the new entry in the right page */ + dtInsertEntry(rp, skip, split->key, split->data, &rdtlck); + } + + out: + *rmpp = rmp; + *rpxdp = *pxd; + + return rc; +} + + +/* + * dtExtendPage() + * + * function: extend 1st/only directory leaf page + * + * parameter: + * + * return: 0 - success; + * errno - failure; + * return extended page pinned; + */ +static int dtExtendPage(tid_t tid, + struct inode *ip, struct dtsplit * split, struct btstack * btstack) +{ + struct super_block *sb = ip->i_sb; + int rc; + struct metapage *smp, *pmp, *mp; + dtpage_t *sp, *pp; + struct pxdlist *pxdlist; + pxd_t *pxd, *tpxd; + int xlen, xsize; + int newstblindex, newstblsize; + int oldstblindex, oldstblsize; + int fsi, last; + struct dtslot *f; + struct btframe *parent; + int n; + struct dt_lock *dtlck; + s64 xaddr, txaddr; + struct tlock *tlck; + struct pxd_lock *pxdlock; + struct lv *lv; + uint type; + struct ldtentry *ldtentry; + u8 *stbl; + + /* get page to extend */ + smp = split->mp; + sp = DT_PAGE(ip, smp); + + /* get parent/root page */ + parent = BT_POP(btstack); + DT_GETPAGE(ip, parent->bn, pmp, PSIZE, pp, rc); + if (rc) + return (rc); + + /* + * extend the extent + */ + pxdlist = split->pxdlist; + pxd = &pxdlist->pxd[pxdlist->npxd]; + pxdlist->npxd++; + + xaddr = addressPXD(pxd); + tpxd = &sp->header.self; + txaddr = addressPXD(tpxd); + /* in-place extension */ + if (xaddr == txaddr) { + type = tlckEXTEND; + } + /* relocation */ + else { + type = tlckNEW; + + /* save moved extent descriptor for later free */ + tlck = txMaplock(tid, ip, tlckDTREE | tlckRELOCATE); + pxdlock = (struct pxd_lock *) & tlck->lock; + pxdlock->flag = mlckFREEPXD; + pxdlock->pxd = sp->header.self; + pxdlock->index = 1; + + /* + * Update directory index table to reflect new page address + */ + if (DO_INDEX(ip)) { + s64 lblock; + + mp = NULL; + stbl = DT_GETSTBL(sp); + for (n = 0; n < sp->header.nextindex; n++) { + ldtentry = + (struct ldtentry *) & sp->slot[stbl[n]]; + modify_index(tid, ip, + le32_to_cpu(ldtentry->index), + xaddr, n, &mp, &lblock); + } + if (mp) + release_metapage(mp); + } + } + + /* + * extend the page + */ + sp->header.self = *pxd; + + jfs_info("dtExtendPage: ip:0x%p smp:0x%p sp:0x%p", ip, smp, sp); + + BT_MARK_DIRTY(smp, ip); + /* + * acquire a transaction lock on the extended/leaf page + */ + tlck = txLock(tid, ip, smp, tlckDTREE | type); + dtlck = (struct dt_lock *) & tlck->lock; + lv = & dtlck->lv[0]; + + /* update buffer extent descriptor of extended page */ + xlen = lengthPXD(pxd); + xsize = xlen << JFS_SBI(sb)->l2bsize; + + /* + * copy old stbl to new stbl at start of extended area + */ + oldstblindex = sp->header.stblindex; + oldstblsize = (sp->header.maxslot + 31) >> L2DTSLOTSIZE; + newstblindex = sp->header.maxslot; + n = xsize >> L2DTSLOTSIZE; + newstblsize = (n + 31) >> L2DTSLOTSIZE; + memcpy(&sp->slot[newstblindex], &sp->slot[oldstblindex], + sp->header.nextindex); + + /* + * in-line extension: linelock old area of extended page + */ + if (type == tlckEXTEND) { + /* linelock header */ + lv->offset = 0; + lv->length = 1; + dtlck->index++; + lv++; + + /* linelock new stbl of extended page */ + lv->offset = newstblindex; + lv->length = newstblsize; + } + /* + * relocation: linelock whole relocated area + */ + else { + lv->offset = 0; + lv->length = sp->header.maxslot + newstblsize; + } + + dtlck->index++; + + sp->header.maxslot = n; + sp->header.stblindex = newstblindex; + /* sp->header.nextindex remains the same */ + + /* + * add old stbl region at head of freelist + */ + fsi = oldstblindex; + f = &sp->slot[fsi]; + last = sp->header.freelist; + for (n = 0; n < oldstblsize; n++, fsi++, f++) { + f->next = last; + last = fsi; + } + sp->header.freelist = last; + sp->header.freecnt += oldstblsize; + + /* + * append free region of newly extended area at tail of freelist + */ + /* init free region of newly extended area */ + fsi = n = newstblindex + newstblsize; + f = &sp->slot[fsi]; + for (fsi++; fsi < sp->header.maxslot; f++, fsi++) + f->next = fsi; + f->next = -1; + + /* append new free region at tail of old freelist */ + fsi = sp->header.freelist; + if (fsi == -1) + sp->header.freelist = n; + else { + do { + f = &sp->slot[fsi]; + fsi = f->next; + } while (fsi != -1); + + f->next = n; + } + + sp->header.freecnt += sp->header.maxslot - n; + + /* + * insert the new entry + */ + dtInsertEntry(sp, split->index, split->key, split->data, &dtlck); + + BT_MARK_DIRTY(pmp, ip); + /* + * linelock any freeslots residing in old extent + */ + if (type == tlckEXTEND) { + n = sp->header.maxslot >> 2; + if (sp->header.freelist < n) + dtLinelockFreelist(sp, n, &dtlck); + } + + /* + * update parent entry on the parent/root page + */ + /* + * acquire a transaction lock on the parent/root page + */ + tlck = txLock(tid, ip, pmp, tlckDTREE | tlckENTRY); + dtlck = (struct dt_lock *) & tlck->lock; + lv = & dtlck->lv[dtlck->index]; + + /* linelock parent entry - 1st slot */ + lv->offset = 1; + lv->length = 1; + dtlck->index++; + + /* update the parent pxd for page extension */ + tpxd = (pxd_t *) & pp->slot[1]; + *tpxd = *pxd; + + DT_PUTPAGE(pmp); + return 0; +} + + +/* + * dtSplitRoot() + * + * function: + * split the full root page into + * original/root/split page and new right page + * i.e., root remains fixed in tree anchor (inode) and + * the root is copied to a single new right child page + * since root page << non-root page, and + * the split root page contains a single entry for the + * new right child page. + * + * parameter: + * + * return: 0 - success; + * errno - failure; + * return new page pinned; + */ +static int dtSplitRoot(tid_t tid, + struct inode *ip, struct dtsplit * split, struct metapage ** rmpp) +{ + struct super_block *sb = ip->i_sb; + struct metapage *smp; + dtroot_t *sp; + struct metapage *rmp; + dtpage_t *rp; + s64 rbn; + int xlen; + int xsize; + struct dtslot *f; + s8 *stbl; + int fsi, stblsize, n; + struct idtentry *s; + pxd_t *ppxd; + struct pxdlist *pxdlist; + pxd_t *pxd; + struct dt_lock *dtlck; + struct tlock *tlck; + struct lv *lv; + int rc; + + /* get split root page */ + smp = split->mp; + sp = &JFS_IP(ip)->i_dtroot; + + /* + * allocate/initialize a single (right) child page + * + * N.B. at first split, a one (or two) block to fit new entry + * is allocated; at subsequent split, a full page is allocated; + */ + pxdlist = split->pxdlist; + pxd = &pxdlist->pxd[pxdlist->npxd]; + pxdlist->npxd++; + rbn = addressPXD(pxd); + xlen = lengthPXD(pxd); + xsize = xlen << JFS_SBI(sb)->l2bsize; + rmp = get_metapage(ip, rbn, xsize, 1); + if (!rmp) + return -EIO; + + rp = rmp->data; + + /* Allocate blocks to quota. */ + rc = dquot_alloc_block(ip, lengthPXD(pxd)); + if (rc) { + release_metapage(rmp); + return rc; + } + + BT_MARK_DIRTY(rmp, ip); + /* + * acquire a transaction lock on the new right page + */ + tlck = txLock(tid, ip, rmp, tlckDTREE | tlckNEW); + dtlck = (struct dt_lock *) & tlck->lock; + + rp->header.flag = + (sp->header.flag & BT_LEAF) ? BT_LEAF : BT_INTERNAL; + rp->header.self = *pxd; + + /* initialize sibling pointers */ + rp->header.next = 0; + rp->header.prev = 0; + + /* + * move in-line root page into new right page extent + */ + /* linelock header + copied entries + new stbl (1st slot) in new page */ + ASSERT(dtlck->index == 0); + lv = & dtlck->lv[0]; + lv->offset = 0; + lv->length = 10; /* 1 + 8 + 1 */ + dtlck->index++; + + n = xsize >> L2DTSLOTSIZE; + rp->header.maxslot = n; + stblsize = (n + 31) >> L2DTSLOTSIZE; + + /* copy old stbl to new stbl at start of extended area */ + rp->header.stblindex = DTROOTMAXSLOT; + stbl = (s8 *) & rp->slot[DTROOTMAXSLOT]; + memcpy(stbl, sp->header.stbl, sp->header.nextindex); + rp->header.nextindex = sp->header.nextindex; + + /* copy old data area to start of new data area */ + memcpy(&rp->slot[1], &sp->slot[1], IDATASIZE); + + /* + * append free region of newly extended area at tail of freelist + */ + /* init free region of newly extended area */ + fsi = n = DTROOTMAXSLOT + stblsize; + f = &rp->slot[fsi]; + for (fsi++; fsi < rp->header.maxslot; f++, fsi++) + f->next = fsi; + f->next = -1; + + /* append new free region at tail of old freelist */ + fsi = sp->header.freelist; + if (fsi == -1) + rp->header.freelist = n; + else { + rp->header.freelist = fsi; + + do { + f = &rp->slot[fsi]; + fsi = f->next; + } while (fsi != -1); + + f->next = n; + } + + rp->header.freecnt = sp->header.freecnt + rp->header.maxslot - n; + + /* + * Update directory index table for entries now in right page + */ + if ((rp->header.flag & BT_LEAF) && DO_INDEX(ip)) { + s64 lblock; + struct metapage *mp = NULL; + struct ldtentry *ldtentry; + + stbl = DT_GETSTBL(rp); + for (n = 0; n < rp->header.nextindex; n++) { + ldtentry = (struct ldtentry *) & rp->slot[stbl[n]]; + modify_index(tid, ip, le32_to_cpu(ldtentry->index), + rbn, n, &mp, &lblock); + } + if (mp) + release_metapage(mp); + } + /* + * insert the new entry into the new right/child page + * (skip index in the new right page will not change) + */ + dtInsertEntry(rp, split->index, split->key, split->data, &dtlck); + + /* + * reset parent/root page + * + * set the 1st entry offset to 0, which force the left-most key + * at any level of the tree to be less than any search key. + * + * The btree comparison code guarantees that the left-most key on any + * level of the tree is never used, so it doesn't need to be filled in. + */ + BT_MARK_DIRTY(smp, ip); + /* + * acquire a transaction lock on the root page (in-memory inode) + */ + tlck = txLock(tid, ip, smp, tlckDTREE | tlckNEW | tlckBTROOT); + dtlck = (struct dt_lock *) & tlck->lock; + + /* linelock root */ + ASSERT(dtlck->index == 0); + lv = & dtlck->lv[0]; + lv->offset = 0; + lv->length = DTROOTMAXSLOT; + dtlck->index++; + + /* update page header of root */ + if (sp->header.flag & BT_LEAF) { + sp->header.flag &= ~BT_LEAF; + sp->header.flag |= BT_INTERNAL; + } + + /* init the first entry */ + s = (struct idtentry *) & sp->slot[DTENTRYSTART]; + ppxd = (pxd_t *) s; + *ppxd = *pxd; + s->next = -1; + s->namlen = 0; + + stbl = sp->header.stbl; + stbl[0] = DTENTRYSTART; + sp->header.nextindex = 1; + + /* init freelist */ + fsi = DTENTRYSTART + 1; + f = &sp->slot[fsi]; + + /* init free region of remaining area */ + for (fsi++; fsi < DTROOTMAXSLOT; f++, fsi++) + f->next = fsi; + f->next = -1; + + sp->header.freelist = DTENTRYSTART + 1; + sp->header.freecnt = DTROOTMAXSLOT - (DTENTRYSTART + 1); + + *rmpp = rmp; + + return 0; +} + + +/* + * dtDelete() + * + * function: delete the entry(s) referenced by a key. + * + * parameter: + * + * return: + */ +int dtDelete(tid_t tid, + struct inode *ip, struct component_name * key, ino_t * ino, int flag) +{ + int rc = 0; + s64 bn; + struct metapage *mp, *imp; + dtpage_t *p; + int index; + struct btstack btstack; + struct dt_lock *dtlck; + struct tlock *tlck; + struct lv *lv; + int i; + struct ldtentry *ldtentry; + u8 *stbl; + u32 table_index, next_index; + struct metapage *nmp; + dtpage_t *np; + + /* + * search for the entry to delete: + * + * dtSearch() returns (leaf page pinned, index at which to delete). + */ + if ((rc = dtSearch(ip, key, ino, &btstack, flag))) + return rc; + + /* retrieve search result */ + DT_GETSEARCH(ip, btstack.top, bn, mp, p, index); + + /* + * We need to find put the index of the next entry into the + * directory index table in order to resume a readdir from this + * entry. + */ + if (DO_INDEX(ip)) { + stbl = DT_GETSTBL(p); + ldtentry = (struct ldtentry *) & p->slot[stbl[index]]; + table_index = le32_to_cpu(ldtentry->index); + if (index == (p->header.nextindex - 1)) { + /* + * Last entry in this leaf page + */ + if ((p->header.flag & BT_ROOT) + || (p->header.next == 0)) + next_index = -1; + else { + /* Read next leaf page */ + DT_GETPAGE(ip, le64_to_cpu(p->header.next), + nmp, PSIZE, np, rc); + if (rc) + next_index = -1; + else { + stbl = DT_GETSTBL(np); + ldtentry = + (struct ldtentry *) & np-> + slot[stbl[0]]; + next_index = + le32_to_cpu(ldtentry->index); + DT_PUTPAGE(nmp); + } + } + } else { + ldtentry = + (struct ldtentry *) & p->slot[stbl[index + 1]]; + next_index = le32_to_cpu(ldtentry->index); + } + free_index(tid, ip, table_index, next_index); + } + /* + * the leaf page becomes empty, delete the page + */ + if (p->header.nextindex == 1) { + /* delete empty page */ + rc = dtDeleteUp(tid, ip, mp, p, &btstack); + } + /* + * the leaf page has other entries remaining: + * + * delete the entry from the leaf page. + */ + else { + BT_MARK_DIRTY(mp, ip); + /* + * acquire a transaction lock on the leaf page + */ + tlck = txLock(tid, ip, mp, tlckDTREE | tlckENTRY); + dtlck = (struct dt_lock *) & tlck->lock; + + /* + * Do not assume that dtlck->index will be zero. During a + * rename within a directory, this transaction may have + * modified this page already when adding the new entry. + */ + + /* linelock header */ + if (dtlck->index >= dtlck->maxcnt) + dtlck = (struct dt_lock *) txLinelock(dtlck); + lv = & dtlck->lv[dtlck->index]; + lv->offset = 0; + lv->length = 1; + dtlck->index++; + + /* linelock stbl of non-root leaf page */ + if (!(p->header.flag & BT_ROOT)) { + if (dtlck->index >= dtlck->maxcnt) + dtlck = (struct dt_lock *) txLinelock(dtlck); + lv = & dtlck->lv[dtlck->index]; + i = index >> L2DTSLOTSIZE; + lv->offset = p->header.stblindex + i; + lv->length = + ((p->header.nextindex - 1) >> L2DTSLOTSIZE) - + i + 1; + dtlck->index++; + } + + /* free the leaf entry */ + dtDeleteEntry(p, index, &dtlck); + + /* + * Update directory index table for entries moved in stbl + */ + if (DO_INDEX(ip) && index < p->header.nextindex) { + s64 lblock; + + imp = NULL; + stbl = DT_GETSTBL(p); + for (i = index; i < p->header.nextindex; i++) { + ldtentry = + (struct ldtentry *) & p->slot[stbl[i]]; + modify_index(tid, ip, + le32_to_cpu(ldtentry->index), + bn, i, &imp, &lblock); + } + if (imp) + release_metapage(imp); + } + + DT_PUTPAGE(mp); + } + + return rc; +} + + +/* + * dtDeleteUp() + * + * function: + * free empty pages as propagating deletion up the tree + * + * parameter: + * + * return: + */ +static int dtDeleteUp(tid_t tid, struct inode *ip, + struct metapage * fmp, dtpage_t * fp, struct btstack * btstack) +{ + int rc = 0; + struct metapage *mp; + dtpage_t *p; + int index, nextindex; + int xlen; + struct btframe *parent; + struct dt_lock *dtlck; + struct tlock *tlck; + struct lv *lv; + struct pxd_lock *pxdlock; + int i; + + /* + * keep the root leaf page which has become empty + */ + if (BT_IS_ROOT(fmp)) { + /* + * reset the root + * + * dtInitRoot() acquires txlock on the root + */ + dtInitRoot(tid, ip, PARENT(ip)); + + DT_PUTPAGE(fmp); + + return 0; + } + + /* + * free the non-root leaf page + */ + /* + * acquire a transaction lock on the page + * + * write FREEXTENT|NOREDOPAGE log record + * N.B. linelock is overlaid as freed extent descriptor, and + * the buffer page is freed; + */ + tlck = txMaplock(tid, ip, tlckDTREE | tlckFREE); + pxdlock = (struct pxd_lock *) & tlck->lock; + pxdlock->flag = mlckFREEPXD; + pxdlock->pxd = fp->header.self; + pxdlock->index = 1; + + /* update sibling pointers */ + if ((rc = dtRelink(tid, ip, fp))) { + BT_PUTPAGE(fmp); + return rc; + } + + xlen = lengthPXD(&fp->header.self); + + /* Free quota allocation. */ + dquot_free_block(ip, xlen); + + /* free/invalidate its buffer page */ + discard_metapage(fmp); + + /* + * propagate page deletion up the directory tree + * + * If the delete from the parent page makes it empty, + * continue all the way up the tree. + * stop if the root page is reached (which is never deleted) or + * if the entry deletion does not empty the page. + */ + while ((parent = BT_POP(btstack)) != NULL) { + /* pin the parent page <sp> */ + DT_GETPAGE(ip, parent->bn, mp, PSIZE, p, rc); + if (rc) + return rc; + + /* + * free the extent of the child page deleted + */ + index = parent->index; + + /* + * delete the entry for the child page from parent + */ + nextindex = p->header.nextindex; + + /* + * the parent has the single entry being deleted: + * + * free the parent page which has become empty. + */ + if (nextindex == 1) { + /* + * keep the root internal page which has become empty + */ + if (p->header.flag & BT_ROOT) { + /* + * reset the root + * + * dtInitRoot() acquires txlock on the root + */ + dtInitRoot(tid, ip, PARENT(ip)); + + DT_PUTPAGE(mp); + + return 0; + } + /* + * free the parent page + */ + else { + /* + * acquire a transaction lock on the page + * + * write FREEXTENT|NOREDOPAGE log record + */ + tlck = + txMaplock(tid, ip, + tlckDTREE | tlckFREE); + pxdlock = (struct pxd_lock *) & tlck->lock; + pxdlock->flag = mlckFREEPXD; + pxdlock->pxd = p->header.self; + pxdlock->index = 1; + + /* update sibling pointers */ + if ((rc = dtRelink(tid, ip, p))) { + DT_PUTPAGE(mp); + return rc; + } + + xlen = lengthPXD(&p->header.self); + + /* Free quota allocation */ + dquot_free_block(ip, xlen); + + /* free/invalidate its buffer page */ + discard_metapage(mp); + + /* propagate up */ + continue; + } + } + + /* + * the parent has other entries remaining: + * + * delete the router entry from the parent page. + */ + BT_MARK_DIRTY(mp, ip); + /* + * acquire a transaction lock on the page + * + * action: router entry deletion + */ + tlck = txLock(tid, ip, mp, tlckDTREE | tlckENTRY); + dtlck = (struct dt_lock *) & tlck->lock; + + /* linelock header */ + if (dtlck->index >= dtlck->maxcnt) + dtlck = (struct dt_lock *) txLinelock(dtlck); + lv = & dtlck->lv[dtlck->index]; + lv->offset = 0; + lv->length = 1; + dtlck->index++; + + /* linelock stbl of non-root leaf page */ + if (!(p->header.flag & BT_ROOT)) { + if (dtlck->index < dtlck->maxcnt) + lv++; + else { + dtlck = (struct dt_lock *) txLinelock(dtlck); + lv = & dtlck->lv[0]; + } + i = index >> L2DTSLOTSIZE; + lv->offset = p->header.stblindex + i; + lv->length = + ((p->header.nextindex - 1) >> L2DTSLOTSIZE) - + i + 1; + dtlck->index++; + } + + /* free the router entry */ + dtDeleteEntry(p, index, &dtlck); + + /* reset key of new leftmost entry of level (for consistency) */ + if (index == 0 && + ((p->header.flag & BT_ROOT) || p->header.prev == 0)) + dtTruncateEntry(p, 0, &dtlck); + + /* unpin the parent page */ + DT_PUTPAGE(mp); + + /* exit propagation up */ + break; + } + + if (!DO_INDEX(ip)) + ip->i_size -= PSIZE; + + return 0; +} + +#ifdef _NOTYET +/* + * NAME: dtRelocate() + * + * FUNCTION: relocate dtpage (internal or leaf) of directory; + * This function is mainly used by defragfs utility. + */ +int dtRelocate(tid_t tid, struct inode *ip, s64 lmxaddr, pxd_t * opxd, + s64 nxaddr) +{ + int rc = 0; + struct metapage *mp, *pmp, *lmp, *rmp; + dtpage_t *p, *pp, *rp = 0, *lp= 0; + s64 bn; + int index; + struct btstack btstack; + pxd_t *pxd; + s64 oxaddr, nextbn, prevbn; + int xlen, xsize; + struct tlock *tlck; + struct dt_lock *dtlck; + struct pxd_lock *pxdlock; + s8 *stbl; + struct lv *lv; + + oxaddr = addressPXD(opxd); + xlen = lengthPXD(opxd); + + jfs_info("dtRelocate: lmxaddr:%Ld xaddr:%Ld:%Ld xlen:%d", + (long long)lmxaddr, (long long)oxaddr, (long long)nxaddr, + xlen); + + /* + * 1. get the internal parent dtpage covering + * router entry for the tartget page to be relocated; + */ + rc = dtSearchNode(ip, lmxaddr, opxd, &btstack); + if (rc) + return rc; + + /* retrieve search result */ + DT_GETSEARCH(ip, btstack.top, bn, pmp, pp, index); + jfs_info("dtRelocate: parent router entry validated."); + + /* + * 2. relocate the target dtpage + */ + /* read in the target page from src extent */ + DT_GETPAGE(ip, oxaddr, mp, PSIZE, p, rc); + if (rc) { + /* release the pinned parent page */ + DT_PUTPAGE(pmp); + return rc; + } + + /* + * read in sibling pages if any to update sibling pointers; + */ + rmp = NULL; + if (p->header.next) { + nextbn = le64_to_cpu(p->header.next); + DT_GETPAGE(ip, nextbn, rmp, PSIZE, rp, rc); + if (rc) { + DT_PUTPAGE(mp); + DT_PUTPAGE(pmp); + return (rc); + } + } + + lmp = NULL; + if (p->header.prev) { + prevbn = le64_to_cpu(p->header.prev); + DT_GETPAGE(ip, prevbn, lmp, PSIZE, lp, rc); + if (rc) { + DT_PUTPAGE(mp); + DT_PUTPAGE(pmp); + if (rmp) + DT_PUTPAGE(rmp); + return (rc); + } + } + + /* at this point, all xtpages to be updated are in memory */ + + /* + * update sibling pointers of sibling dtpages if any; + */ + if (lmp) { + tlck = txLock(tid, ip, lmp, tlckDTREE | tlckRELINK); + dtlck = (struct dt_lock *) & tlck->lock; + /* linelock header */ + ASSERT(dtlck->index == 0); + lv = & dtlck->lv[0]; + lv->offset = 0; + lv->length = 1; + dtlck->index++; + + lp->header.next = cpu_to_le64(nxaddr); + DT_PUTPAGE(lmp); + } + + if (rmp) { + tlck = txLock(tid, ip, rmp, tlckDTREE | tlckRELINK); + dtlck = (struct dt_lock *) & tlck->lock; + /* linelock header */ + ASSERT(dtlck->index == 0); + lv = & dtlck->lv[0]; + lv->offset = 0; + lv->length = 1; + dtlck->index++; + + rp->header.prev = cpu_to_le64(nxaddr); + DT_PUTPAGE(rmp); + } + + /* + * update the target dtpage to be relocated + * + * write LOG_REDOPAGE of LOG_NEW type for dst page + * for the whole target page (logredo() will apply + * after image and update bmap for allocation of the + * dst extent), and update bmap for allocation of + * the dst extent; + */ + tlck = txLock(tid, ip, mp, tlckDTREE | tlckNEW); + dtlck = (struct dt_lock *) & tlck->lock; + /* linelock header */ + ASSERT(dtlck->index == 0); + lv = & dtlck->lv[0]; + + /* update the self address in the dtpage header */ + pxd = &p->header.self; + PXDaddress(pxd, nxaddr); + + /* the dst page is the same as the src page, i.e., + * linelock for afterimage of the whole page; + */ + lv->offset = 0; + lv->length = p->header.maxslot; + dtlck->index++; + + /* update the buffer extent descriptor of the dtpage */ + xsize = xlen << JFS_SBI(ip->i_sb)->l2bsize; + + /* unpin the relocated page */ + DT_PUTPAGE(mp); + jfs_info("dtRelocate: target dtpage relocated."); + + /* the moved extent is dtpage, then a LOG_NOREDOPAGE log rec + * needs to be written (in logredo(), the LOG_NOREDOPAGE log rec + * will also force a bmap update ). + */ + + /* + * 3. acquire maplock for the source extent to be freed; + */ + /* for dtpage relocation, write a LOG_NOREDOPAGE record + * for the source dtpage (logredo() will init NoRedoPage + * filter and will also update bmap for free of the source + * dtpage), and upadte bmap for free of the source dtpage; + */ + tlck = txMaplock(tid, ip, tlckDTREE | tlckFREE); + pxdlock = (struct pxd_lock *) & tlck->lock; + pxdlock->flag = mlckFREEPXD; + PXDaddress(&pxdlock->pxd, oxaddr); + PXDlength(&pxdlock->pxd, xlen); + pxdlock->index = 1; + + /* + * 4. update the parent router entry for relocation; + * + * acquire tlck for the parent entry covering the target dtpage; + * write LOG_REDOPAGE to apply after image only; + */ + jfs_info("dtRelocate: update parent router entry."); + tlck = txLock(tid, ip, pmp, tlckDTREE | tlckENTRY); + dtlck = (struct dt_lock *) & tlck->lock; + lv = & dtlck->lv[dtlck->index]; + + /* update the PXD with the new address */ + stbl = DT_GETSTBL(pp); + pxd = (pxd_t *) & pp->slot[stbl[index]]; + PXDaddress(pxd, nxaddr); + lv->offset = stbl[index]; + lv->length = 1; + dtlck->index++; + + /* unpin the parent dtpage */ + DT_PUTPAGE(pmp); + + return rc; +} + +/* + * NAME: dtSearchNode() + * + * FUNCTION: Search for an dtpage containing a specified address + * This function is mainly used by defragfs utility. + * + * NOTE: Search result on stack, the found page is pinned at exit. + * The result page must be an internal dtpage. + * lmxaddr give the address of the left most page of the + * dtree level, in which the required dtpage resides. + */ +static int dtSearchNode(struct inode *ip, s64 lmxaddr, pxd_t * kpxd, + struct btstack * btstack) +{ + int rc = 0; + s64 bn; + struct metapage *mp; + dtpage_t *p; + int psize = 288; /* initial in-line directory */ + s8 *stbl; + int i; + pxd_t *pxd; + struct btframe *btsp; + + BT_CLR(btstack); /* reset stack */ + + /* + * descend tree to the level with specified leftmost page + * + * by convention, root bn = 0. + */ + for (bn = 0;;) { + /* get/pin the page to search */ + DT_GETPAGE(ip, bn, mp, psize, p, rc); + if (rc) + return rc; + + /* does the xaddr of leftmost page of the levevl + * matches levevl search key ? + */ + if (p->header.flag & BT_ROOT) { + if (lmxaddr == 0) + break; + } else if (addressPXD(&p->header.self) == lmxaddr) + break; + + /* + * descend down to leftmost child page + */ + if (p->header.flag & BT_LEAF) { + DT_PUTPAGE(mp); + return -ESTALE; + } + + /* get the leftmost entry */ + stbl = DT_GETSTBL(p); + pxd = (pxd_t *) & p->slot[stbl[0]]; + + /* get the child page block address */ + bn = addressPXD(pxd); + psize = lengthPXD(pxd) << JFS_SBI(ip->i_sb)->l2bsize; + /* unpin the parent page */ + DT_PUTPAGE(mp); + } + + /* + * search each page at the current levevl + */ + loop: + stbl = DT_GETSTBL(p); + for (i = 0; i < p->header.nextindex; i++) { + pxd = (pxd_t *) & p->slot[stbl[i]]; + + /* found the specified router entry */ + if (addressPXD(pxd) == addressPXD(kpxd) && + lengthPXD(pxd) == lengthPXD(kpxd)) { + btsp = btstack->top; + btsp->bn = bn; + btsp->index = i; + btsp->mp = mp; + + return 0; + } + } + + /* get the right sibling page if any */ + if (p->header.next) + bn = le64_to_cpu(p->header.next); + else { + DT_PUTPAGE(mp); + return -ESTALE; + } + + /* unpin current page */ + DT_PUTPAGE(mp); + + /* get the right sibling page */ + DT_GETPAGE(ip, bn, mp, PSIZE, p, rc); + if (rc) + return rc; + + goto loop; +} +#endif /* _NOTYET */ + +/* + * dtRelink() + * + * function: + * link around a freed page. + * + * parameter: + * fp: page to be freed + * + * return: + */ +static int dtRelink(tid_t tid, struct inode *ip, dtpage_t * p) +{ + int rc; + struct metapage *mp; + s64 nextbn, prevbn; + struct tlock *tlck; + struct dt_lock *dtlck; + struct lv *lv; + + nextbn = le64_to_cpu(p->header.next); + prevbn = le64_to_cpu(p->header.prev); + + /* update prev pointer of the next page */ + if (nextbn != 0) { + DT_GETPAGE(ip, nextbn, mp, PSIZE, p, rc); + if (rc) + return rc; + + BT_MARK_DIRTY(mp, ip); + /* + * acquire a transaction lock on the next page + * + * action: update prev pointer; + */ + tlck = txLock(tid, ip, mp, tlckDTREE | tlckRELINK); + jfs_info("dtRelink nextbn: tlck = 0x%p, ip = 0x%p, mp=0x%p", + tlck, ip, mp); + dtlck = (struct dt_lock *) & tlck->lock; + + /* linelock header */ + if (dtlck->index >= dtlck->maxcnt) + dtlck = (struct dt_lock *) txLinelock(dtlck); + lv = & dtlck->lv[dtlck->index]; + lv->offset = 0; + lv->length = 1; + dtlck->index++; + + p->header.prev = cpu_to_le64(prevbn); + DT_PUTPAGE(mp); + } + + /* update next pointer of the previous page */ + if (prevbn != 0) { + DT_GETPAGE(ip, prevbn, mp, PSIZE, p, rc); + if (rc) + return rc; + + BT_MARK_DIRTY(mp, ip); + /* + * acquire a transaction lock on the prev page + * + * action: update next pointer; + */ + tlck = txLock(tid, ip, mp, tlckDTREE | tlckRELINK); + jfs_info("dtRelink prevbn: tlck = 0x%p, ip = 0x%p, mp=0x%p", + tlck, ip, mp); + dtlck = (struct dt_lock *) & tlck->lock; + + /* linelock header */ + if (dtlck->index >= dtlck->maxcnt) + dtlck = (struct dt_lock *) txLinelock(dtlck); + lv = & dtlck->lv[dtlck->index]; + lv->offset = 0; + lv->length = 1; + dtlck->index++; + + p->header.next = cpu_to_le64(nextbn); + DT_PUTPAGE(mp); + } + + return 0; +} + + +/* + * dtInitRoot() + * + * initialize directory root (inline in inode) + */ +void dtInitRoot(tid_t tid, struct inode *ip, u32 idotdot) +{ + struct jfs_inode_info *jfs_ip = JFS_IP(ip); + dtroot_t *p; + int fsi; + struct dtslot *f; + struct tlock *tlck; + struct dt_lock *dtlck; + struct lv *lv; + u16 xflag_save; + + /* + * If this was previously an non-empty directory, we need to remove + * the old directory table. + */ + if (DO_INDEX(ip)) { + if (!jfs_dirtable_inline(ip)) { + struct tblock *tblk = tid_to_tblock(tid); + /* + * We're playing games with the tid's xflag. If + * we're removing a regular file, the file's xtree + * is committed with COMMIT_PMAP, but we always + * commit the directories xtree with COMMIT_PWMAP. + */ + xflag_save = tblk->xflag; + tblk->xflag = 0; + /* + * xtTruncate isn't guaranteed to fully truncate + * the xtree. The caller needs to check i_size + * after committing the transaction to see if + * additional truncation is needed. The + * COMMIT_Stale flag tells caller that we + * initiated the truncation. + */ + xtTruncate(tid, ip, 0, COMMIT_PWMAP); + set_cflag(COMMIT_Stale, ip); + + tblk->xflag = xflag_save; + } else + ip->i_size = 1; + + jfs_ip->next_index = 2; + } else + ip->i_size = IDATASIZE; + + /* + * acquire a transaction lock on the root + * + * action: directory initialization; + */ + tlck = txLock(tid, ip, (struct metapage *) & jfs_ip->bxflag, + tlckDTREE | tlckENTRY | tlckBTROOT); + dtlck = (struct dt_lock *) & tlck->lock; + + /* linelock root */ + ASSERT(dtlck->index == 0); + lv = & dtlck->lv[0]; + lv->offset = 0; + lv->length = DTROOTMAXSLOT; + dtlck->index++; + + p = &jfs_ip->i_dtroot; + + p->header.flag = DXD_INDEX | BT_ROOT | BT_LEAF; + + p->header.nextindex = 0; + + /* init freelist */ + fsi = 1; + f = &p->slot[fsi]; + + /* init data area of root */ + for (fsi++; fsi < DTROOTMAXSLOT; f++, fsi++) + f->next = fsi; + f->next = -1; + + p->header.freelist = 1; + p->header.freecnt = 8; + + /* init '..' entry */ + p->header.idotdot = cpu_to_le32(idotdot); + + return; +} + +/* + * add_missing_indices() + * + * function: Fix dtree page in which one or more entries has an invalid index. + * fsck.jfs should really fix this, but it currently does not. + * Called from jfs_readdir when bad index is detected. + */ +static void add_missing_indices(struct inode *inode, s64 bn) +{ + struct ldtentry *d; + struct dt_lock *dtlck; + int i; + uint index; + struct lv *lv; + struct metapage *mp; + dtpage_t *p; + int rc; + s8 *stbl; + tid_t tid; + struct tlock *tlck; + + tid = txBegin(inode->i_sb, 0); + + DT_GETPAGE(inode, bn, mp, PSIZE, p, rc); + + if (rc) { + printk(KERN_ERR "DT_GETPAGE failed!\n"); + goto end; + } + BT_MARK_DIRTY(mp, inode); + + ASSERT(p->header.flag & BT_LEAF); + + tlck = txLock(tid, inode, mp, tlckDTREE | tlckENTRY); + if (BT_IS_ROOT(mp)) + tlck->type |= tlckBTROOT; + + dtlck = (struct dt_lock *) &tlck->lock; + + stbl = DT_GETSTBL(p); + for (i = 0; i < p->header.nextindex; i++) { + d = (struct ldtentry *) &p->slot[stbl[i]]; + index = le32_to_cpu(d->index); + if ((index < 2) || (index >= JFS_IP(inode)->next_index)) { + d->index = cpu_to_le32(add_index(tid, inode, bn, i)); + if (dtlck->index >= dtlck->maxcnt) + dtlck = (struct dt_lock *) txLinelock(dtlck); + lv = &dtlck->lv[dtlck->index]; + lv->offset = stbl[i]; + lv->length = 1; + dtlck->index++; + } + } + + DT_PUTPAGE(mp); + (void) txCommit(tid, 1, &inode, 0); +end: + txEnd(tid); +} + +/* + * Buffer to hold directory entry info while traversing a dtree page + * before being fed to the filldir function + */ +struct jfs_dirent { + loff_t position; + int ino; + u16 name_len; + char name[0]; +}; + +/* + * function to determine next variable-sized jfs_dirent in buffer + */ +static inline struct jfs_dirent *next_jfs_dirent(struct jfs_dirent *dirent) +{ + return (struct jfs_dirent *) + ((char *)dirent + + ((sizeof (struct jfs_dirent) + dirent->name_len + 1 + + sizeof (loff_t) - 1) & + ~(sizeof (loff_t) - 1))); +} + +/* + * jfs_readdir() + * + * function: read directory entries sequentially + * from the specified entry offset + * + * parameter: + * + * return: offset = (pn, index) of start entry + * of next jfs_readdir()/dtRead() + */ +int jfs_readdir(struct file *file, struct dir_context *ctx) +{ + struct inode *ip = file_inode(file); + struct nls_table *codepage = JFS_SBI(ip->i_sb)->nls_tab; + int rc = 0; + loff_t dtpos; /* legacy OS/2 style position */ + struct dtoffset { + s16 pn; + s16 index; + s32 unused; + } *dtoffset = (struct dtoffset *) &dtpos; + s64 bn; + struct metapage *mp; + dtpage_t *p; + int index; + s8 *stbl; + struct btstack btstack; + int i, next; + struct ldtentry *d; + struct dtslot *t; + int d_namleft, len, outlen; + unsigned long dirent_buf; + char *name_ptr; + u32 dir_index; + int do_index = 0; + uint loop_count = 0; + struct jfs_dirent *jfs_dirent; + int jfs_dirents; + int overflow, fix_page, page_fixed = 0; + static int unique_pos = 2; /* If we can't fix broken index */ + + if (ctx->pos == DIREND) + return 0; + + if (DO_INDEX(ip)) { + /* + * persistent index is stored in directory entries. + * Special cases: 0 = . + * 1 = .. + * -1 = End of directory + */ + do_index = 1; + + dir_index = (u32) ctx->pos; + + /* + * NFSv4 reserves cookies 1 and 2 for . and .. so the value + * we return to the vfs is one greater than the one we use + * internally. + */ + if (dir_index) + dir_index--; + + if (dir_index > 1) { + struct dir_table_slot dirtab_slot; + + if (dtEmpty(ip) || + (dir_index >= JFS_IP(ip)->next_index)) { + /* Stale position. Directory has shrunk */ + ctx->pos = DIREND; + return 0; + } + repeat: + rc = read_index(ip, dir_index, &dirtab_slot); + if (rc) { + ctx->pos = DIREND; + return rc; + } + if (dirtab_slot.flag == DIR_INDEX_FREE) { + if (loop_count++ > JFS_IP(ip)->next_index) { + jfs_err("jfs_readdir detected infinite loop!"); + ctx->pos = DIREND; + return 0; + } + dir_index = le32_to_cpu(dirtab_slot.addr2); + if (dir_index == -1) { + ctx->pos = DIREND; + return 0; + } + goto repeat; + } + bn = addressDTS(&dirtab_slot); + index = dirtab_slot.slot; + DT_GETPAGE(ip, bn, mp, PSIZE, p, rc); + if (rc) { + ctx->pos = DIREND; + return 0; + } + if (p->header.flag & BT_INTERNAL) { + jfs_err("jfs_readdir: bad index table"); + DT_PUTPAGE(mp); + ctx->pos = DIREND; + return 0; + } + } else { + if (dir_index == 0) { + /* + * self "." + */ + ctx->pos = 1; + if (!dir_emit(ctx, ".", 1, ip->i_ino, DT_DIR)) + return 0; + } + /* + * parent ".." + */ + ctx->pos = 2; + if (!dir_emit(ctx, "..", 2, PARENT(ip), DT_DIR)) + return 0; + + /* + * Find first entry of left-most leaf + */ + if (dtEmpty(ip)) { + ctx->pos = DIREND; + return 0; + } + + if ((rc = dtReadFirst(ip, &btstack))) + return rc; + + DT_GETSEARCH(ip, btstack.top, bn, mp, p, index); + } + } else { + /* + * Legacy filesystem - OS/2 & Linux JFS < 0.3.6 + * + * pn = 0; index = 1: First entry "." + * pn = 0; index = 2: Second entry ".." + * pn > 0: Real entries, pn=1 -> leftmost page + * pn = index = -1: No more entries + */ + dtpos = ctx->pos; + if (dtpos < 2) { + /* build "." entry */ + ctx->pos = 1; + if (!dir_emit(ctx, ".", 1, ip->i_ino, DT_DIR)) + return 0; + dtoffset->index = 2; + ctx->pos = dtpos; + } + + if (dtoffset->pn == 0) { + if (dtoffset->index == 2) { + /* build ".." entry */ + if (!dir_emit(ctx, "..", 2, PARENT(ip), DT_DIR)) + return 0; + } else { + jfs_err("jfs_readdir called with invalid offset!"); + } + dtoffset->pn = 1; + dtoffset->index = 0; + ctx->pos = dtpos; + } + + if (dtEmpty(ip)) { + ctx->pos = DIREND; + return 0; + } + + if ((rc = dtReadNext(ip, &ctx->pos, &btstack))) { + jfs_err("jfs_readdir: unexpected rc = %d from dtReadNext", + rc); + ctx->pos = DIREND; + return 0; + } + /* get start leaf page and index */ + DT_GETSEARCH(ip, btstack.top, bn, mp, p, index); + + /* offset beyond directory eof ? */ + if (bn < 0) { + ctx->pos = DIREND; + return 0; + } + } + + dirent_buf = __get_free_page(GFP_KERNEL); + if (dirent_buf == 0) { + DT_PUTPAGE(mp); + jfs_warn("jfs_readdir: __get_free_page failed!"); + ctx->pos = DIREND; + return -ENOMEM; + } + + while (1) { + jfs_dirent = (struct jfs_dirent *) dirent_buf; + jfs_dirents = 0; + overflow = fix_page = 0; + + stbl = DT_GETSTBL(p); + + for (i = index; i < p->header.nextindex; i++) { + d = (struct ldtentry *) & p->slot[stbl[i]]; + + if (((long) jfs_dirent + d->namlen + 1) > + (dirent_buf + PAGE_SIZE)) { + /* DBCS codepages could overrun dirent_buf */ + index = i; + overflow = 1; + break; + } + + d_namleft = d->namlen; + name_ptr = jfs_dirent->name; + jfs_dirent->ino = le32_to_cpu(d->inumber); + + if (do_index) { + len = min(d_namleft, DTLHDRDATALEN); + jfs_dirent->position = le32_to_cpu(d->index); + /* + * d->index should always be valid, but it + * isn't. fsck.jfs doesn't create the + * directory index for the lost+found + * directory. Rather than let it go, + * we can try to fix it. + */ + if ((jfs_dirent->position < 2) || + (jfs_dirent->position >= + JFS_IP(ip)->next_index)) { + if (!page_fixed && !isReadOnly(ip)) { + fix_page = 1; + /* + * setting overflow and setting + * index to i will cause the + * same page to be processed + * again starting here + */ + overflow = 1; + index = i; + break; + } + jfs_dirent->position = unique_pos++; + } + /* + * We add 1 to the index because we may + * use a value of 2 internally, and NFSv4 + * doesn't like that. + */ + jfs_dirent->position++; + } else { + jfs_dirent->position = dtpos; + len = min(d_namleft, DTLHDRDATALEN_LEGACY); + } + + /* copy the name of head/only segment */ + outlen = jfs_strfromUCS_le(name_ptr, d->name, len, + codepage); + jfs_dirent->name_len = outlen; + + /* copy name in the additional segment(s) */ + next = d->next; + while (next >= 0) { + t = (struct dtslot *) & p->slot[next]; + name_ptr += outlen; + d_namleft -= len; + /* Sanity Check */ + if (d_namleft == 0) { + jfs_error(ip->i_sb, + "JFS:Dtree error: ino = %ld, bn=%lld, index = %d\n", + (long)ip->i_ino, + (long long)bn, + i); + goto skip_one; + } + len = min(d_namleft, DTSLOTDATALEN); + outlen = jfs_strfromUCS_le(name_ptr, t->name, + len, codepage); + jfs_dirent->name_len += outlen; + + next = t->next; + } + + jfs_dirents++; + jfs_dirent = next_jfs_dirent(jfs_dirent); +skip_one: + if (!do_index) + dtoffset->index++; + } + + if (!overflow) { + /* Point to next leaf page */ + if (p->header.flag & BT_ROOT) + bn = 0; + else { + bn = le64_to_cpu(p->header.next); + index = 0; + /* update offset (pn:index) for new page */ + if (!do_index) { + dtoffset->pn++; + dtoffset->index = 0; + } + } + page_fixed = 0; + } + + /* unpin previous leaf page */ + DT_PUTPAGE(mp); + + jfs_dirent = (struct jfs_dirent *) dirent_buf; + while (jfs_dirents--) { + ctx->pos = jfs_dirent->position; + if (!dir_emit(ctx, jfs_dirent->name, + jfs_dirent->name_len, + jfs_dirent->ino, DT_UNKNOWN)) + goto out; + jfs_dirent = next_jfs_dirent(jfs_dirent); + } + + if (fix_page) { + add_missing_indices(ip, bn); + page_fixed = 1; + } + + if (!overflow && (bn == 0)) { + ctx->pos = DIREND; + break; + } + + DT_GETPAGE(ip, bn, mp, PSIZE, p, rc); + if (rc) { + free_page(dirent_buf); + return rc; + } + } + + out: + free_page(dirent_buf); + + return rc; +} + + +/* + * dtReadFirst() + * + * function: get the leftmost page of the directory + */ +static int dtReadFirst(struct inode *ip, struct btstack * btstack) +{ + int rc = 0; + s64 bn; + int psize = 288; /* initial in-line directory */ + struct metapage *mp; + dtpage_t *p; + s8 *stbl; + struct btframe *btsp; + pxd_t *xd; + + BT_CLR(btstack); /* reset stack */ + + /* + * descend leftmost path of the tree + * + * by convention, root bn = 0. + */ + for (bn = 0;;) { + DT_GETPAGE(ip, bn, mp, psize, p, rc); + if (rc) + return rc; + + /* + * leftmost leaf page + */ + if (p->header.flag & BT_LEAF) { + /* return leftmost entry */ + btsp = btstack->top; + btsp->bn = bn; + btsp->index = 0; + btsp->mp = mp; + + return 0; + } + + /* + * descend down to leftmost child page + */ + if (BT_STACK_FULL(btstack)) { + DT_PUTPAGE(mp); + jfs_error(ip->i_sb, "btstack overrun\n"); + BT_STACK_DUMP(btstack); + return -EIO; + } + /* push (bn, index) of the parent page/entry */ + BT_PUSH(btstack, bn, 0); + + /* get the leftmost entry */ + stbl = DT_GETSTBL(p); + xd = (pxd_t *) & p->slot[stbl[0]]; + + /* get the child page block address */ + bn = addressPXD(xd); + psize = lengthPXD(xd) << JFS_SBI(ip->i_sb)->l2bsize; + + /* unpin the parent page */ + DT_PUTPAGE(mp); + } +} + + +/* + * dtReadNext() + * + * function: get the page of the specified offset (pn:index) + * + * return: if (offset > eof), bn = -1; + * + * note: if index > nextindex of the target leaf page, + * start with 1st entry of next leaf page; + */ +static int dtReadNext(struct inode *ip, loff_t * offset, + struct btstack * btstack) +{ + int rc = 0; + struct dtoffset { + s16 pn; + s16 index; + s32 unused; + } *dtoffset = (struct dtoffset *) offset; + s64 bn; + struct metapage *mp; + dtpage_t *p; + int index; + int pn; + s8 *stbl; + struct btframe *btsp, *parent; + pxd_t *xd; + + /* + * get leftmost leaf page pinned + */ + if ((rc = dtReadFirst(ip, btstack))) + return rc; + + /* get leaf page */ + DT_GETSEARCH(ip, btstack->top, bn, mp, p, index); + + /* get the start offset (pn:index) */ + pn = dtoffset->pn - 1; /* Now pn = 0 represents leftmost leaf */ + index = dtoffset->index; + + /* start at leftmost page ? */ + if (pn == 0) { + /* offset beyond eof ? */ + if (index < p->header.nextindex) + goto out; + + if (p->header.flag & BT_ROOT) { + bn = -1; + goto out; + } + + /* start with 1st entry of next leaf page */ + dtoffset->pn++; + dtoffset->index = index = 0; + goto a; + } + + /* start at non-leftmost page: scan parent pages for large pn */ + if (p->header.flag & BT_ROOT) { + bn = -1; + goto out; + } + + /* start after next leaf page ? */ + if (pn > 1) + goto b; + + /* get leaf page pn = 1 */ + a: + bn = le64_to_cpu(p->header.next); + + /* unpin leaf page */ + DT_PUTPAGE(mp); + + /* offset beyond eof ? */ + if (bn == 0) { + bn = -1; + goto out; + } + + goto c; + + /* + * scan last internal page level to get target leaf page + */ + b: + /* unpin leftmost leaf page */ + DT_PUTPAGE(mp); + + /* get left most parent page */ + btsp = btstack->top; + parent = btsp - 1; + bn = parent->bn; + DT_GETPAGE(ip, bn, mp, PSIZE, p, rc); + if (rc) + return rc; + + /* scan parent pages at last internal page level */ + while (pn >= p->header.nextindex) { + pn -= p->header.nextindex; + + /* get next parent page address */ + bn = le64_to_cpu(p->header.next); + + /* unpin current parent page */ + DT_PUTPAGE(mp); + + /* offset beyond eof ? */ + if (bn == 0) { + bn = -1; + goto out; + } + + /* get next parent page */ + DT_GETPAGE(ip, bn, mp, PSIZE, p, rc); + if (rc) + return rc; + + /* update parent page stack frame */ + parent->bn = bn; + } + + /* get leaf page address */ + stbl = DT_GETSTBL(p); + xd = (pxd_t *) & p->slot[stbl[pn]]; + bn = addressPXD(xd); + + /* unpin parent page */ + DT_PUTPAGE(mp); + + /* + * get target leaf page + */ + c: + DT_GETPAGE(ip, bn, mp, PSIZE, p, rc); + if (rc) + return rc; + + /* + * leaf page has been completed: + * start with 1st entry of next leaf page + */ + if (index >= p->header.nextindex) { + bn = le64_to_cpu(p->header.next); + + /* unpin leaf page */ + DT_PUTPAGE(mp); + + /* offset beyond eof ? */ + if (bn == 0) { + bn = -1; + goto out; + } + + /* get next leaf page */ + DT_GETPAGE(ip, bn, mp, PSIZE, p, rc); + if (rc) + return rc; + + /* start with 1st entry of next leaf page */ + dtoffset->pn++; + dtoffset->index = 0; + } + + out: + /* return target leaf page pinned */ + btsp = btstack->top; + btsp->bn = bn; + btsp->index = dtoffset->index; + btsp->mp = mp; + + return 0; +} + + +/* + * dtCompare() + * + * function: compare search key with an internal entry + * + * return: + * < 0 if k is < record + * = 0 if k is = record + * > 0 if k is > record + */ +static int dtCompare(struct component_name * key, /* search key */ + dtpage_t * p, /* directory page */ + int si) +{ /* entry slot index */ + wchar_t *kname; + __le16 *name; + int klen, namlen, len, rc; + struct idtentry *ih; + struct dtslot *t; + + /* + * force the left-most key on internal pages, at any level of + * the tree, to be less than any search key. + * this obviates having to update the leftmost key on an internal + * page when the user inserts a new key in the tree smaller than + * anything that has been stored. + * + * (? if/when dtSearch() narrows down to 1st entry (index = 0), + * at any internal page at any level of the tree, + * it descends to child of the entry anyway - + * ? make the entry as min size dummy entry) + * + * if (e->index == 0 && h->prevpg == P_INVALID && !(h->flags & BT_LEAF)) + * return (1); + */ + + kname = key->name; + klen = key->namlen; + + ih = (struct idtentry *) & p->slot[si]; + si = ih->next; + name = ih->name; + namlen = ih->namlen; + len = min(namlen, DTIHDRDATALEN); + + /* compare with head/only segment */ + len = min(klen, len); + if ((rc = UniStrncmp_le(kname, name, len))) + return rc; + + klen -= len; + namlen -= len; + + /* compare with additional segment(s) */ + kname += len; + while (klen > 0 && namlen > 0) { + /* compare with next name segment */ + t = (struct dtslot *) & p->slot[si]; + len = min(namlen, DTSLOTDATALEN); + len = min(klen, len); + name = t->name; + if ((rc = UniStrncmp_le(kname, name, len))) + return rc; + + klen -= len; + namlen -= len; + kname += len; + si = t->next; + } + + return (klen - namlen); +} + + + + +/* + * ciCompare() + * + * function: compare search key with an (leaf/internal) entry + * + * return: + * < 0 if k is < record + * = 0 if k is = record + * > 0 if k is > record + */ +static int ciCompare(struct component_name * key, /* search key */ + dtpage_t * p, /* directory page */ + int si, /* entry slot index */ + int flag) +{ + wchar_t *kname, x; + __le16 *name; + int klen, namlen, len, rc; + struct ldtentry *lh; + struct idtentry *ih; + struct dtslot *t; + int i; + + /* + * force the left-most key on internal pages, at any level of + * the tree, to be less than any search key. + * this obviates having to update the leftmost key on an internal + * page when the user inserts a new key in the tree smaller than + * anything that has been stored. + * + * (? if/when dtSearch() narrows down to 1st entry (index = 0), + * at any internal page at any level of the tree, + * it descends to child of the entry anyway - + * ? make the entry as min size dummy entry) + * + * if (e->index == 0 && h->prevpg == P_INVALID && !(h->flags & BT_LEAF)) + * return (1); + */ + + kname = key->name; + klen = key->namlen; + + /* + * leaf page entry + */ + if (p->header.flag & BT_LEAF) { + lh = (struct ldtentry *) & p->slot[si]; + si = lh->next; + name = lh->name; + namlen = lh->namlen; + if (flag & JFS_DIR_INDEX) + len = min(namlen, DTLHDRDATALEN); + else + len = min(namlen, DTLHDRDATALEN_LEGACY); + } + /* + * internal page entry + */ + else { + ih = (struct idtentry *) & p->slot[si]; + si = ih->next; + name = ih->name; + namlen = ih->namlen; + len = min(namlen, DTIHDRDATALEN); + } + + /* compare with head/only segment */ + len = min(klen, len); + for (i = 0; i < len; i++, kname++, name++) { + /* only uppercase if case-insensitive support is on */ + if ((flag & JFS_OS2) == JFS_OS2) + x = UniToupper(le16_to_cpu(*name)); + else + x = le16_to_cpu(*name); + if ((rc = *kname - x)) + return rc; + } + + klen -= len; + namlen -= len; + + /* compare with additional segment(s) */ + while (klen > 0 && namlen > 0) { + /* compare with next name segment */ + t = (struct dtslot *) & p->slot[si]; + len = min(namlen, DTSLOTDATALEN); + len = min(klen, len); + name = t->name; + for (i = 0; i < len; i++, kname++, name++) { + /* only uppercase if case-insensitive support is on */ + if ((flag & JFS_OS2) == JFS_OS2) + x = UniToupper(le16_to_cpu(*name)); + else + x = le16_to_cpu(*name); + + if ((rc = *kname - x)) + return rc; + } + + klen -= len; + namlen -= len; + si = t->next; + } + + return (klen - namlen); +} + + +/* + * ciGetLeafPrefixKey() + * + * function: compute prefix of suffix compression + * from two adjacent leaf entries + * across page boundary + * + * return: non-zero on error + * + */ +static int ciGetLeafPrefixKey(dtpage_t * lp, int li, dtpage_t * rp, + int ri, struct component_name * key, int flag) +{ + int klen, namlen; + wchar_t *pl, *pr, *kname; + struct component_name lkey; + struct component_name rkey; + + lkey.name = kmalloc_array(JFS_NAME_MAX + 1, sizeof(wchar_t), + GFP_KERNEL); + if (lkey.name == NULL) + return -ENOMEM; + + rkey.name = kmalloc_array(JFS_NAME_MAX + 1, sizeof(wchar_t), + GFP_KERNEL); + if (rkey.name == NULL) { + kfree(lkey.name); + return -ENOMEM; + } + + /* get left and right key */ + dtGetKey(lp, li, &lkey, flag); + lkey.name[lkey.namlen] = 0; + + if ((flag & JFS_OS2) == JFS_OS2) + ciToUpper(&lkey); + + dtGetKey(rp, ri, &rkey, flag); + rkey.name[rkey.namlen] = 0; + + + if ((flag & JFS_OS2) == JFS_OS2) + ciToUpper(&rkey); + + /* compute prefix */ + klen = 0; + kname = key->name; + namlen = min(lkey.namlen, rkey.namlen); + for (pl = lkey.name, pr = rkey.name; + namlen; pl++, pr++, namlen--, klen++, kname++) { + *kname = *pr; + if (*pl != *pr) { + key->namlen = klen + 1; + goto free_names; + } + } + + /* l->namlen <= r->namlen since l <= r */ + if (lkey.namlen < rkey.namlen) { + *kname = *pr; + key->namlen = klen + 1; + } else /* l->namelen == r->namelen */ + key->namlen = klen; + +free_names: + kfree(lkey.name); + kfree(rkey.name); + return 0; +} + + + +/* + * dtGetKey() + * + * function: get key of the entry + */ +static void dtGetKey(dtpage_t * p, int i, /* entry index */ + struct component_name * key, int flag) +{ + int si; + s8 *stbl; + struct ldtentry *lh; + struct idtentry *ih; + struct dtslot *t; + int namlen, len; + wchar_t *kname; + __le16 *name; + + /* get entry */ + stbl = DT_GETSTBL(p); + si = stbl[i]; + if (p->header.flag & BT_LEAF) { + lh = (struct ldtentry *) & p->slot[si]; + si = lh->next; + namlen = lh->namlen; + name = lh->name; + if (flag & JFS_DIR_INDEX) + len = min(namlen, DTLHDRDATALEN); + else + len = min(namlen, DTLHDRDATALEN_LEGACY); + } else { + ih = (struct idtentry *) & p->slot[si]; + si = ih->next; + namlen = ih->namlen; + name = ih->name; + len = min(namlen, DTIHDRDATALEN); + } + + key->namlen = namlen; + kname = key->name; + + /* + * move head/only segment + */ + UniStrncpy_from_le(kname, name, len); + + /* + * move additional segment(s) + */ + while (si >= 0) { + /* get next segment */ + t = &p->slot[si]; + kname += len; + namlen -= len; + len = min(namlen, DTSLOTDATALEN); + UniStrncpy_from_le(kname, t->name, len); + + si = t->next; + } +} + + +/* + * dtInsertEntry() + * + * function: allocate free slot(s) and + * write a leaf/internal entry + * + * return: entry slot index + */ +static void dtInsertEntry(dtpage_t * p, int index, struct component_name * key, + ddata_t * data, struct dt_lock ** dtlock) +{ + struct dtslot *h, *t; + struct ldtentry *lh = NULL; + struct idtentry *ih = NULL; + int hsi, fsi, klen, len, nextindex; + wchar_t *kname; + __le16 *name; + s8 *stbl; + pxd_t *xd; + struct dt_lock *dtlck = *dtlock; + struct lv *lv; + int xsi, n; + s64 bn = 0; + struct metapage *mp = NULL; + + klen = key->namlen; + kname = key->name; + + /* allocate a free slot */ + hsi = fsi = p->header.freelist; + h = &p->slot[fsi]; + p->header.freelist = h->next; + --p->header.freecnt; + + /* open new linelock */ + if (dtlck->index >= dtlck->maxcnt) + dtlck = (struct dt_lock *) txLinelock(dtlck); + + lv = & dtlck->lv[dtlck->index]; + lv->offset = hsi; + + /* write head/only segment */ + if (p->header.flag & BT_LEAF) { + lh = (struct ldtentry *) h; + lh->next = h->next; + lh->inumber = cpu_to_le32(data->leaf.ino); + lh->namlen = klen; + name = lh->name; + if (data->leaf.ip) { + len = min(klen, DTLHDRDATALEN); + if (!(p->header.flag & BT_ROOT)) + bn = addressPXD(&p->header.self); + lh->index = cpu_to_le32(add_index(data->leaf.tid, + data->leaf.ip, + bn, index)); + } else + len = min(klen, DTLHDRDATALEN_LEGACY); + } else { + ih = (struct idtentry *) h; + ih->next = h->next; + xd = (pxd_t *) ih; + *xd = data->xd; + ih->namlen = klen; + name = ih->name; + len = min(klen, DTIHDRDATALEN); + } + + UniStrncpy_to_le(name, kname, len); + + n = 1; + xsi = hsi; + + /* write additional segment(s) */ + t = h; + klen -= len; + while (klen) { + /* get free slot */ + fsi = p->header.freelist; + t = &p->slot[fsi]; + p->header.freelist = t->next; + --p->header.freecnt; + + /* is next slot contiguous ? */ + if (fsi != xsi + 1) { + /* close current linelock */ + lv->length = n; + dtlck->index++; + + /* open new linelock */ + if (dtlck->index < dtlck->maxcnt) + lv++; + else { + dtlck = (struct dt_lock *) txLinelock(dtlck); + lv = & dtlck->lv[0]; + } + + lv->offset = fsi; + n = 0; + } + + kname += len; + len = min(klen, DTSLOTDATALEN); + UniStrncpy_to_le(t->name, kname, len); + + n++; + xsi = fsi; + klen -= len; + } + + /* close current linelock */ + lv->length = n; + dtlck->index++; + + *dtlock = dtlck; + + /* terminate last/only segment */ + if (h == t) { + /* single segment entry */ + if (p->header.flag & BT_LEAF) + lh->next = -1; + else + ih->next = -1; + } else + /* multi-segment entry */ + t->next = -1; + + /* if insert into middle, shift right succeeding entries in stbl */ + stbl = DT_GETSTBL(p); + nextindex = p->header.nextindex; + if (index < nextindex) { + memmove(stbl + index + 1, stbl + index, nextindex - index); + + if ((p->header.flag & BT_LEAF) && data->leaf.ip) { + s64 lblock; + + /* + * Need to update slot number for entries that moved + * in the stbl + */ + mp = NULL; + for (n = index + 1; n <= nextindex; n++) { + lh = (struct ldtentry *) & (p->slot[stbl[n]]); + modify_index(data->leaf.tid, data->leaf.ip, + le32_to_cpu(lh->index), bn, n, + &mp, &lblock); + } + if (mp) + release_metapage(mp); + } + } + + stbl[index] = hsi; + + /* advance next available entry index of stbl */ + ++p->header.nextindex; +} + + +/* + * dtMoveEntry() + * + * function: move entries from split/left page to new/right page + * + * nextindex of dst page and freelist/freecnt of both pages + * are updated. + */ +static void dtMoveEntry(dtpage_t * sp, int si, dtpage_t * dp, + struct dt_lock ** sdtlock, struct dt_lock ** ddtlock, + int do_index) +{ + int ssi, next; /* src slot index */ + int di; /* dst entry index */ + int dsi; /* dst slot index */ + s8 *sstbl, *dstbl; /* sorted entry table */ + int snamlen, len; + struct ldtentry *slh, *dlh = NULL; + struct idtentry *sih, *dih = NULL; + struct dtslot *h, *s, *d; + struct dt_lock *sdtlck = *sdtlock, *ddtlck = *ddtlock; + struct lv *slv, *dlv; + int xssi, ns, nd; + int sfsi; + + sstbl = (s8 *) & sp->slot[sp->header.stblindex]; + dstbl = (s8 *) & dp->slot[dp->header.stblindex]; + + dsi = dp->header.freelist; /* first (whole page) free slot */ + sfsi = sp->header.freelist; + + /* linelock destination entry slot */ + dlv = & ddtlck->lv[ddtlck->index]; + dlv->offset = dsi; + + /* linelock source entry slot */ + slv = & sdtlck->lv[sdtlck->index]; + slv->offset = sstbl[si]; + xssi = slv->offset - 1; + + /* + * move entries + */ + ns = nd = 0; + for (di = 0; si < sp->header.nextindex; si++, di++) { + ssi = sstbl[si]; + dstbl[di] = dsi; + + /* is next slot contiguous ? */ + if (ssi != xssi + 1) { + /* close current linelock */ + slv->length = ns; + sdtlck->index++; + + /* open new linelock */ + if (sdtlck->index < sdtlck->maxcnt) + slv++; + else { + sdtlck = (struct dt_lock *) txLinelock(sdtlck); + slv = & sdtlck->lv[0]; + } + + slv->offset = ssi; + ns = 0; + } + + /* + * move head/only segment of an entry + */ + /* get dst slot */ + h = d = &dp->slot[dsi]; + + /* get src slot and move */ + s = &sp->slot[ssi]; + if (sp->header.flag & BT_LEAF) { + /* get source entry */ + slh = (struct ldtentry *) s; + dlh = (struct ldtentry *) h; + snamlen = slh->namlen; + + if (do_index) { + len = min(snamlen, DTLHDRDATALEN); + dlh->index = slh->index; /* little-endian */ + } else + len = min(snamlen, DTLHDRDATALEN_LEGACY); + + memcpy(dlh, slh, 6 + len * 2); + + next = slh->next; + + /* update dst head/only segment next field */ + dsi++; + dlh->next = dsi; + } else { + sih = (struct idtentry *) s; + snamlen = sih->namlen; + + len = min(snamlen, DTIHDRDATALEN); + dih = (struct idtentry *) h; + memcpy(dih, sih, 10 + len * 2); + next = sih->next; + + dsi++; + dih->next = dsi; + } + + /* free src head/only segment */ + s->next = sfsi; + s->cnt = 1; + sfsi = ssi; + + ns++; + nd++; + xssi = ssi; + + /* + * move additional segment(s) of the entry + */ + snamlen -= len; + while ((ssi = next) >= 0) { + /* is next slot contiguous ? */ + if (ssi != xssi + 1) { + /* close current linelock */ + slv->length = ns; + sdtlck->index++; + + /* open new linelock */ + if (sdtlck->index < sdtlck->maxcnt) + slv++; + else { + sdtlck = + (struct dt_lock *) + txLinelock(sdtlck); + slv = & sdtlck->lv[0]; + } + + slv->offset = ssi; + ns = 0; + } + + /* get next source segment */ + s = &sp->slot[ssi]; + + /* get next destination free slot */ + d++; + + len = min(snamlen, DTSLOTDATALEN); + UniStrncpy_le(d->name, s->name, len); + + ns++; + nd++; + xssi = ssi; + + dsi++; + d->next = dsi; + + /* free source segment */ + next = s->next; + s->next = sfsi; + s->cnt = 1; + sfsi = ssi; + + snamlen -= len; + } /* end while */ + + /* terminate dst last/only segment */ + if (h == d) { + /* single segment entry */ + if (dp->header.flag & BT_LEAF) + dlh->next = -1; + else + dih->next = -1; + } else + /* multi-segment entry */ + d->next = -1; + } /* end for */ + + /* close current linelock */ + slv->length = ns; + sdtlck->index++; + *sdtlock = sdtlck; + + dlv->length = nd; + ddtlck->index++; + *ddtlock = ddtlck; + + /* update source header */ + sp->header.freelist = sfsi; + sp->header.freecnt += nd; + + /* update destination header */ + dp->header.nextindex = di; + + dp->header.freelist = dsi; + dp->header.freecnt -= nd; +} + + +/* + * dtDeleteEntry() + * + * function: free a (leaf/internal) entry + * + * log freelist header, stbl, and each segment slot of entry + * (even though last/only segment next field is modified, + * physical image logging requires all segment slots of + * the entry logged to avoid applying previous updates + * to the same slots) + */ +static void dtDeleteEntry(dtpage_t * p, int fi, struct dt_lock ** dtlock) +{ + int fsi; /* free entry slot index */ + s8 *stbl; + struct dtslot *t; + int si, freecnt; + struct dt_lock *dtlck = *dtlock; + struct lv *lv; + int xsi, n; + + /* get free entry slot index */ + stbl = DT_GETSTBL(p); + fsi = stbl[fi]; + + /* open new linelock */ + if (dtlck->index >= dtlck->maxcnt) + dtlck = (struct dt_lock *) txLinelock(dtlck); + lv = & dtlck->lv[dtlck->index]; + + lv->offset = fsi; + + /* get the head/only segment */ + t = &p->slot[fsi]; + if (p->header.flag & BT_LEAF) + si = ((struct ldtentry *) t)->next; + else + si = ((struct idtentry *) t)->next; + t->next = si; + t->cnt = 1; + + n = freecnt = 1; + xsi = fsi; + + /* find the last/only segment */ + while (si >= 0) { + /* is next slot contiguous ? */ + if (si != xsi + 1) { + /* close current linelock */ + lv->length = n; + dtlck->index++; + + /* open new linelock */ + if (dtlck->index < dtlck->maxcnt) + lv++; + else { + dtlck = (struct dt_lock *) txLinelock(dtlck); + lv = & dtlck->lv[0]; + } + + lv->offset = si; + n = 0; + } + + n++; + xsi = si; + freecnt++; + + t = &p->slot[si]; + t->cnt = 1; + si = t->next; + } + + /* close current linelock */ + lv->length = n; + dtlck->index++; + + *dtlock = dtlck; + + /* update freelist */ + t->next = p->header.freelist; + p->header.freelist = fsi; + p->header.freecnt += freecnt; + + /* if delete from middle, + * shift left the succedding entries in the stbl + */ + si = p->header.nextindex; + if (fi < si - 1) + memmove(&stbl[fi], &stbl[fi + 1], si - fi - 1); + + p->header.nextindex--; +} + + +/* + * dtTruncateEntry() + * + * function: truncate a (leaf/internal) entry + * + * log freelist header, stbl, and each segment slot of entry + * (even though last/only segment next field is modified, + * physical image logging requires all segment slots of + * the entry logged to avoid applying previous updates + * to the same slots) + */ +static void dtTruncateEntry(dtpage_t * p, int ti, struct dt_lock ** dtlock) +{ + int tsi; /* truncate entry slot index */ + s8 *stbl; + struct dtslot *t; + int si, freecnt; + struct dt_lock *dtlck = *dtlock; + struct lv *lv; + int fsi, xsi, n; + + /* get free entry slot index */ + stbl = DT_GETSTBL(p); + tsi = stbl[ti]; + + /* open new linelock */ + if (dtlck->index >= dtlck->maxcnt) + dtlck = (struct dt_lock *) txLinelock(dtlck); + lv = & dtlck->lv[dtlck->index]; + + lv->offset = tsi; + + /* get the head/only segment */ + t = &p->slot[tsi]; + ASSERT(p->header.flag & BT_INTERNAL); + ((struct idtentry *) t)->namlen = 0; + si = ((struct idtentry *) t)->next; + ((struct idtentry *) t)->next = -1; + + n = 1; + freecnt = 0; + fsi = si; + xsi = tsi; + + /* find the last/only segment */ + while (si >= 0) { + /* is next slot contiguous ? */ + if (si != xsi + 1) { + /* close current linelock */ + lv->length = n; + dtlck->index++; + + /* open new linelock */ + if (dtlck->index < dtlck->maxcnt) + lv++; + else { + dtlck = (struct dt_lock *) txLinelock(dtlck); + lv = & dtlck->lv[0]; + } + + lv->offset = si; + n = 0; + } + + n++; + xsi = si; + freecnt++; + + t = &p->slot[si]; + t->cnt = 1; + si = t->next; + } + + /* close current linelock */ + lv->length = n; + dtlck->index++; + + *dtlock = dtlck; + + /* update freelist */ + if (freecnt == 0) + return; + t->next = p->header.freelist; + p->header.freelist = fsi; + p->header.freecnt += freecnt; +} + + +/* + * dtLinelockFreelist() + */ +static void dtLinelockFreelist(dtpage_t * p, /* directory page */ + int m, /* max slot index */ + struct dt_lock ** dtlock) +{ + int fsi; /* free entry slot index */ + struct dtslot *t; + int si; + struct dt_lock *dtlck = *dtlock; + struct lv *lv; + int xsi, n; + + /* get free entry slot index */ + fsi = p->header.freelist; + + /* open new linelock */ + if (dtlck->index >= dtlck->maxcnt) + dtlck = (struct dt_lock *) txLinelock(dtlck); + lv = & dtlck->lv[dtlck->index]; + + lv->offset = fsi; + + n = 1; + xsi = fsi; + + t = &p->slot[fsi]; + si = t->next; + + /* find the last/only segment */ + while (si < m && si >= 0) { + /* is next slot contiguous ? */ + if (si != xsi + 1) { + /* close current linelock */ + lv->length = n; + dtlck->index++; + + /* open new linelock */ + if (dtlck->index < dtlck->maxcnt) + lv++; + else { + dtlck = (struct dt_lock *) txLinelock(dtlck); + lv = & dtlck->lv[0]; + } + + lv->offset = si; + n = 0; + } + + n++; + xsi = si; + + t = &p->slot[si]; + si = t->next; + } + + /* close current linelock */ + lv->length = n; + dtlck->index++; + + *dtlock = dtlck; +} + + +/* + * NAME: dtModify + * + * FUNCTION: Modify the inode number part of a directory entry + * + * PARAMETERS: + * tid - Transaction id + * ip - Inode of parent directory + * key - Name of entry to be modified + * orig_ino - Original inode number expected in entry + * new_ino - New inode number to put into entry + * flag - JFS_RENAME + * + * RETURNS: + * -ESTALE - If entry found does not match orig_ino passed in + * -ENOENT - If no entry can be found to match key + * 0 - If successfully modified entry + */ +int dtModify(tid_t tid, struct inode *ip, + struct component_name * key, ino_t * orig_ino, ino_t new_ino, int flag) +{ + int rc; + s64 bn; + struct metapage *mp; + dtpage_t *p; + int index; + struct btstack btstack; + struct tlock *tlck; + struct dt_lock *dtlck; + struct lv *lv; + s8 *stbl; + int entry_si; /* entry slot index */ + struct ldtentry *entry; + + /* + * search for the entry to modify: + * + * dtSearch() returns (leaf page pinned, index at which to modify). + */ + if ((rc = dtSearch(ip, key, orig_ino, &btstack, flag))) + return rc; + + /* retrieve search result */ + DT_GETSEARCH(ip, btstack.top, bn, mp, p, index); + + BT_MARK_DIRTY(mp, ip); + /* + * acquire a transaction lock on the leaf page of named entry + */ + tlck = txLock(tid, ip, mp, tlckDTREE | tlckENTRY); + dtlck = (struct dt_lock *) & tlck->lock; + + /* get slot index of the entry */ + stbl = DT_GETSTBL(p); + entry_si = stbl[index]; + + /* linelock entry */ + ASSERT(dtlck->index == 0); + lv = & dtlck->lv[0]; + lv->offset = entry_si; + lv->length = 1; + dtlck->index++; + + /* get the head/only segment */ + entry = (struct ldtentry *) & p->slot[entry_si]; + + /* substitute the inode number of the entry */ + entry->inumber = cpu_to_le32(new_ino); + + /* unpin the leaf page */ + DT_PUTPAGE(mp); + + return 0; +} diff --git a/fs/jfs/jfs_dtree.h b/fs/jfs/jfs_dtree.h new file mode 100644 index 000000000..fd4169e6e --- /dev/null +++ b/fs/jfs/jfs_dtree.h @@ -0,0 +1,269 @@ +/* + * Copyright (C) International Business Machines Corp., 2000-2002 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +#ifndef _H_JFS_DTREE +#define _H_JFS_DTREE + +/* + * jfs_dtree.h: directory B+-tree manager + */ + +#include "jfs_btree.h" + +typedef union { + struct { + tid_t tid; + struct inode *ip; + u32 ino; + } leaf; + pxd_t xd; +} ddata_t; + + +/* + * entry segment/slot + * + * an entry consists of type dependent head/only segment/slot and + * additional segments/slots linked vi next field; + * N.B. last/only segment of entry is terminated by next = -1; + */ +/* + * directory page slot + */ +struct dtslot { + s8 next; /* 1: */ + s8 cnt; /* 1: */ + __le16 name[15]; /* 30: */ +}; /* (32) */ + + +#define DATASLOTSIZE 16 +#define L2DATASLOTSIZE 4 +#define DTSLOTSIZE 32 +#define L2DTSLOTSIZE 5 +#define DTSLOTHDRSIZE 2 +#define DTSLOTDATASIZE 30 +#define DTSLOTDATALEN 15 + +/* + * internal node entry head/only segment + */ +struct idtentry { + pxd_t xd; /* 8: child extent descriptor */ + + s8 next; /* 1: */ + u8 namlen; /* 1: */ + __le16 name[11]; /* 22: 2-byte aligned */ +}; /* (32) */ + +#define DTIHDRSIZE 10 +#define DTIHDRDATALEN 11 + +/* compute number of slots for entry */ +#define NDTINTERNAL(klen) (DIV_ROUND_UP((4 + (klen)), 15)) + + +/* + * leaf node entry head/only segment + * + * For legacy filesystems, name contains 13 wchars -- no index field + */ +struct ldtentry { + __le32 inumber; /* 4: 4-byte aligned */ + s8 next; /* 1: */ + u8 namlen; /* 1: */ + __le16 name[11]; /* 22: 2-byte aligned */ + __le32 index; /* 4: index into dir_table */ +}; /* (32) */ + +#define DTLHDRSIZE 6 +#define DTLHDRDATALEN_LEGACY 13 /* Old (OS/2) format */ +#define DTLHDRDATALEN 11 + +/* + * dir_table used for directory traversal during readdir + */ + +/* + * Keep persistent index for directory entries + */ +#define DO_INDEX(INODE) (JFS_SBI((INODE)->i_sb)->mntflag & JFS_DIR_INDEX) + +/* + * Maximum entry in inline directory table + */ +#define MAX_INLINE_DIRTABLE_ENTRY 13 + +struct dir_table_slot { + u8 rsrvd; /* 1: */ + u8 flag; /* 1: 0 if free */ + u8 slot; /* 1: slot within leaf page of entry */ + u8 addr1; /* 1: upper 8 bits of leaf page address */ + __le32 addr2; /* 4: lower 32 bits of leaf page address -OR- + index of next entry when this entry was deleted */ +}; /* (8) */ + +/* + * flag values + */ +#define DIR_INDEX_VALID 1 +#define DIR_INDEX_FREE 0 + +#define DTSaddress(dir_table_slot, address64)\ +{\ + (dir_table_slot)->addr1 = ((u64)address64) >> 32;\ + (dir_table_slot)->addr2 = __cpu_to_le32((address64) & 0xffffffff);\ +} + +#define addressDTS(dts)\ + ( ((s64)((dts)->addr1)) << 32 | __le32_to_cpu((dts)->addr2) ) + +/* compute number of slots for entry */ +#define NDTLEAF_LEGACY(klen) (DIV_ROUND_UP((2 + (klen)), 15)) +#define NDTLEAF NDTINTERNAL + + +/* + * directory root page (in-line in on-disk inode): + * + * cf. dtpage_t below. + */ +typedef union { + struct { + struct dasd DASD; /* 16: DASD limit/usage info */ + + u8 flag; /* 1: */ + u8 nextindex; /* 1: next free entry in stbl */ + s8 freecnt; /* 1: free count */ + s8 freelist; /* 1: freelist header */ + + __le32 idotdot; /* 4: parent inode number */ + + s8 stbl[8]; /* 8: sorted entry index table */ + } header; /* (32) */ + + struct dtslot slot[9]; +} dtroot_t; + +#define PARENT(IP) \ + (le32_to_cpu(JFS_IP(IP)->i_dtroot.header.idotdot)) + +#define DTROOTMAXSLOT 9 + +#define dtEmpty(IP) (JFS_IP(IP)->i_dtroot.header.nextindex == 0) + + +/* + * directory regular page: + * + * entry slot array of 32 byte slot + * + * sorted entry slot index table (stbl): + * contiguous slots at slot specified by stblindex, + * 1-byte per entry + * 512 byte block: 16 entry tbl (1 slot) + * 1024 byte block: 32 entry tbl (1 slot) + * 2048 byte block: 64 entry tbl (2 slot) + * 4096 byte block: 128 entry tbl (4 slot) + * + * data area: + * 512 byte block: 16 - 2 = 14 slot + * 1024 byte block: 32 - 2 = 30 slot + * 2048 byte block: 64 - 3 = 61 slot + * 4096 byte block: 128 - 5 = 123 slot + * + * N.B. index is 0-based; index fields refer to slot index + * except nextindex which refers to entry index in stbl; + * end of entry stot list or freelist is marked with -1. + */ +typedef union { + struct { + __le64 next; /* 8: next sibling */ + __le64 prev; /* 8: previous sibling */ + + u8 flag; /* 1: */ + u8 nextindex; /* 1: next entry index in stbl */ + s8 freecnt; /* 1: */ + s8 freelist; /* 1: slot index of head of freelist */ + + u8 maxslot; /* 1: number of slots in page slot[] */ + u8 stblindex; /* 1: slot index of start of stbl */ + u8 rsrvd[2]; /* 2: */ + + pxd_t self; /* 8: self pxd */ + } header; /* (32) */ + + struct dtslot slot[128]; +} dtpage_t; + +#define DTPAGEMAXSLOT 128 + +#define DT8THPGNODEBYTES 512 +#define DT8THPGNODETSLOTS 1 +#define DT8THPGNODESLOTS 16 + +#define DTQTRPGNODEBYTES 1024 +#define DTQTRPGNODETSLOTS 1 +#define DTQTRPGNODESLOTS 32 + +#define DTHALFPGNODEBYTES 2048 +#define DTHALFPGNODETSLOTS 2 +#define DTHALFPGNODESLOTS 64 + +#define DTFULLPGNODEBYTES 4096 +#define DTFULLPGNODETSLOTS 4 +#define DTFULLPGNODESLOTS 128 + +#define DTENTRYSTART 1 + +/* get sorted entry table of the page */ +#define DT_GETSTBL(p) ( ((p)->header.flag & BT_ROOT) ?\ + ((dtroot_t *)(p))->header.stbl : \ + (s8 *)&(p)->slot[(p)->header.stblindex] ) + +/* + * Flags for dtSearch + */ +#define JFS_CREATE 1 +#define JFS_LOOKUP 2 +#define JFS_REMOVE 3 +#define JFS_RENAME 4 + +/* + * Maximum file offset for directories. + */ +#define DIREND INT_MAX + +/* + * external declarations + */ +extern void dtInitRoot(tid_t tid, struct inode *ip, u32 idotdot); + +extern int dtSearch(struct inode *ip, struct component_name * key, + ino_t * data, struct btstack * btstack, int flag); + +extern int dtInsert(tid_t tid, struct inode *ip, struct component_name * key, + ino_t * ino, struct btstack * btstack); + +extern int dtDelete(tid_t tid, struct inode *ip, struct component_name * key, + ino_t * data, int flag); + +extern int dtModify(tid_t tid, struct inode *ip, struct component_name * key, + ino_t * orig_ino, ino_t new_ino, int flag); + +extern int jfs_readdir(struct file *file, struct dir_context *ctx); +#endif /* !_H_JFS_DTREE */ diff --git a/fs/jfs/jfs_extent.c b/fs/jfs/jfs_extent.c new file mode 100644 index 000000000..2ae7d59ab --- /dev/null +++ b/fs/jfs/jfs_extent.c @@ -0,0 +1,651 @@ +/* + * Copyright (C) International Business Machines Corp., 2000-2004 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include <linux/fs.h> +#include <linux/quotaops.h> +#include "jfs_incore.h" +#include "jfs_inode.h" +#include "jfs_superblock.h" +#include "jfs_dmap.h" +#include "jfs_extent.h" +#include "jfs_debug.h" + +/* + * forward references + */ +static int extBalloc(struct inode *, s64, s64 *, s64 *); +#ifdef _NOTYET +static int extBrealloc(struct inode *, s64, s64, s64 *, s64 *); +#endif +static s64 extRoundDown(s64 nb); + +#define DPD(a) (printk("(a): %d\n",(a))) +#define DPC(a) (printk("(a): %c\n",(a))) +#define DPL1(a) \ +{ \ + if ((a) >> 32) \ + printk("(a): %x%08x ",(a)); \ + else \ + printk("(a): %x ",(a) << 32); \ +} +#define DPL(a) \ +{ \ + if ((a) >> 32) \ + printk("(a): %x%08x\n",(a)); \ + else \ + printk("(a): %x\n",(a) << 32); \ +} + +#define DPD1(a) (printk("(a): %d ",(a))) +#define DPX(a) (printk("(a): %08x\n",(a))) +#define DPX1(a) (printk("(a): %08x ",(a))) +#define DPS(a) (printk("%s\n",(a))) +#define DPE(a) (printk("\nENTERING: %s\n",(a))) +#define DPE1(a) (printk("\nENTERING: %s",(a))) +#define DPS1(a) (printk(" %s ",(a))) + + +/* + * NAME: extAlloc() + * + * FUNCTION: allocate an extent for a specified page range within a + * file. + * + * PARAMETERS: + * ip - the inode of the file. + * xlen - requested extent length. + * pno - the starting page number with the file. + * xp - pointer to an xad. on entry, xad describes an + * extent that is used as an allocation hint if the + * xaddr of the xad is non-zero. on successful exit, + * the xad describes the newly allocated extent. + * abnr - bool indicating whether the newly allocated extent + * should be marked as allocated but not recorded. + * + * RETURN VALUES: + * 0 - success + * -EIO - i/o error. + * -ENOSPC - insufficient disk resources. + */ +int +extAlloc(struct inode *ip, s64 xlen, s64 pno, xad_t * xp, bool abnr) +{ + struct jfs_sb_info *sbi = JFS_SBI(ip->i_sb); + s64 nxlen, nxaddr, xoff, hint, xaddr = 0; + int rc; + int xflag; + + /* This blocks if we are low on resources */ + txBeginAnon(ip->i_sb); + + /* Avoid race with jfs_commit_inode() */ + mutex_lock(&JFS_IP(ip)->commit_mutex); + + /* validate extent length */ + if (xlen > MAXXLEN) + xlen = MAXXLEN; + + /* get the page's starting extent offset */ + xoff = pno << sbi->l2nbperpage; + + /* check if an allocation hint was provided */ + if ((hint = addressXAD(xp))) { + /* get the size of the extent described by the hint */ + nxlen = lengthXAD(xp); + + /* check if the hint is for the portion of the file + * immediately previous to the current allocation + * request and if hint extent has the same abnr + * value as the current request. if so, we can + * extend the hint extent to include the current + * extent if we can allocate the blocks immediately + * following the hint extent. + */ + if (offsetXAD(xp) + nxlen == xoff && + abnr == ((xp->flag & XAD_NOTRECORDED) ? true : false)) + xaddr = hint + nxlen; + + /* adjust the hint to the last block of the extent */ + hint += (nxlen - 1); + } + + /* allocate the disk blocks for the extent. initially, extBalloc() + * will try to allocate disk blocks for the requested size (xlen). + * if this fails (xlen contiguous free blocks not available), it'll + * try to allocate a smaller number of blocks (producing a smaller + * extent), with this smaller number of blocks consisting of the + * requested number of blocks rounded down to the next smaller + * power of 2 number (i.e. 16 -> 8). it'll continue to round down + * and retry the allocation until the number of blocks to allocate + * is smaller than the number of blocks per page. + */ + nxlen = xlen; + if ((rc = extBalloc(ip, hint ? hint : INOHINT(ip), &nxlen, &nxaddr))) { + mutex_unlock(&JFS_IP(ip)->commit_mutex); + return (rc); + } + + /* Allocate blocks to quota. */ + rc = dquot_alloc_block(ip, nxlen); + if (rc) { + dbFree(ip, nxaddr, (s64) nxlen); + mutex_unlock(&JFS_IP(ip)->commit_mutex); + return rc; + } + + /* determine the value of the extent flag */ + xflag = abnr ? XAD_NOTRECORDED : 0; + + /* if we can extend the hint extent to cover the current request, + * extend it. otherwise, insert a new extent to + * cover the current request. + */ + if (xaddr && xaddr == nxaddr) + rc = xtExtend(0, ip, xoff, (int) nxlen, 0); + else + rc = xtInsert(0, ip, xflag, xoff, (int) nxlen, &nxaddr, 0); + + /* if the extend or insert failed, + * free the newly allocated blocks and return the error. + */ + if (rc) { + dbFree(ip, nxaddr, nxlen); + dquot_free_block(ip, nxlen); + mutex_unlock(&JFS_IP(ip)->commit_mutex); + return (rc); + } + + /* set the results of the extent allocation */ + XADaddress(xp, nxaddr); + XADlength(xp, nxlen); + XADoffset(xp, xoff); + xp->flag = xflag; + + mark_inode_dirty(ip); + + mutex_unlock(&JFS_IP(ip)->commit_mutex); + /* + * COMMIT_SyncList flags an anonymous tlock on page that is on + * sync list. + * We need to commit the inode to get the page written disk. + */ + if (test_and_clear_cflag(COMMIT_Synclist,ip)) + jfs_commit_inode(ip, 0); + + return (0); +} + + +#ifdef _NOTYET +/* + * NAME: extRealloc() + * + * FUNCTION: extend the allocation of a file extent containing a + * partial back last page. + * + * PARAMETERS: + * ip - the inode of the file. + * cp - cbuf for the partial backed last page. + * xlen - request size of the resulting extent. + * xp - pointer to an xad. on successful exit, the xad + * describes the newly allocated extent. + * abnr - bool indicating whether the newly allocated extent + * should be marked as allocated but not recorded. + * + * RETURN VALUES: + * 0 - success + * -EIO - i/o error. + * -ENOSPC - insufficient disk resources. + */ +int extRealloc(struct inode *ip, s64 nxlen, xad_t * xp, bool abnr) +{ + struct super_block *sb = ip->i_sb; + s64 xaddr, xlen, nxaddr, delta, xoff; + s64 ntail, nextend, ninsert; + int rc, nbperpage = JFS_SBI(sb)->nbperpage; + int xflag; + + /* This blocks if we are low on resources */ + txBeginAnon(ip->i_sb); + + mutex_lock(&JFS_IP(ip)->commit_mutex); + /* validate extent length */ + if (nxlen > MAXXLEN) + nxlen = MAXXLEN; + + /* get the extend (partial) page's disk block address and + * number of blocks. + */ + xaddr = addressXAD(xp); + xlen = lengthXAD(xp); + xoff = offsetXAD(xp); + + /* if the extend page is abnr and if the request is for + * the extent to be allocated and recorded, + * make the page allocated and recorded. + */ + if ((xp->flag & XAD_NOTRECORDED) && !abnr) { + xp->flag = 0; + if ((rc = xtUpdate(0, ip, xp))) + goto exit; + } + + /* try to allocated the request number of blocks for the + * extent. dbRealloc() first tries to satisfy the request + * by extending the allocation in place. otherwise, it will + * try to allocate a new set of blocks large enough for the + * request. in satisfying a request, dbReAlloc() may allocate + * less than what was request but will always allocate enough + * space as to satisfy the extend page. + */ + if ((rc = extBrealloc(ip, xaddr, xlen, &nxlen, &nxaddr))) + goto exit; + + /* Allocat blocks to quota. */ + rc = dquot_alloc_block(ip, nxlen); + if (rc) { + dbFree(ip, nxaddr, (s64) nxlen); + mutex_unlock(&JFS_IP(ip)->commit_mutex); + return rc; + } + + delta = nxlen - xlen; + + /* check if the extend page is not abnr but the request is abnr + * and the allocated disk space is for more than one page. if this + * is the case, there is a miss match of abnr between the extend page + * and the one or more pages following the extend page. as a result, + * two extents will have to be manipulated. the first will be that + * of the extent of the extend page and will be manipulated thru + * an xtExtend() or an xtTailgate(), depending upon whether the + * disk allocation occurred as an inplace extension. the second + * extent will be manipulated (created) through an xtInsert() and + * will be for the pages following the extend page. + */ + if (abnr && (!(xp->flag & XAD_NOTRECORDED)) && (nxlen > nbperpage)) { + ntail = nbperpage; + nextend = ntail - xlen; + ninsert = nxlen - nbperpage; + + xflag = XAD_NOTRECORDED; + } else { + ntail = nxlen; + nextend = delta; + ninsert = 0; + + xflag = xp->flag; + } + + /* if we were able to extend the disk allocation in place, + * extend the extent. otherwise, move the extent to a + * new disk location. + */ + if (xaddr == nxaddr) { + /* extend the extent */ + if ((rc = xtExtend(0, ip, xoff + xlen, (int) nextend, 0))) { + dbFree(ip, xaddr + xlen, delta); + dquot_free_block(ip, nxlen); + goto exit; + } + } else { + /* + * move the extent to a new location: + * + * xtTailgate() accounts for relocated tail extent; + */ + if ((rc = xtTailgate(0, ip, xoff, (int) ntail, nxaddr, 0))) { + dbFree(ip, nxaddr, nxlen); + dquot_free_block(ip, nxlen); + goto exit; + } + } + + + /* check if we need to also insert a new extent */ + if (ninsert) { + /* perform the insert. if it fails, free the blocks + * to be inserted and make it appear that we only did + * the xtExtend() or xtTailgate() above. + */ + xaddr = nxaddr + ntail; + if (xtInsert (0, ip, xflag, xoff + ntail, (int) ninsert, + &xaddr, 0)) { + dbFree(ip, xaddr, (s64) ninsert); + delta = nextend; + nxlen = ntail; + xflag = 0; + } + } + + /* set the return results */ + XADaddress(xp, nxaddr); + XADlength(xp, nxlen); + XADoffset(xp, xoff); + xp->flag = xflag; + + mark_inode_dirty(ip); +exit: + mutex_unlock(&JFS_IP(ip)->commit_mutex); + return (rc); +} +#endif /* _NOTYET */ + + +/* + * NAME: extHint() + * + * FUNCTION: produce an extent allocation hint for a file offset. + * + * PARAMETERS: + * ip - the inode of the file. + * offset - file offset for which the hint is needed. + * xp - pointer to the xad that is to be filled in with + * the hint. + * + * RETURN VALUES: + * 0 - success + * -EIO - i/o error. + */ +int extHint(struct inode *ip, s64 offset, xad_t * xp) +{ + struct super_block *sb = ip->i_sb; + int nbperpage = JFS_SBI(sb)->nbperpage; + s64 prev; + int rc = 0; + s64 xaddr; + int xlen; + int xflag; + + /* init the hint as "no hint provided" */ + XADaddress(xp, 0); + + /* determine the starting extent offset of the page previous + * to the page containing the offset. + */ + prev = ((offset & ~POFFSET) >> JFS_SBI(sb)->l2bsize) - nbperpage; + + /* if the offset is in the first page of the file, no hint provided. + */ + if (prev < 0) + goto out; + + rc = xtLookup(ip, prev, nbperpage, &xflag, &xaddr, &xlen, 0); + + if ((rc == 0) && xlen) { + if (xlen != nbperpage) { + jfs_error(ip->i_sb, "corrupt xtree\n"); + rc = -EIO; + } + XADaddress(xp, xaddr); + XADlength(xp, xlen); + XADoffset(xp, prev); + /* + * only preserve the abnr flag within the xad flags + * of the returned hint. + */ + xp->flag = xflag & XAD_NOTRECORDED; + } else + rc = 0; + +out: + return (rc); +} + + +/* + * NAME: extRecord() + * + * FUNCTION: change a page with a file from not recorded to recorded. + * + * PARAMETERS: + * ip - inode of the file. + * cp - cbuf of the file page. + * + * RETURN VALUES: + * 0 - success + * -EIO - i/o error. + * -ENOSPC - insufficient disk resources. + */ +int extRecord(struct inode *ip, xad_t * xp) +{ + int rc; + + txBeginAnon(ip->i_sb); + + mutex_lock(&JFS_IP(ip)->commit_mutex); + + /* update the extent */ + rc = xtUpdate(0, ip, xp); + + mutex_unlock(&JFS_IP(ip)->commit_mutex); + return rc; +} + + +#ifdef _NOTYET +/* + * NAME: extFill() + * + * FUNCTION: allocate disk space for a file page that represents + * a file hole. + * + * PARAMETERS: + * ip - the inode of the file. + * cp - cbuf of the file page represent the hole. + * + * RETURN VALUES: + * 0 - success + * -EIO - i/o error. + * -ENOSPC - insufficient disk resources. + */ +int extFill(struct inode *ip, xad_t * xp) +{ + int rc, nbperpage = JFS_SBI(ip->i_sb)->nbperpage; + s64 blkno = offsetXAD(xp) >> ip->i_blkbits; + +// assert(ISSPARSE(ip)); + + /* initialize the extent allocation hint */ + XADaddress(xp, 0); + + /* allocate an extent to fill the hole */ + if ((rc = extAlloc(ip, nbperpage, blkno, xp, false))) + return (rc); + + assert(lengthPXD(xp) == nbperpage); + + return (0); +} +#endif /* _NOTYET */ + + +/* + * NAME: extBalloc() + * + * FUNCTION: allocate disk blocks to form an extent. + * + * initially, we will try to allocate disk blocks for the + * requested size (nblocks). if this fails (nblocks + * contiguous free blocks not available), we'll try to allocate + * a smaller number of blocks (producing a smaller extent), with + * this smaller number of blocks consisting of the requested + * number of blocks rounded down to the next smaller power of 2 + * number (i.e. 16 -> 8). we'll continue to round down and + * retry the allocation until the number of blocks to allocate + * is smaller than the number of blocks per page. + * + * PARAMETERS: + * ip - the inode of the file. + * hint - disk block number to be used as an allocation hint. + * *nblocks - pointer to an s64 value. on entry, this value specifies + * the desired number of block to be allocated. on successful + * exit, this value is set to the number of blocks actually + * allocated. + * blkno - pointer to a block address that is filled in on successful + * return with the starting block number of the newly + * allocated block range. + * + * RETURN VALUES: + * 0 - success + * -EIO - i/o error. + * -ENOSPC - insufficient disk resources. + */ +static int +extBalloc(struct inode *ip, s64 hint, s64 * nblocks, s64 * blkno) +{ + struct jfs_inode_info *ji = JFS_IP(ip); + struct jfs_sb_info *sbi = JFS_SBI(ip->i_sb); + s64 nb, nblks, daddr, max; + int rc, nbperpage = sbi->nbperpage; + struct bmap *bmp = sbi->bmap; + int ag; + + /* get the number of blocks to initially attempt to allocate. + * we'll first try the number of blocks requested unless this + * number is greater than the maximum number of contiguous free + * blocks in the map. in that case, we'll start off with the + * maximum free. + */ + max = (s64) 1 << bmp->db_maxfreebud; + if (*nblocks >= max && *nblocks > nbperpage) + nb = nblks = (max > nbperpage) ? max : nbperpage; + else + nb = nblks = *nblocks; + + /* try to allocate blocks */ + while ((rc = dbAlloc(ip, hint, nb, &daddr)) != 0) { + /* if something other than an out of space error, + * stop and return this error. + */ + if (rc != -ENOSPC) + return (rc); + + /* decrease the allocation request size */ + nb = min(nblks, extRoundDown(nb)); + + /* give up if we cannot cover a page */ + if (nb < nbperpage) + return (rc); + } + + *nblocks = nb; + *blkno = daddr; + + if (S_ISREG(ip->i_mode) && (ji->fileset == FILESYSTEM_I)) { + ag = BLKTOAG(daddr, sbi); + spin_lock_irq(&ji->ag_lock); + if (ji->active_ag == -1) { + atomic_inc(&bmp->db_active[ag]); + ji->active_ag = ag; + } else if (ji->active_ag != ag) { + atomic_dec(&bmp->db_active[ji->active_ag]); + atomic_inc(&bmp->db_active[ag]); + ji->active_ag = ag; + } + spin_unlock_irq(&ji->ag_lock); + } + + return (0); +} + + +#ifdef _NOTYET +/* + * NAME: extBrealloc() + * + * FUNCTION: attempt to extend an extent's allocation. + * + * Initially, we will try to extend the extent's allocation + * in place. If this fails, we'll try to move the extent + * to a new set of blocks. If moving the extent, we initially + * will try to allocate disk blocks for the requested size + * (newnblks). if this fails (new contiguous free blocks not + * available), we'll try to allocate a smaller number of + * blocks (producing a smaller extent), with this smaller + * number of blocks consisting of the requested number of + * blocks rounded down to the next smaller power of 2 + * number (i.e. 16 -> 8). We'll continue to round down and + * retry the allocation until the number of blocks to allocate + * is smaller than the number of blocks per page. + * + * PARAMETERS: + * ip - the inode of the file. + * blkno - starting block number of the extents current allocation. + * nblks - number of blocks within the extents current allocation. + * newnblks - pointer to a s64 value. on entry, this value is the + * the new desired extent size (number of blocks). on + * successful exit, this value is set to the extent's actual + * new size (new number of blocks). + * newblkno - the starting block number of the extents new allocation. + * + * RETURN VALUES: + * 0 - success + * -EIO - i/o error. + * -ENOSPC - insufficient disk resources. + */ +static int +extBrealloc(struct inode *ip, + s64 blkno, s64 nblks, s64 * newnblks, s64 * newblkno) +{ + int rc; + + /* try to extend in place */ + if ((rc = dbExtend(ip, blkno, nblks, *newnblks - nblks)) == 0) { + *newblkno = blkno; + return (0); + } else { + if (rc != -ENOSPC) + return (rc); + } + + /* in place extension not possible. + * try to move the extent to a new set of blocks. + */ + return (extBalloc(ip, blkno, newnblks, newblkno)); +} +#endif /* _NOTYET */ + + +/* + * NAME: extRoundDown() + * + * FUNCTION: round down a specified number of blocks to the next + * smallest power of 2 number. + * + * PARAMETERS: + * nb - the inode of the file. + * + * RETURN VALUES: + * next smallest power of 2 number. + */ +static s64 extRoundDown(s64 nb) +{ + int i; + u64 m, k; + + for (i = 0, m = (u64) 1 << 63; i < 64; i++, m >>= 1) { + if (m & nb) + break; + } + + i = 63 - i; + k = (u64) 1 << i; + k = ((k - 1) & nb) ? k : k >> 1; + + return (k); +} diff --git a/fs/jfs/jfs_extent.h b/fs/jfs/jfs_extent.h new file mode 100644 index 000000000..b567e12c5 --- /dev/null +++ b/fs/jfs/jfs_extent.h @@ -0,0 +1,31 @@ +/* + * Copyright (C) International Business Machines Corp., 2000-2001 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +#ifndef _H_JFS_EXTENT +#define _H_JFS_EXTENT + +/* get block allocation allocation hint as location of disk inode */ +#define INOHINT(ip) \ + (addressPXD(&(JFS_IP(ip)->ixpxd)) + lengthPXD(&(JFS_IP(ip)->ixpxd)) - 1) + +extern int extAlloc(struct inode *, s64, s64, xad_t *, bool); +extern int extFill(struct inode *, xad_t *); +extern int extHint(struct inode *, s64, xad_t *); +extern int extRealloc(struct inode *, s64, xad_t *, bool); +extern int extRecord(struct inode *, xad_t *); + +#endif /* _H_JFS_EXTENT */ diff --git a/fs/jfs/jfs_filsys.h b/fs/jfs/jfs_filsys.h new file mode 100644 index 000000000..415bfa906 --- /dev/null +++ b/fs/jfs/jfs_filsys.h @@ -0,0 +1,286 @@ +/* + * Copyright (C) International Business Machines Corp., 2000-2003 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +#ifndef _H_JFS_FILSYS +#define _H_JFS_FILSYS + +/* + * jfs_filsys.h + * + * file system (implementation-dependent) constants + * + * refer to <limits.h> for system wide implementation-dependent constants + */ + +/* + * file system option (superblock flag) + */ + +/* directory option */ +#define JFS_UNICODE 0x00000001 /* unicode name */ + +/* mount time flags for error handling */ +#define JFS_ERR_REMOUNT_RO 0x00000002 /* remount read-only */ +#define JFS_ERR_CONTINUE 0x00000004 /* continue */ +#define JFS_ERR_PANIC 0x00000008 /* panic */ + +/* Quota support */ +#define JFS_USRQUOTA 0x00000010 +#define JFS_GRPQUOTA 0x00000020 + +/* mount time flag to disable journaling to disk */ +#define JFS_NOINTEGRITY 0x00000040 + +/* mount time flag to enable TRIM to ssd disks */ +#define JFS_DISCARD 0x00000080 + +/* commit option */ +#define JFS_COMMIT 0x00000f00 /* commit option mask */ +#define JFS_GROUPCOMMIT 0x00000100 /* group (of 1) commit */ +#define JFS_LAZYCOMMIT 0x00000200 /* lazy commit */ +#define JFS_TMPFS 0x00000400 /* temporary file system - + * do not log/commit: + * Never implemented + */ + +/* log logical volume option */ +#define JFS_INLINELOG 0x00000800 /* inline log within file system */ +#define JFS_INLINEMOVE 0x00001000 /* inline log being moved */ + +/* Secondary aggregate inode table */ +#define JFS_BAD_SAIT 0x00010000 /* current secondary ait is bad */ + +/* sparse regular file support */ +#define JFS_SPARSE 0x00020000 /* sparse regular file */ + +/* DASD Limits F226941 */ +#define JFS_DASD_ENABLED 0x00040000 /* DASD limits enabled */ +#define JFS_DASD_PRIME 0x00080000 /* Prime DASD usage on boot */ + +/* big endian flag */ +#define JFS_SWAP_BYTES 0x00100000 /* running on big endian computer */ + +/* Directory index */ +#define JFS_DIR_INDEX 0x00200000 /* Persistent index for */ + +/* platform options */ +#define JFS_LINUX 0x10000000 /* Linux support */ +#define JFS_DFS 0x20000000 /* DCE DFS LFS support */ +/* Never implemented */ + +#define JFS_OS2 0x40000000 /* OS/2 support */ +/* case-insensitive name/directory support */ + +#define JFS_AIX 0x80000000 /* AIX support */ + +/* + * buffer cache configuration + */ +/* page size */ +#ifdef PSIZE +#undef PSIZE +#endif +#define PSIZE 4096 /* page size (in byte) */ +#define L2PSIZE 12 /* log2(PSIZE) */ +#define POFFSET 4095 /* offset within page */ + +/* buffer page size */ +#define BPSIZE PSIZE + +/* + * fs fundamental size + * + * PSIZE >= file system block size >= PBSIZE >= DISIZE + */ +#define PBSIZE 512 /* physical block size (in byte) */ +#define L2PBSIZE 9 /* log2(PBSIZE) */ + +#define DISIZE 512 /* on-disk inode size (in byte) */ +#define L2DISIZE 9 /* log2(DISIZE) */ + +#define IDATASIZE 256 /* inode inline data size */ +#define IXATTRSIZE 128 /* inode inline extended attribute size */ + +#define XTPAGE_SIZE 4096 +#define log2_PAGESIZE 12 + +#define IAG_SIZE 4096 +#define IAG_EXTENT_SIZE 4096 +#define INOSPERIAG 4096 /* number of disk inodes per iag */ +#define L2INOSPERIAG 12 /* l2 number of disk inodes per iag */ +#define INOSPEREXT 32 /* number of disk inode per extent */ +#define L2INOSPEREXT 5 /* l2 number of disk inode per extent */ +#define IXSIZE (DISIZE * INOSPEREXT) /* inode extent size */ +#define INOSPERPAGE 8 /* number of disk inodes per 4K page */ +#define L2INOSPERPAGE 3 /* log2(INOSPERPAGE) */ + +#define IAGFREELIST_LWM 64 + +#define INODE_EXTENT_SIZE IXSIZE /* inode extent size */ +#define NUM_INODE_PER_EXTENT INOSPEREXT +#define NUM_INODE_PER_IAG INOSPERIAG + +#define MINBLOCKSIZE 512 +#define MAXBLOCKSIZE 4096 +#define MAXFILESIZE ((s64)1 << 52) + +#define JFS_LINK_MAX 0xffffffff + +/* Minimum number of bytes supported for a JFS partition */ +#define MINJFS (0x1000000) +#define MINJFSTEXT "16" + +/* + * file system block size -> physical block size + */ +#define LBOFFSET(x) ((x) & (PBSIZE - 1)) +#define LBNUMBER(x) ((x) >> L2PBSIZE) +#define LBLK2PBLK(sb,b) ((b) << (sb->s_blocksize_bits - L2PBSIZE)) +#define PBLK2LBLK(sb,b) ((b) >> (sb->s_blocksize_bits - L2PBSIZE)) +/* size in byte -> last page number */ +#define SIZE2PN(size) ( ((s64)((size) - 1)) >> (L2PSIZE) ) +/* size in byte -> last file system block number */ +#define SIZE2BN(size, l2bsize) ( ((s64)((size) - 1)) >> (l2bsize) ) + +/* + * fixed physical block address (physical block size = 512 byte) + * + * NOTE: since we can't guarantee a physical block size of 512 bytes the use of + * these macros should be removed and the byte offset macros used instead. + */ +#define SUPER1_B 64 /* primary superblock */ +#define AIMAP_B (SUPER1_B + 8) /* 1st extent of aggregate inode map */ +#define AITBL_B (AIMAP_B + 16) /* + * 1st extent of aggregate inode table + */ +#define SUPER2_B (AITBL_B + 32) /* 2ndary superblock pbn */ +#define BMAP_B (SUPER2_B + 8) /* block allocation map */ + +/* + * SIZE_OF_SUPER defines the total amount of space reserved on disk for the + * superblock. This is not the same as the superblock structure, since all of + * this space is not currently being used. + */ +#define SIZE_OF_SUPER PSIZE + +/* + * SIZE_OF_AG_TABLE defines the amount of space reserved to hold the AG table + */ +#define SIZE_OF_AG_TABLE PSIZE + +/* + * SIZE_OF_MAP_PAGE defines the amount of disk space reserved for each page of + * the inode allocation map (to hold iag) + */ +#define SIZE_OF_MAP_PAGE PSIZE + +/* + * fixed byte offset address + */ +#define SUPER1_OFF 0x8000 /* primary superblock */ +#define AIMAP_OFF (SUPER1_OFF + SIZE_OF_SUPER) + /* + * Control page of aggregate inode map + * followed by 1st extent of map + */ +#define AITBL_OFF (AIMAP_OFF + (SIZE_OF_MAP_PAGE << 1)) + /* + * 1st extent of aggregate inode table + */ +#define SUPER2_OFF (AITBL_OFF + INODE_EXTENT_SIZE) + /* + * secondary superblock + */ +#define BMAP_OFF (SUPER2_OFF + SIZE_OF_SUPER) + /* + * block allocation map + */ + +/* + * The following macro is used to indicate the number of reserved disk blocks at + * the front of an aggregate, in terms of physical blocks. This value is + * currently defined to be 32K. This turns out to be the same as the primary + * superblock's address, since it directly follows the reserved blocks. + */ +#define AGGR_RSVD_BLOCKS SUPER1_B + +/* + * The following macro is used to indicate the number of reserved bytes at the + * front of an aggregate. This value is currently defined to be 32K. This + * turns out to be the same as the primary superblock's byte offset, since it + * directly follows the reserved blocks. + */ +#define AGGR_RSVD_BYTES SUPER1_OFF + +/* + * The following macro defines the byte offset for the first inode extent in + * the aggregate inode table. This allows us to find the self inode to find the + * rest of the table. Currently this value is 44K. + */ +#define AGGR_INODE_TABLE_START AITBL_OFF + +/* + * fixed reserved inode number + */ +/* aggregate inode */ +#define AGGR_RESERVED_I 0 /* aggregate inode (reserved) */ +#define AGGREGATE_I 1 /* aggregate inode map inode */ +#define BMAP_I 2 /* aggregate block allocation map inode */ +#define LOG_I 3 /* aggregate inline log inode */ +#define BADBLOCK_I 4 /* aggregate bad block inode */ +#define FILESYSTEM_I 16 /* 1st/only fileset inode in ait: + * fileset inode map inode + */ + +/* per fileset inode */ +#define FILESET_RSVD_I 0 /* fileset inode (reserved) */ +#define FILESET_EXT_I 1 /* fileset inode extension */ +#define ROOT_I 2 /* fileset root inode */ +#define ACL_I 3 /* fileset ACL inode */ + +#define FILESET_OBJECT_I 4 /* the first fileset inode available for a file + * or directory or link... + */ +#define FIRST_FILESET_INO 16 /* the first aggregate inode which describes + * an inode. (To fsck this is also the first + * inode in part 2 of the agg inode table.) + */ + +/* + * directory configuration + */ +#define JFS_NAME_MAX 255 +#define JFS_PATH_MAX BPSIZE + + +/* + * file system state (superblock state) + */ +#define FM_CLEAN 0x00000000 /* file system is unmounted and clean */ +#define FM_MOUNT 0x00000001 /* file system is mounted cleanly */ +#define FM_DIRTY 0x00000002 /* file system was not unmounted and clean + * when mounted or + * commit failure occurred while being mounted: + * fsck() must be run to repair + */ +#define FM_LOGREDO 0x00000004 /* log based recovery (logredo()) failed: + * fsck() must be run to repair + */ +#define FM_EXTENDFS 0x00000008 /* file system extendfs() in progress */ +#define FM_STATE_MAX 0x0000000f /* max value of s_state */ + +#endif /* _H_JFS_FILSYS */ diff --git a/fs/jfs/jfs_imap.c b/fs/jfs/jfs_imap.c new file mode 100644 index 000000000..93e8c590f --- /dev/null +++ b/fs/jfs/jfs_imap.c @@ -0,0 +1,3170 @@ +/* + * Copyright (C) International Business Machines Corp., 2000-2004 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +/* + * jfs_imap.c: inode allocation map manager + * + * Serialization: + * Each AG has a simple lock which is used to control the serialization of + * the AG level lists. This lock should be taken first whenever an AG + * level list will be modified or accessed. + * + * Each IAG is locked by obtaining the buffer for the IAG page. + * + * There is also a inode lock for the inode map inode. A read lock needs to + * be taken whenever an IAG is read from the map or the global level + * information is read. A write lock needs to be taken whenever the global + * level information is modified or an atomic operation needs to be used. + * + * If more than one IAG is read at one time, the read lock may not + * be given up until all of the IAG's are read. Otherwise, a deadlock + * may occur when trying to obtain the read lock while another thread + * holding the read lock is waiting on the IAG already being held. + * + * The control page of the inode map is read into memory by diMount(). + * Thereafter it should only be modified in memory and then it will be + * written out when the filesystem is unmounted by diUnmount(). + */ + +#include <linux/fs.h> +#include <linux/buffer_head.h> +#include <linux/pagemap.h> +#include <linux/quotaops.h> +#include <linux/slab.h> + +#include "jfs_incore.h" +#include "jfs_inode.h" +#include "jfs_filsys.h" +#include "jfs_dinode.h" +#include "jfs_dmap.h" +#include "jfs_imap.h" +#include "jfs_metapage.h" +#include "jfs_superblock.h" +#include "jfs_debug.h" + +/* + * imap locks + */ +/* iag free list lock */ +#define IAGFREE_LOCK_INIT(imap) mutex_init(&imap->im_freelock) +#define IAGFREE_LOCK(imap) mutex_lock(&imap->im_freelock) +#define IAGFREE_UNLOCK(imap) mutex_unlock(&imap->im_freelock) + +/* per ag iag list locks */ +#define AG_LOCK_INIT(imap,index) mutex_init(&(imap->im_aglock[index])) +#define AG_LOCK(imap,agno) mutex_lock(&imap->im_aglock[agno]) +#define AG_UNLOCK(imap,agno) mutex_unlock(&imap->im_aglock[agno]) + +/* + * forward references + */ +static int diAllocAG(struct inomap *, int, bool, struct inode *); +static int diAllocAny(struct inomap *, int, bool, struct inode *); +static int diAllocBit(struct inomap *, struct iag *, int); +static int diAllocExt(struct inomap *, int, struct inode *); +static int diAllocIno(struct inomap *, int, struct inode *); +static int diFindFree(u32, int); +static int diNewExt(struct inomap *, struct iag *, int); +static int diNewIAG(struct inomap *, int *, int, struct metapage **); +static void duplicateIXtree(struct super_block *, s64, int, s64 *); + +static int diIAGRead(struct inomap * imap, int, struct metapage **); +static int copy_from_dinode(struct dinode *, struct inode *); +static void copy_to_dinode(struct dinode *, struct inode *); + +/* + * NAME: diMount() + * + * FUNCTION: initialize the incore inode map control structures for + * a fileset or aggregate init time. + * + * the inode map's control structure (dinomap) is + * brought in from disk and placed in virtual memory. + * + * PARAMETERS: + * ipimap - pointer to inode map inode for the aggregate or fileset. + * + * RETURN VALUES: + * 0 - success + * -ENOMEM - insufficient free virtual memory. + * -EIO - i/o error. + */ +int diMount(struct inode *ipimap) +{ + struct inomap *imap; + struct metapage *mp; + int index; + struct dinomap_disk *dinom_le; + + /* + * allocate/initialize the in-memory inode map control structure + */ + /* allocate the in-memory inode map control structure. */ + imap = kmalloc(sizeof(struct inomap), GFP_KERNEL); + if (imap == NULL) { + jfs_err("diMount: kmalloc returned NULL!"); + return -ENOMEM; + } + + /* read the on-disk inode map control structure. */ + + mp = read_metapage(ipimap, + IMAPBLKNO << JFS_SBI(ipimap->i_sb)->l2nbperpage, + PSIZE, 0); + if (mp == NULL) { + kfree(imap); + return -EIO; + } + + /* copy the on-disk version to the in-memory version. */ + dinom_le = (struct dinomap_disk *) mp->data; + imap->im_freeiag = le32_to_cpu(dinom_le->in_freeiag); + imap->im_nextiag = le32_to_cpu(dinom_le->in_nextiag); + atomic_set(&imap->im_numinos, le32_to_cpu(dinom_le->in_numinos)); + atomic_set(&imap->im_numfree, le32_to_cpu(dinom_le->in_numfree)); + imap->im_nbperiext = le32_to_cpu(dinom_le->in_nbperiext); + imap->im_l2nbperiext = le32_to_cpu(dinom_le->in_l2nbperiext); + for (index = 0; index < MAXAG; index++) { + imap->im_agctl[index].inofree = + le32_to_cpu(dinom_le->in_agctl[index].inofree); + imap->im_agctl[index].extfree = + le32_to_cpu(dinom_le->in_agctl[index].extfree); + imap->im_agctl[index].numinos = + le32_to_cpu(dinom_le->in_agctl[index].numinos); + imap->im_agctl[index].numfree = + le32_to_cpu(dinom_le->in_agctl[index].numfree); + } + + /* release the buffer. */ + release_metapage(mp); + + /* + * allocate/initialize inode allocation map locks + */ + /* allocate and init iag free list lock */ + IAGFREE_LOCK_INIT(imap); + + /* allocate and init ag list locks */ + for (index = 0; index < MAXAG; index++) { + AG_LOCK_INIT(imap, index); + } + + /* bind the inode map inode and inode map control structure + * to each other. + */ + imap->im_ipimap = ipimap; + JFS_IP(ipimap)->i_imap = imap; + + return (0); +} + + +/* + * NAME: diUnmount() + * + * FUNCTION: write to disk the incore inode map control structures for + * a fileset or aggregate at unmount time. + * + * PARAMETERS: + * ipimap - pointer to inode map inode for the aggregate or fileset. + * + * RETURN VALUES: + * 0 - success + * -ENOMEM - insufficient free virtual memory. + * -EIO - i/o error. + */ +int diUnmount(struct inode *ipimap, int mounterror) +{ + struct inomap *imap = JFS_IP(ipimap)->i_imap; + + /* + * update the on-disk inode map control structure + */ + + if (!(mounterror || isReadOnly(ipimap))) + diSync(ipimap); + + /* + * Invalidate the page cache buffers + */ + truncate_inode_pages(ipimap->i_mapping, 0); + + /* + * free in-memory control structure + */ + kfree(imap); + + return (0); +} + + +/* + * diSync() + */ +int diSync(struct inode *ipimap) +{ + struct dinomap_disk *dinom_le; + struct inomap *imp = JFS_IP(ipimap)->i_imap; + struct metapage *mp; + int index; + + /* + * write imap global conrol page + */ + /* read the on-disk inode map control structure */ + mp = get_metapage(ipimap, + IMAPBLKNO << JFS_SBI(ipimap->i_sb)->l2nbperpage, + PSIZE, 0); + if (mp == NULL) { + jfs_err("diSync: get_metapage failed!"); + return -EIO; + } + + /* copy the in-memory version to the on-disk version */ + dinom_le = (struct dinomap_disk *) mp->data; + dinom_le->in_freeiag = cpu_to_le32(imp->im_freeiag); + dinom_le->in_nextiag = cpu_to_le32(imp->im_nextiag); + dinom_le->in_numinos = cpu_to_le32(atomic_read(&imp->im_numinos)); + dinom_le->in_numfree = cpu_to_le32(atomic_read(&imp->im_numfree)); + dinom_le->in_nbperiext = cpu_to_le32(imp->im_nbperiext); + dinom_le->in_l2nbperiext = cpu_to_le32(imp->im_l2nbperiext); + for (index = 0; index < MAXAG; index++) { + dinom_le->in_agctl[index].inofree = + cpu_to_le32(imp->im_agctl[index].inofree); + dinom_le->in_agctl[index].extfree = + cpu_to_le32(imp->im_agctl[index].extfree); + dinom_le->in_agctl[index].numinos = + cpu_to_le32(imp->im_agctl[index].numinos); + dinom_le->in_agctl[index].numfree = + cpu_to_le32(imp->im_agctl[index].numfree); + } + + /* write out the control structure */ + write_metapage(mp); + + /* + * write out dirty pages of imap + */ + filemap_write_and_wait(ipimap->i_mapping); + + diWriteSpecial(ipimap, 0); + + return (0); +} + + +/* + * NAME: diRead() + * + * FUNCTION: initialize an incore inode from disk. + * + * on entry, the specifed incore inode should itself + * specify the disk inode number corresponding to the + * incore inode (i.e. i_number should be initialized). + * + * this routine handles incore inode initialization for + * both "special" and "regular" inodes. special inodes + * are those required early in the mount process and + * require special handling since much of the file system + * is not yet initialized. these "special" inodes are + * identified by a NULL inode map inode pointer and are + * actually initialized by a call to diReadSpecial(). + * + * for regular inodes, the iag describing the disk inode + * is read from disk to determine the inode extent address + * for the disk inode. with the inode extent address in + * hand, the page of the extent that contains the disk + * inode is read and the disk inode is copied to the + * incore inode. + * + * PARAMETERS: + * ip - pointer to incore inode to be initialized from disk. + * + * RETURN VALUES: + * 0 - success + * -EIO - i/o error. + * -ENOMEM - insufficient memory + * + */ +int diRead(struct inode *ip) +{ + struct jfs_sb_info *sbi = JFS_SBI(ip->i_sb); + int iagno, ino, extno, rc; + struct inode *ipimap; + struct dinode *dp; + struct iag *iagp; + struct metapage *mp; + s64 blkno, agstart; + struct inomap *imap; + int block_offset; + int inodes_left; + unsigned long pageno; + int rel_inode; + + jfs_info("diRead: ino = %ld", ip->i_ino); + + ipimap = sbi->ipimap; + JFS_IP(ip)->ipimap = ipimap; + + /* determine the iag number for this inode (number) */ + iagno = INOTOIAG(ip->i_ino); + + /* read the iag */ + imap = JFS_IP(ipimap)->i_imap; + IREAD_LOCK(ipimap, RDWRLOCK_IMAP); + rc = diIAGRead(imap, iagno, &mp); + IREAD_UNLOCK(ipimap); + if (rc) { + jfs_err("diRead: diIAGRead returned %d", rc); + return (rc); + } + + iagp = (struct iag *) mp->data; + + /* determine inode extent that holds the disk inode */ + ino = ip->i_ino & (INOSPERIAG - 1); + extno = ino >> L2INOSPEREXT; + + if ((lengthPXD(&iagp->inoext[extno]) != imap->im_nbperiext) || + (addressPXD(&iagp->inoext[extno]) == 0)) { + release_metapage(mp); + return -ESTALE; + } + + /* get disk block number of the page within the inode extent + * that holds the disk inode. + */ + blkno = INOPBLK(&iagp->inoext[extno], ino, sbi->l2nbperpage); + + /* get the ag for the iag */ + agstart = le64_to_cpu(iagp->agstart); + + release_metapage(mp); + + rel_inode = (ino & (INOSPERPAGE - 1)); + pageno = blkno >> sbi->l2nbperpage; + + if ((block_offset = ((u32) blkno & (sbi->nbperpage - 1)))) { + /* + * OS/2 didn't always align inode extents on page boundaries + */ + inodes_left = + (sbi->nbperpage - block_offset) << sbi->l2niperblk; + + if (rel_inode < inodes_left) + rel_inode += block_offset << sbi->l2niperblk; + else { + pageno += 1; + rel_inode -= inodes_left; + } + } + + /* read the page of disk inode */ + mp = read_metapage(ipimap, pageno << sbi->l2nbperpage, PSIZE, 1); + if (!mp) { + jfs_err("diRead: read_metapage failed"); + return -EIO; + } + + /* locate the disk inode requested */ + dp = (struct dinode *) mp->data; + dp += rel_inode; + + if (ip->i_ino != le32_to_cpu(dp->di_number)) { + jfs_error(ip->i_sb, "i_ino != di_number\n"); + rc = -EIO; + } else if (le32_to_cpu(dp->di_nlink) == 0) + rc = -ESTALE; + else + /* copy the disk inode to the in-memory inode */ + rc = copy_from_dinode(dp, ip); + + release_metapage(mp); + + /* set the ag for the inode */ + JFS_IP(ip)->agstart = agstart; + JFS_IP(ip)->active_ag = -1; + + return (rc); +} + + +/* + * NAME: diReadSpecial() + * + * FUNCTION: initialize a 'special' inode from disk. + * + * this routines handles aggregate level inodes. The + * inode cache cannot differentiate between the + * aggregate inodes and the filesystem inodes, so we + * handle these here. We don't actually use the aggregate + * inode map, since these inodes are at a fixed location + * and in some cases the aggregate inode map isn't initialized + * yet. + * + * PARAMETERS: + * sb - filesystem superblock + * inum - aggregate inode number + * secondary - 1 if secondary aggregate inode table + * + * RETURN VALUES: + * new inode - success + * NULL - i/o error. + */ +struct inode *diReadSpecial(struct super_block *sb, ino_t inum, int secondary) +{ + struct jfs_sb_info *sbi = JFS_SBI(sb); + uint address; + struct dinode *dp; + struct inode *ip; + struct metapage *mp; + + ip = new_inode(sb); + if (ip == NULL) { + jfs_err("diReadSpecial: new_inode returned NULL!"); + return ip; + } + + if (secondary) { + address = addressPXD(&sbi->ait2) >> sbi->l2nbperpage; + JFS_IP(ip)->ipimap = sbi->ipaimap2; + } else { + address = AITBL_OFF >> L2PSIZE; + JFS_IP(ip)->ipimap = sbi->ipaimap; + } + + ASSERT(inum < INOSPEREXT); + + ip->i_ino = inum; + + address += inum >> 3; /* 8 inodes per 4K page */ + + /* read the page of fixed disk inode (AIT) in raw mode */ + mp = read_metapage(ip, address << sbi->l2nbperpage, PSIZE, 1); + if (mp == NULL) { + set_nlink(ip, 1); /* Don't want iput() deleting it */ + iput(ip); + return (NULL); + } + + /* get the pointer to the disk inode of interest */ + dp = (struct dinode *) (mp->data); + dp += inum % 8; /* 8 inodes per 4K page */ + + /* copy on-disk inode to in-memory inode */ + if ((copy_from_dinode(dp, ip)) != 0) { + /* handle bad return by returning NULL for ip */ + set_nlink(ip, 1); /* Don't want iput() deleting it */ + iput(ip); + /* release the page */ + release_metapage(mp); + return (NULL); + + } + + ip->i_mapping->a_ops = &jfs_metapage_aops; + mapping_set_gfp_mask(ip->i_mapping, GFP_NOFS); + + /* Allocations to metadata inodes should not affect quotas */ + ip->i_flags |= S_NOQUOTA; + + if ((inum == FILESYSTEM_I) && (JFS_IP(ip)->ipimap == sbi->ipaimap)) { + sbi->gengen = le32_to_cpu(dp->di_gengen); + sbi->inostamp = le32_to_cpu(dp->di_inostamp); + } + + /* release the page */ + release_metapage(mp); + + inode_fake_hash(ip); + + return (ip); +} + +/* + * NAME: diWriteSpecial() + * + * FUNCTION: Write the special inode to disk + * + * PARAMETERS: + * ip - special inode + * secondary - 1 if secondary aggregate inode table + * + * RETURN VALUES: none + */ + +void diWriteSpecial(struct inode *ip, int secondary) +{ + struct jfs_sb_info *sbi = JFS_SBI(ip->i_sb); + uint address; + struct dinode *dp; + ino_t inum = ip->i_ino; + struct metapage *mp; + + if (secondary) + address = addressPXD(&sbi->ait2) >> sbi->l2nbperpage; + else + address = AITBL_OFF >> L2PSIZE; + + ASSERT(inum < INOSPEREXT); + + address += inum >> 3; /* 8 inodes per 4K page */ + + /* read the page of fixed disk inode (AIT) in raw mode */ + mp = read_metapage(ip, address << sbi->l2nbperpage, PSIZE, 1); + if (mp == NULL) { + jfs_err("diWriteSpecial: failed to read aggregate inode extent!"); + return; + } + + /* get the pointer to the disk inode of interest */ + dp = (struct dinode *) (mp->data); + dp += inum % 8; /* 8 inodes per 4K page */ + + /* copy on-disk inode to in-memory inode */ + copy_to_dinode(dp, ip); + memcpy(&dp->di_xtroot, &JFS_IP(ip)->i_xtroot, 288); + + if (inum == FILESYSTEM_I) + dp->di_gengen = cpu_to_le32(sbi->gengen); + + /* write the page */ + write_metapage(mp); +} + +/* + * NAME: diFreeSpecial() + * + * FUNCTION: Free allocated space for special inode + */ +void diFreeSpecial(struct inode *ip) +{ + if (ip == NULL) { + jfs_err("diFreeSpecial called with NULL ip!"); + return; + } + filemap_write_and_wait(ip->i_mapping); + truncate_inode_pages(ip->i_mapping, 0); + iput(ip); +} + + + +/* + * NAME: diWrite() + * + * FUNCTION: write the on-disk inode portion of the in-memory inode + * to its corresponding on-disk inode. + * + * on entry, the specifed incore inode should itself + * specify the disk inode number corresponding to the + * incore inode (i.e. i_number should be initialized). + * + * the inode contains the inode extent address for the disk + * inode. with the inode extent address in hand, the + * page of the extent that contains the disk inode is + * read and the disk inode portion of the incore inode + * is copied to the disk inode. + * + * PARAMETERS: + * tid - transacation id + * ip - pointer to incore inode to be written to the inode extent. + * + * RETURN VALUES: + * 0 - success + * -EIO - i/o error. + */ +int diWrite(tid_t tid, struct inode *ip) +{ + struct jfs_sb_info *sbi = JFS_SBI(ip->i_sb); + struct jfs_inode_info *jfs_ip = JFS_IP(ip); + int rc = 0; + s32 ino; + struct dinode *dp; + s64 blkno; + int block_offset; + int inodes_left; + struct metapage *mp; + unsigned long pageno; + int rel_inode; + int dioffset; + struct inode *ipimap; + uint type; + lid_t lid; + struct tlock *ditlck, *tlck; + struct linelock *dilinelock, *ilinelock; + struct lv *lv; + int n; + + ipimap = jfs_ip->ipimap; + + ino = ip->i_ino & (INOSPERIAG - 1); + + if (!addressPXD(&(jfs_ip->ixpxd)) || + (lengthPXD(&(jfs_ip->ixpxd)) != + JFS_IP(ipimap)->i_imap->im_nbperiext)) { + jfs_error(ip->i_sb, "ixpxd invalid\n"); + return -EIO; + } + + /* + * read the page of disk inode containing the specified inode: + */ + /* compute the block address of the page */ + blkno = INOPBLK(&(jfs_ip->ixpxd), ino, sbi->l2nbperpage); + + rel_inode = (ino & (INOSPERPAGE - 1)); + pageno = blkno >> sbi->l2nbperpage; + + if ((block_offset = ((u32) blkno & (sbi->nbperpage - 1)))) { + /* + * OS/2 didn't always align inode extents on page boundaries + */ + inodes_left = + (sbi->nbperpage - block_offset) << sbi->l2niperblk; + + if (rel_inode < inodes_left) + rel_inode += block_offset << sbi->l2niperblk; + else { + pageno += 1; + rel_inode -= inodes_left; + } + } + /* read the page of disk inode */ + retry: + mp = read_metapage(ipimap, pageno << sbi->l2nbperpage, PSIZE, 1); + if (!mp) + return -EIO; + + /* get the pointer to the disk inode */ + dp = (struct dinode *) mp->data; + dp += rel_inode; + + dioffset = (ino & (INOSPERPAGE - 1)) << L2DISIZE; + + /* + * acquire transaction lock on the on-disk inode; + * N.B. tlock is acquired on ipimap not ip; + */ + if ((ditlck = + txLock(tid, ipimap, mp, tlckINODE | tlckENTRY)) == NULL) + goto retry; + dilinelock = (struct linelock *) & ditlck->lock; + + /* + * copy btree root from in-memory inode to on-disk inode + * + * (tlock is taken from inline B+-tree root in in-memory + * inode when the B+-tree root is updated, which is pointed + * by jfs_ip->blid as well as being on tx tlock list) + * + * further processing of btree root is based on the copy + * in in-memory inode, where txLog() will log from, and, + * for xtree root, txUpdateMap() will update map and reset + * XAD_NEW bit; + */ + + if (S_ISDIR(ip->i_mode) && (lid = jfs_ip->xtlid)) { + /* + * This is the special xtree inside the directory for storing + * the directory table + */ + xtpage_t *p, *xp; + xad_t *xad; + + jfs_ip->xtlid = 0; + tlck = lid_to_tlock(lid); + assert(tlck->type & tlckXTREE); + tlck->type |= tlckBTROOT; + tlck->mp = mp; + ilinelock = (struct linelock *) & tlck->lock; + + /* + * copy xtree root from inode to dinode: + */ + p = &jfs_ip->i_xtroot; + xp = (xtpage_t *) &dp->di_dirtable; + lv = ilinelock->lv; + for (n = 0; n < ilinelock->index; n++, lv++) { + memcpy(&xp->xad[lv->offset], &p->xad[lv->offset], + lv->length << L2XTSLOTSIZE); + } + + /* reset on-disk (metadata page) xtree XAD_NEW bit */ + xad = &xp->xad[XTENTRYSTART]; + for (n = XTENTRYSTART; + n < le16_to_cpu(xp->header.nextindex); n++, xad++) + if (xad->flag & (XAD_NEW | XAD_EXTENDED)) + xad->flag &= ~(XAD_NEW | XAD_EXTENDED); + } + + if ((lid = jfs_ip->blid) == 0) + goto inlineData; + jfs_ip->blid = 0; + + tlck = lid_to_tlock(lid); + type = tlck->type; + tlck->type |= tlckBTROOT; + tlck->mp = mp; + ilinelock = (struct linelock *) & tlck->lock; + + /* + * regular file: 16 byte (XAD slot) granularity + */ + if (type & tlckXTREE) { + xtpage_t *p, *xp; + xad_t *xad; + + /* + * copy xtree root from inode to dinode: + */ + p = &jfs_ip->i_xtroot; + xp = &dp->di_xtroot; + lv = ilinelock->lv; + for (n = 0; n < ilinelock->index; n++, lv++) { + memcpy(&xp->xad[lv->offset], &p->xad[lv->offset], + lv->length << L2XTSLOTSIZE); + } + + /* reset on-disk (metadata page) xtree XAD_NEW bit */ + xad = &xp->xad[XTENTRYSTART]; + for (n = XTENTRYSTART; + n < le16_to_cpu(xp->header.nextindex); n++, xad++) + if (xad->flag & (XAD_NEW | XAD_EXTENDED)) + xad->flag &= ~(XAD_NEW | XAD_EXTENDED); + } + /* + * directory: 32 byte (directory entry slot) granularity + */ + else if (type & tlckDTREE) { + dtpage_t *p, *xp; + + /* + * copy dtree root from inode to dinode: + */ + p = (dtpage_t *) &jfs_ip->i_dtroot; + xp = (dtpage_t *) & dp->di_dtroot; + lv = ilinelock->lv; + for (n = 0; n < ilinelock->index; n++, lv++) { + memcpy(&xp->slot[lv->offset], &p->slot[lv->offset], + lv->length << L2DTSLOTSIZE); + } + } else { + jfs_err("diWrite: UFO tlock"); + } + + inlineData: + /* + * copy inline symlink from in-memory inode to on-disk inode + */ + if (S_ISLNK(ip->i_mode) && ip->i_size < IDATASIZE) { + lv = & dilinelock->lv[dilinelock->index]; + lv->offset = (dioffset + 2 * 128) >> L2INODESLOTSIZE; + lv->length = 2; + memcpy(&dp->di_fastsymlink, jfs_ip->i_inline, IDATASIZE); + dilinelock->index++; + } + /* + * copy inline data from in-memory inode to on-disk inode: + * 128 byte slot granularity + */ + if (test_cflag(COMMIT_Inlineea, ip)) { + lv = & dilinelock->lv[dilinelock->index]; + lv->offset = (dioffset + 3 * 128) >> L2INODESLOTSIZE; + lv->length = 1; + memcpy(&dp->di_inlineea, jfs_ip->i_inline_ea, INODESLOTSIZE); + dilinelock->index++; + + clear_cflag(COMMIT_Inlineea, ip); + } + + /* + * lock/copy inode base: 128 byte slot granularity + */ + lv = & dilinelock->lv[dilinelock->index]; + lv->offset = dioffset >> L2INODESLOTSIZE; + copy_to_dinode(dp, ip); + if (test_and_clear_cflag(COMMIT_Dirtable, ip)) { + lv->length = 2; + memcpy(&dp->di_dirtable, &jfs_ip->i_dirtable, 96); + } else + lv->length = 1; + dilinelock->index++; + + /* release the buffer holding the updated on-disk inode. + * the buffer will be later written by commit processing. + */ + write_metapage(mp); + + return (rc); +} + + +/* + * NAME: diFree(ip) + * + * FUNCTION: free a specified inode from the inode working map + * for a fileset or aggregate. + * + * if the inode to be freed represents the first (only) + * free inode within the iag, the iag will be placed on + * the ag free inode list. + * + * freeing the inode will cause the inode extent to be + * freed if the inode is the only allocated inode within + * the extent. in this case all the disk resource backing + * up the inode extent will be freed. in addition, the iag + * will be placed on the ag extent free list if the extent + * is the first free extent in the iag. if freeing the + * extent also means that no free inodes will exist for + * the iag, the iag will also be removed from the ag free + * inode list. + * + * the iag describing the inode will be freed if the extent + * is to be freed and it is the only backed extent within + * the iag. in this case, the iag will be removed from the + * ag free extent list and ag free inode list and placed on + * the inode map's free iag list. + * + * a careful update approach is used to provide consistency + * in the face of updates to multiple buffers. under this + * approach, all required buffers are obtained before making + * any updates and are held until all updates are complete. + * + * PARAMETERS: + * ip - inode to be freed. + * + * RETURN VALUES: + * 0 - success + * -EIO - i/o error. + */ +int diFree(struct inode *ip) +{ + int rc; + ino_t inum = ip->i_ino; + struct iag *iagp, *aiagp, *biagp, *ciagp, *diagp; + struct metapage *mp, *amp, *bmp, *cmp, *dmp; + int iagno, ino, extno, bitno, sword, agno; + int back, fwd; + u32 bitmap, mask; + struct inode *ipimap = JFS_SBI(ip->i_sb)->ipimap; + struct inomap *imap = JFS_IP(ipimap)->i_imap; + pxd_t freepxd; + tid_t tid; + struct inode *iplist[3]; + struct tlock *tlck; + struct pxd_lock *pxdlock; + + /* + * This is just to suppress compiler warnings. The same logic that + * references these variables is used to initialize them. + */ + aiagp = biagp = ciagp = diagp = NULL; + + /* get the iag number containing the inode. + */ + iagno = INOTOIAG(inum); + + /* make sure that the iag is contained within + * the map. + */ + if (iagno >= imap->im_nextiag) { + print_hex_dump(KERN_ERR, "imap: ", DUMP_PREFIX_ADDRESS, 16, 4, + imap, 32, 0); + jfs_error(ip->i_sb, "inum = %d, iagno = %d, nextiag = %d\n", + (uint) inum, iagno, imap->im_nextiag); + return -EIO; + } + + /* get the allocation group for this ino. + */ + agno = BLKTOAG(JFS_IP(ip)->agstart, JFS_SBI(ip->i_sb)); + + /* Lock the AG specific inode map information + */ + AG_LOCK(imap, agno); + + /* Obtain read lock in imap inode. Don't release it until we have + * read all of the IAG's that we are going to. + */ + IREAD_LOCK(ipimap, RDWRLOCK_IMAP); + + /* read the iag. + */ + if ((rc = diIAGRead(imap, iagno, &mp))) { + IREAD_UNLOCK(ipimap); + AG_UNLOCK(imap, agno); + return (rc); + } + iagp = (struct iag *) mp->data; + + /* get the inode number and extent number of the inode within + * the iag and the inode number within the extent. + */ + ino = inum & (INOSPERIAG - 1); + extno = ino >> L2INOSPEREXT; + bitno = ino & (INOSPEREXT - 1); + mask = HIGHORDER >> bitno; + + if (!(le32_to_cpu(iagp->wmap[extno]) & mask)) { + jfs_error(ip->i_sb, "wmap shows inode already free\n"); + } + + if (!addressPXD(&iagp->inoext[extno])) { + release_metapage(mp); + IREAD_UNLOCK(ipimap); + AG_UNLOCK(imap, agno); + jfs_error(ip->i_sb, "invalid inoext\n"); + return -EIO; + } + + /* compute the bitmap for the extent reflecting the freed inode. + */ + bitmap = le32_to_cpu(iagp->wmap[extno]) & ~mask; + + if (imap->im_agctl[agno].numfree > imap->im_agctl[agno].numinos) { + release_metapage(mp); + IREAD_UNLOCK(ipimap); + AG_UNLOCK(imap, agno); + jfs_error(ip->i_sb, "numfree > numinos\n"); + return -EIO; + } + /* + * inode extent still has some inodes or below low water mark: + * keep the inode extent; + */ + if (bitmap || + imap->im_agctl[agno].numfree < 96 || + (imap->im_agctl[agno].numfree < 288 && + (((imap->im_agctl[agno].numfree * 100) / + imap->im_agctl[agno].numinos) <= 25))) { + /* if the iag currently has no free inodes (i.e., + * the inode being freed is the first free inode of iag), + * insert the iag at head of the inode free list for the ag. + */ + if (iagp->nfreeinos == 0) { + /* check if there are any iags on the ag inode + * free list. if so, read the first one so that + * we can link the current iag onto the list at + * the head. + */ + if ((fwd = imap->im_agctl[agno].inofree) >= 0) { + /* read the iag that currently is the head + * of the list. + */ + if ((rc = diIAGRead(imap, fwd, &))) { + IREAD_UNLOCK(ipimap); + AG_UNLOCK(imap, agno); + release_metapage(mp); + return (rc); + } + aiagp = (struct iag *) amp->data; + + /* make current head point back to the iag. + */ + aiagp->inofreeback = cpu_to_le32(iagno); + + write_metapage(amp); + } + + /* iag points forward to current head and iag + * becomes the new head of the list. + */ + iagp->inofreefwd = + cpu_to_le32(imap->im_agctl[agno].inofree); + iagp->inofreeback = cpu_to_le32(-1); + imap->im_agctl[agno].inofree = iagno; + } + IREAD_UNLOCK(ipimap); + + /* update the free inode summary map for the extent if + * freeing the inode means the extent will now have free + * inodes (i.e., the inode being freed is the first free + * inode of extent), + */ + if (iagp->wmap[extno] == cpu_to_le32(ONES)) { + sword = extno >> L2EXTSPERSUM; + bitno = extno & (EXTSPERSUM - 1); + iagp->inosmap[sword] &= + cpu_to_le32(~(HIGHORDER >> bitno)); + } + + /* update the bitmap. + */ + iagp->wmap[extno] = cpu_to_le32(bitmap); + + /* update the free inode counts at the iag, ag and + * map level. + */ + le32_add_cpu(&iagp->nfreeinos, 1); + imap->im_agctl[agno].numfree += 1; + atomic_inc(&imap->im_numfree); + + /* release the AG inode map lock + */ + AG_UNLOCK(imap, agno); + + /* write the iag */ + write_metapage(mp); + + return (0); + } + + + /* + * inode extent has become free and above low water mark: + * free the inode extent; + */ + + /* + * prepare to update iag list(s) (careful update step 1) + */ + amp = bmp = cmp = dmp = NULL; + fwd = back = -1; + + /* check if the iag currently has no free extents. if so, + * it will be placed on the head of the ag extent free list. + */ + if (iagp->nfreeexts == 0) { + /* check if the ag extent free list has any iags. + * if so, read the iag at the head of the list now. + * this (head) iag will be updated later to reflect + * the addition of the current iag at the head of + * the list. + */ + if ((fwd = imap->im_agctl[agno].extfree) >= 0) { + if ((rc = diIAGRead(imap, fwd, &))) + goto error_out; + aiagp = (struct iag *) amp->data; + } + } else { + /* iag has free extents. check if the addition of a free + * extent will cause all extents to be free within this + * iag. if so, the iag will be removed from the ag extent + * free list and placed on the inode map's free iag list. + */ + if (iagp->nfreeexts == cpu_to_le32(EXTSPERIAG - 1)) { + /* in preparation for removing the iag from the + * ag extent free list, read the iags preceding + * and following the iag on the ag extent free + * list. + */ + if ((fwd = le32_to_cpu(iagp->extfreefwd)) >= 0) { + if ((rc = diIAGRead(imap, fwd, &))) + goto error_out; + aiagp = (struct iag *) amp->data; + } + + if ((back = le32_to_cpu(iagp->extfreeback)) >= 0) { + if ((rc = diIAGRead(imap, back, &bmp))) + goto error_out; + biagp = (struct iag *) bmp->data; + } + } + } + + /* remove the iag from the ag inode free list if freeing + * this extent cause the iag to have no free inodes. + */ + if (iagp->nfreeinos == cpu_to_le32(INOSPEREXT - 1)) { + int inofreeback = le32_to_cpu(iagp->inofreeback); + int inofreefwd = le32_to_cpu(iagp->inofreefwd); + + /* in preparation for removing the iag from the + * ag inode free list, read the iags preceding + * and following the iag on the ag inode free + * list. before reading these iags, we must make + * sure that we already don't have them in hand + * from up above, since re-reading an iag (buffer) + * we are currently holding would cause a deadlock. + */ + if (inofreefwd >= 0) { + + if (inofreefwd == fwd) + ciagp = (struct iag *) amp->data; + else if (inofreefwd == back) + ciagp = (struct iag *) bmp->data; + else { + if ((rc = + diIAGRead(imap, inofreefwd, &cmp))) + goto error_out; + ciagp = (struct iag *) cmp->data; + } + assert(ciagp != NULL); + } + + if (inofreeback >= 0) { + if (inofreeback == fwd) + diagp = (struct iag *) amp->data; + else if (inofreeback == back) + diagp = (struct iag *) bmp->data; + else { + if ((rc = + diIAGRead(imap, inofreeback, &dmp))) + goto error_out; + diagp = (struct iag *) dmp->data; + } + assert(diagp != NULL); + } + } + + IREAD_UNLOCK(ipimap); + + /* + * invalidate any page of the inode extent freed from buffer cache; + */ + freepxd = iagp->inoext[extno]; + invalidate_pxd_metapages(ip, freepxd); + + /* + * update iag list(s) (careful update step 2) + */ + /* add the iag to the ag extent free list if this is the + * first free extent for the iag. + */ + if (iagp->nfreeexts == 0) { + if (fwd >= 0) + aiagp->extfreeback = cpu_to_le32(iagno); + + iagp->extfreefwd = + cpu_to_le32(imap->im_agctl[agno].extfree); + iagp->extfreeback = cpu_to_le32(-1); + imap->im_agctl[agno].extfree = iagno; + } else { + /* remove the iag from the ag extent list if all extents + * are now free and place it on the inode map iag free list. + */ + if (iagp->nfreeexts == cpu_to_le32(EXTSPERIAG - 1)) { + if (fwd >= 0) + aiagp->extfreeback = iagp->extfreeback; + + if (back >= 0) + biagp->extfreefwd = iagp->extfreefwd; + else + imap->im_agctl[agno].extfree = + le32_to_cpu(iagp->extfreefwd); + + iagp->extfreefwd = iagp->extfreeback = cpu_to_le32(-1); + + IAGFREE_LOCK(imap); + iagp->iagfree = cpu_to_le32(imap->im_freeiag); + imap->im_freeiag = iagno; + IAGFREE_UNLOCK(imap); + } + } + + /* remove the iag from the ag inode free list if freeing + * this extent causes the iag to have no free inodes. + */ + if (iagp->nfreeinos == cpu_to_le32(INOSPEREXT - 1)) { + if ((int) le32_to_cpu(iagp->inofreefwd) >= 0) + ciagp->inofreeback = iagp->inofreeback; + + if ((int) le32_to_cpu(iagp->inofreeback) >= 0) + diagp->inofreefwd = iagp->inofreefwd; + else + imap->im_agctl[agno].inofree = + le32_to_cpu(iagp->inofreefwd); + + iagp->inofreefwd = iagp->inofreeback = cpu_to_le32(-1); + } + + /* update the inode extent address and working map + * to reflect the free extent. + * the permanent map should have been updated already + * for the inode being freed. + */ + if (iagp->pmap[extno] != 0) { + jfs_error(ip->i_sb, "the pmap does not show inode free\n"); + } + iagp->wmap[extno] = 0; + PXDlength(&iagp->inoext[extno], 0); + PXDaddress(&iagp->inoext[extno], 0); + + /* update the free extent and free inode summary maps + * to reflect the freed extent. + * the inode summary map is marked to indicate no inodes + * available for the freed extent. + */ + sword = extno >> L2EXTSPERSUM; + bitno = extno & (EXTSPERSUM - 1); + mask = HIGHORDER >> bitno; + iagp->inosmap[sword] |= cpu_to_le32(mask); + iagp->extsmap[sword] &= cpu_to_le32(~mask); + + /* update the number of free inodes and number of free extents + * for the iag. + */ + le32_add_cpu(&iagp->nfreeinos, -(INOSPEREXT - 1)); + le32_add_cpu(&iagp->nfreeexts, 1); + + /* update the number of free inodes and backed inodes + * at the ag and inode map level. + */ + imap->im_agctl[agno].numfree -= (INOSPEREXT - 1); + imap->im_agctl[agno].numinos -= INOSPEREXT; + atomic_sub(INOSPEREXT - 1, &imap->im_numfree); + atomic_sub(INOSPEREXT, &imap->im_numinos); + + if (amp) + write_metapage(amp); + if (bmp) + write_metapage(bmp); + if (cmp) + write_metapage(cmp); + if (dmp) + write_metapage(dmp); + + /* + * start transaction to update block allocation map + * for the inode extent freed; + * + * N.B. AG_LOCK is released and iag will be released below, and + * other thread may allocate inode from/reusing the ixad freed + * BUT with new/different backing inode extent from the extent + * to be freed by the transaction; + */ + tid = txBegin(ipimap->i_sb, COMMIT_FORCE); + mutex_lock(&JFS_IP(ipimap)->commit_mutex); + + /* acquire tlock of the iag page of the freed ixad + * to force the page NOHOMEOK (even though no data is + * logged from the iag page) until NOREDOPAGE|FREEXTENT log + * for the free of the extent is committed; + * write FREEXTENT|NOREDOPAGE log record + * N.B. linelock is overlaid as freed extent descriptor; + */ + tlck = txLock(tid, ipimap, mp, tlckINODE | tlckFREE); + pxdlock = (struct pxd_lock *) & tlck->lock; + pxdlock->flag = mlckFREEPXD; + pxdlock->pxd = freepxd; + pxdlock->index = 1; + + write_metapage(mp); + + iplist[0] = ipimap; + + /* + * logredo needs the IAG number and IAG extent index in order + * to ensure that the IMap is consistent. The least disruptive + * way to pass these values through to the transaction manager + * is in the iplist array. + * + * It's not pretty, but it works. + */ + iplist[1] = (struct inode *) (size_t)iagno; + iplist[2] = (struct inode *) (size_t)extno; + + rc = txCommit(tid, 1, &iplist[0], COMMIT_FORCE); + + txEnd(tid); + mutex_unlock(&JFS_IP(ipimap)->commit_mutex); + + /* unlock the AG inode map information */ + AG_UNLOCK(imap, agno); + + return (0); + + error_out: + IREAD_UNLOCK(ipimap); + + if (amp) + release_metapage(amp); + if (bmp) + release_metapage(bmp); + if (cmp) + release_metapage(cmp); + if (dmp) + release_metapage(dmp); + + AG_UNLOCK(imap, agno); + + release_metapage(mp); + + return (rc); +} + +/* + * There are several places in the diAlloc* routines where we initialize + * the inode. + */ +static inline void +diInitInode(struct inode *ip, int iagno, int ino, int extno, struct iag * iagp) +{ + struct jfs_inode_info *jfs_ip = JFS_IP(ip); + + ip->i_ino = (iagno << L2INOSPERIAG) + ino; + jfs_ip->ixpxd = iagp->inoext[extno]; + jfs_ip->agstart = le64_to_cpu(iagp->agstart); + jfs_ip->active_ag = -1; +} + + +/* + * NAME: diAlloc(pip,dir,ip) + * + * FUNCTION: allocate a disk inode from the inode working map + * for a fileset or aggregate. + * + * PARAMETERS: + * pip - pointer to incore inode for the parent inode. + * dir - 'true' if the new disk inode is for a directory. + * ip - pointer to a new inode + * + * RETURN VALUES: + * 0 - success. + * -ENOSPC - insufficient disk resources. + * -EIO - i/o error. + */ +int diAlloc(struct inode *pip, bool dir, struct inode *ip) +{ + int rc, ino, iagno, addext, extno, bitno, sword; + int nwords, rem, i, agno; + u32 mask, inosmap, extsmap; + struct inode *ipimap; + struct metapage *mp; + ino_t inum; + struct iag *iagp; + struct inomap *imap; + + /* get the pointers to the inode map inode and the + * corresponding imap control structure. + */ + ipimap = JFS_SBI(pip->i_sb)->ipimap; + imap = JFS_IP(ipimap)->i_imap; + JFS_IP(ip)->ipimap = ipimap; + JFS_IP(ip)->fileset = FILESYSTEM_I; + + /* for a directory, the allocation policy is to start + * at the ag level using the preferred ag. + */ + if (dir) { + agno = dbNextAG(JFS_SBI(pip->i_sb)->ipbmap); + AG_LOCK(imap, agno); + goto tryag; + } + + /* for files, the policy starts off by trying to allocate from + * the same iag containing the parent disk inode: + * try to allocate the new disk inode close to the parent disk + * inode, using parent disk inode number + 1 as the allocation + * hint. (we use a left-to-right policy to attempt to avoid + * moving backward on the disk.) compute the hint within the + * file system and the iag. + */ + + /* get the ag number of this iag */ + agno = BLKTOAG(JFS_IP(pip)->agstart, JFS_SBI(pip->i_sb)); + + if (atomic_read(&JFS_SBI(pip->i_sb)->bmap->db_active[agno])) { + /* + * There is an open file actively growing. We want to + * allocate new inodes from a different ag to avoid + * fragmentation problems. + */ + agno = dbNextAG(JFS_SBI(pip->i_sb)->ipbmap); + AG_LOCK(imap, agno); + goto tryag; + } + + inum = pip->i_ino + 1; + ino = inum & (INOSPERIAG - 1); + + /* back off the hint if it is outside of the iag */ + if (ino == 0) + inum = pip->i_ino; + + /* lock the AG inode map information */ + AG_LOCK(imap, agno); + + /* Get read lock on imap inode */ + IREAD_LOCK(ipimap, RDWRLOCK_IMAP); + + /* get the iag number and read the iag */ + iagno = INOTOIAG(inum); + if ((rc = diIAGRead(imap, iagno, &mp))) { + IREAD_UNLOCK(ipimap); + AG_UNLOCK(imap, agno); + return (rc); + } + iagp = (struct iag *) mp->data; + + /* determine if new inode extent is allowed to be added to the iag. + * new inode extent can be added to the iag if the ag + * has less than 32 free disk inodes and the iag has free extents. + */ + addext = (imap->im_agctl[agno].numfree < 32 && iagp->nfreeexts); + + /* + * try to allocate from the IAG + */ + /* check if the inode may be allocated from the iag + * (i.e. the inode has free inodes or new extent can be added). + */ + if (iagp->nfreeinos || addext) { + /* determine the extent number of the hint. + */ + extno = ino >> L2INOSPEREXT; + + /* check if the extent containing the hint has backed + * inodes. if so, try to allocate within this extent. + */ + if (addressPXD(&iagp->inoext[extno])) { + bitno = ino & (INOSPEREXT - 1); + if ((bitno = + diFindFree(le32_to_cpu(iagp->wmap[extno]), + bitno)) + < INOSPEREXT) { + ino = (extno << L2INOSPEREXT) + bitno; + + /* a free inode (bit) was found within this + * extent, so allocate it. + */ + rc = diAllocBit(imap, iagp, ino); + IREAD_UNLOCK(ipimap); + if (rc) { + assert(rc == -EIO); + } else { + /* set the results of the allocation + * and write the iag. + */ + diInitInode(ip, iagno, ino, extno, + iagp); + mark_metapage_dirty(mp); + } + release_metapage(mp); + + /* free the AG lock and return. + */ + AG_UNLOCK(imap, agno); + return (rc); + } + + if (!addext) + extno = + (extno == + EXTSPERIAG - 1) ? 0 : extno + 1; + } + + /* + * no free inodes within the extent containing the hint. + * + * try to allocate from the backed extents following + * hint or, if appropriate (i.e. addext is true), allocate + * an extent of free inodes at or following the extent + * containing the hint. + * + * the free inode and free extent summary maps are used + * here, so determine the starting summary map position + * and the number of words we'll have to examine. again, + * the approach is to allocate following the hint, so we + * might have to initially ignore prior bits of the summary + * map that represent extents prior to the extent containing + * the hint and later revisit these bits. + */ + bitno = extno & (EXTSPERSUM - 1); + nwords = (bitno == 0) ? SMAPSZ : SMAPSZ + 1; + sword = extno >> L2EXTSPERSUM; + + /* mask any prior bits for the starting words of the + * summary map. + */ + mask = (bitno == 0) ? 0 : (ONES << (EXTSPERSUM - bitno)); + inosmap = le32_to_cpu(iagp->inosmap[sword]) | mask; + extsmap = le32_to_cpu(iagp->extsmap[sword]) | mask; + + /* scan the free inode and free extent summary maps for + * free resources. + */ + for (i = 0; i < nwords; i++) { + /* check if this word of the free inode summary + * map describes an extent with free inodes. + */ + if (~inosmap) { + /* an extent with free inodes has been + * found. determine the extent number + * and the inode number within the extent. + */ + rem = diFindFree(inosmap, 0); + extno = (sword << L2EXTSPERSUM) + rem; + rem = diFindFree(le32_to_cpu(iagp->wmap[extno]), + 0); + if (rem >= INOSPEREXT) { + IREAD_UNLOCK(ipimap); + release_metapage(mp); + AG_UNLOCK(imap, agno); + jfs_error(ip->i_sb, + "can't find free bit in wmap\n"); + return -EIO; + } + + /* determine the inode number within the + * iag and allocate the inode from the + * map. + */ + ino = (extno << L2INOSPEREXT) + rem; + rc = diAllocBit(imap, iagp, ino); + IREAD_UNLOCK(ipimap); + if (rc) + assert(rc == -EIO); + else { + /* set the results of the allocation + * and write the iag. + */ + diInitInode(ip, iagno, ino, extno, + iagp); + mark_metapage_dirty(mp); + } + release_metapage(mp); + + /* free the AG lock and return. + */ + AG_UNLOCK(imap, agno); + return (rc); + + } + + /* check if we may allocate an extent of free + * inodes and whether this word of the free + * extents summary map describes a free extent. + */ + if (addext && ~extsmap) { + /* a free extent has been found. determine + * the extent number. + */ + rem = diFindFree(extsmap, 0); + extno = (sword << L2EXTSPERSUM) + rem; + + /* allocate an extent of free inodes. + */ + if ((rc = diNewExt(imap, iagp, extno))) { + /* if there is no disk space for a + * new extent, try to allocate the + * disk inode from somewhere else. + */ + if (rc == -ENOSPC) + break; + + assert(rc == -EIO); + } else { + /* set the results of the allocation + * and write the iag. + */ + diInitInode(ip, iagno, + extno << L2INOSPEREXT, + extno, iagp); + mark_metapage_dirty(mp); + } + release_metapage(mp); + /* free the imap inode & the AG lock & return. + */ + IREAD_UNLOCK(ipimap); + AG_UNLOCK(imap, agno); + return (rc); + } + + /* move on to the next set of summary map words. + */ + sword = (sword == SMAPSZ - 1) ? 0 : sword + 1; + inosmap = le32_to_cpu(iagp->inosmap[sword]); + extsmap = le32_to_cpu(iagp->extsmap[sword]); + } + } + /* unlock imap inode */ + IREAD_UNLOCK(ipimap); + + /* nothing doing in this iag, so release it. */ + release_metapage(mp); + + tryag: + /* + * try to allocate anywhere within the same AG as the parent inode. + */ + rc = diAllocAG(imap, agno, dir, ip); + + AG_UNLOCK(imap, agno); + + if (rc != -ENOSPC) + return (rc); + + /* + * try to allocate in any AG. + */ + return (diAllocAny(imap, agno, dir, ip)); +} + + +/* + * NAME: diAllocAG(imap,agno,dir,ip) + * + * FUNCTION: allocate a disk inode from the allocation group. + * + * this routine first determines if a new extent of free + * inodes should be added for the allocation group, with + * the current request satisfied from this extent. if this + * is the case, an attempt will be made to do just that. if + * this attempt fails or it has been determined that a new + * extent should not be added, an attempt is made to satisfy + * the request by allocating an existing (backed) free inode + * from the allocation group. + * + * PRE CONDITION: Already have the AG lock for this AG. + * + * PARAMETERS: + * imap - pointer to inode map control structure. + * agno - allocation group to allocate from. + * dir - 'true' if the new disk inode is for a directory. + * ip - pointer to the new inode to be filled in on successful return + * with the disk inode number allocated, its extent address + * and the start of the ag. + * + * RETURN VALUES: + * 0 - success. + * -ENOSPC - insufficient disk resources. + * -EIO - i/o error. + */ +static int +diAllocAG(struct inomap * imap, int agno, bool dir, struct inode *ip) +{ + int rc, addext, numfree, numinos; + + /* get the number of free and the number of backed disk + * inodes currently within the ag. + */ + numfree = imap->im_agctl[agno].numfree; + numinos = imap->im_agctl[agno].numinos; + + if (numfree > numinos) { + jfs_error(ip->i_sb, "numfree > numinos\n"); + return -EIO; + } + + /* determine if we should allocate a new extent of free inodes + * within the ag: for directory inodes, add a new extent + * if there are a small number of free inodes or number of free + * inodes is a small percentage of the number of backed inodes. + */ + if (dir) + addext = (numfree < 64 || + (numfree < 256 + && ((numfree * 100) / numinos) <= 20)); + else + addext = (numfree == 0); + + /* + * try to allocate a new extent of free inodes. + */ + if (addext) { + /* if free space is not available for this new extent, try + * below to allocate a free and existing (already backed) + * inode from the ag. + */ + if ((rc = diAllocExt(imap, agno, ip)) != -ENOSPC) + return (rc); + } + + /* + * try to allocate an existing free inode from the ag. + */ + return (diAllocIno(imap, agno, ip)); +} + + +/* + * NAME: diAllocAny(imap,agno,dir,iap) + * + * FUNCTION: allocate a disk inode from any other allocation group. + * + * this routine is called when an allocation attempt within + * the primary allocation group has failed. if attempts to + * allocate an inode from any allocation group other than the + * specified primary group. + * + * PARAMETERS: + * imap - pointer to inode map control structure. + * agno - primary allocation group (to avoid). + * dir - 'true' if the new disk inode is for a directory. + * ip - pointer to a new inode to be filled in on successful return + * with the disk inode number allocated, its extent address + * and the start of the ag. + * + * RETURN VALUES: + * 0 - success. + * -ENOSPC - insufficient disk resources. + * -EIO - i/o error. + */ +static int +diAllocAny(struct inomap * imap, int agno, bool dir, struct inode *ip) +{ + int ag, rc; + int maxag = JFS_SBI(imap->im_ipimap->i_sb)->bmap->db_maxag; + + + /* try to allocate from the ags following agno up to + * the maximum ag number. + */ + for (ag = agno + 1; ag <= maxag; ag++) { + AG_LOCK(imap, ag); + + rc = diAllocAG(imap, ag, dir, ip); + + AG_UNLOCK(imap, ag); + + if (rc != -ENOSPC) + return (rc); + } + + /* try to allocate from the ags in front of agno. + */ + for (ag = 0; ag < agno; ag++) { + AG_LOCK(imap, ag); + + rc = diAllocAG(imap, ag, dir, ip); + + AG_UNLOCK(imap, ag); + + if (rc != -ENOSPC) + return (rc); + } + + /* no free disk inodes. + */ + return -ENOSPC; +} + + +/* + * NAME: diAllocIno(imap,agno,ip) + * + * FUNCTION: allocate a disk inode from the allocation group's free + * inode list, returning an error if this free list is + * empty (i.e. no iags on the list). + * + * allocation occurs from the first iag on the list using + * the iag's free inode summary map to find the leftmost + * free inode in the iag. + * + * PRE CONDITION: Already have AG lock for this AG. + * + * PARAMETERS: + * imap - pointer to inode map control structure. + * agno - allocation group. + * ip - pointer to new inode to be filled in on successful return + * with the disk inode number allocated, its extent address + * and the start of the ag. + * + * RETURN VALUES: + * 0 - success. + * -ENOSPC - insufficient disk resources. + * -EIO - i/o error. + */ +static int diAllocIno(struct inomap * imap, int agno, struct inode *ip) +{ + int iagno, ino, rc, rem, extno, sword; + struct metapage *mp; + struct iag *iagp; + + /* check if there are iags on the ag's free inode list. + */ + if ((iagno = imap->im_agctl[agno].inofree) < 0) + return -ENOSPC; + + /* obtain read lock on imap inode */ + IREAD_LOCK(imap->im_ipimap, RDWRLOCK_IMAP); + + /* read the iag at the head of the list. + */ + if ((rc = diIAGRead(imap, iagno, &mp))) { + IREAD_UNLOCK(imap->im_ipimap); + return (rc); + } + iagp = (struct iag *) mp->data; + + /* better be free inodes in this iag if it is on the + * list. + */ + if (!iagp->nfreeinos) { + IREAD_UNLOCK(imap->im_ipimap); + release_metapage(mp); + jfs_error(ip->i_sb, "nfreeinos = 0, but iag on freelist\n"); + return -EIO; + } + + /* scan the free inode summary map to find an extent + * with free inodes. + */ + for (sword = 0;; sword++) { + if (sword >= SMAPSZ) { + IREAD_UNLOCK(imap->im_ipimap); + release_metapage(mp); + jfs_error(ip->i_sb, + "free inode not found in summary map\n"); + return -EIO; + } + + if (~iagp->inosmap[sword]) + break; + } + + /* found a extent with free inodes. determine + * the extent number. + */ + rem = diFindFree(le32_to_cpu(iagp->inosmap[sword]), 0); + if (rem >= EXTSPERSUM) { + IREAD_UNLOCK(imap->im_ipimap); + release_metapage(mp); + jfs_error(ip->i_sb, "no free extent found\n"); + return -EIO; + } + extno = (sword << L2EXTSPERSUM) + rem; + + /* find the first free inode in the extent. + */ + rem = diFindFree(le32_to_cpu(iagp->wmap[extno]), 0); + if (rem >= INOSPEREXT) { + IREAD_UNLOCK(imap->im_ipimap); + release_metapage(mp); + jfs_error(ip->i_sb, "free inode not found\n"); + return -EIO; + } + + /* compute the inode number within the iag. + */ + ino = (extno << L2INOSPEREXT) + rem; + + /* allocate the inode. + */ + rc = diAllocBit(imap, iagp, ino); + IREAD_UNLOCK(imap->im_ipimap); + if (rc) { + release_metapage(mp); + return (rc); + } + + /* set the results of the allocation and write the iag. + */ + diInitInode(ip, iagno, ino, extno, iagp); + write_metapage(mp); + + return (0); +} + + +/* + * NAME: diAllocExt(imap,agno,ip) + * + * FUNCTION: add a new extent of free inodes to an iag, allocating + * an inode from this extent to satisfy the current allocation + * request. + * + * this routine first tries to find an existing iag with free + * extents through the ag free extent list. if list is not + * empty, the head of the list will be selected as the home + * of the new extent of free inodes. otherwise (the list is + * empty), a new iag will be allocated for the ag to contain + * the extent. + * + * once an iag has been selected, the free extent summary map + * is used to locate a free extent within the iag and diNewExt() + * is called to initialize the extent, with initialization + * including the allocation of the first inode of the extent + * for the purpose of satisfying this request. + * + * PARAMETERS: + * imap - pointer to inode map control structure. + * agno - allocation group number. + * ip - pointer to new inode to be filled in on successful return + * with the disk inode number allocated, its extent address + * and the start of the ag. + * + * RETURN VALUES: + * 0 - success. + * -ENOSPC - insufficient disk resources. + * -EIO - i/o error. + */ +static int diAllocExt(struct inomap * imap, int agno, struct inode *ip) +{ + int rem, iagno, sword, extno, rc; + struct metapage *mp; + struct iag *iagp; + + /* check if the ag has any iags with free extents. if not, + * allocate a new iag for the ag. + */ + if ((iagno = imap->im_agctl[agno].extfree) < 0) { + /* If successful, diNewIAG will obtain the read lock on the + * imap inode. + */ + if ((rc = diNewIAG(imap, &iagno, agno, &mp))) { + return (rc); + } + iagp = (struct iag *) mp->data; + + /* set the ag number if this a brand new iag + */ + iagp->agstart = + cpu_to_le64(AGTOBLK(agno, imap->im_ipimap)); + } else { + /* read the iag. + */ + IREAD_LOCK(imap->im_ipimap, RDWRLOCK_IMAP); + if ((rc = diIAGRead(imap, iagno, &mp))) { + IREAD_UNLOCK(imap->im_ipimap); + jfs_error(ip->i_sb, "error reading iag\n"); + return rc; + } + iagp = (struct iag *) mp->data; + } + + /* using the free extent summary map, find a free extent. + */ + for (sword = 0;; sword++) { + if (sword >= SMAPSZ) { + release_metapage(mp); + IREAD_UNLOCK(imap->im_ipimap); + jfs_error(ip->i_sb, "free ext summary map not found\n"); + return -EIO; + } + if (~iagp->extsmap[sword]) + break; + } + + /* determine the extent number of the free extent. + */ + rem = diFindFree(le32_to_cpu(iagp->extsmap[sword]), 0); + if (rem >= EXTSPERSUM) { + release_metapage(mp); + IREAD_UNLOCK(imap->im_ipimap); + jfs_error(ip->i_sb, "free extent not found\n"); + return -EIO; + } + extno = (sword << L2EXTSPERSUM) + rem; + + /* initialize the new extent. + */ + rc = diNewExt(imap, iagp, extno); + IREAD_UNLOCK(imap->im_ipimap); + if (rc) { + /* something bad happened. if a new iag was allocated, + * place it back on the inode map's iag free list, and + * clear the ag number information. + */ + if (iagp->nfreeexts == cpu_to_le32(EXTSPERIAG)) { + IAGFREE_LOCK(imap); + iagp->iagfree = cpu_to_le32(imap->im_freeiag); + imap->im_freeiag = iagno; + IAGFREE_UNLOCK(imap); + } + write_metapage(mp); + return (rc); + } + + /* set the results of the allocation and write the iag. + */ + diInitInode(ip, iagno, extno << L2INOSPEREXT, extno, iagp); + + write_metapage(mp); + + return (0); +} + + +/* + * NAME: diAllocBit(imap,iagp,ino) + * + * FUNCTION: allocate a backed inode from an iag. + * + * this routine performs the mechanics of allocating a + * specified inode from a backed extent. + * + * if the inode to be allocated represents the last free + * inode within the iag, the iag will be removed from the + * ag free inode list. + * + * a careful update approach is used to provide consistency + * in the face of updates to multiple buffers. under this + * approach, all required buffers are obtained before making + * any updates and are held all are updates are complete. + * + * PRE CONDITION: Already have buffer lock on iagp. Already have AG lock on + * this AG. Must have read lock on imap inode. + * + * PARAMETERS: + * imap - pointer to inode map control structure. + * iagp - pointer to iag. + * ino - inode number to be allocated within the iag. + * + * RETURN VALUES: + * 0 - success. + * -ENOSPC - insufficient disk resources. + * -EIO - i/o error. + */ +static int diAllocBit(struct inomap * imap, struct iag * iagp, int ino) +{ + int extno, bitno, agno, sword, rc; + struct metapage *amp = NULL, *bmp = NULL; + struct iag *aiagp = NULL, *biagp = NULL; + u32 mask; + + /* check if this is the last free inode within the iag. + * if so, it will have to be removed from the ag free + * inode list, so get the iags preceding and following + * it on the list. + */ + if (iagp->nfreeinos == cpu_to_le32(1)) { + if ((int) le32_to_cpu(iagp->inofreefwd) >= 0) { + if ((rc = + diIAGRead(imap, le32_to_cpu(iagp->inofreefwd), + &))) + return (rc); + aiagp = (struct iag *) amp->data; + } + + if ((int) le32_to_cpu(iagp->inofreeback) >= 0) { + if ((rc = + diIAGRead(imap, + le32_to_cpu(iagp->inofreeback), + &bmp))) { + if (amp) + release_metapage(amp); + return (rc); + } + biagp = (struct iag *) bmp->data; + } + } + + /* get the ag number, extent number, inode number within + * the extent. + */ + agno = BLKTOAG(le64_to_cpu(iagp->agstart), JFS_SBI(imap->im_ipimap->i_sb)); + extno = ino >> L2INOSPEREXT; + bitno = ino & (INOSPEREXT - 1); + + /* compute the mask for setting the map. + */ + mask = HIGHORDER >> bitno; + + /* the inode should be free and backed. + */ + if (((le32_to_cpu(iagp->pmap[extno]) & mask) != 0) || + ((le32_to_cpu(iagp->wmap[extno]) & mask) != 0) || + (addressPXD(&iagp->inoext[extno]) == 0)) { + if (amp) + release_metapage(amp); + if (bmp) + release_metapage(bmp); + + jfs_error(imap->im_ipimap->i_sb, "iag inconsistent\n"); + return -EIO; + } + + /* mark the inode as allocated in the working map. + */ + iagp->wmap[extno] |= cpu_to_le32(mask); + + /* check if all inodes within the extent are now + * allocated. if so, update the free inode summary + * map to reflect this. + */ + if (iagp->wmap[extno] == cpu_to_le32(ONES)) { + sword = extno >> L2EXTSPERSUM; + bitno = extno & (EXTSPERSUM - 1); + iagp->inosmap[sword] |= cpu_to_le32(HIGHORDER >> bitno); + } + + /* if this was the last free inode in the iag, remove the + * iag from the ag free inode list. + */ + if (iagp->nfreeinos == cpu_to_le32(1)) { + if (amp) { + aiagp->inofreeback = iagp->inofreeback; + write_metapage(amp); + } + + if (bmp) { + biagp->inofreefwd = iagp->inofreefwd; + write_metapage(bmp); + } else { + imap->im_agctl[agno].inofree = + le32_to_cpu(iagp->inofreefwd); + } + iagp->inofreefwd = iagp->inofreeback = cpu_to_le32(-1); + } + + /* update the free inode count at the iag, ag, inode + * map levels. + */ + le32_add_cpu(&iagp->nfreeinos, -1); + imap->im_agctl[agno].numfree -= 1; + atomic_dec(&imap->im_numfree); + + return (0); +} + + +/* + * NAME: diNewExt(imap,iagp,extno) + * + * FUNCTION: initialize a new extent of inodes for an iag, allocating + * the first inode of the extent for use for the current + * allocation request. + * + * disk resources are allocated for the new extent of inodes + * and the inodes themselves are initialized to reflect their + * existence within the extent (i.e. their inode numbers and + * inode extent addresses are set) and their initial state + * (mode and link count are set to zero). + * + * if the iag is new, it is not yet on an ag extent free list + * but will now be placed on this list. + * + * if the allocation of the new extent causes the iag to + * have no free extent, the iag will be removed from the + * ag extent free list. + * + * if the iag has no free backed inodes, it will be placed + * on the ag free inode list, since the addition of the new + * extent will now cause it to have free inodes. + * + * a careful update approach is used to provide consistency + * (i.e. list consistency) in the face of updates to multiple + * buffers. under this approach, all required buffers are + * obtained before making any updates and are held until all + * updates are complete. + * + * PRE CONDITION: Already have buffer lock on iagp. Already have AG lock on + * this AG. Must have read lock on imap inode. + * + * PARAMETERS: + * imap - pointer to inode map control structure. + * iagp - pointer to iag. + * extno - extent number. + * + * RETURN VALUES: + * 0 - success. + * -ENOSPC - insufficient disk resources. + * -EIO - i/o error. + */ +static int diNewExt(struct inomap * imap, struct iag * iagp, int extno) +{ + int agno, iagno, fwd, back, freei = 0, sword, rc; + struct iag *aiagp = NULL, *biagp = NULL, *ciagp = NULL; + struct metapage *amp, *bmp, *cmp, *dmp; + struct inode *ipimap; + s64 blkno, hint; + int i, j; + u32 mask; + ino_t ino; + struct dinode *dp; + struct jfs_sb_info *sbi; + + /* better have free extents. + */ + if (!iagp->nfreeexts) { + jfs_error(imap->im_ipimap->i_sb, "no free extents\n"); + return -EIO; + } + + /* get the inode map inode. + */ + ipimap = imap->im_ipimap; + sbi = JFS_SBI(ipimap->i_sb); + + amp = bmp = cmp = NULL; + + /* get the ag and iag numbers for this iag. + */ + agno = BLKTOAG(le64_to_cpu(iagp->agstart), sbi); + iagno = le32_to_cpu(iagp->iagnum); + + /* check if this is the last free extent within the + * iag. if so, the iag must be removed from the ag + * free extent list, so get the iags preceding and + * following the iag on this list. + */ + if (iagp->nfreeexts == cpu_to_le32(1)) { + if ((fwd = le32_to_cpu(iagp->extfreefwd)) >= 0) { + if ((rc = diIAGRead(imap, fwd, &))) + return (rc); + aiagp = (struct iag *) amp->data; + } + + if ((back = le32_to_cpu(iagp->extfreeback)) >= 0) { + if ((rc = diIAGRead(imap, back, &bmp))) + goto error_out; + biagp = (struct iag *) bmp->data; + } + } else { + /* the iag has free extents. if all extents are free + * (as is the case for a newly allocated iag), the iag + * must be added to the ag free extent list, so get + * the iag at the head of the list in preparation for + * adding this iag to this list. + */ + fwd = back = -1; + if (iagp->nfreeexts == cpu_to_le32(EXTSPERIAG)) { + if ((fwd = imap->im_agctl[agno].extfree) >= 0) { + if ((rc = diIAGRead(imap, fwd, &))) + goto error_out; + aiagp = (struct iag *) amp->data; + } + } + } + + /* check if the iag has no free inodes. if so, the iag + * will have to be added to the ag free inode list, so get + * the iag at the head of the list in preparation for + * adding this iag to this list. in doing this, we must + * check if we already have the iag at the head of + * the list in hand. + */ + if (iagp->nfreeinos == 0) { + freei = imap->im_agctl[agno].inofree; + + if (freei >= 0) { + if (freei == fwd) { + ciagp = aiagp; + } else if (freei == back) { + ciagp = biagp; + } else { + if ((rc = diIAGRead(imap, freei, &cmp))) + goto error_out; + ciagp = (struct iag *) cmp->data; + } + if (ciagp == NULL) { + jfs_error(imap->im_ipimap->i_sb, + "ciagp == NULL\n"); + rc = -EIO; + goto error_out; + } + } + } + + /* allocate disk space for the inode extent. + */ + if ((extno == 0) || (addressPXD(&iagp->inoext[extno - 1]) == 0)) + hint = ((s64) agno << sbi->bmap->db_agl2size) - 1; + else + hint = addressPXD(&iagp->inoext[extno - 1]) + + lengthPXD(&iagp->inoext[extno - 1]) - 1; + + if ((rc = dbAlloc(ipimap, hint, (s64) imap->im_nbperiext, &blkno))) + goto error_out; + + /* compute the inode number of the first inode within the + * extent. + */ + ino = (iagno << L2INOSPERIAG) + (extno << L2INOSPEREXT); + + /* initialize the inodes within the newly allocated extent a + * page at a time. + */ + for (i = 0; i < imap->im_nbperiext; i += sbi->nbperpage) { + /* get a buffer for this page of disk inodes. + */ + dmp = get_metapage(ipimap, blkno + i, PSIZE, 1); + if (dmp == NULL) { + rc = -EIO; + goto error_out; + } + dp = (struct dinode *) dmp->data; + + /* initialize the inode number, mode, link count and + * inode extent address. + */ + for (j = 0; j < INOSPERPAGE; j++, dp++, ino++) { + dp->di_inostamp = cpu_to_le32(sbi->inostamp); + dp->di_number = cpu_to_le32(ino); + dp->di_fileset = cpu_to_le32(FILESYSTEM_I); + dp->di_mode = 0; + dp->di_nlink = 0; + PXDaddress(&(dp->di_ixpxd), blkno); + PXDlength(&(dp->di_ixpxd), imap->im_nbperiext); + } + write_metapage(dmp); + } + + /* if this is the last free extent within the iag, remove the + * iag from the ag free extent list. + */ + if (iagp->nfreeexts == cpu_to_le32(1)) { + if (fwd >= 0) + aiagp->extfreeback = iagp->extfreeback; + + if (back >= 0) + biagp->extfreefwd = iagp->extfreefwd; + else + imap->im_agctl[agno].extfree = + le32_to_cpu(iagp->extfreefwd); + + iagp->extfreefwd = iagp->extfreeback = cpu_to_le32(-1); + } else { + /* if the iag has all free extents (newly allocated iag), + * add the iag to the ag free extent list. + */ + if (iagp->nfreeexts == cpu_to_le32(EXTSPERIAG)) { + if (fwd >= 0) + aiagp->extfreeback = cpu_to_le32(iagno); + + iagp->extfreefwd = cpu_to_le32(fwd); + iagp->extfreeback = cpu_to_le32(-1); + imap->im_agctl[agno].extfree = iagno; + } + } + + /* if the iag has no free inodes, add the iag to the + * ag free inode list. + */ + if (iagp->nfreeinos == 0) { + if (freei >= 0) + ciagp->inofreeback = cpu_to_le32(iagno); + + iagp->inofreefwd = + cpu_to_le32(imap->im_agctl[agno].inofree); + iagp->inofreeback = cpu_to_le32(-1); + imap->im_agctl[agno].inofree = iagno; + } + + /* initialize the extent descriptor of the extent. */ + PXDlength(&iagp->inoext[extno], imap->im_nbperiext); + PXDaddress(&iagp->inoext[extno], blkno); + + /* initialize the working and persistent map of the extent. + * the working map will be initialized such that + * it indicates the first inode of the extent is allocated. + */ + iagp->wmap[extno] = cpu_to_le32(HIGHORDER); + iagp->pmap[extno] = 0; + + /* update the free inode and free extent summary maps + * for the extent to indicate the extent has free inodes + * and no longer represents a free extent. + */ + sword = extno >> L2EXTSPERSUM; + mask = HIGHORDER >> (extno & (EXTSPERSUM - 1)); + iagp->extsmap[sword] |= cpu_to_le32(mask); + iagp->inosmap[sword] &= cpu_to_le32(~mask); + + /* update the free inode and free extent counts for the + * iag. + */ + le32_add_cpu(&iagp->nfreeinos, (INOSPEREXT - 1)); + le32_add_cpu(&iagp->nfreeexts, -1); + + /* update the free and backed inode counts for the ag. + */ + imap->im_agctl[agno].numfree += (INOSPEREXT - 1); + imap->im_agctl[agno].numinos += INOSPEREXT; + + /* update the free and backed inode counts for the inode map. + */ + atomic_add(INOSPEREXT - 1, &imap->im_numfree); + atomic_add(INOSPEREXT, &imap->im_numinos); + + /* write the iags. + */ + if (amp) + write_metapage(amp); + if (bmp) + write_metapage(bmp); + if (cmp) + write_metapage(cmp); + + return (0); + + error_out: + + /* release the iags. + */ + if (amp) + release_metapage(amp); + if (bmp) + release_metapage(bmp); + if (cmp) + release_metapage(cmp); + + return (rc); +} + + +/* + * NAME: diNewIAG(imap,iagnop,agno) + * + * FUNCTION: allocate a new iag for an allocation group. + * + * first tries to allocate the iag from the inode map + * iagfree list: + * if the list has free iags, the head of the list is removed + * and returned to satisfy the request. + * if the inode map's iag free list is empty, the inode map + * is extended to hold a new iag. this new iag is initialized + * and returned to satisfy the request. + * + * PARAMETERS: + * imap - pointer to inode map control structure. + * iagnop - pointer to an iag number set with the number of the + * newly allocated iag upon successful return. + * agno - allocation group number. + * bpp - Buffer pointer to be filled in with new IAG's buffer + * + * RETURN VALUES: + * 0 - success. + * -ENOSPC - insufficient disk resources. + * -EIO - i/o error. + * + * serialization: + * AG lock held on entry/exit; + * write lock on the map is held inside; + * read lock on the map is held on successful completion; + * + * note: new iag transaction: + * . synchronously write iag; + * . write log of xtree and inode of imap; + * . commit; + * . synchronous write of xtree (right to left, bottom to top); + * . at start of logredo(): init in-memory imap with one additional iag page; + * . at end of logredo(): re-read imap inode to determine + * new imap size; + */ +static int +diNewIAG(struct inomap * imap, int *iagnop, int agno, struct metapage ** mpp) +{ + int rc; + int iagno, i, xlen; + struct inode *ipimap; + struct super_block *sb; + struct jfs_sb_info *sbi; + struct metapage *mp; + struct iag *iagp; + s64 xaddr = 0; + s64 blkno; + tid_t tid; + struct inode *iplist[1]; + + /* pick up pointers to the inode map and mount inodes */ + ipimap = imap->im_ipimap; + sb = ipimap->i_sb; + sbi = JFS_SBI(sb); + + /* acquire the free iag lock */ + IAGFREE_LOCK(imap); + + /* if there are any iags on the inode map free iag list, + * allocate the iag from the head of the list. + */ + if (imap->im_freeiag >= 0) { + /* pick up the iag number at the head of the list */ + iagno = imap->im_freeiag; + + /* determine the logical block number of the iag */ + blkno = IAGTOLBLK(iagno, sbi->l2nbperpage); + } else { + /* no free iags. the inode map will have to be extented + * to include a new iag. + */ + + /* acquire inode map lock */ + IWRITE_LOCK(ipimap, RDWRLOCK_IMAP); + + if (ipimap->i_size >> L2PSIZE != imap->im_nextiag + 1) { + IWRITE_UNLOCK(ipimap); + IAGFREE_UNLOCK(imap); + jfs_error(imap->im_ipimap->i_sb, + "ipimap->i_size is wrong\n"); + return -EIO; + } + + + /* get the next available iag number */ + iagno = imap->im_nextiag; + + /* make sure that we have not exceeded the maximum inode + * number limit. + */ + if (iagno > (MAXIAGS - 1)) { + /* release the inode map lock */ + IWRITE_UNLOCK(ipimap); + + rc = -ENOSPC; + goto out; + } + + /* + * synchronously append new iag page. + */ + /* determine the logical address of iag page to append */ + blkno = IAGTOLBLK(iagno, sbi->l2nbperpage); + + /* Allocate extent for new iag page */ + xlen = sbi->nbperpage; + if ((rc = dbAlloc(ipimap, 0, (s64) xlen, &xaddr))) { + /* release the inode map lock */ + IWRITE_UNLOCK(ipimap); + + goto out; + } + + /* + * start transaction of update of the inode map + * addressing structure pointing to the new iag page; + */ + tid = txBegin(sb, COMMIT_FORCE); + mutex_lock(&JFS_IP(ipimap)->commit_mutex); + + /* update the inode map addressing structure to point to it */ + if ((rc = + xtInsert(tid, ipimap, 0, blkno, xlen, &xaddr, 0))) { + txEnd(tid); + mutex_unlock(&JFS_IP(ipimap)->commit_mutex); + /* Free the blocks allocated for the iag since it was + * not successfully added to the inode map + */ + dbFree(ipimap, xaddr, (s64) xlen); + + /* release the inode map lock */ + IWRITE_UNLOCK(ipimap); + + goto out; + } + + /* update the inode map's inode to reflect the extension */ + ipimap->i_size += PSIZE; + inode_add_bytes(ipimap, PSIZE); + + /* assign a buffer for the page */ + mp = get_metapage(ipimap, blkno, PSIZE, 0); + if (!mp) { + /* + * This is very unlikely since we just created the + * extent, but let's try to handle it correctly + */ + xtTruncate(tid, ipimap, ipimap->i_size - PSIZE, + COMMIT_PWMAP); + + txAbort(tid, 0); + txEnd(tid); + mutex_unlock(&JFS_IP(ipimap)->commit_mutex); + + /* release the inode map lock */ + IWRITE_UNLOCK(ipimap); + + rc = -EIO; + goto out; + } + iagp = (struct iag *) mp->data; + + /* init the iag */ + memset(iagp, 0, sizeof(struct iag)); + iagp->iagnum = cpu_to_le32(iagno); + iagp->inofreefwd = iagp->inofreeback = cpu_to_le32(-1); + iagp->extfreefwd = iagp->extfreeback = cpu_to_le32(-1); + iagp->iagfree = cpu_to_le32(-1); + iagp->nfreeinos = 0; + iagp->nfreeexts = cpu_to_le32(EXTSPERIAG); + + /* initialize the free inode summary map (free extent + * summary map initialization handled by bzero). + */ + for (i = 0; i < SMAPSZ; i++) + iagp->inosmap[i] = cpu_to_le32(ONES); + + /* + * Write and sync the metapage + */ + flush_metapage(mp); + + /* + * txCommit(COMMIT_FORCE) will synchronously write address + * index pages and inode after commit in careful update order + * of address index pages (right to left, bottom up); + */ + iplist[0] = ipimap; + rc = txCommit(tid, 1, &iplist[0], COMMIT_FORCE); + + txEnd(tid); + mutex_unlock(&JFS_IP(ipimap)->commit_mutex); + + duplicateIXtree(sb, blkno, xlen, &xaddr); + + /* update the next available iag number */ + imap->im_nextiag += 1; + + /* Add the iag to the iag free list so we don't lose the iag + * if a failure happens now. + */ + imap->im_freeiag = iagno; + + /* Until we have logredo working, we want the imap inode & + * control page to be up to date. + */ + diSync(ipimap); + + /* release the inode map lock */ + IWRITE_UNLOCK(ipimap); + } + + /* obtain read lock on map */ + IREAD_LOCK(ipimap, RDWRLOCK_IMAP); + + /* read the iag */ + if ((rc = diIAGRead(imap, iagno, &mp))) { + IREAD_UNLOCK(ipimap); + rc = -EIO; + goto out; + } + iagp = (struct iag *) mp->data; + + /* remove the iag from the iag free list */ + imap->im_freeiag = le32_to_cpu(iagp->iagfree); + iagp->iagfree = cpu_to_le32(-1); + + /* set the return iag number and buffer pointer */ + *iagnop = iagno; + *mpp = mp; + + out: + /* release the iag free lock */ + IAGFREE_UNLOCK(imap); + + return (rc); +} + +/* + * NAME: diIAGRead() + * + * FUNCTION: get the buffer for the specified iag within a fileset + * or aggregate inode map. + * + * PARAMETERS: + * imap - pointer to inode map control structure. + * iagno - iag number. + * bpp - point to buffer pointer to be filled in on successful + * exit. + * + * SERIALIZATION: + * must have read lock on imap inode + * (When called by diExtendFS, the filesystem is quiesced, therefore + * the read lock is unnecessary.) + * + * RETURN VALUES: + * 0 - success. + * -EIO - i/o error. + */ +static int diIAGRead(struct inomap * imap, int iagno, struct metapage ** mpp) +{ + struct inode *ipimap = imap->im_ipimap; + s64 blkno; + + /* compute the logical block number of the iag. */ + blkno = IAGTOLBLK(iagno, JFS_SBI(ipimap->i_sb)->l2nbperpage); + + /* read the iag. */ + *mpp = read_metapage(ipimap, blkno, PSIZE, 0); + if (*mpp == NULL) { + return -EIO; + } + + return (0); +} + +/* + * NAME: diFindFree() + * + * FUNCTION: find the first free bit in a word starting at + * the specified bit position. + * + * PARAMETERS: + * word - word to be examined. + * start - starting bit position. + * + * RETURN VALUES: + * bit position of first free bit in the word or 32 if + * no free bits were found. + */ +static int diFindFree(u32 word, int start) +{ + int bitno; + assert(start < 32); + /* scan the word for the first free bit. */ + for (word <<= start, bitno = start; bitno < 32; + bitno++, word <<= 1) { + if ((word & HIGHORDER) == 0) + break; + } + return (bitno); +} + +/* + * NAME: diUpdatePMap() + * + * FUNCTION: Update the persistent map in an IAG for the allocation or + * freeing of the specified inode. + * + * PRE CONDITIONS: Working map has already been updated for allocate. + * + * PARAMETERS: + * ipimap - Incore inode map inode + * inum - Number of inode to mark in permanent map + * is_free - If 'true' indicates inode should be marked freed, otherwise + * indicates inode should be marked allocated. + * + * RETURN VALUES: + * 0 for success + */ +int +diUpdatePMap(struct inode *ipimap, + unsigned long inum, bool is_free, struct tblock * tblk) +{ + int rc; + struct iag *iagp; + struct metapage *mp; + int iagno, ino, extno, bitno; + struct inomap *imap; + u32 mask; + struct jfs_log *log; + int lsn, difft, diffp; + unsigned long flags; + + imap = JFS_IP(ipimap)->i_imap; + /* get the iag number containing the inode */ + iagno = INOTOIAG(inum); + /* make sure that the iag is contained within the map */ + if (iagno >= imap->im_nextiag) { + jfs_error(ipimap->i_sb, "the iag is outside the map\n"); + return -EIO; + } + /* read the iag */ + IREAD_LOCK(ipimap, RDWRLOCK_IMAP); + rc = diIAGRead(imap, iagno, &mp); + IREAD_UNLOCK(ipimap); + if (rc) + return (rc); + metapage_wait_for_io(mp); + iagp = (struct iag *) mp->data; + /* get the inode number and extent number of the inode within + * the iag and the inode number within the extent. + */ + ino = inum & (INOSPERIAG - 1); + extno = ino >> L2INOSPEREXT; + bitno = ino & (INOSPEREXT - 1); + mask = HIGHORDER >> bitno; + /* + * mark the inode free in persistent map: + */ + if (is_free) { + /* The inode should have been allocated both in working + * map and in persistent map; + * the inode will be freed from working map at the release + * of last reference release; + */ + if (!(le32_to_cpu(iagp->wmap[extno]) & mask)) { + jfs_error(ipimap->i_sb, + "inode %ld not marked as allocated in wmap!\n", + inum); + } + if (!(le32_to_cpu(iagp->pmap[extno]) & mask)) { + jfs_error(ipimap->i_sb, + "inode %ld not marked as allocated in pmap!\n", + inum); + } + /* update the bitmap for the extent of the freed inode */ + iagp->pmap[extno] &= cpu_to_le32(~mask); + } + /* + * mark the inode allocated in persistent map: + */ + else { + /* The inode should be already allocated in the working map + * and should be free in persistent map; + */ + if (!(le32_to_cpu(iagp->wmap[extno]) & mask)) { + release_metapage(mp); + jfs_error(ipimap->i_sb, + "the inode is not allocated in the working map\n"); + return -EIO; + } + if ((le32_to_cpu(iagp->pmap[extno]) & mask) != 0) { + release_metapage(mp); + jfs_error(ipimap->i_sb, + "the inode is not free in the persistent map\n"); + return -EIO; + } + /* update the bitmap for the extent of the allocated inode */ + iagp->pmap[extno] |= cpu_to_le32(mask); + } + /* + * update iag lsn + */ + lsn = tblk->lsn; + log = JFS_SBI(tblk->sb)->log; + LOGSYNC_LOCK(log, flags); + if (mp->lsn != 0) { + /* inherit older/smaller lsn */ + logdiff(difft, lsn, log); + logdiff(diffp, mp->lsn, log); + if (difft < diffp) { + mp->lsn = lsn; + /* move mp after tblock in logsync list */ + list_move(&mp->synclist, &tblk->synclist); + } + /* inherit younger/larger clsn */ + assert(mp->clsn); + logdiff(difft, tblk->clsn, log); + logdiff(diffp, mp->clsn, log); + if (difft > diffp) + mp->clsn = tblk->clsn; + } else { + mp->log = log; + mp->lsn = lsn; + /* insert mp after tblock in logsync list */ + log->count++; + list_add(&mp->synclist, &tblk->synclist); + mp->clsn = tblk->clsn; + } + LOGSYNC_UNLOCK(log, flags); + write_metapage(mp); + return (0); +} + +/* + * diExtendFS() + * + * function: update imap for extendfs(); + * + * note: AG size has been increased s.t. each k old contiguous AGs are + * coalesced into a new AG; + */ +int diExtendFS(struct inode *ipimap, struct inode *ipbmap) +{ + int rc, rcx = 0; + struct inomap *imap = JFS_IP(ipimap)->i_imap; + struct iag *iagp = NULL, *hiagp = NULL; + struct bmap *mp = JFS_SBI(ipbmap->i_sb)->bmap; + struct metapage *bp, *hbp; + int i, n, head; + int numinos, xnuminos = 0, xnumfree = 0; + s64 agstart; + + jfs_info("diExtendFS: nextiag:%d numinos:%d numfree:%d", + imap->im_nextiag, atomic_read(&imap->im_numinos), + atomic_read(&imap->im_numfree)); + + /* + * reconstruct imap + * + * coalesce contiguous k (newAGSize/oldAGSize) AGs; + * i.e., (AGi, ..., AGj) where i = k*n and j = k*(n+1) - 1 to AGn; + * note: new AG size = old AG size * (2**x). + */ + + /* init per AG control information im_agctl[] */ + for (i = 0; i < MAXAG; i++) { + imap->im_agctl[i].inofree = -1; + imap->im_agctl[i].extfree = -1; + imap->im_agctl[i].numinos = 0; /* number of backed inodes */ + imap->im_agctl[i].numfree = 0; /* number of free backed inodes */ + } + + /* + * process each iag page of the map. + * + * rebuild AG Free Inode List, AG Free Inode Extent List; + */ + for (i = 0; i < imap->im_nextiag; i++) { + if ((rc = diIAGRead(imap, i, &bp))) { + rcx = rc; + continue; + } + iagp = (struct iag *) bp->data; + if (le32_to_cpu(iagp->iagnum) != i) { + release_metapage(bp); + jfs_error(ipimap->i_sb, "unexpected value of iagnum\n"); + return -EIO; + } + + /* leave free iag in the free iag list */ + if (iagp->nfreeexts == cpu_to_le32(EXTSPERIAG)) { + release_metapage(bp); + continue; + } + + agstart = le64_to_cpu(iagp->agstart); + n = agstart >> mp->db_agl2size; + iagp->agstart = cpu_to_le64((s64)n << mp->db_agl2size); + + /* compute backed inodes */ + numinos = (EXTSPERIAG - le32_to_cpu(iagp->nfreeexts)) + << L2INOSPEREXT; + if (numinos > 0) { + /* merge AG backed inodes */ + imap->im_agctl[n].numinos += numinos; + xnuminos += numinos; + } + + /* if any backed free inodes, insert at AG free inode list */ + if ((int) le32_to_cpu(iagp->nfreeinos) > 0) { + if ((head = imap->im_agctl[n].inofree) == -1) { + iagp->inofreefwd = cpu_to_le32(-1); + iagp->inofreeback = cpu_to_le32(-1); + } else { + if ((rc = diIAGRead(imap, head, &hbp))) { + rcx = rc; + goto nextiag; + } + hiagp = (struct iag *) hbp->data; + hiagp->inofreeback = iagp->iagnum; + iagp->inofreefwd = cpu_to_le32(head); + iagp->inofreeback = cpu_to_le32(-1); + write_metapage(hbp); + } + + imap->im_agctl[n].inofree = + le32_to_cpu(iagp->iagnum); + + /* merge AG backed free inodes */ + imap->im_agctl[n].numfree += + le32_to_cpu(iagp->nfreeinos); + xnumfree += le32_to_cpu(iagp->nfreeinos); + } + + /* if any free extents, insert at AG free extent list */ + if (le32_to_cpu(iagp->nfreeexts) > 0) { + if ((head = imap->im_agctl[n].extfree) == -1) { + iagp->extfreefwd = cpu_to_le32(-1); + iagp->extfreeback = cpu_to_le32(-1); + } else { + if ((rc = diIAGRead(imap, head, &hbp))) { + rcx = rc; + goto nextiag; + } + hiagp = (struct iag *) hbp->data; + hiagp->extfreeback = iagp->iagnum; + iagp->extfreefwd = cpu_to_le32(head); + iagp->extfreeback = cpu_to_le32(-1); + write_metapage(hbp); + } + + imap->im_agctl[n].extfree = + le32_to_cpu(iagp->iagnum); + } + + nextiag: + write_metapage(bp); + } + + if (xnuminos != atomic_read(&imap->im_numinos) || + xnumfree != atomic_read(&imap->im_numfree)) { + jfs_error(ipimap->i_sb, "numinos or numfree incorrect\n"); + return -EIO; + } + + return rcx; +} + + +/* + * duplicateIXtree() + * + * serialization: IWRITE_LOCK held on entry/exit + * + * note: shadow page with regular inode (rel.2); + */ +static void duplicateIXtree(struct super_block *sb, s64 blkno, + int xlen, s64 *xaddr) +{ + struct jfs_superblock *j_sb; + struct buffer_head *bh; + struct inode *ip; + tid_t tid; + + /* if AIT2 ipmap2 is bad, do not try to update it */ + if (JFS_SBI(sb)->mntflag & JFS_BAD_SAIT) /* s_flag */ + return; + ip = diReadSpecial(sb, FILESYSTEM_I, 1); + if (ip == NULL) { + JFS_SBI(sb)->mntflag |= JFS_BAD_SAIT; + if (readSuper(sb, &bh)) + return; + j_sb = (struct jfs_superblock *)bh->b_data; + j_sb->s_flag |= cpu_to_le32(JFS_BAD_SAIT); + + mark_buffer_dirty(bh); + sync_dirty_buffer(bh); + brelse(bh); + return; + } + + /* start transaction */ + tid = txBegin(sb, COMMIT_FORCE); + /* update the inode map addressing structure to point to it */ + if (xtInsert(tid, ip, 0, blkno, xlen, xaddr, 0)) { + JFS_SBI(sb)->mntflag |= JFS_BAD_SAIT; + txAbort(tid, 1); + goto cleanup; + + } + /* update the inode map's inode to reflect the extension */ + ip->i_size += PSIZE; + inode_add_bytes(ip, PSIZE); + txCommit(tid, 1, &ip, COMMIT_FORCE); + cleanup: + txEnd(tid); + diFreeSpecial(ip); +} + +/* + * NAME: copy_from_dinode() + * + * FUNCTION: Copies inode info from disk inode to in-memory inode + * + * RETURN VALUES: + * 0 - success + * -ENOMEM - insufficient memory + */ +static int copy_from_dinode(struct dinode * dip, struct inode *ip) +{ + struct jfs_inode_info *jfs_ip = JFS_IP(ip); + struct jfs_sb_info *sbi = JFS_SBI(ip->i_sb); + + jfs_ip->fileset = le32_to_cpu(dip->di_fileset); + jfs_ip->mode2 = le32_to_cpu(dip->di_mode); + jfs_set_inode_flags(ip); + + ip->i_mode = le32_to_cpu(dip->di_mode) & 0xffff; + if (sbi->umask != -1) { + ip->i_mode = (ip->i_mode & ~0777) | (0777 & ~sbi->umask); + /* For directories, add x permission if r is allowed by umask */ + if (S_ISDIR(ip->i_mode)) { + if (ip->i_mode & 0400) + ip->i_mode |= 0100; + if (ip->i_mode & 0040) + ip->i_mode |= 0010; + if (ip->i_mode & 0004) + ip->i_mode |= 0001; + } + } + set_nlink(ip, le32_to_cpu(dip->di_nlink)); + + jfs_ip->saved_uid = make_kuid(&init_user_ns, le32_to_cpu(dip->di_uid)); + if (!uid_valid(sbi->uid)) + ip->i_uid = jfs_ip->saved_uid; + else { + ip->i_uid = sbi->uid; + } + + jfs_ip->saved_gid = make_kgid(&init_user_ns, le32_to_cpu(dip->di_gid)); + if (!gid_valid(sbi->gid)) + ip->i_gid = jfs_ip->saved_gid; + else { + ip->i_gid = sbi->gid; + } + + ip->i_size = le64_to_cpu(dip->di_size); + ip->i_atime.tv_sec = le32_to_cpu(dip->di_atime.tv_sec); + ip->i_atime.tv_nsec = le32_to_cpu(dip->di_atime.tv_nsec); + ip->i_mtime.tv_sec = le32_to_cpu(dip->di_mtime.tv_sec); + ip->i_mtime.tv_nsec = le32_to_cpu(dip->di_mtime.tv_nsec); + ip->i_ctime.tv_sec = le32_to_cpu(dip->di_ctime.tv_sec); + ip->i_ctime.tv_nsec = le32_to_cpu(dip->di_ctime.tv_nsec); + ip->i_blocks = LBLK2PBLK(ip->i_sb, le64_to_cpu(dip->di_nblocks)); + ip->i_generation = le32_to_cpu(dip->di_gen); + + jfs_ip->ixpxd = dip->di_ixpxd; /* in-memory pxd's are little-endian */ + jfs_ip->acl = dip->di_acl; /* as are dxd's */ + jfs_ip->ea = dip->di_ea; + jfs_ip->next_index = le32_to_cpu(dip->di_next_index); + jfs_ip->otime = le32_to_cpu(dip->di_otime.tv_sec); + jfs_ip->acltype = le32_to_cpu(dip->di_acltype); + + if (S_ISCHR(ip->i_mode) || S_ISBLK(ip->i_mode)) { + jfs_ip->dev = le32_to_cpu(dip->di_rdev); + ip->i_rdev = new_decode_dev(jfs_ip->dev); + } + + if (S_ISDIR(ip->i_mode)) { + memcpy(&jfs_ip->i_dirtable, &dip->di_dirtable, 384); + } else if (S_ISREG(ip->i_mode) || S_ISLNK(ip->i_mode)) { + memcpy(&jfs_ip->i_xtroot, &dip->di_xtroot, 288); + } else + memcpy(&jfs_ip->i_inline_ea, &dip->di_inlineea, 128); + + /* Zero the in-memory-only stuff */ + jfs_ip->cflag = 0; + jfs_ip->btindex = 0; + jfs_ip->btorder = 0; + jfs_ip->bxflag = 0; + jfs_ip->blid = 0; + jfs_ip->atlhead = 0; + jfs_ip->atltail = 0; + jfs_ip->xtlid = 0; + return (0); +} + +/* + * NAME: copy_to_dinode() + * + * FUNCTION: Copies inode info from in-memory inode to disk inode + */ +static void copy_to_dinode(struct dinode * dip, struct inode *ip) +{ + struct jfs_inode_info *jfs_ip = JFS_IP(ip); + struct jfs_sb_info *sbi = JFS_SBI(ip->i_sb); + + dip->di_fileset = cpu_to_le32(jfs_ip->fileset); + dip->di_inostamp = cpu_to_le32(sbi->inostamp); + dip->di_number = cpu_to_le32(ip->i_ino); + dip->di_gen = cpu_to_le32(ip->i_generation); + dip->di_size = cpu_to_le64(ip->i_size); + dip->di_nblocks = cpu_to_le64(PBLK2LBLK(ip->i_sb, ip->i_blocks)); + dip->di_nlink = cpu_to_le32(ip->i_nlink); + if (!uid_valid(sbi->uid)) + dip->di_uid = cpu_to_le32(i_uid_read(ip)); + else + dip->di_uid =cpu_to_le32(from_kuid(&init_user_ns, + jfs_ip->saved_uid)); + if (!gid_valid(sbi->gid)) + dip->di_gid = cpu_to_le32(i_gid_read(ip)); + else + dip->di_gid = cpu_to_le32(from_kgid(&init_user_ns, + jfs_ip->saved_gid)); + /* + * mode2 is only needed for storing the higher order bits. + * Trust i_mode for the lower order ones + */ + if (sbi->umask == -1) + dip->di_mode = cpu_to_le32((jfs_ip->mode2 & 0xffff0000) | + ip->i_mode); + else /* Leave the original permissions alone */ + dip->di_mode = cpu_to_le32(jfs_ip->mode2); + + dip->di_atime.tv_sec = cpu_to_le32(ip->i_atime.tv_sec); + dip->di_atime.tv_nsec = cpu_to_le32(ip->i_atime.tv_nsec); + dip->di_ctime.tv_sec = cpu_to_le32(ip->i_ctime.tv_sec); + dip->di_ctime.tv_nsec = cpu_to_le32(ip->i_ctime.tv_nsec); + dip->di_mtime.tv_sec = cpu_to_le32(ip->i_mtime.tv_sec); + dip->di_mtime.tv_nsec = cpu_to_le32(ip->i_mtime.tv_nsec); + dip->di_ixpxd = jfs_ip->ixpxd; /* in-memory pxd's are little-endian */ + dip->di_acl = jfs_ip->acl; /* as are dxd's */ + dip->di_ea = jfs_ip->ea; + dip->di_next_index = cpu_to_le32(jfs_ip->next_index); + dip->di_otime.tv_sec = cpu_to_le32(jfs_ip->otime); + dip->di_otime.tv_nsec = 0; + dip->di_acltype = cpu_to_le32(jfs_ip->acltype); + if (S_ISCHR(ip->i_mode) || S_ISBLK(ip->i_mode)) + dip->di_rdev = cpu_to_le32(jfs_ip->dev); +} diff --git a/fs/jfs/jfs_imap.h b/fs/jfs/jfs_imap.h new file mode 100644 index 000000000..610a0e9d8 --- /dev/null +++ b/fs/jfs/jfs_imap.h @@ -0,0 +1,175 @@ +/* + * Copyright (C) International Business Machines Corp., 2000-2002 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +#ifndef _H_JFS_IMAP +#define _H_JFS_IMAP + +#include "jfs_txnmgr.h" + +/* + * jfs_imap.h: disk inode manager + */ + +#define EXTSPERIAG 128 /* number of disk inode extent per iag */ +#define IMAPBLKNO 0 /* lblkno of dinomap within inode map */ +#define SMAPSZ 4 /* number of words per summary map */ +#define EXTSPERSUM 32 /* number of extents per summary map entry */ +#define L2EXTSPERSUM 5 /* l2 number of extents per summary map */ +#define PGSPERIEXT 4 /* number of 4K pages per dinode extent */ +#define MAXIAGS ((1<<20)-1) /* maximum number of iags */ +#define MAXAG 128 /* maximum number of allocation groups */ + +#define AMAPSIZE 512 /* bytes in the IAG allocation maps */ +#define SMAPSIZE 16 /* bytes in the IAG summary maps */ + +/* convert inode number to iag number */ +#define INOTOIAG(ino) ((ino) >> L2INOSPERIAG) + +/* convert iag number to logical block number of the iag page */ +#define IAGTOLBLK(iagno,l2nbperpg) (((iagno) + 1) << (l2nbperpg)) + +/* get the starting block number of the 4K page of an inode extent + * that contains ino. + */ +#define INOPBLK(pxd,ino,l2nbperpg) (addressPXD((pxd)) + \ + ((((ino) & (INOSPEREXT-1)) >> L2INOSPERPAGE) << (l2nbperpg))) + +/* + * inode allocation map: + * + * inode allocation map consists of + * . the inode map control page and + * . inode allocation group pages (per 4096 inodes) + * which are addressed by standard JFS xtree. + */ +/* + * inode allocation group page (per 4096 inodes of an AG) + */ +struct iag { + __le64 agstart; /* 8: starting block of ag */ + __le32 iagnum; /* 4: inode allocation group number */ + __le32 inofreefwd; /* 4: ag inode free list forward */ + __le32 inofreeback; /* 4: ag inode free list back */ + __le32 extfreefwd; /* 4: ag inode extent free list forward */ + __le32 extfreeback; /* 4: ag inode extent free list back */ + __le32 iagfree; /* 4: iag free list */ + + /* summary map: 1 bit per inode extent */ + __le32 inosmap[SMAPSZ]; /* 16: sum map of mapwords w/ free inodes; + * note: this indicates free and backed + * inodes, if the extent is not backed the + * value will be 1. if the extent is + * backed but all inodes are being used the + * value will be 1. if the extent is + * backed but at least one of the inodes is + * free the value will be 0. + */ + __le32 extsmap[SMAPSZ]; /* 16: sum map of mapwords w/ free extents */ + __le32 nfreeinos; /* 4: number of free inodes */ + __le32 nfreeexts; /* 4: number of free extents */ + /* (72) */ + u8 pad[1976]; /* 1976: pad to 2048 bytes */ + /* allocation bit map: 1 bit per inode (0 - free, 1 - allocated) */ + __le32 wmap[EXTSPERIAG]; /* 512: working allocation map */ + __le32 pmap[EXTSPERIAG]; /* 512: persistent allocation map */ + pxd_t inoext[EXTSPERIAG]; /* 1024: inode extent addresses */ +}; /* (4096) */ + +/* + * per AG control information (in inode map control page) + */ +struct iagctl_disk { + __le32 inofree; /* 4: free inode list anchor */ + __le32 extfree; /* 4: free extent list anchor */ + __le32 numinos; /* 4: number of backed inodes */ + __le32 numfree; /* 4: number of free inodes */ +}; /* (16) */ + +struct iagctl { + int inofree; /* free inode list anchor */ + int extfree; /* free extent list anchor */ + int numinos; /* number of backed inodes */ + int numfree; /* number of free inodes */ +}; + +/* + * per fileset/aggregate inode map control page + */ +struct dinomap_disk { + __le32 in_freeiag; /* 4: free iag list anchor */ + __le32 in_nextiag; /* 4: next free iag number */ + __le32 in_numinos; /* 4: num of backed inodes */ + __le32 in_numfree; /* 4: num of free backed inodes */ + __le32 in_nbperiext; /* 4: num of blocks per inode extent */ + __le32 in_l2nbperiext; /* 4: l2 of in_nbperiext */ + __le32 in_diskblock; /* 4: for standalone test driver */ + __le32 in_maxag; /* 4: for standalone test driver */ + u8 pad[2016]; /* 2016: pad to 2048 */ + struct iagctl_disk in_agctl[MAXAG]; /* 2048: AG control information */ +}; /* (4096) */ + +struct dinomap { + int in_freeiag; /* free iag list anchor */ + int in_nextiag; /* next free iag number */ + int in_numinos; /* num of backed inodes */ + int in_numfree; /* num of free backed inodes */ + int in_nbperiext; /* num of blocks per inode extent */ + int in_l2nbperiext; /* l2 of in_nbperiext */ + int in_diskblock; /* for standalone test driver */ + int in_maxag; /* for standalone test driver */ + struct iagctl in_agctl[MAXAG]; /* AG control information */ +}; + +/* + * In-core inode map control page + */ +struct inomap { + struct dinomap im_imap; /* 4096: inode allocation control */ + struct inode *im_ipimap; /* 4: ptr to inode for imap */ + struct mutex im_freelock; /* 4: iag free list lock */ + struct mutex im_aglock[MAXAG]; /* 512: per AG locks */ + u32 *im_DBGdimap; + atomic_t im_numinos; /* num of backed inodes */ + atomic_t im_numfree; /* num of free backed inodes */ +}; + +#define im_freeiag im_imap.in_freeiag +#define im_nextiag im_imap.in_nextiag +#define im_agctl im_imap.in_agctl +#define im_nbperiext im_imap.in_nbperiext +#define im_l2nbperiext im_imap.in_l2nbperiext + +/* for standalone testdriver + */ +#define im_diskblock im_imap.in_diskblock +#define im_maxag im_imap.in_maxag + +extern int diFree(struct inode *); +extern int diAlloc(struct inode *, bool, struct inode *); +extern int diSync(struct inode *); +/* external references */ +extern int diUpdatePMap(struct inode *ipimap, unsigned long inum, + bool is_free, struct tblock * tblk); +extern int diExtendFS(struct inode *ipimap, struct inode *ipbmap); +extern int diMount(struct inode *); +extern int diUnmount(struct inode *, int); +extern int diRead(struct inode *); +extern struct inode *diReadSpecial(struct super_block *, ino_t, int); +extern void diWriteSpecial(struct inode *, int); +extern void diFreeSpecial(struct inode *); +extern int diWrite(tid_t tid, struct inode *); +#endif /* _H_JFS_IMAP */ diff --git a/fs/jfs/jfs_incore.h b/fs/jfs/jfs_incore.h new file mode 100644 index 000000000..912a3af23 --- /dev/null +++ b/fs/jfs/jfs_incore.h @@ -0,0 +1,229 @@ +/* + * Copyright (C) International Business Machines Corp., 2000-2004 + * Portions Copyright (C) Christoph Hellwig, 2001-2002 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +#ifndef _H_JFS_INCORE +#define _H_JFS_INCORE + +#include <linux/mutex.h> +#include <linux/rwsem.h> +#include <linux/slab.h> +#include <linux/bitops.h> +#include "jfs_types.h" +#include "jfs_xtree.h" +#include "jfs_dtree.h" + +/* + * JFS magic number + */ +#define JFS_SUPER_MAGIC 0x3153464a /* "JFS1" */ + +/* + * JFS-private inode information + */ +struct jfs_inode_info { + int fileset; /* fileset number (always 16)*/ + uint mode2; /* jfs-specific mode */ + kuid_t saved_uid; /* saved for uid mount option */ + kgid_t saved_gid; /* saved for gid mount option */ + pxd_t ixpxd; /* inode extent descriptor */ + dxd_t acl; /* dxd describing acl */ + dxd_t ea; /* dxd describing ea */ + time64_t otime; /* time created */ + uint next_index; /* next available directory entry index */ + int acltype; /* Type of ACL */ + short btorder; /* access order */ + short btindex; /* btpage entry index*/ + struct inode *ipimap; /* inode map */ + unsigned long cflag; /* commit flags */ + u64 agstart; /* agstart of the containing IAG */ + u16 bxflag; /* xflag of pseudo buffer? */ + unchar pad; + signed char active_ag; /* ag currently allocating from */ + lid_t blid; /* lid of pseudo buffer? */ + lid_t atlhead; /* anonymous tlock list head */ + lid_t atltail; /* anonymous tlock list tail */ + spinlock_t ag_lock; /* protects active_ag */ + struct list_head anon_inode_list; /* inodes having anonymous txns */ + /* + * rdwrlock serializes xtree between reads & writes and synchronizes + * changes to special inodes. It's use would be redundant on + * directories since the i_mutex taken in the VFS is sufficient. + */ + struct rw_semaphore rdwrlock; + /* + * commit_mutex serializes transaction processing on an inode. + * It must be taken after beginning a transaction (txBegin), since + * dirty inodes may be committed while a new transaction on the + * inode is blocked in txBegin or TxBeginAnon + */ + struct mutex commit_mutex; + /* xattr_sem allows us to access the xattrs without taking i_mutex */ + struct rw_semaphore xattr_sem; + lid_t xtlid; /* lid of xtree lock on directory */ + union { + struct { + xtpage_t _xtroot; /* 288: xtree root */ + struct inomap *_imap; /* 4: inode map header */ + } file; + struct { + struct dir_table_slot _table[12]; /* 96: dir index */ + dtroot_t _dtroot; /* 288: dtree root */ + } dir; + struct { + unchar _unused[16]; /* 16: */ + dxd_t _dxd; /* 16: */ + /* _inline may overflow into _inline_ea when needed */ + unchar _inline[128]; /* 128: inline symlink */ + /* _inline_ea may overlay the last part of + * file._xtroot if maxentry = XTROOTINITSLOT + */ + unchar _inline_ea[128]; /* 128: inline extended attr */ + } link; + } u; +#ifdef CONFIG_QUOTA + struct dquot *i_dquot[MAXQUOTAS]; +#endif + u32 dev; /* will die when we get wide dev_t */ + struct inode vfs_inode; +}; +#define i_xtroot u.file._xtroot +#define i_imap u.file._imap +#define i_dirtable u.dir._table +#define i_dtroot u.dir._dtroot +#define i_inline u.link._inline +#define i_inline_ea u.link._inline_ea + +#define IREAD_LOCK(ip, subclass) \ + down_read_nested(&JFS_IP(ip)->rdwrlock, subclass) +#define IREAD_UNLOCK(ip) up_read(&JFS_IP(ip)->rdwrlock) +#define IWRITE_LOCK(ip, subclass) \ + down_write_nested(&JFS_IP(ip)->rdwrlock, subclass) +#define IWRITE_UNLOCK(ip) up_write(&JFS_IP(ip)->rdwrlock) + +/* + * cflag + */ +enum cflags { + COMMIT_Nolink, /* inode committed with zero link count */ + COMMIT_Inlineea, /* commit inode inline EA */ + COMMIT_Freewmap, /* free WMAP at iClose() */ + COMMIT_Dirty, /* Inode is really dirty */ + COMMIT_Dirtable, /* commit changes to di_dirtable */ + COMMIT_Stale, /* data extent is no longer valid */ + COMMIT_Synclist, /* metadata pages on group commit synclist */ +}; + +/* + * commit_mutex nesting subclasses: + */ +enum commit_mutex_class +{ + COMMIT_MUTEX_PARENT, + COMMIT_MUTEX_CHILD, + COMMIT_MUTEX_SECOND_PARENT, /* Renaming */ + COMMIT_MUTEX_VICTIM /* Inode being unlinked due to rename */ +}; + +/* + * rdwrlock subclasses: + * The dmap inode may be locked while a normal inode or the imap inode are + * locked. + */ +enum rdwrlock_class +{ + RDWRLOCK_NORMAL, + RDWRLOCK_IMAP, + RDWRLOCK_DMAP +}; + +#define set_cflag(flag, ip) set_bit(flag, &(JFS_IP(ip)->cflag)) +#define clear_cflag(flag, ip) clear_bit(flag, &(JFS_IP(ip)->cflag)) +#define test_cflag(flag, ip) test_bit(flag, &(JFS_IP(ip)->cflag)) +#define test_and_clear_cflag(flag, ip) \ + test_and_clear_bit(flag, &(JFS_IP(ip)->cflag)) +/* + * JFS-private superblock information. + */ +struct jfs_sb_info { + struct super_block *sb; /* Point back to vfs super block */ + unsigned long mntflag; /* aggregate attributes */ + struct inode *ipbmap; /* block map inode */ + struct inode *ipaimap; /* aggregate inode map inode */ + struct inode *ipaimap2; /* secondary aimap inode */ + struct inode *ipimap; /* aggregate inode map inode */ + struct jfs_log *log; /* log */ + struct list_head log_list; /* volumes associated with a journal */ + short bsize; /* logical block size */ + short l2bsize; /* log2 logical block size */ + short nbperpage; /* blocks per page */ + short l2nbperpage; /* log2 blocks per page */ + short l2niperblk; /* log2 inodes per page */ + dev_t logdev; /* external log device */ + uint aggregate; /* volume identifier in log record */ + pxd_t logpxd; /* pxd describing log */ + pxd_t fsckpxd; /* pxd describing fsck wkspc */ + pxd_t ait2; /* pxd describing AIT copy */ + char uuid[16]; /* 128-bit uuid for volume */ + char loguuid[16]; /* 128-bit uuid for log */ + /* + * commit_state is used for synchronization of the jfs_commit + * threads. It is protected by LAZY_LOCK(). + */ + int commit_state; /* commit state */ + /* Formerly in ipimap */ + uint gengen; /* inode generation generator*/ + uint inostamp; /* shows inode belongs to fileset*/ + + /* Formerly in ipbmap */ + struct bmap *bmap; /* incore bmap descriptor */ + struct nls_table *nls_tab; /* current codepage */ + struct inode *direct_inode; /* metadata inode */ + uint state; /* mount/recovery state */ + unsigned long flag; /* mount time flags */ + uint p_state; /* state prior to going no integrity */ + kuid_t uid; /* uid to override on-disk uid */ + kgid_t gid; /* gid to override on-disk gid */ + uint umask; /* umask to override on-disk umask */ + uint minblks_trim; /* minimum blocks, for online trim */ +}; + +/* jfs_sb_info commit_state */ +#define IN_LAZYCOMMIT 1 + +static inline struct jfs_inode_info *JFS_IP(struct inode *inode) +{ + return container_of(inode, struct jfs_inode_info, vfs_inode); +} + +static inline int jfs_dirtable_inline(struct inode *inode) +{ + return (JFS_IP(inode)->next_index <= (MAX_INLINE_DIRTABLE_ENTRY + 1)); +} + +static inline struct jfs_sb_info *JFS_SBI(struct super_block *sb) +{ + return sb->s_fs_info; +} + +static inline int isReadOnly(struct inode *inode) +{ + if (JFS_SBI(inode->i_sb)->log) + return 0; + return 1; +} +#endif /* _H_JFS_INCORE */ diff --git a/fs/jfs/jfs_inode.c b/fs/jfs/jfs_inode.c new file mode 100644 index 000000000..4572b7cf1 --- /dev/null +++ b/fs/jfs/jfs_inode.c @@ -0,0 +1,147 @@ +/* + * Copyright (C) International Business Machines Corp., 2000-2004 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include <linux/fs.h> +#include <linux/quotaops.h> +#include "jfs_incore.h" +#include "jfs_inode.h" +#include "jfs_filsys.h" +#include "jfs_imap.h" +#include "jfs_dinode.h" +#include "jfs_debug.h" + + +void jfs_set_inode_flags(struct inode *inode) +{ + unsigned int flags = JFS_IP(inode)->mode2; + unsigned int new_fl = 0; + + if (flags & JFS_IMMUTABLE_FL) + new_fl |= S_IMMUTABLE; + if (flags & JFS_APPEND_FL) + new_fl |= S_APPEND; + if (flags & JFS_NOATIME_FL) + new_fl |= S_NOATIME; + if (flags & JFS_DIRSYNC_FL) + new_fl |= S_DIRSYNC; + if (flags & JFS_SYNC_FL) + new_fl |= S_SYNC; + inode_set_flags(inode, new_fl, S_IMMUTABLE | S_APPEND | S_NOATIME | + S_DIRSYNC | S_SYNC); +} + +/* + * NAME: ialloc() + * + * FUNCTION: Allocate a new inode + * + */ +struct inode *ialloc(struct inode *parent, umode_t mode) +{ + struct super_block *sb = parent->i_sb; + struct inode *inode; + struct jfs_inode_info *jfs_inode; + int rc; + + inode = new_inode(sb); + if (!inode) { + jfs_warn("ialloc: new_inode returned NULL!"); + return ERR_PTR(-ENOMEM); + } + + jfs_inode = JFS_IP(inode); + + rc = diAlloc(parent, S_ISDIR(mode), inode); + if (rc) { + jfs_warn("ialloc: diAlloc returned %d!", rc); + goto fail_put; + } + + if (insert_inode_locked(inode) < 0) { + rc = -EINVAL; + goto fail_put; + } + + inode_init_owner(inode, parent, mode); + /* + * New inodes need to save sane values on disk when + * uid & gid mount options are used + */ + jfs_inode->saved_uid = inode->i_uid; + jfs_inode->saved_gid = inode->i_gid; + + /* + * Allocate inode to quota. + */ + rc = dquot_initialize(inode); + if (rc) + goto fail_drop; + rc = dquot_alloc_inode(inode); + if (rc) + goto fail_drop; + + /* inherit flags from parent */ + jfs_inode->mode2 = JFS_IP(parent)->mode2 & JFS_FL_INHERIT; + + if (S_ISDIR(mode)) { + jfs_inode->mode2 |= IDIRECTORY; + jfs_inode->mode2 &= ~JFS_DIRSYNC_FL; + } + else { + jfs_inode->mode2 |= INLINEEA | ISPARSE; + if (S_ISLNK(mode)) + jfs_inode->mode2 &= ~(JFS_IMMUTABLE_FL|JFS_APPEND_FL); + } + jfs_inode->mode2 |= inode->i_mode; + + inode->i_blocks = 0; + inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); + jfs_inode->otime = inode->i_ctime.tv_sec; + inode->i_generation = JFS_SBI(sb)->gengen++; + + jfs_inode->cflag = 0; + + /* Zero remaining fields */ + memset(&jfs_inode->acl, 0, sizeof(dxd_t)); + memset(&jfs_inode->ea, 0, sizeof(dxd_t)); + jfs_inode->next_index = 0; + jfs_inode->acltype = 0; + jfs_inode->btorder = 0; + jfs_inode->btindex = 0; + jfs_inode->bxflag = 0; + jfs_inode->blid = 0; + jfs_inode->atlhead = 0; + jfs_inode->atltail = 0; + jfs_inode->xtlid = 0; + jfs_set_inode_flags(inode); + + jfs_info("ialloc returns inode = 0x%p", inode); + + return inode; + +fail_drop: + dquot_drop(inode); + inode->i_flags |= S_NOQUOTA; + clear_nlink(inode); + discard_new_inode(inode); + return ERR_PTR(rc); + +fail_put: + iput(inode); + return ERR_PTR(rc); +} diff --git a/fs/jfs/jfs_inode.h b/fs/jfs/jfs_inode.h new file mode 100644 index 000000000..7b0b3a407 --- /dev/null +++ b/fs/jfs/jfs_inode.h @@ -0,0 +1,52 @@ +/* + * Copyright (C) International Business Machines Corp., 2000-2001 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +#ifndef _H_JFS_INODE +#define _H_JFS_INODE + +struct fid; + +extern struct inode *ialloc(struct inode *, umode_t); +extern int jfs_fsync(struct file *, loff_t, loff_t, int); +extern long jfs_ioctl(struct file *, unsigned int, unsigned long); +extern long jfs_compat_ioctl(struct file *, unsigned int, unsigned long); +extern struct inode *jfs_iget(struct super_block *, unsigned long); +extern int jfs_commit_inode(struct inode *, int); +extern int jfs_write_inode(struct inode *, struct writeback_control *); +extern void jfs_evict_inode(struct inode *); +extern void jfs_dirty_inode(struct inode *, int); +extern void jfs_truncate(struct inode *); +extern void jfs_truncate_nolock(struct inode *, loff_t); +extern void jfs_free_zero_link(struct inode *); +extern struct dentry *jfs_get_parent(struct dentry *dentry); +extern struct dentry *jfs_fh_to_dentry(struct super_block *sb, struct fid *fid, + int fh_len, int fh_type); +extern struct dentry *jfs_fh_to_parent(struct super_block *sb, struct fid *fid, + int fh_len, int fh_type); +extern void jfs_set_inode_flags(struct inode *); +extern int jfs_get_block(struct inode *, sector_t, struct buffer_head *, int); +extern int jfs_setattr(struct dentry *, struct iattr *); + +extern const struct address_space_operations jfs_aops; +extern const struct inode_operations jfs_dir_inode_operations; +extern const struct file_operations jfs_dir_operations; +extern const struct inode_operations jfs_file_inode_operations; +extern const struct file_operations jfs_file_operations; +extern const struct inode_operations jfs_symlink_inode_operations; +extern const struct inode_operations jfs_fast_symlink_inode_operations; +extern const struct dentry_operations jfs_ci_dentry_operations; +#endif /* _H_JFS_INODE */ diff --git a/fs/jfs/jfs_lock.h b/fs/jfs/jfs_lock.h new file mode 100644 index 000000000..ecf048822 --- /dev/null +++ b/fs/jfs/jfs_lock.h @@ -0,0 +1,52 @@ +/* + * Copyright (C) International Business Machines Corp., 2000-2001 + * Portions Copyright (C) Christoph Hellwig, 2001-2002 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +#ifndef _H_JFS_LOCK +#define _H_JFS_LOCK + +#include <linux/spinlock.h> +#include <linux/mutex.h> +#include <linux/sched.h> + +/* + * jfs_lock.h + */ + +/* + * Conditional sleep where condition is protected by spinlock + * + * lock_cmd and unlock_cmd take and release the spinlock + */ +#define __SLEEP_COND(wq, cond, lock_cmd, unlock_cmd) \ +do { \ + DECLARE_WAITQUEUE(__wait, current); \ + \ + add_wait_queue(&wq, &__wait); \ + for (;;) { \ + set_current_state(TASK_UNINTERRUPTIBLE);\ + if (cond) \ + break; \ + unlock_cmd; \ + io_schedule(); \ + lock_cmd; \ + } \ + __set_current_state(TASK_RUNNING); \ + remove_wait_queue(&wq, &__wait); \ +} while (0) + +#endif /* _H_JFS_LOCK */ diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c new file mode 100644 index 000000000..356d1fcf7 --- /dev/null +++ b/fs/jfs/jfs_logmgr.c @@ -0,0 +1,2514 @@ +/* + * Copyright (C) International Business Machines Corp., 2000-2004 + * Portions Copyright (C) Christoph Hellwig, 2001-2002 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +/* + * jfs_logmgr.c: log manager + * + * for related information, see transaction manager (jfs_txnmgr.c), and + * recovery manager (jfs_logredo.c). + * + * note: for detail, RTFS. + * + * log buffer manager: + * special purpose buffer manager supporting log i/o requirements. + * per log serial pageout of logpage + * queuing i/o requests and redrive i/o at iodone + * maintain current logpage buffer + * no caching since append only + * appropriate jfs buffer cache buffers as needed + * + * group commit: + * transactions which wrote COMMIT records in the same in-memory + * log page during the pageout of previous/current log page(s) are + * committed together by the pageout of the page. + * + * TBD lazy commit: + * transactions are committed asynchronously when the log page + * containing it COMMIT is paged out when it becomes full; + * + * serialization: + * . a per log lock serialize log write. + * . a per log lock serialize group commit. + * . a per log lock serialize log open/close; + * + * TBD log integrity: + * careful-write (ping-pong) of last logpage to recover from crash + * in overwrite. + * detection of split (out-of-order) write of physical sectors + * of last logpage via timestamp at end of each sector + * with its mirror data array at trailer). + * + * alternatives: + * lsn - 64-bit monotonically increasing integer vs + * 32-bit lspn and page eor. + */ + +#include <linux/fs.h> +#include <linux/blkdev.h> +#include <linux/interrupt.h> +#include <linux/completion.h> +#include <linux/kthread.h> +#include <linux/buffer_head.h> /* for sync_blockdev() */ +#include <linux/bio.h> +#include <linux/freezer.h> +#include <linux/export.h> +#include <linux/delay.h> +#include <linux/mutex.h> +#include <linux/seq_file.h> +#include <linux/slab.h> +#include "jfs_incore.h" +#include "jfs_filsys.h" +#include "jfs_metapage.h" +#include "jfs_superblock.h" +#include "jfs_txnmgr.h" +#include "jfs_debug.h" + + +/* + * lbuf's ready to be redriven. Protected by log_redrive_lock (jfsIO thread) + */ +static struct lbuf *log_redrive_list; +static DEFINE_SPINLOCK(log_redrive_lock); + + +/* + * log read/write serialization (per log) + */ +#define LOG_LOCK_INIT(log) mutex_init(&(log)->loglock) +#define LOG_LOCK(log) mutex_lock(&((log)->loglock)) +#define LOG_UNLOCK(log) mutex_unlock(&((log)->loglock)) + + +/* + * log group commit serialization (per log) + */ + +#define LOGGC_LOCK_INIT(log) spin_lock_init(&(log)->gclock) +#define LOGGC_LOCK(log) spin_lock_irq(&(log)->gclock) +#define LOGGC_UNLOCK(log) spin_unlock_irq(&(log)->gclock) +#define LOGGC_WAKEUP(tblk) wake_up_all(&(tblk)->gcwait) + +/* + * log sync serialization (per log) + */ +#define LOGSYNC_DELTA(logsize) min((logsize)/8, 128*LOGPSIZE) +#define LOGSYNC_BARRIER(logsize) ((logsize)/4) +/* +#define LOGSYNC_DELTA(logsize) min((logsize)/4, 256*LOGPSIZE) +#define LOGSYNC_BARRIER(logsize) ((logsize)/2) +*/ + + +/* + * log buffer cache synchronization + */ +static DEFINE_SPINLOCK(jfsLCacheLock); + +#define LCACHE_LOCK(flags) spin_lock_irqsave(&jfsLCacheLock, flags) +#define LCACHE_UNLOCK(flags) spin_unlock_irqrestore(&jfsLCacheLock, flags) + +/* + * See __SLEEP_COND in jfs_locks.h + */ +#define LCACHE_SLEEP_COND(wq, cond, flags) \ +do { \ + if (cond) \ + break; \ + __SLEEP_COND(wq, cond, LCACHE_LOCK(flags), LCACHE_UNLOCK(flags)); \ +} while (0) + +#define LCACHE_WAKEUP(event) wake_up(event) + + +/* + * lbuf buffer cache (lCache) control + */ +/* log buffer manager pageout control (cumulative, inclusive) */ +#define lbmREAD 0x0001 +#define lbmWRITE 0x0002 /* enqueue at tail of write queue; + * init pageout if at head of queue; + */ +#define lbmRELEASE 0x0004 /* remove from write queue + * at completion of pageout; + * do not free/recycle it yet: + * caller will free it; + */ +#define lbmSYNC 0x0008 /* do not return to freelist + * when removed from write queue; + */ +#define lbmFREE 0x0010 /* return to freelist + * at completion of pageout; + * the buffer may be recycled; + */ +#define lbmDONE 0x0020 +#define lbmERROR 0x0040 +#define lbmGC 0x0080 /* lbmIODone to perform post-GC processing + * of log page + */ +#define lbmDIRECT 0x0100 + +/* + * Global list of active external journals + */ +static LIST_HEAD(jfs_external_logs); +static struct jfs_log *dummy_log; +static DEFINE_MUTEX(jfs_log_mutex); + +/* + * forward references + */ +static int lmWriteRecord(struct jfs_log * log, struct tblock * tblk, + struct lrd * lrd, struct tlock * tlck); + +static int lmNextPage(struct jfs_log * log); +static int lmLogFileSystem(struct jfs_log * log, struct jfs_sb_info *sbi, + int activate); + +static int open_inline_log(struct super_block *sb); +static int open_dummy_log(struct super_block *sb); +static int lbmLogInit(struct jfs_log * log); +static void lbmLogShutdown(struct jfs_log * log); +static struct lbuf *lbmAllocate(struct jfs_log * log, int); +static void lbmFree(struct lbuf * bp); +static void lbmfree(struct lbuf * bp); +static int lbmRead(struct jfs_log * log, int pn, struct lbuf ** bpp); +static void lbmWrite(struct jfs_log * log, struct lbuf * bp, int flag, int cant_block); +static void lbmDirectWrite(struct jfs_log * log, struct lbuf * bp, int flag); +static int lbmIOWait(struct lbuf * bp, int flag); +static bio_end_io_t lbmIODone; +static void lbmStartIO(struct lbuf * bp); +static void lmGCwrite(struct jfs_log * log, int cant_block); +static int lmLogSync(struct jfs_log * log, int hard_sync); + + + +/* + * statistics + */ +#ifdef CONFIG_JFS_STATISTICS +static struct lmStat { + uint commit; /* # of commit */ + uint pagedone; /* # of page written */ + uint submitted; /* # of pages submitted */ + uint full_page; /* # of full pages submitted */ + uint partial_page; /* # of partial pages submitted */ +} lmStat; +#endif + +static void write_special_inodes(struct jfs_log *log, + int (*writer)(struct address_space *)) +{ + struct jfs_sb_info *sbi; + + list_for_each_entry(sbi, &log->sb_list, log_list) { + writer(sbi->ipbmap->i_mapping); + writer(sbi->ipimap->i_mapping); + writer(sbi->direct_inode->i_mapping); + } +} + +/* + * NAME: lmLog() + * + * FUNCTION: write a log record; + * + * PARAMETER: + * + * RETURN: lsn - offset to the next log record to write (end-of-log); + * -1 - error; + * + * note: todo: log error handler + */ +int lmLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd, + struct tlock * tlck) +{ + int lsn; + int diffp, difft; + struct metapage *mp = NULL; + unsigned long flags; + + jfs_info("lmLog: log:0x%p tblk:0x%p, lrd:0x%p tlck:0x%p", + log, tblk, lrd, tlck); + + LOG_LOCK(log); + + /* log by (out-of-transaction) JFS ? */ + if (tblk == NULL) + goto writeRecord; + + /* log from page ? */ + if (tlck == NULL || + tlck->type & tlckBTROOT || (mp = tlck->mp) == NULL) + goto writeRecord; + + /* + * initialize/update page/transaction recovery lsn + */ + lsn = log->lsn; + + LOGSYNC_LOCK(log, flags); + + /* + * initialize page lsn if first log write of the page + */ + if (mp->lsn == 0) { + mp->log = log; + mp->lsn = lsn; + log->count++; + + /* insert page at tail of logsynclist */ + list_add_tail(&mp->synclist, &log->synclist); + } + + /* + * initialize/update lsn of tblock of the page + * + * transaction inherits oldest lsn of pages associated + * with allocation/deallocation of resources (their + * log records are used to reconstruct allocation map + * at recovery time: inode for inode allocation map, + * B+-tree index of extent descriptors for block + * allocation map); + * allocation map pages inherit transaction lsn at + * commit time to allow forwarding log syncpt past log + * records associated with allocation/deallocation of + * resources only after persistent map of these map pages + * have been updated and propagated to home. + */ + /* + * initialize transaction lsn: + */ + if (tblk->lsn == 0) { + /* inherit lsn of its first page logged */ + tblk->lsn = mp->lsn; + log->count++; + + /* insert tblock after the page on logsynclist */ + list_add(&tblk->synclist, &mp->synclist); + } + /* + * update transaction lsn: + */ + else { + /* inherit oldest/smallest lsn of page */ + logdiff(diffp, mp->lsn, log); + logdiff(difft, tblk->lsn, log); + if (diffp < difft) { + /* update tblock lsn with page lsn */ + tblk->lsn = mp->lsn; + + /* move tblock after page on logsynclist */ + list_move(&tblk->synclist, &mp->synclist); + } + } + + LOGSYNC_UNLOCK(log, flags); + + /* + * write the log record + */ + writeRecord: + lsn = lmWriteRecord(log, tblk, lrd, tlck); + + /* + * forward log syncpt if log reached next syncpt trigger + */ + logdiff(diffp, lsn, log); + if (diffp >= log->nextsync) + lsn = lmLogSync(log, 0); + + /* update end-of-log lsn */ + log->lsn = lsn; + + LOG_UNLOCK(log); + + /* return end-of-log address */ + return lsn; +} + +/* + * NAME: lmWriteRecord() + * + * FUNCTION: move the log record to current log page + * + * PARAMETER: cd - commit descriptor + * + * RETURN: end-of-log address + * + * serialization: LOG_LOCK() held on entry/exit + */ +static int +lmWriteRecord(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd, + struct tlock * tlck) +{ + int lsn = 0; /* end-of-log address */ + struct lbuf *bp; /* dst log page buffer */ + struct logpage *lp; /* dst log page */ + caddr_t dst; /* destination address in log page */ + int dstoffset; /* end-of-log offset in log page */ + int freespace; /* free space in log page */ + caddr_t p; /* src meta-data page */ + caddr_t src; + int srclen; + int nbytes; /* number of bytes to move */ + int i; + int len; + struct linelock *linelock; + struct lv *lv; + struct lvd *lvd; + int l2linesize; + + len = 0; + + /* retrieve destination log page to write */ + bp = (struct lbuf *) log->bp; + lp = (struct logpage *) bp->l_ldata; + dstoffset = log->eor; + + /* any log data to write ? */ + if (tlck == NULL) + goto moveLrd; + + /* + * move log record data + */ + /* retrieve source meta-data page to log */ + if (tlck->flag & tlckPAGELOCK) { + p = (caddr_t) (tlck->mp->data); + linelock = (struct linelock *) & tlck->lock; + } + /* retrieve source in-memory inode to log */ + else if (tlck->flag & tlckINODELOCK) { + if (tlck->type & tlckDTREE) + p = (caddr_t) &JFS_IP(tlck->ip)->i_dtroot; + else + p = (caddr_t) &JFS_IP(tlck->ip)->i_xtroot; + linelock = (struct linelock *) & tlck->lock; + } +#ifdef _JFS_WIP + else if (tlck->flag & tlckINLINELOCK) { + + inlinelock = (struct inlinelock *) & tlck; + p = (caddr_t) & inlinelock->pxd; + linelock = (struct linelock *) & tlck; + } +#endif /* _JFS_WIP */ + else { + jfs_err("lmWriteRecord: UFO tlck:0x%p", tlck); + return 0; /* Probably should trap */ + } + l2linesize = linelock->l2linesize; + + moveData: + ASSERT(linelock->index <= linelock->maxcnt); + + lv = linelock->lv; + for (i = 0; i < linelock->index; i++, lv++) { + if (lv->length == 0) + continue; + + /* is page full ? */ + if (dstoffset >= LOGPSIZE - LOGPTLRSIZE) { + /* page become full: move on to next page */ + lmNextPage(log); + + bp = log->bp; + lp = (struct logpage *) bp->l_ldata; + dstoffset = LOGPHDRSIZE; + } + + /* + * move log vector data + */ + src = (u8 *) p + (lv->offset << l2linesize); + srclen = lv->length << l2linesize; + len += srclen; + while (srclen > 0) { + freespace = (LOGPSIZE - LOGPTLRSIZE) - dstoffset; + nbytes = min(freespace, srclen); + dst = (caddr_t) lp + dstoffset; + memcpy(dst, src, nbytes); + dstoffset += nbytes; + + /* is page not full ? */ + if (dstoffset < LOGPSIZE - LOGPTLRSIZE) + break; + + /* page become full: move on to next page */ + lmNextPage(log); + + bp = (struct lbuf *) log->bp; + lp = (struct logpage *) bp->l_ldata; + dstoffset = LOGPHDRSIZE; + + srclen -= nbytes; + src += nbytes; + } + + /* + * move log vector descriptor + */ + len += 4; + lvd = (struct lvd *) ((caddr_t) lp + dstoffset); + lvd->offset = cpu_to_le16(lv->offset); + lvd->length = cpu_to_le16(lv->length); + dstoffset += 4; + jfs_info("lmWriteRecord: lv offset:%d length:%d", + lv->offset, lv->length); + } + + if ((i = linelock->next)) { + linelock = (struct linelock *) lid_to_tlock(i); + goto moveData; + } + + /* + * move log record descriptor + */ + moveLrd: + lrd->length = cpu_to_le16(len); + + src = (caddr_t) lrd; + srclen = LOGRDSIZE; + + while (srclen > 0) { + freespace = (LOGPSIZE - LOGPTLRSIZE) - dstoffset; + nbytes = min(freespace, srclen); + dst = (caddr_t) lp + dstoffset; + memcpy(dst, src, nbytes); + + dstoffset += nbytes; + srclen -= nbytes; + + /* are there more to move than freespace of page ? */ + if (srclen) + goto pageFull; + + /* + * end of log record descriptor + */ + + /* update last log record eor */ + log->eor = dstoffset; + bp->l_eor = dstoffset; + lsn = (log->page << L2LOGPSIZE) + dstoffset; + + if (lrd->type & cpu_to_le16(LOG_COMMIT)) { + tblk->clsn = lsn; + jfs_info("wr: tclsn:0x%x, beor:0x%x", tblk->clsn, + bp->l_eor); + + INCREMENT(lmStat.commit); /* # of commit */ + + /* + * enqueue tblock for group commit: + * + * enqueue tblock of non-trivial/synchronous COMMIT + * at tail of group commit queue + * (trivial/asynchronous COMMITs are ignored by + * group commit.) + */ + LOGGC_LOCK(log); + + /* init tblock gc state */ + tblk->flag = tblkGC_QUEUE; + tblk->bp = log->bp; + tblk->pn = log->page; + tblk->eor = log->eor; + + /* enqueue transaction to commit queue */ + list_add_tail(&tblk->cqueue, &log->cqueue); + + LOGGC_UNLOCK(log); + } + + jfs_info("lmWriteRecord: lrd:0x%04x bp:0x%p pn:%d eor:0x%x", + le16_to_cpu(lrd->type), log->bp, log->page, dstoffset); + + /* page not full ? */ + if (dstoffset < LOGPSIZE - LOGPTLRSIZE) + return lsn; + + pageFull: + /* page become full: move on to next page */ + lmNextPage(log); + + bp = (struct lbuf *) log->bp; + lp = (struct logpage *) bp->l_ldata; + dstoffset = LOGPHDRSIZE; + src += nbytes; + } + + return lsn; +} + + +/* + * NAME: lmNextPage() + * + * FUNCTION: write current page and allocate next page. + * + * PARAMETER: log + * + * RETURN: 0 + * + * serialization: LOG_LOCK() held on entry/exit + */ +static int lmNextPage(struct jfs_log * log) +{ + struct logpage *lp; + int lspn; /* log sequence page number */ + int pn; /* current page number */ + struct lbuf *bp; + struct lbuf *nextbp; + struct tblock *tblk; + + /* get current log page number and log sequence page number */ + pn = log->page; + bp = log->bp; + lp = (struct logpage *) bp->l_ldata; + lspn = le32_to_cpu(lp->h.page); + + LOGGC_LOCK(log); + + /* + * write or queue the full page at the tail of write queue + */ + /* get the tail tblk on commit queue */ + if (list_empty(&log->cqueue)) + tblk = NULL; + else + tblk = list_entry(log->cqueue.prev, struct tblock, cqueue); + + /* every tblk who has COMMIT record on the current page, + * and has not been committed, must be on commit queue + * since tblk is queued at commit queueu at the time + * of writing its COMMIT record on the page before + * page becomes full (even though the tblk thread + * who wrote COMMIT record may have been suspended + * currently); + */ + + /* is page bound with outstanding tail tblk ? */ + if (tblk && tblk->pn == pn) { + /* mark tblk for end-of-page */ + tblk->flag |= tblkGC_EOP; + + if (log->cflag & logGC_PAGEOUT) { + /* if page is not already on write queue, + * just enqueue (no lbmWRITE to prevent redrive) + * buffer to wqueue to ensure correct serial order + * of the pages since log pages will be added + * continuously + */ + if (bp->l_wqnext == NULL) + lbmWrite(log, bp, 0, 0); + } else { + /* + * No current GC leader, initiate group commit + */ + log->cflag |= logGC_PAGEOUT; + lmGCwrite(log, 0); + } + } + /* page is not bound with outstanding tblk: + * init write or mark it to be redriven (lbmWRITE) + */ + else { + /* finalize the page */ + bp->l_ceor = bp->l_eor; + lp->h.eor = lp->t.eor = cpu_to_le16(bp->l_ceor); + lbmWrite(log, bp, lbmWRITE | lbmRELEASE | lbmFREE, 0); + } + LOGGC_UNLOCK(log); + + /* + * allocate/initialize next page + */ + /* if log wraps, the first data page of log is 2 + * (0 never used, 1 is superblock). + */ + log->page = (pn == log->size - 1) ? 2 : pn + 1; + log->eor = LOGPHDRSIZE; /* ? valid page empty/full at logRedo() */ + + /* allocate/initialize next log page buffer */ + nextbp = lbmAllocate(log, log->page); + nextbp->l_eor = log->eor; + log->bp = nextbp; + + /* initialize next log page */ + lp = (struct logpage *) nextbp->l_ldata; + lp->h.page = lp->t.page = cpu_to_le32(lspn + 1); + lp->h.eor = lp->t.eor = cpu_to_le16(LOGPHDRSIZE); + + return 0; +} + + +/* + * NAME: lmGroupCommit() + * + * FUNCTION: group commit + * initiate pageout of the pages with COMMIT in the order of + * page number - redrive pageout of the page at the head of + * pageout queue until full page has been written. + * + * RETURN: + * + * NOTE: + * LOGGC_LOCK serializes log group commit queue, and + * transaction blocks on the commit queue. + * N.B. LOG_LOCK is NOT held during lmGroupCommit(). + */ +int lmGroupCommit(struct jfs_log * log, struct tblock * tblk) +{ + int rc = 0; + + LOGGC_LOCK(log); + + /* group committed already ? */ + if (tblk->flag & tblkGC_COMMITTED) { + if (tblk->flag & tblkGC_ERROR) + rc = -EIO; + + LOGGC_UNLOCK(log); + return rc; + } + jfs_info("lmGroup Commit: tblk = 0x%p, gcrtc = %d", tblk, log->gcrtc); + + if (tblk->xflag & COMMIT_LAZY) + tblk->flag |= tblkGC_LAZY; + + if ((!(log->cflag & logGC_PAGEOUT)) && (!list_empty(&log->cqueue)) && + (!(tblk->xflag & COMMIT_LAZY) || test_bit(log_FLUSH, &log->flag) + || jfs_tlocks_low)) { + /* + * No pageout in progress + * + * start group commit as its group leader. + */ + log->cflag |= logGC_PAGEOUT; + + lmGCwrite(log, 0); + } + + if (tblk->xflag & COMMIT_LAZY) { + /* + * Lazy transactions can leave now + */ + LOGGC_UNLOCK(log); + return 0; + } + + /* lmGCwrite gives up LOGGC_LOCK, check again */ + + if (tblk->flag & tblkGC_COMMITTED) { + if (tblk->flag & tblkGC_ERROR) + rc = -EIO; + + LOGGC_UNLOCK(log); + return rc; + } + + /* upcount transaction waiting for completion + */ + log->gcrtc++; + tblk->flag |= tblkGC_READY; + + __SLEEP_COND(tblk->gcwait, (tblk->flag & tblkGC_COMMITTED), + LOGGC_LOCK(log), LOGGC_UNLOCK(log)); + + /* removed from commit queue */ + if (tblk->flag & tblkGC_ERROR) + rc = -EIO; + + LOGGC_UNLOCK(log); + return rc; +} + +/* + * NAME: lmGCwrite() + * + * FUNCTION: group commit write + * initiate write of log page, building a group of all transactions + * with commit records on that page. + * + * RETURN: None + * + * NOTE: + * LOGGC_LOCK must be held by caller. + * N.B. LOG_LOCK is NOT held during lmGroupCommit(). + */ +static void lmGCwrite(struct jfs_log * log, int cant_write) +{ + struct lbuf *bp; + struct logpage *lp; + int gcpn; /* group commit page number */ + struct tblock *tblk; + struct tblock *xtblk = NULL; + + /* + * build the commit group of a log page + * + * scan commit queue and make a commit group of all + * transactions with COMMIT records on the same log page. + */ + /* get the head tblk on the commit queue */ + gcpn = list_entry(log->cqueue.next, struct tblock, cqueue)->pn; + + list_for_each_entry(tblk, &log->cqueue, cqueue) { + if (tblk->pn != gcpn) + break; + + xtblk = tblk; + + /* state transition: (QUEUE, READY) -> COMMIT */ + tblk->flag |= tblkGC_COMMIT; + } + tblk = xtblk; /* last tblk of the page */ + + /* + * pageout to commit transactions on the log page. + */ + bp = (struct lbuf *) tblk->bp; + lp = (struct logpage *) bp->l_ldata; + /* is page already full ? */ + if (tblk->flag & tblkGC_EOP) { + /* mark page to free at end of group commit of the page */ + tblk->flag &= ~tblkGC_EOP; + tblk->flag |= tblkGC_FREE; + bp->l_ceor = bp->l_eor; + lp->h.eor = lp->t.eor = cpu_to_le16(bp->l_ceor); + lbmWrite(log, bp, lbmWRITE | lbmRELEASE | lbmGC, + cant_write); + INCREMENT(lmStat.full_page); + } + /* page is not yet full */ + else { + bp->l_ceor = tblk->eor; /* ? bp->l_ceor = bp->l_eor; */ + lp->h.eor = lp->t.eor = cpu_to_le16(bp->l_ceor); + lbmWrite(log, bp, lbmWRITE | lbmGC, cant_write); + INCREMENT(lmStat.partial_page); + } +} + +/* + * NAME: lmPostGC() + * + * FUNCTION: group commit post-processing + * Processes transactions after their commit records have been written + * to disk, redriving log I/O if necessary. + * + * RETURN: None + * + * NOTE: + * This routine is called a interrupt time by lbmIODone + */ +static void lmPostGC(struct lbuf * bp) +{ + unsigned long flags; + struct jfs_log *log = bp->l_log; + struct logpage *lp; + struct tblock *tblk, *temp; + + //LOGGC_LOCK(log); + spin_lock_irqsave(&log->gclock, flags); + /* + * current pageout of group commit completed. + * + * remove/wakeup transactions from commit queue who were + * group committed with the current log page + */ + list_for_each_entry_safe(tblk, temp, &log->cqueue, cqueue) { + if (!(tblk->flag & tblkGC_COMMIT)) + break; + /* if transaction was marked GC_COMMIT then + * it has been shipped in the current pageout + * and made it to disk - it is committed. + */ + + if (bp->l_flag & lbmERROR) + tblk->flag |= tblkGC_ERROR; + + /* remove it from the commit queue */ + list_del(&tblk->cqueue); + tblk->flag &= ~tblkGC_QUEUE; + + if (tblk == log->flush_tblk) { + /* we can stop flushing the log now */ + clear_bit(log_FLUSH, &log->flag); + log->flush_tblk = NULL; + } + + jfs_info("lmPostGC: tblk = 0x%p, flag = 0x%x", tblk, + tblk->flag); + + if (!(tblk->xflag & COMMIT_FORCE)) + /* + * Hand tblk over to lazy commit thread + */ + txLazyUnlock(tblk); + else { + /* state transition: COMMIT -> COMMITTED */ + tblk->flag |= tblkGC_COMMITTED; + + if (tblk->flag & tblkGC_READY) + log->gcrtc--; + + LOGGC_WAKEUP(tblk); + } + + /* was page full before pageout ? + * (and this is the last tblk bound with the page) + */ + if (tblk->flag & tblkGC_FREE) + lbmFree(bp); + /* did page become full after pageout ? + * (and this is the last tblk bound with the page) + */ + else if (tblk->flag & tblkGC_EOP) { + /* finalize the page */ + lp = (struct logpage *) bp->l_ldata; + bp->l_ceor = bp->l_eor; + lp->h.eor = lp->t.eor = cpu_to_le16(bp->l_eor); + jfs_info("lmPostGC: calling lbmWrite"); + lbmWrite(log, bp, lbmWRITE | lbmRELEASE | lbmFREE, + 1); + } + + } + + /* are there any transactions who have entered lnGroupCommit() + * (whose COMMITs are after that of the last log page written. + * They are waiting for new group commit (above at (SLEEP 1)) + * or lazy transactions are on a full (queued) log page, + * select the latest ready transaction as new group leader and + * wake her up to lead her group. + */ + if ((!list_empty(&log->cqueue)) && + ((log->gcrtc > 0) || (tblk->bp->l_wqnext != NULL) || + test_bit(log_FLUSH, &log->flag) || jfs_tlocks_low)) + /* + * Call lmGCwrite with new group leader + */ + lmGCwrite(log, 1); + + /* no transaction are ready yet (transactions are only just + * queued (GC_QUEUE) and not entered for group commit yet). + * the first transaction entering group commit + * will elect herself as new group leader. + */ + else + log->cflag &= ~logGC_PAGEOUT; + + //LOGGC_UNLOCK(log); + spin_unlock_irqrestore(&log->gclock, flags); + return; +} + +/* + * NAME: lmLogSync() + * + * FUNCTION: write log SYNCPT record for specified log + * if new sync address is available + * (normally the case if sync() is executed by back-ground + * process). + * calculate new value of i_nextsync which determines when + * this code is called again. + * + * PARAMETERS: log - log structure + * hard_sync - 1 to force all metadata to be written + * + * RETURN: 0 + * + * serialization: LOG_LOCK() held on entry/exit + */ +static int lmLogSync(struct jfs_log * log, int hard_sync) +{ + int logsize; + int written; /* written since last syncpt */ + int free; /* free space left available */ + int delta; /* additional delta to write normally */ + int more; /* additional write granted */ + struct lrd lrd; + int lsn; + struct logsyncblk *lp; + unsigned long flags; + + /* push dirty metapages out to disk */ + if (hard_sync) + write_special_inodes(log, filemap_fdatawrite); + else + write_special_inodes(log, filemap_flush); + + /* + * forward syncpt + */ + /* if last sync is same as last syncpt, + * invoke sync point forward processing to update sync. + */ + + if (log->sync == log->syncpt) { + LOGSYNC_LOCK(log, flags); + if (list_empty(&log->synclist)) + log->sync = log->lsn; + else { + lp = list_entry(log->synclist.next, + struct logsyncblk, synclist); + log->sync = lp->lsn; + } + LOGSYNC_UNLOCK(log, flags); + + } + + /* if sync is different from last syncpt, + * write a SYNCPT record with syncpt = sync. + * reset syncpt = sync + */ + if (log->sync != log->syncpt) { + lrd.logtid = 0; + lrd.backchain = 0; + lrd.type = cpu_to_le16(LOG_SYNCPT); + lrd.length = 0; + lrd.log.syncpt.sync = cpu_to_le32(log->sync); + lsn = lmWriteRecord(log, NULL, &lrd, NULL); + + log->syncpt = log->sync; + } else + lsn = log->lsn; + + /* + * setup next syncpt trigger (SWAG) + */ + logsize = log->logsize; + + logdiff(written, lsn, log); + free = logsize - written; + delta = LOGSYNC_DELTA(logsize); + more = min(free / 2, delta); + if (more < 2 * LOGPSIZE) { + jfs_warn("\n ... Log Wrap ... Log Wrap ... Log Wrap ...\n"); + /* + * log wrapping + * + * option 1 - panic ? No.! + * option 2 - shutdown file systems + * associated with log ? + * option 3 - extend log ? + * option 4 - second chance + * + * mark log wrapped, and continue. + * when all active transactions are completed, + * mark log valid for recovery. + * if crashed during invalid state, log state + * implies invalid log, forcing fsck(). + */ + /* mark log state log wrap in log superblock */ + /* log->state = LOGWRAP; */ + + /* reset sync point computation */ + log->syncpt = log->sync = lsn; + log->nextsync = delta; + } else + /* next syncpt trigger = written + more */ + log->nextsync = written + more; + + /* if number of bytes written from last sync point is more + * than 1/4 of the log size, stop new transactions from + * starting until all current transactions are completed + * by setting syncbarrier flag. + */ + if (!test_bit(log_SYNCBARRIER, &log->flag) && + (written > LOGSYNC_BARRIER(logsize)) && log->active) { + set_bit(log_SYNCBARRIER, &log->flag); + jfs_info("log barrier on: lsn=0x%x syncpt=0x%x", lsn, + log->syncpt); + /* + * We may have to initiate group commit + */ + jfs_flush_journal(log, 0); + } + + return lsn; +} + +/* + * NAME: jfs_syncpt + * + * FUNCTION: write log SYNCPT record for specified log + * + * PARAMETERS: log - log structure + * hard_sync - set to 1 to force metadata to be written + */ +void jfs_syncpt(struct jfs_log *log, int hard_sync) +{ LOG_LOCK(log); + if (!test_bit(log_QUIESCE, &log->flag)) + lmLogSync(log, hard_sync); + LOG_UNLOCK(log); +} + +/* + * NAME: lmLogOpen() + * + * FUNCTION: open the log on first open; + * insert filesystem in the active list of the log. + * + * PARAMETER: ipmnt - file system mount inode + * iplog - log inode (out) + * + * RETURN: + * + * serialization: + */ +int lmLogOpen(struct super_block *sb) +{ + int rc; + struct block_device *bdev; + struct jfs_log *log; + struct jfs_sb_info *sbi = JFS_SBI(sb); + + if (sbi->flag & JFS_NOINTEGRITY) + return open_dummy_log(sb); + + if (sbi->mntflag & JFS_INLINELOG) + return open_inline_log(sb); + + mutex_lock(&jfs_log_mutex); + list_for_each_entry(log, &jfs_external_logs, journal_list) { + if (log->bdev->bd_dev == sbi->logdev) { + if (memcmp(log->uuid, sbi->loguuid, + sizeof(log->uuid))) { + jfs_warn("wrong uuid on JFS journal"); + mutex_unlock(&jfs_log_mutex); + return -EINVAL; + } + /* + * add file system to log active file system list + */ + if ((rc = lmLogFileSystem(log, sbi, 1))) { + mutex_unlock(&jfs_log_mutex); + return rc; + } + goto journal_found; + } + } + + if (!(log = kzalloc(sizeof(struct jfs_log), GFP_KERNEL))) { + mutex_unlock(&jfs_log_mutex); + return -ENOMEM; + } + INIT_LIST_HEAD(&log->sb_list); + init_waitqueue_head(&log->syncwait); + + /* + * external log as separate logical volume + * + * file systems to log may have n-to-1 relationship; + */ + + bdev = blkdev_get_by_dev(sbi->logdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL, + log); + if (IS_ERR(bdev)) { + rc = PTR_ERR(bdev); + goto free; + } + + log->bdev = bdev; + memcpy(log->uuid, sbi->loguuid, sizeof(log->uuid)); + + /* + * initialize log: + */ + if ((rc = lmLogInit(log))) + goto close; + + list_add(&log->journal_list, &jfs_external_logs); + + /* + * add file system to log active file system list + */ + if ((rc = lmLogFileSystem(log, sbi, 1))) + goto shutdown; + +journal_found: + LOG_LOCK(log); + list_add(&sbi->log_list, &log->sb_list); + sbi->log = log; + LOG_UNLOCK(log); + + mutex_unlock(&jfs_log_mutex); + return 0; + + /* + * unwind on error + */ + shutdown: /* unwind lbmLogInit() */ + list_del(&log->journal_list); + lbmLogShutdown(log); + + close: /* close external log device */ + blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL); + + free: /* free log descriptor */ + mutex_unlock(&jfs_log_mutex); + kfree(log); + + jfs_warn("lmLogOpen: exit(%d)", rc); + return rc; +} + +static int open_inline_log(struct super_block *sb) +{ + struct jfs_log *log; + int rc; + + if (!(log = kzalloc(sizeof(struct jfs_log), GFP_KERNEL))) + return -ENOMEM; + INIT_LIST_HEAD(&log->sb_list); + init_waitqueue_head(&log->syncwait); + + set_bit(log_INLINELOG, &log->flag); + log->bdev = sb->s_bdev; + log->base = addressPXD(&JFS_SBI(sb)->logpxd); + log->size = lengthPXD(&JFS_SBI(sb)->logpxd) >> + (L2LOGPSIZE - sb->s_blocksize_bits); + log->l2bsize = sb->s_blocksize_bits; + ASSERT(L2LOGPSIZE >= sb->s_blocksize_bits); + + /* + * initialize log. + */ + if ((rc = lmLogInit(log))) { + kfree(log); + jfs_warn("lmLogOpen: exit(%d)", rc); + return rc; + } + + list_add(&JFS_SBI(sb)->log_list, &log->sb_list); + JFS_SBI(sb)->log = log; + + return rc; +} + +static int open_dummy_log(struct super_block *sb) +{ + int rc; + + mutex_lock(&jfs_log_mutex); + if (!dummy_log) { + dummy_log = kzalloc(sizeof(struct jfs_log), GFP_KERNEL); + if (!dummy_log) { + mutex_unlock(&jfs_log_mutex); + return -ENOMEM; + } + INIT_LIST_HEAD(&dummy_log->sb_list); + init_waitqueue_head(&dummy_log->syncwait); + dummy_log->no_integrity = 1; + /* Make up some stuff */ + dummy_log->base = 0; + dummy_log->size = 1024; + rc = lmLogInit(dummy_log); + if (rc) { + kfree(dummy_log); + dummy_log = NULL; + mutex_unlock(&jfs_log_mutex); + return rc; + } + } + + LOG_LOCK(dummy_log); + list_add(&JFS_SBI(sb)->log_list, &dummy_log->sb_list); + JFS_SBI(sb)->log = dummy_log; + LOG_UNLOCK(dummy_log); + mutex_unlock(&jfs_log_mutex); + + return 0; +} + +/* + * NAME: lmLogInit() + * + * FUNCTION: log initialization at first log open. + * + * logredo() (or logformat()) should have been run previously. + * initialize the log from log superblock. + * set the log state in the superblock to LOGMOUNT and + * write SYNCPT log record. + * + * PARAMETER: log - log structure + * + * RETURN: 0 - if ok + * -EINVAL - bad log magic number or superblock dirty + * error returned from logwait() + * + * serialization: single first open thread + */ +int lmLogInit(struct jfs_log * log) +{ + int rc = 0; + struct lrd lrd; + struct logsuper *logsuper; + struct lbuf *bpsuper; + struct lbuf *bp; + struct logpage *lp; + int lsn = 0; + + jfs_info("lmLogInit: log:0x%p", log); + + /* initialize the group commit serialization lock */ + LOGGC_LOCK_INIT(log); + + /* allocate/initialize the log write serialization lock */ + LOG_LOCK_INIT(log); + + LOGSYNC_LOCK_INIT(log); + + INIT_LIST_HEAD(&log->synclist); + + INIT_LIST_HEAD(&log->cqueue); + log->flush_tblk = NULL; + + log->count = 0; + + /* + * initialize log i/o + */ + if ((rc = lbmLogInit(log))) + return rc; + + if (!test_bit(log_INLINELOG, &log->flag)) + log->l2bsize = L2LOGPSIZE; + + /* check for disabled journaling to disk */ + if (log->no_integrity) { + /* + * Journal pages will still be filled. When the time comes + * to actually do the I/O, the write is not done, and the + * endio routine is called directly. + */ + bp = lbmAllocate(log , 0); + log->bp = bp; + bp->l_pn = bp->l_eor = 0; + } else { + /* + * validate log superblock + */ + if ((rc = lbmRead(log, 1, &bpsuper))) + goto errout10; + + logsuper = (struct logsuper *) bpsuper->l_ldata; + + if (logsuper->magic != cpu_to_le32(LOGMAGIC)) { + jfs_warn("*** Log Format Error ! ***"); + rc = -EINVAL; + goto errout20; + } + + /* logredo() should have been run successfully. */ + if (logsuper->state != cpu_to_le32(LOGREDONE)) { + jfs_warn("*** Log Is Dirty ! ***"); + rc = -EINVAL; + goto errout20; + } + + /* initialize log from log superblock */ + if (test_bit(log_INLINELOG,&log->flag)) { + if (log->size != le32_to_cpu(logsuper->size)) { + rc = -EINVAL; + goto errout20; + } + jfs_info("lmLogInit: inline log:0x%p base:0x%Lx size:0x%x", + log, (unsigned long long)log->base, log->size); + } else { + if (memcmp(logsuper->uuid, log->uuid, 16)) { + jfs_warn("wrong uuid on JFS log device"); + rc = -EINVAL; + goto errout20; + } + log->size = le32_to_cpu(logsuper->size); + log->l2bsize = le32_to_cpu(logsuper->l2bsize); + jfs_info("lmLogInit: external log:0x%p base:0x%Lx size:0x%x", + log, (unsigned long long)log->base, log->size); + } + + log->page = le32_to_cpu(logsuper->end) / LOGPSIZE; + log->eor = le32_to_cpu(logsuper->end) - (LOGPSIZE * log->page); + + /* + * initialize for log append write mode + */ + /* establish current/end-of-log page/buffer */ + if ((rc = lbmRead(log, log->page, &bp))) + goto errout20; + + lp = (struct logpage *) bp->l_ldata; + + jfs_info("lmLogInit: lsn:0x%x page:%d eor:%d:%d", + le32_to_cpu(logsuper->end), log->page, log->eor, + le16_to_cpu(lp->h.eor)); + + log->bp = bp; + bp->l_pn = log->page; + bp->l_eor = log->eor; + + /* if current page is full, move on to next page */ + if (log->eor >= LOGPSIZE - LOGPTLRSIZE) + lmNextPage(log); + + /* + * initialize log syncpoint + */ + /* + * write the first SYNCPT record with syncpoint = 0 + * (i.e., log redo up to HERE !); + * remove current page from lbm write queue at end of pageout + * (to write log superblock update), but do not release to + * freelist; + */ + lrd.logtid = 0; + lrd.backchain = 0; + lrd.type = cpu_to_le16(LOG_SYNCPT); + lrd.length = 0; + lrd.log.syncpt.sync = 0; + lsn = lmWriteRecord(log, NULL, &lrd, NULL); + bp = log->bp; + bp->l_ceor = bp->l_eor; + lp = (struct logpage *) bp->l_ldata; + lp->h.eor = lp->t.eor = cpu_to_le16(bp->l_eor); + lbmWrite(log, bp, lbmWRITE | lbmSYNC, 0); + if ((rc = lbmIOWait(bp, 0))) + goto errout30; + + /* + * update/write superblock + */ + logsuper->state = cpu_to_le32(LOGMOUNT); + log->serial = le32_to_cpu(logsuper->serial) + 1; + logsuper->serial = cpu_to_le32(log->serial); + lbmDirectWrite(log, bpsuper, lbmWRITE | lbmRELEASE | lbmSYNC); + if ((rc = lbmIOWait(bpsuper, lbmFREE))) + goto errout30; + } + + /* initialize logsync parameters */ + log->logsize = (log->size - 2) << L2LOGPSIZE; + log->lsn = lsn; + log->syncpt = lsn; + log->sync = log->syncpt; + log->nextsync = LOGSYNC_DELTA(log->logsize); + + jfs_info("lmLogInit: lsn:0x%x syncpt:0x%x sync:0x%x", + log->lsn, log->syncpt, log->sync); + + /* + * initialize for lazy/group commit + */ + log->clsn = lsn; + + return 0; + + /* + * unwind on error + */ + errout30: /* release log page */ + log->wqueue = NULL; + bp->l_wqnext = NULL; + lbmFree(bp); + + errout20: /* release log superblock */ + lbmFree(bpsuper); + + errout10: /* unwind lbmLogInit() */ + lbmLogShutdown(log); + + jfs_warn("lmLogInit: exit(%d)", rc); + return rc; +} + + +/* + * NAME: lmLogClose() + * + * FUNCTION: remove file system <ipmnt> from active list of log <iplog> + * and close it on last close. + * + * PARAMETER: sb - superblock + * + * RETURN: errors from subroutines + * + * serialization: + */ +int lmLogClose(struct super_block *sb) +{ + struct jfs_sb_info *sbi = JFS_SBI(sb); + struct jfs_log *log = sbi->log; + struct block_device *bdev; + int rc = 0; + + jfs_info("lmLogClose: log:0x%p", log); + + mutex_lock(&jfs_log_mutex); + LOG_LOCK(log); + list_del(&sbi->log_list); + LOG_UNLOCK(log); + sbi->log = NULL; + + /* + * We need to make sure all of the "written" metapages + * actually make it to disk + */ + sync_blockdev(sb->s_bdev); + + if (test_bit(log_INLINELOG, &log->flag)) { + /* + * in-line log in host file system + */ + rc = lmLogShutdown(log); + kfree(log); + goto out; + } + + if (!log->no_integrity) + lmLogFileSystem(log, sbi, 0); + + if (!list_empty(&log->sb_list)) + goto out; + + /* + * TODO: ensure that the dummy_log is in a state to allow + * lbmLogShutdown to deallocate all the buffers and call + * kfree against dummy_log. For now, leave dummy_log & its + * buffers in memory, and resuse if another no-integrity mount + * is requested. + */ + if (log->no_integrity) + goto out; + + /* + * external log as separate logical volume + */ + list_del(&log->journal_list); + bdev = log->bdev; + rc = lmLogShutdown(log); + + blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL); + + kfree(log); + + out: + mutex_unlock(&jfs_log_mutex); + jfs_info("lmLogClose: exit(%d)", rc); + return rc; +} + + +/* + * NAME: jfs_flush_journal() + * + * FUNCTION: initiate write of any outstanding transactions to the journal + * and optionally wait until they are all written to disk + * + * wait == 0 flush until latest txn is committed, don't wait + * wait == 1 flush until latest txn is committed, wait + * wait > 1 flush until all txn's are complete, wait + */ +void jfs_flush_journal(struct jfs_log *log, int wait) +{ + int i; + struct tblock *target = NULL; + + /* jfs_write_inode may call us during read-only mount */ + if (!log) + return; + + jfs_info("jfs_flush_journal: log:0x%p wait=%d", log, wait); + + LOGGC_LOCK(log); + + if (!list_empty(&log->cqueue)) { + /* + * This ensures that we will keep writing to the journal as long + * as there are unwritten commit records + */ + target = list_entry(log->cqueue.prev, struct tblock, cqueue); + + if (test_bit(log_FLUSH, &log->flag)) { + /* + * We're already flushing. + * if flush_tblk is NULL, we are flushing everything, + * so leave it that way. Otherwise, update it to the + * latest transaction + */ + if (log->flush_tblk) + log->flush_tblk = target; + } else { + /* Only flush until latest transaction is committed */ + log->flush_tblk = target; + set_bit(log_FLUSH, &log->flag); + + /* + * Initiate I/O on outstanding transactions + */ + if (!(log->cflag & logGC_PAGEOUT)) { + log->cflag |= logGC_PAGEOUT; + lmGCwrite(log, 0); + } + } + } + if ((wait > 1) || test_bit(log_SYNCBARRIER, &log->flag)) { + /* Flush until all activity complete */ + set_bit(log_FLUSH, &log->flag); + log->flush_tblk = NULL; + } + + if (wait && target && !(target->flag & tblkGC_COMMITTED)) { + DECLARE_WAITQUEUE(__wait, current); + + add_wait_queue(&target->gcwait, &__wait); + set_current_state(TASK_UNINTERRUPTIBLE); + LOGGC_UNLOCK(log); + schedule(); + LOGGC_LOCK(log); + remove_wait_queue(&target->gcwait, &__wait); + } + LOGGC_UNLOCK(log); + + if (wait < 2) + return; + + write_special_inodes(log, filemap_fdatawrite); + + /* + * If there was recent activity, we may need to wait + * for the lazycommit thread to catch up + */ + if ((!list_empty(&log->cqueue)) || !list_empty(&log->synclist)) { + for (i = 0; i < 200; i++) { /* Too much? */ + msleep(250); + write_special_inodes(log, filemap_fdatawrite); + if (list_empty(&log->cqueue) && + list_empty(&log->synclist)) + break; + } + } + assert(list_empty(&log->cqueue)); + +#ifdef CONFIG_JFS_DEBUG + if (!list_empty(&log->synclist)) { + struct logsyncblk *lp; + + printk(KERN_ERR "jfs_flush_journal: synclist not empty\n"); + list_for_each_entry(lp, &log->synclist, synclist) { + if (lp->xflag & COMMIT_PAGE) { + struct metapage *mp = (struct metapage *)lp; + print_hex_dump(KERN_ERR, "metapage: ", + DUMP_PREFIX_ADDRESS, 16, 4, + mp, sizeof(struct metapage), 0); + print_hex_dump(KERN_ERR, "page: ", + DUMP_PREFIX_ADDRESS, 16, + sizeof(long), mp->page, + sizeof(struct page), 0); + } else + print_hex_dump(KERN_ERR, "tblock:", + DUMP_PREFIX_ADDRESS, 16, 4, + lp, sizeof(struct tblock), 0); + } + } +#else + WARN_ON(!list_empty(&log->synclist)); +#endif + clear_bit(log_FLUSH, &log->flag); +} + +/* + * NAME: lmLogShutdown() + * + * FUNCTION: log shutdown at last LogClose(). + * + * write log syncpt record. + * update super block to set redone flag to 0. + * + * PARAMETER: log - log inode + * + * RETURN: 0 - success + * + * serialization: single last close thread + */ +int lmLogShutdown(struct jfs_log * log) +{ + int rc; + struct lrd lrd; + int lsn; + struct logsuper *logsuper; + struct lbuf *bpsuper; + struct lbuf *bp; + struct logpage *lp; + + jfs_info("lmLogShutdown: log:0x%p", log); + + jfs_flush_journal(log, 2); + + /* + * write the last SYNCPT record with syncpoint = 0 + * (i.e., log redo up to HERE !) + */ + lrd.logtid = 0; + lrd.backchain = 0; + lrd.type = cpu_to_le16(LOG_SYNCPT); + lrd.length = 0; + lrd.log.syncpt.sync = 0; + + lsn = lmWriteRecord(log, NULL, &lrd, NULL); + bp = log->bp; + lp = (struct logpage *) bp->l_ldata; + lp->h.eor = lp->t.eor = cpu_to_le16(bp->l_eor); + lbmWrite(log, log->bp, lbmWRITE | lbmRELEASE | lbmSYNC, 0); + lbmIOWait(log->bp, lbmFREE); + log->bp = NULL; + + /* + * synchronous update log superblock + * mark log state as shutdown cleanly + * (i.e., Log does not need to be replayed). + */ + if ((rc = lbmRead(log, 1, &bpsuper))) + goto out; + + logsuper = (struct logsuper *) bpsuper->l_ldata; + logsuper->state = cpu_to_le32(LOGREDONE); + logsuper->end = cpu_to_le32(lsn); + lbmDirectWrite(log, bpsuper, lbmWRITE | lbmRELEASE | lbmSYNC); + rc = lbmIOWait(bpsuper, lbmFREE); + + jfs_info("lmLogShutdown: lsn:0x%x page:%d eor:%d", + lsn, log->page, log->eor); + + out: + /* + * shutdown per log i/o + */ + lbmLogShutdown(log); + + if (rc) { + jfs_warn("lmLogShutdown: exit(%d)", rc); + } + return rc; +} + + +/* + * NAME: lmLogFileSystem() + * + * FUNCTION: insert (<activate> = true)/remove (<activate> = false) + * file system into/from log active file system list. + * + * PARAMETE: log - pointer to logs inode. + * fsdev - kdev_t of filesystem. + * serial - pointer to returned log serial number + * activate - insert/remove device from active list. + * + * RETURN: 0 - success + * errors returned by vms_iowait(). + */ +static int lmLogFileSystem(struct jfs_log * log, struct jfs_sb_info *sbi, + int activate) +{ + int rc = 0; + int i; + struct logsuper *logsuper; + struct lbuf *bpsuper; + char *uuid = sbi->uuid; + + /* + * insert/remove file system device to log active file system list. + */ + if ((rc = lbmRead(log, 1, &bpsuper))) + return rc; + + logsuper = (struct logsuper *) bpsuper->l_ldata; + if (activate) { + for (i = 0; i < MAX_ACTIVE; i++) + if (!memcmp(logsuper->active[i].uuid, NULL_UUID, 16)) { + memcpy(logsuper->active[i].uuid, uuid, 16); + sbi->aggregate = i; + break; + } + if (i == MAX_ACTIVE) { + jfs_warn("Too many file systems sharing journal!"); + lbmFree(bpsuper); + return -EMFILE; /* Is there a better rc? */ + } + } else { + for (i = 0; i < MAX_ACTIVE; i++) + if (!memcmp(logsuper->active[i].uuid, uuid, 16)) { + memcpy(logsuper->active[i].uuid, NULL_UUID, 16); + break; + } + if (i == MAX_ACTIVE) { + jfs_warn("Somebody stomped on the journal!"); + lbmFree(bpsuper); + return -EIO; + } + + } + + /* + * synchronous write log superblock: + * + * write sidestream bypassing write queue: + * at file system mount, log super block is updated for + * activation of the file system before any log record + * (MOUNT record) of the file system, and at file system + * unmount, all meta data for the file system has been + * flushed before log super block is updated for deactivation + * of the file system. + */ + lbmDirectWrite(log, bpsuper, lbmWRITE | lbmRELEASE | lbmSYNC); + rc = lbmIOWait(bpsuper, lbmFREE); + + return rc; +} + +/* + * log buffer manager (lbm) + * ------------------------ + * + * special purpose buffer manager supporting log i/o requirements. + * + * per log write queue: + * log pageout occurs in serial order by fifo write queue and + * restricting to a single i/o in pregress at any one time. + * a circular singly-linked list + * (log->wrqueue points to the tail, and buffers are linked via + * bp->wrqueue field), and + * maintains log page in pageout ot waiting for pageout in serial pageout. + */ + +/* + * lbmLogInit() + * + * initialize per log I/O setup at lmLogInit() + */ +static int lbmLogInit(struct jfs_log * log) +{ /* log inode */ + int i; + struct lbuf *lbuf; + + jfs_info("lbmLogInit: log:0x%p", log); + + /* initialize current buffer cursor */ + log->bp = NULL; + + /* initialize log device write queue */ + log->wqueue = NULL; + + /* + * Each log has its own buffer pages allocated to it. These are + * not managed by the page cache. This ensures that a transaction + * writing to the log does not block trying to allocate a page from + * the page cache (for the log). This would be bad, since page + * allocation waits on the kswapd thread that may be committing inodes + * which would cause log activity. Was that clear? I'm trying to + * avoid deadlock here. + */ + init_waitqueue_head(&log->free_wait); + + log->lbuf_free = NULL; + + for (i = 0; i < LOGPAGES;) { + char *buffer; + uint offset; + struct page *page = alloc_page(GFP_KERNEL | __GFP_ZERO); + + if (!page) + goto error; + buffer = page_address(page); + for (offset = 0; offset < PAGE_SIZE; offset += LOGPSIZE) { + lbuf = kmalloc(sizeof(struct lbuf), GFP_KERNEL); + if (lbuf == NULL) { + if (offset == 0) + __free_page(page); + goto error; + } + if (offset) /* we already have one reference */ + get_page(page); + lbuf->l_offset = offset; + lbuf->l_ldata = buffer + offset; + lbuf->l_page = page; + lbuf->l_log = log; + init_waitqueue_head(&lbuf->l_ioevent); + + lbuf->l_freelist = log->lbuf_free; + log->lbuf_free = lbuf; + i++; + } + } + + return (0); + + error: + lbmLogShutdown(log); + return -ENOMEM; +} + + +/* + * lbmLogShutdown() + * + * finalize per log I/O setup at lmLogShutdown() + */ +static void lbmLogShutdown(struct jfs_log * log) +{ + struct lbuf *lbuf; + + jfs_info("lbmLogShutdown: log:0x%p", log); + + lbuf = log->lbuf_free; + while (lbuf) { + struct lbuf *next = lbuf->l_freelist; + __free_page(lbuf->l_page); + kfree(lbuf); + lbuf = next; + } +} + + +/* + * lbmAllocate() + * + * allocate an empty log buffer + */ +static struct lbuf *lbmAllocate(struct jfs_log * log, int pn) +{ + struct lbuf *bp; + unsigned long flags; + + /* + * recycle from log buffer freelist if any + */ + LCACHE_LOCK(flags); + LCACHE_SLEEP_COND(log->free_wait, (bp = log->lbuf_free), flags); + log->lbuf_free = bp->l_freelist; + LCACHE_UNLOCK(flags); + + bp->l_flag = 0; + + bp->l_wqnext = NULL; + bp->l_freelist = NULL; + + bp->l_pn = pn; + bp->l_blkno = log->base + (pn << (L2LOGPSIZE - log->l2bsize)); + bp->l_ceor = 0; + + return bp; +} + + +/* + * lbmFree() + * + * release a log buffer to freelist + */ +static void lbmFree(struct lbuf * bp) +{ + unsigned long flags; + + LCACHE_LOCK(flags); + + lbmfree(bp); + + LCACHE_UNLOCK(flags); +} + +static void lbmfree(struct lbuf * bp) +{ + struct jfs_log *log = bp->l_log; + + assert(bp->l_wqnext == NULL); + + /* + * return the buffer to head of freelist + */ + bp->l_freelist = log->lbuf_free; + log->lbuf_free = bp; + + wake_up(&log->free_wait); + return; +} + + +/* + * NAME: lbmRedrive + * + * FUNCTION: add a log buffer to the log redrive list + * + * PARAMETER: + * bp - log buffer + * + * NOTES: + * Takes log_redrive_lock. + */ +static inline void lbmRedrive(struct lbuf *bp) +{ + unsigned long flags; + + spin_lock_irqsave(&log_redrive_lock, flags); + bp->l_redrive_next = log_redrive_list; + log_redrive_list = bp; + spin_unlock_irqrestore(&log_redrive_lock, flags); + + wake_up_process(jfsIOthread); +} + + +/* + * lbmRead() + */ +static int lbmRead(struct jfs_log * log, int pn, struct lbuf ** bpp) +{ + struct bio *bio; + struct lbuf *bp; + + /* + * allocate a log buffer + */ + *bpp = bp = lbmAllocate(log, pn); + jfs_info("lbmRead: bp:0x%p pn:0x%x", bp, pn); + + bp->l_flag |= lbmREAD; + + bio = bio_alloc(GFP_NOFS, 1); + + bio->bi_iter.bi_sector = bp->l_blkno << (log->l2bsize - 9); + bio_set_dev(bio, log->bdev); + + bio_add_page(bio, bp->l_page, LOGPSIZE, bp->l_offset); + BUG_ON(bio->bi_iter.bi_size != LOGPSIZE); + + bio->bi_end_io = lbmIODone; + bio->bi_private = bp; + bio->bi_opf = REQ_OP_READ; + /*check if journaling to disk has been disabled*/ + if (log->no_integrity) { + bio->bi_iter.bi_size = 0; + lbmIODone(bio); + } else { + submit_bio(bio); + } + + wait_event(bp->l_ioevent, (bp->l_flag != lbmREAD)); + + return 0; +} + + +/* + * lbmWrite() + * + * buffer at head of pageout queue stays after completion of + * partial-page pageout and redriven by explicit initiation of + * pageout by caller until full-page pageout is completed and + * released. + * + * device driver i/o done redrives pageout of new buffer at + * head of pageout queue when current buffer at head of pageout + * queue is released at the completion of its full-page pageout. + * + * LOGGC_LOCK() serializes lbmWrite() by lmNextPage() and lmGroupCommit(). + * LCACHE_LOCK() serializes xflag between lbmWrite() and lbmIODone() + */ +static void lbmWrite(struct jfs_log * log, struct lbuf * bp, int flag, + int cant_block) +{ + struct lbuf *tail; + unsigned long flags; + + jfs_info("lbmWrite: bp:0x%p flag:0x%x pn:0x%x", bp, flag, bp->l_pn); + + /* map the logical block address to physical block address */ + bp->l_blkno = + log->base + (bp->l_pn << (L2LOGPSIZE - log->l2bsize)); + + LCACHE_LOCK(flags); /* disable+lock */ + + /* + * initialize buffer for device driver + */ + bp->l_flag = flag; + + /* + * insert bp at tail of write queue associated with log + * + * (request is either for bp already/currently at head of queue + * or new bp to be inserted at tail) + */ + tail = log->wqueue; + + /* is buffer not already on write queue ? */ + if (bp->l_wqnext == NULL) { + /* insert at tail of wqueue */ + if (tail == NULL) { + log->wqueue = bp; + bp->l_wqnext = bp; + } else { + log->wqueue = bp; + bp->l_wqnext = tail->l_wqnext; + tail->l_wqnext = bp; + } + + tail = bp; + } + + /* is buffer at head of wqueue and for write ? */ + if ((bp != tail->l_wqnext) || !(flag & lbmWRITE)) { + LCACHE_UNLOCK(flags); /* unlock+enable */ + return; + } + + LCACHE_UNLOCK(flags); /* unlock+enable */ + + if (cant_block) + lbmRedrive(bp); + else if (flag & lbmSYNC) + lbmStartIO(bp); + else { + LOGGC_UNLOCK(log); + lbmStartIO(bp); + LOGGC_LOCK(log); + } +} + + +/* + * lbmDirectWrite() + * + * initiate pageout bypassing write queue for sidestream + * (e.g., log superblock) write; + */ +static void lbmDirectWrite(struct jfs_log * log, struct lbuf * bp, int flag) +{ + jfs_info("lbmDirectWrite: bp:0x%p flag:0x%x pn:0x%x", + bp, flag, bp->l_pn); + + /* + * initialize buffer for device driver + */ + bp->l_flag = flag | lbmDIRECT; + + /* map the logical block address to physical block address */ + bp->l_blkno = + log->base + (bp->l_pn << (L2LOGPSIZE - log->l2bsize)); + + /* + * initiate pageout of the page + */ + lbmStartIO(bp); +} + + +/* + * NAME: lbmStartIO() + * + * FUNCTION: Interface to DD strategy routine + * + * RETURN: none + * + * serialization: LCACHE_LOCK() is NOT held during log i/o; + */ +static void lbmStartIO(struct lbuf * bp) +{ + struct bio *bio; + struct jfs_log *log = bp->l_log; + + jfs_info("lbmStartIO"); + + bio = bio_alloc(GFP_NOFS, 1); + bio->bi_iter.bi_sector = bp->l_blkno << (log->l2bsize - 9); + bio_set_dev(bio, log->bdev); + + bio_add_page(bio, bp->l_page, LOGPSIZE, bp->l_offset); + BUG_ON(bio->bi_iter.bi_size != LOGPSIZE); + + bio->bi_end_io = lbmIODone; + bio->bi_private = bp; + bio->bi_opf = REQ_OP_WRITE | REQ_SYNC; + + /* check if journaling to disk has been disabled */ + if (log->no_integrity) { + bio->bi_iter.bi_size = 0; + lbmIODone(bio); + } else { + submit_bio(bio); + INCREMENT(lmStat.submitted); + } +} + + +/* + * lbmIOWait() + */ +static int lbmIOWait(struct lbuf * bp, int flag) +{ + unsigned long flags; + int rc = 0; + + jfs_info("lbmIOWait1: bp:0x%p flag:0x%x:0x%x", bp, bp->l_flag, flag); + + LCACHE_LOCK(flags); /* disable+lock */ + + LCACHE_SLEEP_COND(bp->l_ioevent, (bp->l_flag & lbmDONE), flags); + + rc = (bp->l_flag & lbmERROR) ? -EIO : 0; + + if (flag & lbmFREE) + lbmfree(bp); + + LCACHE_UNLOCK(flags); /* unlock+enable */ + + jfs_info("lbmIOWait2: bp:0x%p flag:0x%x:0x%x", bp, bp->l_flag, flag); + return rc; +} + +/* + * lbmIODone() + * + * executed at INTIODONE level + */ +static void lbmIODone(struct bio *bio) +{ + struct lbuf *bp = bio->bi_private; + struct lbuf *nextbp, *tail; + struct jfs_log *log; + unsigned long flags; + + /* + * get back jfs buffer bound to the i/o buffer + */ + jfs_info("lbmIODone: bp:0x%p flag:0x%x", bp, bp->l_flag); + + LCACHE_LOCK(flags); /* disable+lock */ + + bp->l_flag |= lbmDONE; + + if (bio->bi_status) { + bp->l_flag |= lbmERROR; + + jfs_err("lbmIODone: I/O error in JFS log"); + } + + bio_put(bio); + + /* + * pagein completion + */ + if (bp->l_flag & lbmREAD) { + bp->l_flag &= ~lbmREAD; + + LCACHE_UNLOCK(flags); /* unlock+enable */ + + /* wakeup I/O initiator */ + LCACHE_WAKEUP(&bp->l_ioevent); + + return; + } + + /* + * pageout completion + * + * the bp at the head of write queue has completed pageout. + * + * if single-commit/full-page pageout, remove the current buffer + * from head of pageout queue, and redrive pageout with + * the new buffer at head of pageout queue; + * otherwise, the partial-page pageout buffer stays at + * the head of pageout queue to be redriven for pageout + * by lmGroupCommit() until full-page pageout is completed. + */ + bp->l_flag &= ~lbmWRITE; + INCREMENT(lmStat.pagedone); + + /* update committed lsn */ + log = bp->l_log; + log->clsn = (bp->l_pn << L2LOGPSIZE) + bp->l_ceor; + + if (bp->l_flag & lbmDIRECT) { + LCACHE_WAKEUP(&bp->l_ioevent); + LCACHE_UNLOCK(flags); + return; + } + + tail = log->wqueue; + + /* single element queue */ + if (bp == tail) { + /* remove head buffer of full-page pageout + * from log device write queue + */ + if (bp->l_flag & lbmRELEASE) { + log->wqueue = NULL; + bp->l_wqnext = NULL; + } + } + /* multi element queue */ + else { + /* remove head buffer of full-page pageout + * from log device write queue + */ + if (bp->l_flag & lbmRELEASE) { + nextbp = tail->l_wqnext = bp->l_wqnext; + bp->l_wqnext = NULL; + + /* + * redrive pageout of next page at head of write queue: + * redrive next page without any bound tblk + * (i.e., page w/o any COMMIT records), or + * first page of new group commit which has been + * queued after current page (subsequent pageout + * is performed synchronously, except page without + * any COMMITs) by lmGroupCommit() as indicated + * by lbmWRITE flag; + */ + if (nextbp->l_flag & lbmWRITE) { + /* + * We can't do the I/O at interrupt time. + * The jfsIO thread can do it + */ + lbmRedrive(nextbp); + } + } + } + + /* + * synchronous pageout: + * + * buffer has not necessarily been removed from write queue + * (e.g., synchronous write of partial-page with COMMIT): + * leave buffer for i/o initiator to dispose + */ + if (bp->l_flag & lbmSYNC) { + LCACHE_UNLOCK(flags); /* unlock+enable */ + + /* wakeup I/O initiator */ + LCACHE_WAKEUP(&bp->l_ioevent); + } + + /* + * Group Commit pageout: + */ + else if (bp->l_flag & lbmGC) { + LCACHE_UNLOCK(flags); + lmPostGC(bp); + } + + /* + * asynchronous pageout: + * + * buffer must have been removed from write queue: + * insert buffer at head of freelist where it can be recycled + */ + else { + assert(bp->l_flag & lbmRELEASE); + assert(bp->l_flag & lbmFREE); + lbmfree(bp); + + LCACHE_UNLOCK(flags); /* unlock+enable */ + } +} + +int jfsIOWait(void *arg) +{ + struct lbuf *bp; + + do { + spin_lock_irq(&log_redrive_lock); + while ((bp = log_redrive_list)) { + log_redrive_list = bp->l_redrive_next; + bp->l_redrive_next = NULL; + spin_unlock_irq(&log_redrive_lock); + lbmStartIO(bp); + spin_lock_irq(&log_redrive_lock); + } + + if (freezing(current)) { + spin_unlock_irq(&log_redrive_lock); + try_to_freeze(); + } else { + set_current_state(TASK_INTERRUPTIBLE); + spin_unlock_irq(&log_redrive_lock); + schedule(); + } + } while (!kthread_should_stop()); + + jfs_info("jfsIOWait being killed!"); + return 0; +} + +/* + * NAME: lmLogFormat()/jfs_logform() + * + * FUNCTION: format file system log + * + * PARAMETERS: + * log - volume log + * logAddress - start address of log space in FS block + * logSize - length of log space in FS block; + * + * RETURN: 0 - success + * -EIO - i/o error + * + * XXX: We're synchronously writing one page at a time. This needs to + * be improved by writing multiple pages at once. + */ +int lmLogFormat(struct jfs_log *log, s64 logAddress, int logSize) +{ + int rc = -EIO; + struct jfs_sb_info *sbi; + struct logsuper *logsuper; + struct logpage *lp; + int lspn; /* log sequence page number */ + struct lrd *lrd_ptr; + int npages = 0; + struct lbuf *bp; + + jfs_info("lmLogFormat: logAddress:%Ld logSize:%d", + (long long)logAddress, logSize); + + sbi = list_entry(log->sb_list.next, struct jfs_sb_info, log_list); + + /* allocate a log buffer */ + bp = lbmAllocate(log, 1); + + npages = logSize >> sbi->l2nbperpage; + + /* + * log space: + * + * page 0 - reserved; + * page 1 - log superblock; + * page 2 - log data page: A SYNC log record is written + * into this page at logform time; + * pages 3-N - log data page: set to empty log data pages; + */ + /* + * init log superblock: log page 1 + */ + logsuper = (struct logsuper *) bp->l_ldata; + + logsuper->magic = cpu_to_le32(LOGMAGIC); + logsuper->version = cpu_to_le32(LOGVERSION); + logsuper->state = cpu_to_le32(LOGREDONE); + logsuper->flag = cpu_to_le32(sbi->mntflag); /* ? */ + logsuper->size = cpu_to_le32(npages); + logsuper->bsize = cpu_to_le32(sbi->bsize); + logsuper->l2bsize = cpu_to_le32(sbi->l2bsize); + logsuper->end = cpu_to_le32(2 * LOGPSIZE + LOGPHDRSIZE + LOGRDSIZE); + + bp->l_flag = lbmWRITE | lbmSYNC | lbmDIRECT; + bp->l_blkno = logAddress + sbi->nbperpage; + lbmStartIO(bp); + if ((rc = lbmIOWait(bp, 0))) + goto exit; + + /* + * init pages 2 to npages-1 as log data pages: + * + * log page sequence number (lpsn) initialization: + * + * pn: 0 1 2 3 n-1 + * +-----+-----+=====+=====+===.....===+=====+ + * lspn: N-1 0 1 N-2 + * <--- N page circular file ----> + * + * the N (= npages-2) data pages of the log is maintained as + * a circular file for the log records; + * lpsn grows by 1 monotonically as each log page is written + * to the circular file of the log; + * and setLogpage() will not reset the page number even if + * the eor is equal to LOGPHDRSIZE. In order for binary search + * still work in find log end process, we have to simulate the + * log wrap situation at the log format time. + * The 1st log page written will have the highest lpsn. Then + * the succeeding log pages will have ascending order of + * the lspn starting from 0, ... (N-2) + */ + lp = (struct logpage *) bp->l_ldata; + /* + * initialize 1st log page to be written: lpsn = N - 1, + * write a SYNCPT log record is written to this page + */ + lp->h.page = lp->t.page = cpu_to_le32(npages - 3); + lp->h.eor = lp->t.eor = cpu_to_le16(LOGPHDRSIZE + LOGRDSIZE); + + lrd_ptr = (struct lrd *) &lp->data; + lrd_ptr->logtid = 0; + lrd_ptr->backchain = 0; + lrd_ptr->type = cpu_to_le16(LOG_SYNCPT); + lrd_ptr->length = 0; + lrd_ptr->log.syncpt.sync = 0; + + bp->l_blkno += sbi->nbperpage; + bp->l_flag = lbmWRITE | lbmSYNC | lbmDIRECT; + lbmStartIO(bp); + if ((rc = lbmIOWait(bp, 0))) + goto exit; + + /* + * initialize succeeding log pages: lpsn = 0, 1, ..., (N-2) + */ + for (lspn = 0; lspn < npages - 3; lspn++) { + lp->h.page = lp->t.page = cpu_to_le32(lspn); + lp->h.eor = lp->t.eor = cpu_to_le16(LOGPHDRSIZE); + + bp->l_blkno += sbi->nbperpage; + bp->l_flag = lbmWRITE | lbmSYNC | lbmDIRECT; + lbmStartIO(bp); + if ((rc = lbmIOWait(bp, 0))) + goto exit; + } + + rc = 0; +exit: + /* + * finalize log + */ + /* release the buffer */ + lbmFree(bp); + + return rc; +} + +#ifdef CONFIG_JFS_STATISTICS +int jfs_lmstats_proc_show(struct seq_file *m, void *v) +{ + seq_printf(m, + "JFS Logmgr stats\n" + "================\n" + "commits = %d\n" + "writes submitted = %d\n" + "writes completed = %d\n" + "full pages submitted = %d\n" + "partial pages submitted = %d\n", + lmStat.commit, + lmStat.submitted, + lmStat.pagedone, + lmStat.full_page, + lmStat.partial_page); + return 0; +} +#endif /* CONFIG_JFS_STATISTICS */ diff --git a/fs/jfs/jfs_logmgr.h b/fs/jfs/jfs_logmgr.h new file mode 100644 index 000000000..e38c21598 --- /dev/null +++ b/fs/jfs/jfs_logmgr.h @@ -0,0 +1,513 @@ +/* + * Copyright (C) International Business Machines Corp., 2000-2004 + * Portions Copyright (C) Christoph Hellwig, 2001-2002 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +#ifndef _H_JFS_LOGMGR +#define _H_JFS_LOGMGR + +#include "jfs_filsys.h" +#include "jfs_lock.h" + +/* + * log manager configuration parameters + */ + +/* log page size */ +#define LOGPSIZE 4096 +#define L2LOGPSIZE 12 + +#define LOGPAGES 16 /* Log pages per mounted file system */ + +/* + * log logical volume + * + * a log is used to make the commit operation on journalled + * files within the same logical volume group atomic. + * a log is implemented with a logical volume. + * there is one log per logical volume group. + * + * block 0 of the log logical volume is not used (ipl etc). + * block 1 contains a log "superblock" and is used by logFormat(), + * lmLogInit(), lmLogShutdown(), and logRedo() to record status + * of the log but is not otherwise used during normal processing. + * blocks 2 - (N-1) are used to contain log records. + * + * when a volume group is varied-on-line, logRedo() must have + * been executed before the file systems (logical volumes) in + * the volume group can be mounted. + */ +/* + * log superblock (block 1 of logical volume) + */ +#define LOGSUPER_B 1 +#define LOGSTART_B 2 + +#define LOGMAGIC 0x87654321 +#define LOGVERSION 1 + +#define MAX_ACTIVE 128 /* Max active file systems sharing log */ + +struct logsuper { + __le32 magic; /* 4: log lv identifier */ + __le32 version; /* 4: version number */ + __le32 serial; /* 4: log open/mount counter */ + __le32 size; /* 4: size in number of LOGPSIZE blocks */ + __le32 bsize; /* 4: logical block size in byte */ + __le32 l2bsize; /* 4: log2 of bsize */ + + __le32 flag; /* 4: option */ + __le32 state; /* 4: state - see below */ + + __le32 end; /* 4: addr of last log record set by logredo */ + char uuid[16]; /* 16: 128-bit journal uuid */ + char label[16]; /* 16: journal label */ + struct { + char uuid[16]; + } active[MAX_ACTIVE]; /* 2048: active file systems list */ +}; + +#define NULL_UUID "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0" + +/* log flag: commit option (see jfs_filsys.h) */ + +/* log state */ +#define LOGMOUNT 0 /* log mounted by lmLogInit() */ +#define LOGREDONE 1 /* log shutdown by lmLogShutdown(). + * log redo completed by logredo(). + */ +#define LOGWRAP 2 /* log wrapped */ +#define LOGREADERR 3 /* log read error detected in logredo() */ + + +/* + * log logical page + * + * (this comment should be rewritten !) + * the header and trailer structures (h,t) will normally have + * the same page and eor value. + * An exception to this occurs when a complete page write is not + * accomplished on a power failure. Since the hardware may "split write" + * sectors in the page, any out of order sequence may occur during powerfail + * and needs to be recognized during log replay. The xor value is + * an "exclusive or" of all log words in the page up to eor. This + * 32 bit eor is stored with the top 16 bits in the header and the + * bottom 16 bits in the trailer. logredo can easily recognize pages + * that were not completed by reconstructing this eor and checking + * the log page. + * + * Previous versions of the operating system did not allow split + * writes and detected partially written records in logredo by + * ordering the updates to the header, trailer, and the move of data + * into the logdata area. The order: (1) data is moved (2) header + * is updated (3) trailer is updated. In logredo, when the header + * differed from the trailer, the header and trailer were reconciled + * as follows: if h.page != t.page they were set to the smaller of + * the two and h.eor and t.eor set to 8 (i.e. empty page). if (only) + * h.eor != t.eor they were set to the smaller of their two values. + */ +struct logpage { + struct { /* header */ + __le32 page; /* 4: log sequence page number */ + __le16 rsrvd; /* 2: */ + __le16 eor; /* 2: end-of-log offset of lasrt record write */ + } h; + + __le32 data[LOGPSIZE / 4 - 4]; /* log record area */ + + struct { /* trailer */ + __le32 page; /* 4: normally the same as h.page */ + __le16 rsrvd; /* 2: */ + __le16 eor; /* 2: normally the same as h.eor */ + } t; +}; + +#define LOGPHDRSIZE 8 /* log page header size */ +#define LOGPTLRSIZE 8 /* log page trailer size */ + + +/* + * log record + * + * (this comment should be rewritten !) + * jfs uses only "after" log records (only a single writer is allowed + * in a page, pages are written to temporary paging space if + * if they must be written to disk before commit, and i/o is + * scheduled for modified pages to their home location after + * the log records containing the after values and the commit + * record is written to the log on disk, undo discards the copy + * in main-memory.) + * + * a log record consists of a data area of variable length followed by + * a descriptor of fixed size LOGRDSIZE bytes. + * the data area is rounded up to an integral number of 4-bytes and + * must be no longer than LOGPSIZE. + * the descriptor is of size of multiple of 4-bytes and aligned on a + * 4-byte boundary. + * records are packed one after the other in the data area of log pages. + * (sometimes a DUMMY record is inserted so that at least one record ends + * on every page or the longest record is placed on at most two pages). + * the field eor in page header/trailer points to the byte following + * the last record on a page. + */ + +/* log record types */ +#define LOG_COMMIT 0x8000 +#define LOG_SYNCPT 0x4000 +#define LOG_MOUNT 0x2000 +#define LOG_REDOPAGE 0x0800 +#define LOG_NOREDOPAGE 0x0080 +#define LOG_NOREDOINOEXT 0x0040 +#define LOG_UPDATEMAP 0x0008 +#define LOG_NOREDOFILE 0x0001 + +/* REDOPAGE/NOREDOPAGE log record data type */ +#define LOG_INODE 0x0001 +#define LOG_XTREE 0x0002 +#define LOG_DTREE 0x0004 +#define LOG_BTROOT 0x0010 +#define LOG_EA 0x0020 +#define LOG_ACL 0x0040 +#define LOG_DATA 0x0080 +#define LOG_NEW 0x0100 +#define LOG_EXTEND 0x0200 +#define LOG_RELOCATE 0x0400 +#define LOG_DIR_XTREE 0x0800 /* Xtree is in directory inode */ + +/* UPDATEMAP log record descriptor type */ +#define LOG_ALLOCXADLIST 0x0080 +#define LOG_ALLOCPXDLIST 0x0040 +#define LOG_ALLOCXAD 0x0020 +#define LOG_ALLOCPXD 0x0010 +#define LOG_FREEXADLIST 0x0008 +#define LOG_FREEPXDLIST 0x0004 +#define LOG_FREEXAD 0x0002 +#define LOG_FREEPXD 0x0001 + + +struct lrd { + /* + * type independent area + */ + __le32 logtid; /* 4: log transaction identifier */ + __le32 backchain; /* 4: ptr to prev record of same transaction */ + __le16 type; /* 2: record type */ + __le16 length; /* 2: length of data in record (in byte) */ + __le32 aggregate; /* 4: file system lv/aggregate */ + /* (16) */ + + /* + * type dependent area (20) + */ + union { + + /* + * COMMIT: commit + * + * transaction commit: no type-dependent information; + */ + + /* + * REDOPAGE: after-image + * + * apply after-image; + * + * N.B. REDOPAGE, NOREDOPAGE, and UPDATEMAP must be same format; + */ + struct { + __le32 fileset; /* 4: fileset number */ + __le32 inode; /* 4: inode number */ + __le16 type; /* 2: REDOPAGE record type */ + __le16 l2linesize; /* 2: log2 of line size */ + pxd_t pxd; /* 8: on-disk page pxd */ + } redopage; /* (20) */ + + /* + * NOREDOPAGE: the page is freed + * + * do not apply after-image records which precede this record + * in the log with the same page block number to this page. + * + * N.B. REDOPAGE, NOREDOPAGE, and UPDATEMAP must be same format; + */ + struct { + __le32 fileset; /* 4: fileset number */ + __le32 inode; /* 4: inode number */ + __le16 type; /* 2: NOREDOPAGE record type */ + __le16 rsrvd; /* 2: reserved */ + pxd_t pxd; /* 8: on-disk page pxd */ + } noredopage; /* (20) */ + + /* + * UPDATEMAP: update block allocation map + * + * either in-line PXD, + * or out-of-line XADLIST; + * + * N.B. REDOPAGE, NOREDOPAGE, and UPDATEMAP must be same format; + */ + struct { + __le32 fileset; /* 4: fileset number */ + __le32 inode; /* 4: inode number */ + __le16 type; /* 2: UPDATEMAP record type */ + __le16 nxd; /* 2: number of extents */ + pxd_t pxd; /* 8: pxd */ + } updatemap; /* (20) */ + + /* + * NOREDOINOEXT: the inode extent is freed + * + * do not apply after-image records which precede this + * record in the log with the any of the 4 page block + * numbers in this inode extent. + * + * NOTE: The fileset and pxd fields MUST remain in + * the same fields in the REDOPAGE record format. + * + */ + struct { + __le32 fileset; /* 4: fileset number */ + __le32 iagnum; /* 4: IAG number */ + __le32 inoext_idx; /* 4: inode extent index */ + pxd_t pxd; /* 8: on-disk page pxd */ + } noredoinoext; /* (20) */ + + /* + * SYNCPT: log sync point + * + * replay log up to syncpt address specified; + */ + struct { + __le32 sync; /* 4: syncpt address (0 = here) */ + } syncpt; + + /* + * MOUNT: file system mount + * + * file system mount: no type-dependent information; + */ + + /* + * ? FREEXTENT: free specified extent(s) + * + * free specified extent(s) from block allocation map + * N.B.: nextents should be length of data/sizeof(xad_t) + */ + struct { + __le32 type; /* 4: FREEXTENT record type */ + __le32 nextent; /* 4: number of extents */ + + /* data: PXD or XAD list */ + } freextent; + + /* + * ? NOREDOFILE: this file is freed + * + * do not apply records which precede this record in the log + * with the same inode number. + * + * NOREDOFILE must be the first to be written at commit + * (last to be read in logredo()) - it prevents + * replay of preceding updates of all preceding generations + * of the inumber esp. the on-disk inode itself. + */ + struct { + __le32 fileset; /* 4: fileset number */ + __le32 inode; /* 4: inode number */ + } noredofile; + + /* + * ? NEWPAGE: + * + * metadata type dependent + */ + struct { + __le32 fileset; /* 4: fileset number */ + __le32 inode; /* 4: inode number */ + __le32 type; /* 4: NEWPAGE record type */ + pxd_t pxd; /* 8: on-disk page pxd */ + } newpage; + + /* + * ? DUMMY: filler + * + * no type-dependent information + */ + } log; +}; /* (36) */ + +#define LOGRDSIZE (sizeof(struct lrd)) + +/* + * line vector descriptor + */ +struct lvd { + __le16 offset; + __le16 length; +}; + + +/* + * log logical volume + */ +struct jfs_log { + + struct list_head sb_list;/* This is used to sync metadata + * before writing syncpt. + */ + struct list_head journal_list; /* Global list */ + struct block_device *bdev; /* 4: log lv pointer */ + int serial; /* 4: log mount serial number */ + + s64 base; /* @8: log extent address (inline log ) */ + int size; /* 4: log size in log page (in page) */ + int l2bsize; /* 4: log2 of bsize */ + + unsigned long flag; /* 4: flag */ + + struct lbuf *lbuf_free; /* 4: free lbufs */ + wait_queue_head_t free_wait; /* 4: */ + + /* log write */ + int logtid; /* 4: log tid */ + int page; /* 4: page number of eol page */ + int eor; /* 4: eor of last record in eol page */ + struct lbuf *bp; /* 4: current log page buffer */ + + struct mutex loglock; /* 4: log write serialization lock */ + + /* syncpt */ + int nextsync; /* 4: bytes to write before next syncpt */ + int active; /* 4: */ + wait_queue_head_t syncwait; /* 4: */ + + /* commit */ + uint cflag; /* 4: */ + struct list_head cqueue; /* FIFO commit queue */ + struct tblock *flush_tblk; /* tblk we're waiting on for flush */ + int gcrtc; /* 4: GC_READY transaction count */ + struct tblock *gclrt; /* 4: latest GC_READY transaction */ + spinlock_t gclock; /* 4: group commit lock */ + int logsize; /* 4: log data area size in byte */ + int lsn; /* 4: end-of-log */ + int clsn; /* 4: clsn */ + int syncpt; /* 4: addr of last syncpt record */ + int sync; /* 4: addr from last logsync() */ + struct list_head synclist; /* 8: logsynclist anchor */ + spinlock_t synclock; /* 4: synclist lock */ + struct lbuf *wqueue; /* 4: log pageout queue */ + int count; /* 4: count */ + char uuid[16]; /* 16: 128-bit uuid of log device */ + + int no_integrity; /* 3: flag to disable journaling to disk */ +}; + +/* + * Log flag + */ +#define log_INLINELOG 1 +#define log_SYNCBARRIER 2 +#define log_QUIESCE 3 +#define log_FLUSH 4 + +/* + * group commit flag + */ +/* jfs_log */ +#define logGC_PAGEOUT 0x00000001 + +/* tblock/lbuf */ +#define tblkGC_QUEUE 0x0001 +#define tblkGC_READY 0x0002 +#define tblkGC_COMMIT 0x0004 +#define tblkGC_COMMITTED 0x0008 +#define tblkGC_EOP 0x0010 +#define tblkGC_FREE 0x0020 +#define tblkGC_LEADER 0x0040 +#define tblkGC_ERROR 0x0080 +#define tblkGC_LAZY 0x0100 // D230860 +#define tblkGC_UNLOCKED 0x0200 // D230860 + +/* + * log cache buffer header + */ +struct lbuf { + struct jfs_log *l_log; /* 4: log associated with buffer */ + + /* + * data buffer base area + */ + uint l_flag; /* 4: pageout control flags */ + + struct lbuf *l_wqnext; /* 4: write queue link */ + struct lbuf *l_freelist; /* 4: freelistlink */ + + int l_pn; /* 4: log page number */ + int l_eor; /* 4: log record eor */ + int l_ceor; /* 4: committed log record eor */ + + s64 l_blkno; /* 8: log page block number */ + caddr_t l_ldata; /* 4: data page */ + struct page *l_page; /* The page itself */ + uint l_offset; /* Offset of l_ldata within the page */ + + wait_queue_head_t l_ioevent; /* 4: i/o done event */ +}; + +/* Reuse l_freelist for redrive list */ +#define l_redrive_next l_freelist + +/* + * logsynclist block + * + * common logsyncblk prefix for jbuf_t and tblock + */ +struct logsyncblk { + u16 xflag; /* flags */ + u16 flag; /* only meaninful in tblock */ + lid_t lid; /* lock id */ + s32 lsn; /* log sequence number */ + struct list_head synclist; /* log sync list link */ +}; + +/* + * logsynclist serialization (per log) + */ + +#define LOGSYNC_LOCK_INIT(log) spin_lock_init(&(log)->synclock) +#define LOGSYNC_LOCK(log, flags) spin_lock_irqsave(&(log)->synclock, flags) +#define LOGSYNC_UNLOCK(log, flags) \ + spin_unlock_irqrestore(&(log)->synclock, flags) + +/* compute the difference in bytes of lsn from sync point */ +#define logdiff(diff, lsn, log)\ +{\ + diff = (lsn) - (log)->syncpt;\ + if (diff < 0)\ + diff += (log)->logsize;\ +} + +extern int lmLogOpen(struct super_block *sb); +extern int lmLogClose(struct super_block *sb); +extern int lmLogShutdown(struct jfs_log * log); +extern int lmLogInit(struct jfs_log * log); +extern int lmLogFormat(struct jfs_log *log, s64 logAddress, int logSize); +extern int lmGroupCommit(struct jfs_log *, struct tblock *); +extern int jfsIOWait(void *); +extern void jfs_flush_journal(struct jfs_log * log, int wait); +extern void jfs_syncpt(struct jfs_log *log, int hard_sync); + +#endif /* _H_JFS_LOGMGR */ diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c new file mode 100644 index 000000000..fa2c6824c --- /dev/null +++ b/fs/jfs/jfs_metapage.c @@ -0,0 +1,831 @@ +/* + * Copyright (C) International Business Machines Corp., 2000-2005 + * Portions Copyright (C) Christoph Hellwig, 2001-2002 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include <linux/fs.h> +#include <linux/mm.h> +#include <linux/module.h> +#include <linux/bio.h> +#include <linux/slab.h> +#include <linux/init.h> +#include <linux/buffer_head.h> +#include <linux/mempool.h> +#include <linux/seq_file.h> +#include "jfs_incore.h" +#include "jfs_superblock.h" +#include "jfs_filsys.h" +#include "jfs_metapage.h" +#include "jfs_txnmgr.h" +#include "jfs_debug.h" + +#ifdef CONFIG_JFS_STATISTICS +static struct { + uint pagealloc; /* # of page allocations */ + uint pagefree; /* # of page frees */ + uint lockwait; /* # of sleeping lock_metapage() calls */ +} mpStat; +#endif + +#define metapage_locked(mp) test_bit(META_locked, &(mp)->flag) +#define trylock_metapage(mp) test_and_set_bit_lock(META_locked, &(mp)->flag) + +static inline void unlock_metapage(struct metapage *mp) +{ + clear_bit_unlock(META_locked, &mp->flag); + wake_up(&mp->wait); +} + +static inline void __lock_metapage(struct metapage *mp) +{ + DECLARE_WAITQUEUE(wait, current); + INCREMENT(mpStat.lockwait); + add_wait_queue_exclusive(&mp->wait, &wait); + do { + set_current_state(TASK_UNINTERRUPTIBLE); + if (metapage_locked(mp)) { + unlock_page(mp->page); + io_schedule(); + lock_page(mp->page); + } + } while (trylock_metapage(mp)); + __set_current_state(TASK_RUNNING); + remove_wait_queue(&mp->wait, &wait); +} + +/* + * Must have mp->page locked + */ +static inline void lock_metapage(struct metapage *mp) +{ + if (trylock_metapage(mp)) + __lock_metapage(mp); +} + +#define METAPOOL_MIN_PAGES 32 +static struct kmem_cache *metapage_cache; +static mempool_t *metapage_mempool; + +#define MPS_PER_PAGE (PAGE_SIZE >> L2PSIZE) + +#if MPS_PER_PAGE > 1 + +struct meta_anchor { + int mp_count; + atomic_t io_count; + struct metapage *mp[MPS_PER_PAGE]; +}; +#define mp_anchor(page) ((struct meta_anchor *)page_private(page)) + +static inline struct metapage *page_to_mp(struct page *page, int offset) +{ + if (!PagePrivate(page)) + return NULL; + return mp_anchor(page)->mp[offset >> L2PSIZE]; +} + +static inline int insert_metapage(struct page *page, struct metapage *mp) +{ + struct meta_anchor *a; + int index; + int l2mp_blocks; /* log2 blocks per metapage */ + + if (PagePrivate(page)) + a = mp_anchor(page); + else { + a = kzalloc(sizeof(struct meta_anchor), GFP_NOFS); + if (!a) + return -ENOMEM; + set_page_private(page, (unsigned long)a); + SetPagePrivate(page); + kmap(page); + } + + if (mp) { + l2mp_blocks = L2PSIZE - page->mapping->host->i_blkbits; + index = (mp->index >> l2mp_blocks) & (MPS_PER_PAGE - 1); + a->mp_count++; + a->mp[index] = mp; + } + + return 0; +} + +static inline void remove_metapage(struct page *page, struct metapage *mp) +{ + struct meta_anchor *a = mp_anchor(page); + int l2mp_blocks = L2PSIZE - page->mapping->host->i_blkbits; + int index; + + index = (mp->index >> l2mp_blocks) & (MPS_PER_PAGE - 1); + + BUG_ON(a->mp[index] != mp); + + a->mp[index] = NULL; + if (--a->mp_count == 0) { + kfree(a); + set_page_private(page, 0); + ClearPagePrivate(page); + kunmap(page); + } +} + +static inline void inc_io(struct page *page) +{ + atomic_inc(&mp_anchor(page)->io_count); +} + +static inline void dec_io(struct page *page, void (*handler) (struct page *)) +{ + if (atomic_dec_and_test(&mp_anchor(page)->io_count)) + handler(page); +} + +#else +static inline struct metapage *page_to_mp(struct page *page, int offset) +{ + return PagePrivate(page) ? (struct metapage *)page_private(page) : NULL; +} + +static inline int insert_metapage(struct page *page, struct metapage *mp) +{ + if (mp) { + set_page_private(page, (unsigned long)mp); + SetPagePrivate(page); + kmap(page); + } + return 0; +} + +static inline void remove_metapage(struct page *page, struct metapage *mp) +{ + set_page_private(page, 0); + ClearPagePrivate(page); + kunmap(page); +} + +#define inc_io(page) do {} while(0) +#define dec_io(page, handler) handler(page) + +#endif + +static inline struct metapage *alloc_metapage(gfp_t gfp_mask) +{ + struct metapage *mp = mempool_alloc(metapage_mempool, gfp_mask); + + if (mp) { + mp->lid = 0; + mp->lsn = 0; + mp->data = NULL; + mp->clsn = 0; + mp->log = NULL; + init_waitqueue_head(&mp->wait); + } + return mp; +} + +static inline void free_metapage(struct metapage *mp) +{ + mempool_free(mp, metapage_mempool); +} + +int __init metapage_init(void) +{ + /* + * Allocate the metapage structures + */ + metapage_cache = kmem_cache_create("jfs_mp", sizeof(struct metapage), + 0, 0, NULL); + if (metapage_cache == NULL) + return -ENOMEM; + + metapage_mempool = mempool_create_slab_pool(METAPOOL_MIN_PAGES, + metapage_cache); + + if (metapage_mempool == NULL) { + kmem_cache_destroy(metapage_cache); + return -ENOMEM; + } + + return 0; +} + +void metapage_exit(void) +{ + mempool_destroy(metapage_mempool); + kmem_cache_destroy(metapage_cache); +} + +static inline void drop_metapage(struct page *page, struct metapage *mp) +{ + if (mp->count || mp->nohomeok || test_bit(META_dirty, &mp->flag) || + test_bit(META_io, &mp->flag)) + return; + remove_metapage(page, mp); + INCREMENT(mpStat.pagefree); + free_metapage(mp); +} + +/* + * Metapage address space operations + */ + +static sector_t metapage_get_blocks(struct inode *inode, sector_t lblock, + int *len) +{ + int rc = 0; + int xflag; + s64 xaddr; + sector_t file_blocks = (inode->i_size + inode->i_sb->s_blocksize - 1) >> + inode->i_blkbits; + + if (lblock >= file_blocks) + return 0; + if (lblock + *len > file_blocks) + *len = file_blocks - lblock; + + if (inode->i_ino) { + rc = xtLookup(inode, (s64)lblock, *len, &xflag, &xaddr, len, 0); + if ((rc == 0) && *len) + lblock = (sector_t)xaddr; + else + lblock = 0; + } /* else no mapping */ + + return lblock; +} + +static void last_read_complete(struct page *page) +{ + if (!PageError(page)) + SetPageUptodate(page); + unlock_page(page); +} + +static void metapage_read_end_io(struct bio *bio) +{ + struct page *page = bio->bi_private; + + if (bio->bi_status) { + printk(KERN_ERR "metapage_read_end_io: I/O error\n"); + SetPageError(page); + } + + dec_io(page, last_read_complete); + bio_put(bio); +} + +static void remove_from_logsync(struct metapage *mp) +{ + struct jfs_log *log = mp->log; + unsigned long flags; +/* + * This can race. Recheck that log hasn't been set to null, and after + * acquiring logsync lock, recheck lsn + */ + if (!log) + return; + + LOGSYNC_LOCK(log, flags); + if (mp->lsn) { + mp->log = NULL; + mp->lsn = 0; + mp->clsn = 0; + log->count--; + list_del(&mp->synclist); + } + LOGSYNC_UNLOCK(log, flags); +} + +static void last_write_complete(struct page *page) +{ + struct metapage *mp; + unsigned int offset; + + for (offset = 0; offset < PAGE_SIZE; offset += PSIZE) { + mp = page_to_mp(page, offset); + if (mp && test_bit(META_io, &mp->flag)) { + if (mp->lsn) + remove_from_logsync(mp); + clear_bit(META_io, &mp->flag); + } + /* + * I'd like to call drop_metapage here, but I don't think it's + * safe unless I have the page locked + */ + } + end_page_writeback(page); +} + +static void metapage_write_end_io(struct bio *bio) +{ + struct page *page = bio->bi_private; + + BUG_ON(!PagePrivate(page)); + + if (bio->bi_status) { + printk(KERN_ERR "metapage_write_end_io: I/O error\n"); + SetPageError(page); + } + dec_io(page, last_write_complete); + bio_put(bio); +} + +static int metapage_writepage(struct page *page, struct writeback_control *wbc) +{ + struct bio *bio = NULL; + int block_offset; /* block offset of mp within page */ + struct inode *inode = page->mapping->host; + int blocks_per_mp = JFS_SBI(inode->i_sb)->nbperpage; + int len; + int xlen; + struct metapage *mp; + int redirty = 0; + sector_t lblock; + int nr_underway = 0; + sector_t pblock; + sector_t next_block = 0; + sector_t page_start; + unsigned long bio_bytes = 0; + unsigned long bio_offset = 0; + int offset; + int bad_blocks = 0; + + page_start = (sector_t)page->index << + (PAGE_SHIFT - inode->i_blkbits); + BUG_ON(!PageLocked(page)); + BUG_ON(PageWriteback(page)); + set_page_writeback(page); + + for (offset = 0; offset < PAGE_SIZE; offset += PSIZE) { + mp = page_to_mp(page, offset); + + if (!mp || !test_bit(META_dirty, &mp->flag)) + continue; + + if (mp->nohomeok && !test_bit(META_forcewrite, &mp->flag)) { + redirty = 1; + /* + * Make sure this page isn't blocked indefinitely. + * If the journal isn't undergoing I/O, push it + */ + if (mp->log && !(mp->log->cflag & logGC_PAGEOUT)) + jfs_flush_journal(mp->log, 0); + continue; + } + + clear_bit(META_dirty, &mp->flag); + set_bit(META_io, &mp->flag); + block_offset = offset >> inode->i_blkbits; + lblock = page_start + block_offset; + if (bio) { + if (xlen && lblock == next_block) { + /* Contiguous, in memory & on disk */ + len = min(xlen, blocks_per_mp); + xlen -= len; + bio_bytes += len << inode->i_blkbits; + continue; + } + /* Not contiguous */ + if (bio_add_page(bio, page, bio_bytes, bio_offset) < + bio_bytes) + goto add_failed; + /* + * Increment counter before submitting i/o to keep + * count from hitting zero before we're through + */ + inc_io(page); + if (!bio->bi_iter.bi_size) + goto dump_bio; + submit_bio(bio); + nr_underway++; + bio = NULL; + } else + inc_io(page); + xlen = (PAGE_SIZE - offset) >> inode->i_blkbits; + pblock = metapage_get_blocks(inode, lblock, &xlen); + if (!pblock) { + printk(KERN_ERR "JFS: metapage_get_blocks failed\n"); + /* + * We already called inc_io(), but can't cancel it + * with dec_io() until we're done with the page + */ + bad_blocks++; + continue; + } + len = min(xlen, (int)JFS_SBI(inode->i_sb)->nbperpage); + + bio = bio_alloc(GFP_NOFS, 1); + bio_set_dev(bio, inode->i_sb->s_bdev); + bio->bi_iter.bi_sector = pblock << (inode->i_blkbits - 9); + bio->bi_end_io = metapage_write_end_io; + bio->bi_private = page; + bio_set_op_attrs(bio, REQ_OP_WRITE, 0); + + /* Don't call bio_add_page yet, we may add to this vec */ + bio_offset = offset; + bio_bytes = len << inode->i_blkbits; + + xlen -= len; + next_block = lblock + len; + } + if (bio) { + if (bio_add_page(bio, page, bio_bytes, bio_offset) < bio_bytes) + goto add_failed; + if (!bio->bi_iter.bi_size) + goto dump_bio; + + submit_bio(bio); + nr_underway++; + } + if (redirty) + redirty_page_for_writepage(wbc, page); + + unlock_page(page); + + if (bad_blocks) + goto err_out; + + if (nr_underway == 0) + end_page_writeback(page); + + return 0; +add_failed: + /* We should never reach here, since we're only adding one vec */ + printk(KERN_ERR "JFS: bio_add_page failed unexpectedly\n"); + goto skip; +dump_bio: + print_hex_dump(KERN_ERR, "JFS: dump of bio: ", DUMP_PREFIX_ADDRESS, 16, + 4, bio, sizeof(*bio), 0); +skip: + bio_put(bio); + unlock_page(page); + dec_io(page, last_write_complete); +err_out: + while (bad_blocks--) + dec_io(page, last_write_complete); + return -EIO; +} + +static int metapage_readpage(struct file *fp, struct page *page) +{ + struct inode *inode = page->mapping->host; + struct bio *bio = NULL; + int block_offset; + int blocks_per_page = PAGE_SIZE >> inode->i_blkbits; + sector_t page_start; /* address of page in fs blocks */ + sector_t pblock; + int xlen; + unsigned int len; + int offset; + + BUG_ON(!PageLocked(page)); + page_start = (sector_t)page->index << + (PAGE_SHIFT - inode->i_blkbits); + + block_offset = 0; + while (block_offset < blocks_per_page) { + xlen = blocks_per_page - block_offset; + pblock = metapage_get_blocks(inode, page_start + block_offset, + &xlen); + if (pblock) { + if (!PagePrivate(page)) + insert_metapage(page, NULL); + inc_io(page); + if (bio) + submit_bio(bio); + + bio = bio_alloc(GFP_NOFS, 1); + bio_set_dev(bio, inode->i_sb->s_bdev); + bio->bi_iter.bi_sector = + pblock << (inode->i_blkbits - 9); + bio->bi_end_io = metapage_read_end_io; + bio->bi_private = page; + bio_set_op_attrs(bio, REQ_OP_READ, 0); + len = xlen << inode->i_blkbits; + offset = block_offset << inode->i_blkbits; + if (bio_add_page(bio, page, len, offset) < len) + goto add_failed; + block_offset += xlen; + } else + block_offset++; + } + if (bio) + submit_bio(bio); + else + unlock_page(page); + + return 0; + +add_failed: + printk(KERN_ERR "JFS: bio_add_page failed unexpectedly\n"); + bio_put(bio); + dec_io(page, last_read_complete); + return -EIO; +} + +static int metapage_releasepage(struct page *page, gfp_t gfp_mask) +{ + struct metapage *mp; + int ret = 1; + int offset; + + for (offset = 0; offset < PAGE_SIZE; offset += PSIZE) { + mp = page_to_mp(page, offset); + + if (!mp) + continue; + + jfs_info("metapage_releasepage: mp = 0x%p", mp); + if (mp->count || mp->nohomeok || + test_bit(META_dirty, &mp->flag)) { + jfs_info("count = %ld, nohomeok = %d", mp->count, + mp->nohomeok); + ret = 0; + continue; + } + if (mp->lsn) + remove_from_logsync(mp); + remove_metapage(page, mp); + INCREMENT(mpStat.pagefree); + free_metapage(mp); + } + return ret; +} + +static void metapage_invalidatepage(struct page *page, unsigned int offset, + unsigned int length) +{ + BUG_ON(offset || length < PAGE_SIZE); + + BUG_ON(PageWriteback(page)); + + metapage_releasepage(page, 0); +} + +const struct address_space_operations jfs_metapage_aops = { + .readpage = metapage_readpage, + .writepage = metapage_writepage, + .releasepage = metapage_releasepage, + .invalidatepage = metapage_invalidatepage, + .set_page_dirty = __set_page_dirty_nobuffers, +}; + +struct metapage *__get_metapage(struct inode *inode, unsigned long lblock, + unsigned int size, int absolute, + unsigned long new) +{ + int l2BlocksPerPage; + int l2bsize; + struct address_space *mapping; + struct metapage *mp = NULL; + struct page *page; + unsigned long page_index; + unsigned long page_offset; + + jfs_info("__get_metapage: ino = %ld, lblock = 0x%lx, abs=%d", + inode->i_ino, lblock, absolute); + + l2bsize = inode->i_blkbits; + l2BlocksPerPage = PAGE_SHIFT - l2bsize; + page_index = lblock >> l2BlocksPerPage; + page_offset = (lblock - (page_index << l2BlocksPerPage)) << l2bsize; + if ((page_offset + size) > PAGE_SIZE) { + jfs_err("MetaData crosses page boundary!!"); + jfs_err("lblock = %lx, size = %d", lblock, size); + dump_stack(); + return NULL; + } + if (absolute) + mapping = JFS_SBI(inode->i_sb)->direct_inode->i_mapping; + else { + /* + * If an nfs client tries to read an inode that is larger + * than any existing inodes, we may try to read past the + * end of the inode map + */ + if ((lblock << inode->i_blkbits) >= inode->i_size) + return NULL; + mapping = inode->i_mapping; + } + + if (new && (PSIZE == PAGE_SIZE)) { + page = grab_cache_page(mapping, page_index); + if (!page) { + jfs_err("grab_cache_page failed!"); + return NULL; + } + SetPageUptodate(page); + } else { + page = read_mapping_page(mapping, page_index, NULL); + if (IS_ERR(page) || !PageUptodate(page)) { + jfs_err("read_mapping_page failed!"); + return NULL; + } + lock_page(page); + } + + mp = page_to_mp(page, page_offset); + if (mp) { + if (mp->logical_size != size) { + jfs_error(inode->i_sb, + "get_mp->logical_size != size\n"); + jfs_err("logical_size = %d, size = %d", + mp->logical_size, size); + dump_stack(); + goto unlock; + } + mp->count++; + lock_metapage(mp); + if (test_bit(META_discard, &mp->flag)) { + if (!new) { + jfs_error(inode->i_sb, + "using a discarded metapage\n"); + discard_metapage(mp); + goto unlock; + } + clear_bit(META_discard, &mp->flag); + } + } else { + INCREMENT(mpStat.pagealloc); + mp = alloc_metapage(GFP_NOFS); + if (!mp) + goto unlock; + mp->page = page; + mp->sb = inode->i_sb; + mp->flag = 0; + mp->xflag = COMMIT_PAGE; + mp->count = 1; + mp->nohomeok = 0; + mp->logical_size = size; + mp->data = page_address(page) + page_offset; + mp->index = lblock; + if (unlikely(insert_metapage(page, mp))) { + free_metapage(mp); + goto unlock; + } + lock_metapage(mp); + } + + if (new) { + jfs_info("zeroing mp = 0x%p", mp); + memset(mp->data, 0, PSIZE); + } + + unlock_page(page); + jfs_info("__get_metapage: returning = 0x%p data = 0x%p", mp, mp->data); + return mp; + +unlock: + unlock_page(page); + return NULL; +} + +void grab_metapage(struct metapage * mp) +{ + jfs_info("grab_metapage: mp = 0x%p", mp); + get_page(mp->page); + lock_page(mp->page); + mp->count++; + lock_metapage(mp); + unlock_page(mp->page); +} + +void force_metapage(struct metapage *mp) +{ + struct page *page = mp->page; + jfs_info("force_metapage: mp = 0x%p", mp); + set_bit(META_forcewrite, &mp->flag); + clear_bit(META_sync, &mp->flag); + get_page(page); + lock_page(page); + set_page_dirty(page); + if (write_one_page(page)) + jfs_error(mp->sb, "write_one_page() failed\n"); + clear_bit(META_forcewrite, &mp->flag); + put_page(page); +} + +void hold_metapage(struct metapage *mp) +{ + lock_page(mp->page); +} + +void put_metapage(struct metapage *mp) +{ + if (mp->count || mp->nohomeok) { + /* Someone else will release this */ + unlock_page(mp->page); + return; + } + get_page(mp->page); + mp->count++; + lock_metapage(mp); + unlock_page(mp->page); + release_metapage(mp); +} + +void release_metapage(struct metapage * mp) +{ + struct page *page = mp->page; + jfs_info("release_metapage: mp = 0x%p, flag = 0x%lx", mp, mp->flag); + + BUG_ON(!page); + + lock_page(page); + unlock_metapage(mp); + + assert(mp->count); + if (--mp->count || mp->nohomeok) { + unlock_page(page); + put_page(page); + return; + } + + if (test_bit(META_dirty, &mp->flag)) { + set_page_dirty(page); + if (test_bit(META_sync, &mp->flag)) { + clear_bit(META_sync, &mp->flag); + if (write_one_page(page)) + jfs_error(mp->sb, "write_one_page() failed\n"); + lock_page(page); /* write_one_page unlocks the page */ + } + } else if (mp->lsn) /* discard_metapage doesn't remove it */ + remove_from_logsync(mp); + + /* Try to keep metapages from using up too much memory */ + drop_metapage(page, mp); + + unlock_page(page); + put_page(page); +} + +void __invalidate_metapages(struct inode *ip, s64 addr, int len) +{ + sector_t lblock; + int l2BlocksPerPage = PAGE_SHIFT - ip->i_blkbits; + int BlocksPerPage = 1 << l2BlocksPerPage; + /* All callers are interested in block device's mapping */ + struct address_space *mapping = + JFS_SBI(ip->i_sb)->direct_inode->i_mapping; + struct metapage *mp; + struct page *page; + unsigned int offset; + + /* + * Mark metapages to discard. They will eventually be + * released, but should not be written. + */ + for (lblock = addr & ~(BlocksPerPage - 1); lblock < addr + len; + lblock += BlocksPerPage) { + page = find_lock_page(mapping, lblock >> l2BlocksPerPage); + if (!page) + continue; + for (offset = 0; offset < PAGE_SIZE; offset += PSIZE) { + mp = page_to_mp(page, offset); + if (!mp) + continue; + if (mp->index < addr) + continue; + if (mp->index >= addr + len) + break; + + clear_bit(META_dirty, &mp->flag); + set_bit(META_discard, &mp->flag); + if (mp->lsn) + remove_from_logsync(mp); + } + unlock_page(page); + put_page(page); + } +} + +#ifdef CONFIG_JFS_STATISTICS +int jfs_mpstat_proc_show(struct seq_file *m, void *v) +{ + seq_printf(m, + "JFS Metapage statistics\n" + "=======================\n" + "page allocations = %d\n" + "page frees = %d\n" + "lock waits = %d\n", + mpStat.pagealloc, + mpStat.pagefree, + mpStat.lockwait); + return 0; +} +#endif diff --git a/fs/jfs/jfs_metapage.h b/fs/jfs/jfs_metapage.h new file mode 100644 index 000000000..8b0ee514e --- /dev/null +++ b/fs/jfs/jfs_metapage.h @@ -0,0 +1,155 @@ +/* + * Copyright (C) International Business Machines Corp., 2000-2002 + * Portions Copyright (C) Christoph Hellwig, 2001-2002 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +#ifndef _H_JFS_METAPAGE +#define _H_JFS_METAPAGE + +#include <linux/pagemap.h> + +struct metapage { + /* Common logsyncblk prefix (see jfs_logmgr.h) */ + u16 xflag; + u16 unused; + lid_t lid; + int lsn; + struct list_head synclist; + /* End of logsyncblk prefix */ + + unsigned long flag; /* See Below */ + unsigned long count; /* Reference count */ + void *data; /* Data pointer */ + sector_t index; /* block address of page */ + wait_queue_head_t wait; + + /* implementation */ + struct page *page; + struct super_block *sb; + unsigned int logical_size; + + /* Journal management */ + int clsn; + int nohomeok; + struct jfs_log *log; +}; + +/* metapage flag */ +#define META_locked 0 +#define META_dirty 2 +#define META_sync 3 +#define META_discard 4 +#define META_forcewrite 5 +#define META_io 6 + +#define mark_metapage_dirty(mp) set_bit(META_dirty, &(mp)->flag) + +/* function prototypes */ +extern int metapage_init(void); +extern void metapage_exit(void); +extern struct metapage *__get_metapage(struct inode *inode, + unsigned long lblock, unsigned int size, + int absolute, unsigned long new); + +#define read_metapage(inode, lblock, size, absolute)\ + __get_metapage(inode, lblock, size, absolute, false) + +#define get_metapage(inode, lblock, size, absolute)\ + __get_metapage(inode, lblock, size, absolute, true) + +extern void release_metapage(struct metapage *); +extern void grab_metapage(struct metapage *); +extern void force_metapage(struct metapage *); + +/* + * hold_metapage and put_metapage are used in conjunction. The page lock + * is not dropped between the two, so no other threads can get or release + * the metapage + */ +extern void hold_metapage(struct metapage *); +extern void put_metapage(struct metapage *); + +static inline void write_metapage(struct metapage *mp) +{ + set_bit(META_dirty, &mp->flag); + release_metapage(mp); +} + +static inline void flush_metapage(struct metapage *mp) +{ + set_bit(META_sync, &mp->flag); + write_metapage(mp); +} + +static inline void discard_metapage(struct metapage *mp) +{ + clear_bit(META_dirty, &mp->flag); + set_bit(META_discard, &mp->flag); + release_metapage(mp); +} + +static inline void metapage_nohomeok(struct metapage *mp) +{ + struct page *page = mp->page; + lock_page(page); + if (!mp->nohomeok++) { + mark_metapage_dirty(mp); + get_page(page); + wait_on_page_writeback(page); + } + unlock_page(page); +} + +/* + * This serializes access to mp->lsn when metapages are added to logsynclist + * without setting nohomeok. i.e. updating imap & dmap + */ +static inline void metapage_wait_for_io(struct metapage *mp) +{ + if (test_bit(META_io, &mp->flag)) + wait_on_page_writeback(mp->page); +} + +/* + * This is called when already holding the metapage + */ +static inline void _metapage_homeok(struct metapage *mp) +{ + if (!--mp->nohomeok) + put_page(mp->page); +} + +static inline void metapage_homeok(struct metapage *mp) +{ + hold_metapage(mp); + _metapage_homeok(mp); + put_metapage(mp); +} + +extern const struct address_space_operations jfs_metapage_aops; + +/* + * This routines invalidate all pages for an extent. + */ +extern void __invalidate_metapages(struct inode *, s64, int); +#define invalidate_pxd_metapages(ip, pxd) \ + __invalidate_metapages((ip), addressPXD(&(pxd)), lengthPXD(&(pxd))) +#define invalidate_dxd_metapages(ip, dxd) \ + __invalidate_metapages((ip), addressDXD(&(dxd)), lengthDXD(&(dxd))) +#define invalidate_xad_metapages(ip, xad) \ + __invalidate_metapages((ip), addressXAD(&(xad)), lengthXAD(&(xad))) + +#endif /* _H_JFS_METAPAGE */ diff --git a/fs/jfs/jfs_mount.c b/fs/jfs/jfs_mount.c new file mode 100644 index 000000000..f1a705d15 --- /dev/null +++ b/fs/jfs/jfs_mount.c @@ -0,0 +1,510 @@ +/* + * Copyright (C) International Business Machines Corp., 2000-2004 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +/* + * Module: jfs_mount.c + * + * note: file system in transition to aggregate/fileset: + * + * file system mount is interpreted as the mount of aggregate, + * if not already mounted, and mount of the single/only fileset in + * the aggregate; + * + * a file system/aggregate is represented by an internal inode + * (aka mount inode) initialized with aggregate superblock; + * each vfs represents a fileset, and points to its "fileset inode + * allocation map inode" (aka fileset inode): + * (an aggregate itself is structured recursively as a filset: + * an internal vfs is constructed and points to its "fileset inode + * allocation map inode" (aka aggregate inode) where each inode + * represents a fileset inode) so that inode number is mapped to + * on-disk inode in uniform way at both aggregate and fileset level; + * + * each vnode/inode of a fileset is linked to its vfs (to facilitate + * per fileset inode operations, e.g., unmount of a fileset, etc.); + * each inode points to the mount inode (to facilitate access to + * per aggregate information, e.g., block size, etc.) as well as + * its file set inode. + * + * aggregate + * ipmnt + * mntvfs -> fileset ipimap+ -> aggregate ipbmap -> aggregate ipaimap; + * fileset vfs -> vp(1) <-> ... <-> vp(n) <->vproot; + */ + +#include <linux/fs.h> +#include <linux/buffer_head.h> +#include <linux/log2.h> + +#include "jfs_incore.h" +#include "jfs_filsys.h" +#include "jfs_superblock.h" +#include "jfs_dmap.h" +#include "jfs_imap.h" +#include "jfs_metapage.h" +#include "jfs_debug.h" + + +/* + * forward references + */ +static int chkSuper(struct super_block *); +static int logMOUNT(struct super_block *sb); + +/* + * NAME: jfs_mount(sb) + * + * FUNCTION: vfs_mount() + * + * PARAMETER: sb - super block + * + * RETURN: -EBUSY - device already mounted or open for write + * -EBUSY - cvrdvp already mounted; + * -EBUSY - mount table full + * -ENOTDIR- cvrdvp not directory on a device mount + * -ENXIO - device open failure + */ +int jfs_mount(struct super_block *sb) +{ + int rc = 0; /* Return code */ + struct jfs_sb_info *sbi = JFS_SBI(sb); + struct inode *ipaimap = NULL; + struct inode *ipaimap2 = NULL; + struct inode *ipimap = NULL; + struct inode *ipbmap = NULL; + + /* + * read/validate superblock + * (initialize mount inode from the superblock) + */ + if ((rc = chkSuper(sb))) { + goto out; + } + + ipaimap = diReadSpecial(sb, AGGREGATE_I, 0); + if (ipaimap == NULL) { + jfs_err("jfs_mount: Failed to read AGGREGATE_I"); + rc = -EIO; + goto out; + } + sbi->ipaimap = ipaimap; + + jfs_info("jfs_mount: ipaimap:0x%p", ipaimap); + + /* + * initialize aggregate inode allocation map + */ + if ((rc = diMount(ipaimap))) { + jfs_err("jfs_mount: diMount(ipaimap) failed w/rc = %d", rc); + goto err_ipaimap; + } + + /* + * open aggregate block allocation map + */ + ipbmap = diReadSpecial(sb, BMAP_I, 0); + if (ipbmap == NULL) { + rc = -EIO; + goto err_umount_ipaimap; + } + + jfs_info("jfs_mount: ipbmap:0x%p", ipbmap); + + sbi->ipbmap = ipbmap; + + /* + * initialize aggregate block allocation map + */ + if ((rc = dbMount(ipbmap))) { + jfs_err("jfs_mount: dbMount failed w/rc = %d", rc); + goto err_ipbmap; + } + + /* + * open the secondary aggregate inode allocation map + * + * This is a duplicate of the aggregate inode allocation map. + * + * hand craft a vfs in the same fashion as we did to read ipaimap. + * By adding INOSPEREXT (32) to the inode number, we are telling + * diReadSpecial that we are reading from the secondary aggregate + * inode table. This also creates a unique entry in the inode hash + * table. + */ + if ((sbi->mntflag & JFS_BAD_SAIT) == 0) { + ipaimap2 = diReadSpecial(sb, AGGREGATE_I, 1); + if (!ipaimap2) { + jfs_err("jfs_mount: Failed to read AGGREGATE_I"); + rc = -EIO; + goto err_umount_ipbmap; + } + sbi->ipaimap2 = ipaimap2; + + jfs_info("jfs_mount: ipaimap2:0x%p", ipaimap2); + + /* + * initialize secondary aggregate inode allocation map + */ + if ((rc = diMount(ipaimap2))) { + jfs_err("jfs_mount: diMount(ipaimap2) failed, rc = %d", + rc); + goto err_ipaimap2; + } + } else + /* Secondary aggregate inode table is not valid */ + sbi->ipaimap2 = NULL; + + /* + * mount (the only/single) fileset + */ + /* + * open fileset inode allocation map (aka fileset inode) + */ + ipimap = diReadSpecial(sb, FILESYSTEM_I, 0); + if (ipimap == NULL) { + jfs_err("jfs_mount: Failed to read FILESYSTEM_I"); + /* open fileset secondary inode allocation map */ + rc = -EIO; + goto err_umount_ipaimap2; + } + jfs_info("jfs_mount: ipimap:0x%p", ipimap); + + /* map further access of per fileset inodes by the fileset inode */ + sbi->ipimap = ipimap; + + /* initialize fileset inode allocation map */ + if ((rc = diMount(ipimap))) { + jfs_err("jfs_mount: diMount failed w/rc = %d", rc); + goto err_ipimap; + } + + return rc; + + /* + * unwind on error + */ +err_ipimap: + /* close fileset inode allocation map inode */ + diFreeSpecial(ipimap); +err_umount_ipaimap2: + /* close secondary aggregate inode allocation map */ + if (ipaimap2) + diUnmount(ipaimap2, 1); +err_ipaimap2: + /* close aggregate inodes */ + if (ipaimap2) + diFreeSpecial(ipaimap2); +err_umount_ipbmap: /* close aggregate block allocation map */ + dbUnmount(ipbmap, 1); +err_ipbmap: /* close aggregate inodes */ + diFreeSpecial(ipbmap); +err_umount_ipaimap: /* close aggregate inode allocation map */ + diUnmount(ipaimap, 1); +err_ipaimap: /* close aggregate inodes */ + diFreeSpecial(ipaimap); +out: + if (rc) + jfs_err("Mount JFS Failure: %d", rc); + + return rc; +} + +/* + * NAME: jfs_mount_rw(sb, remount) + * + * FUNCTION: Completes read-write mount, or remounts read-only volume + * as read-write + */ +int jfs_mount_rw(struct super_block *sb, int remount) +{ + struct jfs_sb_info *sbi = JFS_SBI(sb); + int rc; + + /* + * If we are re-mounting a previously read-only volume, we want to + * re-read the inode and block maps, since fsck.jfs may have updated + * them. + */ + if (remount) { + if (chkSuper(sb) || (sbi->state != FM_CLEAN)) + return -EINVAL; + + truncate_inode_pages(sbi->ipimap->i_mapping, 0); + truncate_inode_pages(sbi->ipbmap->i_mapping, 0); + diUnmount(sbi->ipimap, 1); + if ((rc = diMount(sbi->ipimap))) { + jfs_err("jfs_mount_rw: diMount failed!"); + return rc; + } + + dbUnmount(sbi->ipbmap, 1); + if ((rc = dbMount(sbi->ipbmap))) { + jfs_err("jfs_mount_rw: dbMount failed!"); + return rc; + } + } + + /* + * open/initialize log + */ + if ((rc = lmLogOpen(sb))) + return rc; + + /* + * update file system superblock; + */ + if ((rc = updateSuper(sb, FM_MOUNT))) { + jfs_err("jfs_mount: updateSuper failed w/rc = %d", rc); + lmLogClose(sb); + return rc; + } + + /* + * write MOUNT log record of the file system + */ + logMOUNT(sb); + + return rc; +} + +/* + * chkSuper() + * + * validate the superblock of the file system to be mounted and + * get the file system parameters. + * + * returns + * 0 with fragsize set if check successful + * error code if not successful + */ +static int chkSuper(struct super_block *sb) +{ + int rc = 0; + struct jfs_sb_info *sbi = JFS_SBI(sb); + struct jfs_superblock *j_sb; + struct buffer_head *bh; + int AIM_bytesize, AIT_bytesize; + int expected_AIM_bytesize, expected_AIT_bytesize; + s64 AIM_byte_addr, AIT_byte_addr, fsckwsp_addr; + s64 byte_addr_diff0, byte_addr_diff1; + s32 bsize; + + if ((rc = readSuper(sb, &bh))) + return rc; + j_sb = (struct jfs_superblock *)bh->b_data; + + /* + * validate superblock + */ + /* validate fs signature */ + if (strncmp(j_sb->s_magic, JFS_MAGIC, 4) || + le32_to_cpu(j_sb->s_version) > JFS_VERSION) { + rc = -EINVAL; + goto out; + } + + bsize = le32_to_cpu(j_sb->s_bsize); +#ifdef _JFS_4K + if (bsize != PSIZE) { + jfs_err("Currently only 4K block size supported!"); + rc = -EINVAL; + goto out; + } +#endif /* _JFS_4K */ + + jfs_info("superblock: flag:0x%08x state:0x%08x size:0x%Lx", + le32_to_cpu(j_sb->s_flag), le32_to_cpu(j_sb->s_state), + (unsigned long long) le64_to_cpu(j_sb->s_size)); + + /* validate the descriptors for Secondary AIM and AIT */ + if ((j_sb->s_flag & cpu_to_le32(JFS_BAD_SAIT)) != + cpu_to_le32(JFS_BAD_SAIT)) { + expected_AIM_bytesize = 2 * PSIZE; + AIM_bytesize = lengthPXD(&(j_sb->s_aim2)) * bsize; + expected_AIT_bytesize = 4 * PSIZE; + AIT_bytesize = lengthPXD(&(j_sb->s_ait2)) * bsize; + AIM_byte_addr = addressPXD(&(j_sb->s_aim2)) * bsize; + AIT_byte_addr = addressPXD(&(j_sb->s_ait2)) * bsize; + byte_addr_diff0 = AIT_byte_addr - AIM_byte_addr; + fsckwsp_addr = addressPXD(&(j_sb->s_fsckpxd)) * bsize; + byte_addr_diff1 = fsckwsp_addr - AIT_byte_addr; + if ((AIM_bytesize != expected_AIM_bytesize) || + (AIT_bytesize != expected_AIT_bytesize) || + (byte_addr_diff0 != AIM_bytesize) || + (byte_addr_diff1 <= AIT_bytesize)) + j_sb->s_flag |= cpu_to_le32(JFS_BAD_SAIT); + } + + if ((j_sb->s_flag & cpu_to_le32(JFS_GROUPCOMMIT)) != + cpu_to_le32(JFS_GROUPCOMMIT)) + j_sb->s_flag |= cpu_to_le32(JFS_GROUPCOMMIT); + + /* validate fs state */ + if (j_sb->s_state != cpu_to_le32(FM_CLEAN) && + !sb_rdonly(sb)) { + jfs_err("jfs_mount: Mount Failure: File System Dirty."); + rc = -EINVAL; + goto out; + } + + sbi->state = le32_to_cpu(j_sb->s_state); + sbi->mntflag = le32_to_cpu(j_sb->s_flag); + + /* + * JFS always does I/O by 4K pages. Don't tell the buffer cache + * that we use anything else (leave s_blocksize alone). + */ + sbi->bsize = bsize; + sbi->l2bsize = le16_to_cpu(j_sb->s_l2bsize); + + /* check some fields for possible corruption */ + if (sbi->l2bsize != ilog2((u32)bsize) || + j_sb->pad != 0 || + le32_to_cpu(j_sb->s_state) > FM_STATE_MAX) { + rc = -EINVAL; + jfs_err("jfs_mount: Mount Failure: superblock is corrupt!"); + goto out; + } + + /* + * For now, ignore s_pbsize, l2bfactor. All I/O going through buffer + * cache. + */ + sbi->nbperpage = PSIZE >> sbi->l2bsize; + sbi->l2nbperpage = L2PSIZE - sbi->l2bsize; + sbi->l2niperblk = sbi->l2bsize - L2DISIZE; + if (sbi->mntflag & JFS_INLINELOG) + sbi->logpxd = j_sb->s_logpxd; + else { + sbi->logdev = new_decode_dev(le32_to_cpu(j_sb->s_logdev)); + memcpy(sbi->uuid, j_sb->s_uuid, sizeof(sbi->uuid)); + memcpy(sbi->loguuid, j_sb->s_loguuid, sizeof(sbi->uuid)); + } + sbi->fsckpxd = j_sb->s_fsckpxd; + sbi->ait2 = j_sb->s_ait2; + + out: + brelse(bh); + return rc; +} + + +/* + * updateSuper() + * + * update synchronously superblock if it is mounted read-write. + */ +int updateSuper(struct super_block *sb, uint state) +{ + struct jfs_superblock *j_sb; + struct jfs_sb_info *sbi = JFS_SBI(sb); + struct buffer_head *bh; + int rc; + + if (sbi->flag & JFS_NOINTEGRITY) { + if (state == FM_DIRTY) { + sbi->p_state = state; + return 0; + } else if (state == FM_MOUNT) { + sbi->p_state = sbi->state; + state = FM_DIRTY; + } else if (state == FM_CLEAN) { + state = sbi->p_state; + } else + jfs_err("updateSuper: bad state"); + } else if (sbi->state == FM_DIRTY) + return 0; + + if ((rc = readSuper(sb, &bh))) + return rc; + + j_sb = (struct jfs_superblock *)bh->b_data; + + j_sb->s_state = cpu_to_le32(state); + sbi->state = state; + + if (state == FM_MOUNT) { + /* record log's dev_t and mount serial number */ + j_sb->s_logdev = cpu_to_le32(new_encode_dev(sbi->log->bdev->bd_dev)); + j_sb->s_logserial = cpu_to_le32(sbi->log->serial); + } else if (state == FM_CLEAN) { + /* + * If this volume is shared with OS/2, OS/2 will need to + * recalculate DASD usage, since we don't deal with it. + */ + if (j_sb->s_flag & cpu_to_le32(JFS_DASD_ENABLED)) + j_sb->s_flag |= cpu_to_le32(JFS_DASD_PRIME); + } + + mark_buffer_dirty(bh); + sync_dirty_buffer(bh); + brelse(bh); + + return 0; +} + + +/* + * readSuper() + * + * read superblock by raw sector address + */ +int readSuper(struct super_block *sb, struct buffer_head **bpp) +{ + /* read in primary superblock */ + *bpp = sb_bread(sb, SUPER1_OFF >> sb->s_blocksize_bits); + if (*bpp) + return 0; + + /* read in secondary/replicated superblock */ + *bpp = sb_bread(sb, SUPER2_OFF >> sb->s_blocksize_bits); + if (*bpp) + return 0; + + return -EIO; +} + + +/* + * logMOUNT() + * + * function: write a MOUNT log record for file system. + * + * MOUNT record keeps logredo() from processing log records + * for this file system past this point in log. + * it is harmless if mount fails. + * + * note: MOUNT record is at aggregate level, not at fileset level, + * since log records of previous mounts of a fileset + * (e.g., AFTER record of extent allocation) have to be processed + * to update block allocation map at aggregate level. + */ +static int logMOUNT(struct super_block *sb) +{ + struct jfs_log *log = JFS_SBI(sb)->log; + struct lrd lrd; + + lrd.logtid = 0; + lrd.backchain = 0; + lrd.type = cpu_to_le16(LOG_MOUNT); + lrd.length = 0; + lrd.aggregate = cpu_to_le32(new_encode_dev(sb->s_bdev->bd_dev)); + lmLog(log, NULL, &lrd, NULL); + + return 0; +} diff --git a/fs/jfs/jfs_superblock.h b/fs/jfs/jfs_superblock.h new file mode 100644 index 000000000..04847b8d3 --- /dev/null +++ b/fs/jfs/jfs_superblock.h @@ -0,0 +1,122 @@ +/* + * Copyright (C) International Business Machines Corp., 2000-2003 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +#ifndef _H_JFS_SUPERBLOCK +#define _H_JFS_SUPERBLOCK + +/* + * make the magic number something a human could read + */ +#define JFS_MAGIC "JFS1" /* Magic word */ + +#define JFS_VERSION 2 /* Version number: Version 2 */ + +#define LV_NAME_SIZE 11 /* MUST BE 11 for OS/2 boot sector */ + +/* + * aggregate superblock + * + * The name superblock is too close to super_block, so the name has been + * changed to jfs_superblock. The utilities are still using the old name. + */ +struct jfs_superblock { + char s_magic[4]; /* 4: magic number */ + __le32 s_version; /* 4: version number */ + + __le64 s_size; /* 8: aggregate size in hardware/LVM blocks; + * VFS: number of blocks + */ + __le32 s_bsize; /* 4: aggregate block size in bytes; + * VFS: fragment size + */ + __le16 s_l2bsize; /* 2: log2 of s_bsize */ + __le16 s_l2bfactor; /* 2: log2(s_bsize/hardware block size) */ + __le32 s_pbsize; /* 4: hardware/LVM block size in bytes */ + __le16 s_l2pbsize; /* 2: log2 of s_pbsize */ + __le16 pad; /* 2: padding necessary for alignment */ + + __le32 s_agsize; /* 4: allocation group size in aggr. blocks */ + + __le32 s_flag; /* 4: aggregate attributes: + * see jfs_filsys.h + */ + __le32 s_state; /* 4: mount/unmount/recovery state: + * see jfs_filsys.h + */ + __le32 s_compress; /* 4: > 0 if data compression */ + + pxd_t s_ait2; /* 8: first extent of secondary + * aggregate inode table + */ + + pxd_t s_aim2; /* 8: first extent of secondary + * aggregate inode map + */ + __le32 s_logdev; /* 4: device address of log */ + __le32 s_logserial; /* 4: log serial number at aggregate mount */ + pxd_t s_logpxd; /* 8: inline log extent */ + + pxd_t s_fsckpxd; /* 8: inline fsck work space extent */ + + struct timestruc_t s_time; /* 8: time last updated */ + + __le32 s_fsckloglen; /* 4: Number of filesystem blocks reserved for + * the fsck service log. + * N.B. These blocks are divided among the + * versions kept. This is not a per + * version size. + * N.B. These blocks are included in the + * length field of s_fsckpxd. + */ + s8 s_fscklog; /* 1: which fsck service log is most recent + * 0 => no service log data yet + * 1 => the first one + * 2 => the 2nd one + */ + char s_fpack[11]; /* 11: file system volume name + * N.B. This must be 11 bytes to + * conform with the OS/2 BootSector + * requirements + * Only used when s_version is 1 + */ + + /* extendfs() parameter under s_state & FM_EXTENDFS */ + __le64 s_xsize; /* 8: extendfs s_size */ + pxd_t s_xfsckpxd; /* 8: extendfs fsckpxd */ + pxd_t s_xlogpxd; /* 8: extendfs logpxd */ + /* - 128 byte boundary - */ + + char s_uuid[16]; /* 16: 128-bit uuid for volume */ + char s_label[16]; /* 16: volume label */ + char s_loguuid[16]; /* 16: 128-bit uuid for log device */ + +}; + +extern int readSuper(struct super_block *, struct buffer_head **); +extern int updateSuper(struct super_block *, uint); +__printf(2, 3) +extern void jfs_error(struct super_block *, const char *, ...); +extern int jfs_mount(struct super_block *); +extern int jfs_mount_rw(struct super_block *, int); +extern int jfs_umount(struct super_block *); +extern int jfs_umount_rw(struct super_block *); +extern int jfs_extendfs(struct super_block *, s64, int); + +extern struct task_struct *jfsIOthread; +extern struct task_struct *jfsSyncThread; + +#endif /*_H_JFS_SUPERBLOCK */ diff --git a/fs/jfs/jfs_txnmgr.c b/fs/jfs/jfs_txnmgr.c new file mode 100644 index 000000000..78789c5ed --- /dev/null +++ b/fs/jfs/jfs_txnmgr.c @@ -0,0 +1,3062 @@ +/* + * Copyright (C) International Business Machines Corp., 2000-2005 + * Portions Copyright (C) Christoph Hellwig, 2001-2002 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +/* + * jfs_txnmgr.c: transaction manager + * + * notes: + * transaction starts with txBegin() and ends with txCommit() + * or txAbort(). + * + * tlock is acquired at the time of update; + * (obviate scan at commit time for xtree and dtree) + * tlock and mp points to each other; + * (no hashlist for mp -> tlock). + * + * special cases: + * tlock on in-memory inode: + * in-place tlock in the in-memory inode itself; + * converted to page lock by iWrite() at commit time. + * + * tlock during write()/mmap() under anonymous transaction (tid = 0): + * transferred (?) to transaction at commit time. + * + * use the page itself to update allocation maps + * (obviate intermediate replication of allocation/deallocation data) + * hold on to mp+lock thru update of maps + */ + +#include <linux/fs.h> +#include <linux/vmalloc.h> +#include <linux/completion.h> +#include <linux/freezer.h> +#include <linux/module.h> +#include <linux/moduleparam.h> +#include <linux/kthread.h> +#include <linux/seq_file.h> +#include "jfs_incore.h" +#include "jfs_inode.h" +#include "jfs_filsys.h" +#include "jfs_metapage.h" +#include "jfs_dinode.h" +#include "jfs_imap.h" +#include "jfs_dmap.h" +#include "jfs_superblock.h" +#include "jfs_debug.h" + +/* + * transaction management structures + */ +static struct { + int freetid; /* index of a free tid structure */ + int freelock; /* index first free lock word */ + wait_queue_head_t freewait; /* eventlist of free tblock */ + wait_queue_head_t freelockwait; /* eventlist of free tlock */ + wait_queue_head_t lowlockwait; /* eventlist of ample tlocks */ + int tlocksInUse; /* Number of tlocks in use */ + spinlock_t LazyLock; /* synchronize sync_queue & unlock_queue */ +/* struct tblock *sync_queue; * Transactions waiting for data sync */ + struct list_head unlock_queue; /* Txns waiting to be released */ + struct list_head anon_list; /* inodes having anonymous txns */ + struct list_head anon_list2; /* inodes having anonymous txns + that couldn't be sync'ed */ +} TxAnchor; + +int jfs_tlocks_low; /* Indicates low number of available tlocks */ + +#ifdef CONFIG_JFS_STATISTICS +static struct { + uint txBegin; + uint txBegin_barrier; + uint txBegin_lockslow; + uint txBegin_freetid; + uint txBeginAnon; + uint txBeginAnon_barrier; + uint txBeginAnon_lockslow; + uint txLockAlloc; + uint txLockAlloc_freelock; +} TxStat; +#endif + +static int nTxBlock = -1; /* number of transaction blocks */ +module_param(nTxBlock, int, 0); +MODULE_PARM_DESC(nTxBlock, + "Number of transaction blocks (max:65536)"); + +static int nTxLock = -1; /* number of transaction locks */ +module_param(nTxLock, int, 0); +MODULE_PARM_DESC(nTxLock, + "Number of transaction locks (max:65536)"); + +struct tblock *TxBlock; /* transaction block table */ +static int TxLockLWM; /* Low water mark for number of txLocks used */ +static int TxLockHWM; /* High water mark for number of txLocks used */ +static int TxLockVHWM; /* Very High water mark */ +struct tlock *TxLock; /* transaction lock table */ + +/* + * transaction management lock + */ +static DEFINE_SPINLOCK(jfsTxnLock); + +#define TXN_LOCK() spin_lock(&jfsTxnLock) +#define TXN_UNLOCK() spin_unlock(&jfsTxnLock) + +#define LAZY_LOCK_INIT() spin_lock_init(&TxAnchor.LazyLock); +#define LAZY_LOCK(flags) spin_lock_irqsave(&TxAnchor.LazyLock, flags) +#define LAZY_UNLOCK(flags) spin_unlock_irqrestore(&TxAnchor.LazyLock, flags) + +static DECLARE_WAIT_QUEUE_HEAD(jfs_commit_thread_wait); +static int jfs_commit_thread_waking; + +/* + * Retry logic exist outside these macros to protect from spurrious wakeups. + */ +static inline void TXN_SLEEP_DROP_LOCK(wait_queue_head_t * event) +{ + DECLARE_WAITQUEUE(wait, current); + + add_wait_queue(event, &wait); + set_current_state(TASK_UNINTERRUPTIBLE); + TXN_UNLOCK(); + io_schedule(); + remove_wait_queue(event, &wait); +} + +#define TXN_SLEEP(event)\ +{\ + TXN_SLEEP_DROP_LOCK(event);\ + TXN_LOCK();\ +} + +#define TXN_WAKEUP(event) wake_up_all(event) + +/* + * statistics + */ +static struct { + tid_t maxtid; /* 4: biggest tid ever used */ + lid_t maxlid; /* 4: biggest lid ever used */ + int ntid; /* 4: # of transactions performed */ + int nlid; /* 4: # of tlocks acquired */ + int waitlock; /* 4: # of tlock wait */ +} stattx; + +/* + * forward references + */ +static int diLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd, + struct tlock * tlck, struct commit * cd); +static int dataLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd, + struct tlock * tlck); +static void dtLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd, + struct tlock * tlck); +static void mapLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd, + struct tlock * tlck); +static void txAllocPMap(struct inode *ip, struct maplock * maplock, + struct tblock * tblk); +static void txForce(struct tblock * tblk); +static int txLog(struct jfs_log * log, struct tblock * tblk, + struct commit * cd); +static void txUpdateMap(struct tblock * tblk); +static void txRelease(struct tblock * tblk); +static void xtLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd, + struct tlock * tlck); +static void LogSyncRelease(struct metapage * mp); + +/* + * transaction block/lock management + * --------------------------------- + */ + +/* + * Get a transaction lock from the free list. If the number in use is + * greater than the high water mark, wake up the sync daemon. This should + * free some anonymous transaction locks. (TXN_LOCK must be held.) + */ +static lid_t txLockAlloc(void) +{ + lid_t lid; + + INCREMENT(TxStat.txLockAlloc); + if (!TxAnchor.freelock) { + INCREMENT(TxStat.txLockAlloc_freelock); + } + + while (!(lid = TxAnchor.freelock)) + TXN_SLEEP(&TxAnchor.freelockwait); + TxAnchor.freelock = TxLock[lid].next; + HIGHWATERMARK(stattx.maxlid, lid); + if ((++TxAnchor.tlocksInUse > TxLockHWM) && (jfs_tlocks_low == 0)) { + jfs_info("txLockAlloc tlocks low"); + jfs_tlocks_low = 1; + wake_up_process(jfsSyncThread); + } + + return lid; +} + +static void txLockFree(lid_t lid) +{ + TxLock[lid].tid = 0; + TxLock[lid].next = TxAnchor.freelock; + TxAnchor.freelock = lid; + TxAnchor.tlocksInUse--; + if (jfs_tlocks_low && (TxAnchor.tlocksInUse < TxLockLWM)) { + jfs_info("txLockFree jfs_tlocks_low no more"); + jfs_tlocks_low = 0; + TXN_WAKEUP(&TxAnchor.lowlockwait); + } + TXN_WAKEUP(&TxAnchor.freelockwait); +} + +/* + * NAME: txInit() + * + * FUNCTION: initialize transaction management structures + * + * RETURN: + * + * serialization: single thread at jfs_init() + */ +int txInit(void) +{ + int k, size; + struct sysinfo si; + + /* Set defaults for nTxLock and nTxBlock if unset */ + + if (nTxLock == -1) { + if (nTxBlock == -1) { + /* Base default on memory size */ + si_meminfo(&si); + if (si.totalram > (256 * 1024)) /* 1 GB */ + nTxLock = 64 * 1024; + else + nTxLock = si.totalram >> 2; + } else if (nTxBlock > (8 * 1024)) + nTxLock = 64 * 1024; + else + nTxLock = nTxBlock << 3; + } + if (nTxBlock == -1) + nTxBlock = nTxLock >> 3; + + /* Verify tunable parameters */ + if (nTxBlock < 16) + nTxBlock = 16; /* No one should set it this low */ + if (nTxBlock > 65536) + nTxBlock = 65536; + if (nTxLock < 256) + nTxLock = 256; /* No one should set it this low */ + if (nTxLock > 65536) + nTxLock = 65536; + + printk(KERN_INFO "JFS: nTxBlock = %d, nTxLock = %d\n", + nTxBlock, nTxLock); + /* + * initialize transaction block (tblock) table + * + * transaction id (tid) = tblock index + * tid = 0 is reserved. + */ + TxLockLWM = (nTxLock * 4) / 10; + TxLockHWM = (nTxLock * 7) / 10; + TxLockVHWM = (nTxLock * 8) / 10; + + size = sizeof(struct tblock) * nTxBlock; + TxBlock = vmalloc(size); + if (TxBlock == NULL) + return -ENOMEM; + + for (k = 1; k < nTxBlock - 1; k++) { + TxBlock[k].next = k + 1; + init_waitqueue_head(&TxBlock[k].gcwait); + init_waitqueue_head(&TxBlock[k].waitor); + } + TxBlock[k].next = 0; + init_waitqueue_head(&TxBlock[k].gcwait); + init_waitqueue_head(&TxBlock[k].waitor); + + TxAnchor.freetid = 1; + init_waitqueue_head(&TxAnchor.freewait); + + stattx.maxtid = 1; /* statistics */ + + /* + * initialize transaction lock (tlock) table + * + * transaction lock id = tlock index + * tlock id = 0 is reserved. + */ + size = sizeof(struct tlock) * nTxLock; + TxLock = vmalloc(size); + if (TxLock == NULL) { + vfree(TxBlock); + return -ENOMEM; + } + + /* initialize tlock table */ + for (k = 1; k < nTxLock - 1; k++) + TxLock[k].next = k + 1; + TxLock[k].next = 0; + init_waitqueue_head(&TxAnchor.freelockwait); + init_waitqueue_head(&TxAnchor.lowlockwait); + + TxAnchor.freelock = 1; + TxAnchor.tlocksInUse = 0; + INIT_LIST_HEAD(&TxAnchor.anon_list); + INIT_LIST_HEAD(&TxAnchor.anon_list2); + + LAZY_LOCK_INIT(); + INIT_LIST_HEAD(&TxAnchor.unlock_queue); + + stattx.maxlid = 1; /* statistics */ + + return 0; +} + +/* + * NAME: txExit() + * + * FUNCTION: clean up when module is unloaded + */ +void txExit(void) +{ + vfree(TxLock); + TxLock = NULL; + vfree(TxBlock); + TxBlock = NULL; +} + +/* + * NAME: txBegin() + * + * FUNCTION: start a transaction. + * + * PARAMETER: sb - superblock + * flag - force for nested tx; + * + * RETURN: tid - transaction id + * + * note: flag force allows to start tx for nested tx + * to prevent deadlock on logsync barrier; + */ +tid_t txBegin(struct super_block *sb, int flag) +{ + tid_t t; + struct tblock *tblk; + struct jfs_log *log; + + jfs_info("txBegin: flag = 0x%x", flag); + log = JFS_SBI(sb)->log; + + TXN_LOCK(); + + INCREMENT(TxStat.txBegin); + + retry: + if (!(flag & COMMIT_FORCE)) { + /* + * synchronize with logsync barrier + */ + if (test_bit(log_SYNCBARRIER, &log->flag) || + test_bit(log_QUIESCE, &log->flag)) { + INCREMENT(TxStat.txBegin_barrier); + TXN_SLEEP(&log->syncwait); + goto retry; + } + } + if (flag == 0) { + /* + * Don't begin transaction if we're getting starved for tlocks + * unless COMMIT_FORCE or COMMIT_INODE (which may ultimately + * free tlocks) + */ + if (TxAnchor.tlocksInUse > TxLockVHWM) { + INCREMENT(TxStat.txBegin_lockslow); + TXN_SLEEP(&TxAnchor.lowlockwait); + goto retry; + } + } + + /* + * allocate transaction id/block + */ + if ((t = TxAnchor.freetid) == 0) { + jfs_info("txBegin: waiting for free tid"); + INCREMENT(TxStat.txBegin_freetid); + TXN_SLEEP(&TxAnchor.freewait); + goto retry; + } + + tblk = tid_to_tblock(t); + + if ((tblk->next == 0) && !(flag & COMMIT_FORCE)) { + /* Don't let a non-forced transaction take the last tblk */ + jfs_info("txBegin: waiting for free tid"); + INCREMENT(TxStat.txBegin_freetid); + TXN_SLEEP(&TxAnchor.freewait); + goto retry; + } + + TxAnchor.freetid = tblk->next; + + /* + * initialize transaction + */ + + /* + * We can't zero the whole thing or we screw up another thread being + * awakened after sleeping on tblk->waitor + * + * memset(tblk, 0, sizeof(struct tblock)); + */ + tblk->next = tblk->last = tblk->xflag = tblk->flag = tblk->lsn = 0; + + tblk->sb = sb; + ++log->logtid; + tblk->logtid = log->logtid; + + ++log->active; + + HIGHWATERMARK(stattx.maxtid, t); /* statistics */ + INCREMENT(stattx.ntid); /* statistics */ + + TXN_UNLOCK(); + + jfs_info("txBegin: returning tid = %d", t); + + return t; +} + +/* + * NAME: txBeginAnon() + * + * FUNCTION: start an anonymous transaction. + * Blocks if logsync or available tlocks are low to prevent + * anonymous tlocks from depleting supply. + * + * PARAMETER: sb - superblock + * + * RETURN: none + */ +void txBeginAnon(struct super_block *sb) +{ + struct jfs_log *log; + + log = JFS_SBI(sb)->log; + + TXN_LOCK(); + INCREMENT(TxStat.txBeginAnon); + + retry: + /* + * synchronize with logsync barrier + */ + if (test_bit(log_SYNCBARRIER, &log->flag) || + test_bit(log_QUIESCE, &log->flag)) { + INCREMENT(TxStat.txBeginAnon_barrier); + TXN_SLEEP(&log->syncwait); + goto retry; + } + + /* + * Don't begin transaction if we're getting starved for tlocks + */ + if (TxAnchor.tlocksInUse > TxLockVHWM) { + INCREMENT(TxStat.txBeginAnon_lockslow); + TXN_SLEEP(&TxAnchor.lowlockwait); + goto retry; + } + TXN_UNLOCK(); +} + +/* + * txEnd() + * + * function: free specified transaction block. + * + * logsync barrier processing: + * + * serialization: + */ +void txEnd(tid_t tid) +{ + struct tblock *tblk = tid_to_tblock(tid); + struct jfs_log *log; + + jfs_info("txEnd: tid = %d", tid); + TXN_LOCK(); + + /* + * wakeup transactions waiting on the page locked + * by the current transaction + */ + TXN_WAKEUP(&tblk->waitor); + + log = JFS_SBI(tblk->sb)->log; + + /* + * Lazy commit thread can't free this guy until we mark it UNLOCKED, + * otherwise, we would be left with a transaction that may have been + * reused. + * + * Lazy commit thread will turn off tblkGC_LAZY before calling this + * routine. + */ + if (tblk->flag & tblkGC_LAZY) { + jfs_info("txEnd called w/lazy tid: %d, tblk = 0x%p", tid, tblk); + TXN_UNLOCK(); + + spin_lock_irq(&log->gclock); // LOGGC_LOCK + tblk->flag |= tblkGC_UNLOCKED; + spin_unlock_irq(&log->gclock); // LOGGC_UNLOCK + return; + } + + jfs_info("txEnd: tid: %d, tblk = 0x%p", tid, tblk); + + assert(tblk->next == 0); + + /* + * insert tblock back on freelist + */ + tblk->next = TxAnchor.freetid; + TxAnchor.freetid = tid; + + /* + * mark the tblock not active + */ + if (--log->active == 0) { + clear_bit(log_FLUSH, &log->flag); + + /* + * synchronize with logsync barrier + */ + if (test_bit(log_SYNCBARRIER, &log->flag)) { + TXN_UNLOCK(); + + /* write dirty metadata & forward log syncpt */ + jfs_syncpt(log, 1); + + jfs_info("log barrier off: 0x%x", log->lsn); + + /* enable new transactions start */ + clear_bit(log_SYNCBARRIER, &log->flag); + + /* wakeup all waitors for logsync barrier */ + TXN_WAKEUP(&log->syncwait); + + goto wakeup; + } + } + + TXN_UNLOCK(); +wakeup: + /* + * wakeup all waitors for a free tblock + */ + TXN_WAKEUP(&TxAnchor.freewait); +} + +/* + * txLock() + * + * function: acquire a transaction lock on the specified <mp> + * + * parameter: + * + * return: transaction lock id + * + * serialization: + */ +struct tlock *txLock(tid_t tid, struct inode *ip, struct metapage * mp, + int type) +{ + struct jfs_inode_info *jfs_ip = JFS_IP(ip); + int dir_xtree = 0; + lid_t lid; + tid_t xtid; + struct tlock *tlck; + struct xtlock *xtlck; + struct linelock *linelock; + xtpage_t *p; + struct tblock *tblk; + + TXN_LOCK(); + + if (S_ISDIR(ip->i_mode) && (type & tlckXTREE) && + !(mp->xflag & COMMIT_PAGE)) { + /* + * Directory inode is special. It can have both an xtree tlock + * and a dtree tlock associated with it. + */ + dir_xtree = 1; + lid = jfs_ip->xtlid; + } else + lid = mp->lid; + + /* is page not locked by a transaction ? */ + if (lid == 0) + goto allocateLock; + + jfs_info("txLock: tid:%d ip:0x%p mp:0x%p lid:%d", tid, ip, mp, lid); + + /* is page locked by the requester transaction ? */ + tlck = lid_to_tlock(lid); + if ((xtid = tlck->tid) == tid) { + TXN_UNLOCK(); + goto grantLock; + } + + /* + * is page locked by anonymous transaction/lock ? + * + * (page update without transaction (i.e., file write) is + * locked under anonymous transaction tid = 0: + * anonymous tlocks maintained on anonymous tlock list of + * the inode of the page and available to all anonymous + * transactions until txCommit() time at which point + * they are transferred to the transaction tlock list of + * the committing transaction of the inode) + */ + if (xtid == 0) { + tlck->tid = tid; + TXN_UNLOCK(); + tblk = tid_to_tblock(tid); + /* + * The order of the tlocks in the transaction is important + * (during truncate, child xtree pages must be freed before + * parent's tlocks change the working map). + * Take tlock off anonymous list and add to tail of + * transaction list + * + * Note: We really need to get rid of the tid & lid and + * use list_head's. This code is getting UGLY! + */ + if (jfs_ip->atlhead == lid) { + if (jfs_ip->atltail == lid) { + /* only anonymous txn. + * Remove from anon_list + */ + TXN_LOCK(); + list_del_init(&jfs_ip->anon_inode_list); + TXN_UNLOCK(); + } + jfs_ip->atlhead = tlck->next; + } else { + lid_t last; + for (last = jfs_ip->atlhead; + lid_to_tlock(last)->next != lid; + last = lid_to_tlock(last)->next) { + assert(last); + } + lid_to_tlock(last)->next = tlck->next; + if (jfs_ip->atltail == lid) + jfs_ip->atltail = last; + } + + /* insert the tlock at tail of transaction tlock list */ + + if (tblk->next) + lid_to_tlock(tblk->last)->next = lid; + else + tblk->next = lid; + tlck->next = 0; + tblk->last = lid; + + goto grantLock; + } + + goto waitLock; + + /* + * allocate a tlock + */ + allocateLock: + lid = txLockAlloc(); + tlck = lid_to_tlock(lid); + + /* + * initialize tlock + */ + tlck->tid = tid; + + TXN_UNLOCK(); + + /* mark tlock for meta-data page */ + if (mp->xflag & COMMIT_PAGE) { + + tlck->flag = tlckPAGELOCK; + + /* mark the page dirty and nohomeok */ + metapage_nohomeok(mp); + + jfs_info("locking mp = 0x%p, nohomeok = %d tid = %d tlck = 0x%p", + mp, mp->nohomeok, tid, tlck); + + /* if anonymous transaction, and buffer is on the group + * commit synclist, mark inode to show this. This will + * prevent the buffer from being marked nohomeok for too + * long a time. + */ + if ((tid == 0) && mp->lsn) + set_cflag(COMMIT_Synclist, ip); + } + /* mark tlock for in-memory inode */ + else + tlck->flag = tlckINODELOCK; + + if (S_ISDIR(ip->i_mode)) + tlck->flag |= tlckDIRECTORY; + + tlck->type = 0; + + /* bind the tlock and the page */ + tlck->ip = ip; + tlck->mp = mp; + if (dir_xtree) + jfs_ip->xtlid = lid; + else + mp->lid = lid; + + /* + * enqueue transaction lock to transaction/inode + */ + /* insert the tlock at tail of transaction tlock list */ + if (tid) { + tblk = tid_to_tblock(tid); + if (tblk->next) + lid_to_tlock(tblk->last)->next = lid; + else + tblk->next = lid; + tlck->next = 0; + tblk->last = lid; + } + /* anonymous transaction: + * insert the tlock at head of inode anonymous tlock list + */ + else { + tlck->next = jfs_ip->atlhead; + jfs_ip->atlhead = lid; + if (tlck->next == 0) { + /* This inode's first anonymous transaction */ + jfs_ip->atltail = lid; + TXN_LOCK(); + list_add_tail(&jfs_ip->anon_inode_list, + &TxAnchor.anon_list); + TXN_UNLOCK(); + } + } + + /* initialize type dependent area for linelock */ + linelock = (struct linelock *) & tlck->lock; + linelock->next = 0; + linelock->flag = tlckLINELOCK; + linelock->maxcnt = TLOCKSHORT; + linelock->index = 0; + + switch (type & tlckTYPE) { + case tlckDTREE: + linelock->l2linesize = L2DTSLOTSIZE; + break; + + case tlckXTREE: + linelock->l2linesize = L2XTSLOTSIZE; + + xtlck = (struct xtlock *) linelock; + xtlck->header.offset = 0; + xtlck->header.length = 2; + + if (type & tlckNEW) { + xtlck->lwm.offset = XTENTRYSTART; + } else { + if (mp->xflag & COMMIT_PAGE) + p = (xtpage_t *) mp->data; + else + p = &jfs_ip->i_xtroot; + xtlck->lwm.offset = + le16_to_cpu(p->header.nextindex); + } + xtlck->lwm.length = 0; /* ! */ + xtlck->twm.offset = 0; + xtlck->hwm.offset = 0; + + xtlck->index = 2; + break; + + case tlckINODE: + linelock->l2linesize = L2INODESLOTSIZE; + break; + + case tlckDATA: + linelock->l2linesize = L2DATASLOTSIZE; + break; + + default: + jfs_err("UFO tlock:0x%p", tlck); + } + + /* + * update tlock vector + */ + grantLock: + tlck->type |= type; + + return tlck; + + /* + * page is being locked by another transaction: + */ + waitLock: + /* Only locks on ipimap or ipaimap should reach here */ + /* assert(jfs_ip->fileset == AGGREGATE_I); */ + if (jfs_ip->fileset != AGGREGATE_I) { + printk(KERN_ERR "txLock: trying to lock locked page!"); + print_hex_dump(KERN_ERR, "ip: ", DUMP_PREFIX_ADDRESS, 16, 4, + ip, sizeof(*ip), 0); + print_hex_dump(KERN_ERR, "mp: ", DUMP_PREFIX_ADDRESS, 16, 4, + mp, sizeof(*mp), 0); + print_hex_dump(KERN_ERR, "Locker's tblock: ", + DUMP_PREFIX_ADDRESS, 16, 4, tid_to_tblock(tid), + sizeof(struct tblock), 0); + print_hex_dump(KERN_ERR, "Tlock: ", DUMP_PREFIX_ADDRESS, 16, 4, + tlck, sizeof(*tlck), 0); + BUG(); + } + INCREMENT(stattx.waitlock); /* statistics */ + TXN_UNLOCK(); + release_metapage(mp); + TXN_LOCK(); + xtid = tlck->tid; /* reacquire after dropping TXN_LOCK */ + + jfs_info("txLock: in waitLock, tid = %d, xtid = %d, lid = %d", + tid, xtid, lid); + + /* Recheck everything since dropping TXN_LOCK */ + if (xtid && (tlck->mp == mp) && (mp->lid == lid)) + TXN_SLEEP_DROP_LOCK(&tid_to_tblock(xtid)->waitor); + else + TXN_UNLOCK(); + jfs_info("txLock: awakened tid = %d, lid = %d", tid, lid); + + return NULL; +} + +/* + * NAME: txRelease() + * + * FUNCTION: Release buffers associated with transaction locks, but don't + * mark homeok yet. The allows other transactions to modify + * buffers, but won't let them go to disk until commit record + * actually gets written. + * + * PARAMETER: + * tblk - + * + * RETURN: Errors from subroutines. + */ +static void txRelease(struct tblock * tblk) +{ + struct metapage *mp; + lid_t lid; + struct tlock *tlck; + + TXN_LOCK(); + + for (lid = tblk->next; lid; lid = tlck->next) { + tlck = lid_to_tlock(lid); + if ((mp = tlck->mp) != NULL && + (tlck->type & tlckBTROOT) == 0) { + assert(mp->xflag & COMMIT_PAGE); + mp->lid = 0; + } + } + + /* + * wakeup transactions waiting on a page locked + * by the current transaction + */ + TXN_WAKEUP(&tblk->waitor); + + TXN_UNLOCK(); +} + +/* + * NAME: txUnlock() + * + * FUNCTION: Initiates pageout of pages modified by tid in journalled + * objects and frees their lockwords. + */ +static void txUnlock(struct tblock * tblk) +{ + struct tlock *tlck; + struct linelock *linelock; + lid_t lid, next, llid, k; + struct metapage *mp; + struct jfs_log *log; + int difft, diffp; + unsigned long flags; + + jfs_info("txUnlock: tblk = 0x%p", tblk); + log = JFS_SBI(tblk->sb)->log; + + /* + * mark page under tlock homeok (its log has been written): + */ + for (lid = tblk->next; lid; lid = next) { + tlck = lid_to_tlock(lid); + next = tlck->next; + + jfs_info("unlocking lid = %d, tlck = 0x%p", lid, tlck); + + /* unbind page from tlock */ + if ((mp = tlck->mp) != NULL && + (tlck->type & tlckBTROOT) == 0) { + assert(mp->xflag & COMMIT_PAGE); + + /* hold buffer + */ + hold_metapage(mp); + + assert(mp->nohomeok > 0); + _metapage_homeok(mp); + + /* inherit younger/larger clsn */ + LOGSYNC_LOCK(log, flags); + if (mp->clsn) { + logdiff(difft, tblk->clsn, log); + logdiff(diffp, mp->clsn, log); + if (difft > diffp) + mp->clsn = tblk->clsn; + } else + mp->clsn = tblk->clsn; + LOGSYNC_UNLOCK(log, flags); + + assert(!(tlck->flag & tlckFREEPAGE)); + + put_metapage(mp); + } + + /* insert tlock, and linelock(s) of the tlock if any, + * at head of freelist + */ + TXN_LOCK(); + + llid = ((struct linelock *) & tlck->lock)->next; + while (llid) { + linelock = (struct linelock *) lid_to_tlock(llid); + k = linelock->next; + txLockFree(llid); + llid = k; + } + txLockFree(lid); + + TXN_UNLOCK(); + } + tblk->next = tblk->last = 0; + + /* + * remove tblock from logsynclist + * (allocation map pages inherited lsn of tblk and + * has been inserted in logsync list at txUpdateMap()) + */ + if (tblk->lsn) { + LOGSYNC_LOCK(log, flags); + log->count--; + list_del(&tblk->synclist); + LOGSYNC_UNLOCK(log, flags); + } +} + +/* + * txMaplock() + * + * function: allocate a transaction lock for freed page/entry; + * for freed page, maplock is used as xtlock/dtlock type; + */ +struct tlock *txMaplock(tid_t tid, struct inode *ip, int type) +{ + struct jfs_inode_info *jfs_ip = JFS_IP(ip); + lid_t lid; + struct tblock *tblk; + struct tlock *tlck; + struct maplock *maplock; + + TXN_LOCK(); + + /* + * allocate a tlock + */ + lid = txLockAlloc(); + tlck = lid_to_tlock(lid); + + /* + * initialize tlock + */ + tlck->tid = tid; + + /* bind the tlock and the object */ + tlck->flag = tlckINODELOCK; + if (S_ISDIR(ip->i_mode)) + tlck->flag |= tlckDIRECTORY; + tlck->ip = ip; + tlck->mp = NULL; + + tlck->type = type; + + /* + * enqueue transaction lock to transaction/inode + */ + /* insert the tlock at tail of transaction tlock list */ + if (tid) { + tblk = tid_to_tblock(tid); + if (tblk->next) + lid_to_tlock(tblk->last)->next = lid; + else + tblk->next = lid; + tlck->next = 0; + tblk->last = lid; + } + /* anonymous transaction: + * insert the tlock at head of inode anonymous tlock list + */ + else { + tlck->next = jfs_ip->atlhead; + jfs_ip->atlhead = lid; + if (tlck->next == 0) { + /* This inode's first anonymous transaction */ + jfs_ip->atltail = lid; + list_add_tail(&jfs_ip->anon_inode_list, + &TxAnchor.anon_list); + } + } + + TXN_UNLOCK(); + + /* initialize type dependent area for maplock */ + maplock = (struct maplock *) & tlck->lock; + maplock->next = 0; + maplock->maxcnt = 0; + maplock->index = 0; + + return tlck; +} + +/* + * txLinelock() + * + * function: allocate a transaction lock for log vector list + */ +struct linelock *txLinelock(struct linelock * tlock) +{ + lid_t lid; + struct tlock *tlck; + struct linelock *linelock; + + TXN_LOCK(); + + /* allocate a TxLock structure */ + lid = txLockAlloc(); + tlck = lid_to_tlock(lid); + + TXN_UNLOCK(); + + /* initialize linelock */ + linelock = (struct linelock *) tlck; + linelock->next = 0; + linelock->flag = tlckLINELOCK; + linelock->maxcnt = TLOCKLONG; + linelock->index = 0; + if (tlck->flag & tlckDIRECTORY) + linelock->flag |= tlckDIRECTORY; + + /* append linelock after tlock */ + linelock->next = tlock->next; + tlock->next = lid; + + return linelock; +} + +/* + * transaction commit management + * ----------------------------- + */ + +/* + * NAME: txCommit() + * + * FUNCTION: commit the changes to the objects specified in + * clist. For journalled segments only the + * changes of the caller are committed, ie by tid. + * for non-journalled segments the data are flushed to + * disk and then the change to the disk inode and indirect + * blocks committed (so blocks newly allocated to the + * segment will be made a part of the segment atomically). + * + * all of the segments specified in clist must be in + * one file system. no more than 6 segments are needed + * to handle all unix svcs. + * + * if the i_nlink field (i.e. disk inode link count) + * is zero, and the type of inode is a regular file or + * directory, or symbolic link , the inode is truncated + * to zero length. the truncation is committed but the + * VM resources are unaffected until it is closed (see + * iput and iclose). + * + * PARAMETER: + * + * RETURN: + * + * serialization: + * on entry the inode lock on each segment is assumed + * to be held. + * + * i/o error: + */ +int txCommit(tid_t tid, /* transaction identifier */ + int nip, /* number of inodes to commit */ + struct inode **iplist, /* list of inode to commit */ + int flag) +{ + int rc = 0; + struct commit cd; + struct jfs_log *log; + struct tblock *tblk; + struct lrd *lrd; + struct inode *ip; + struct jfs_inode_info *jfs_ip; + int k, n; + ino_t top; + struct super_block *sb; + + jfs_info("txCommit, tid = %d, flag = %d", tid, flag); + /* is read-only file system ? */ + if (isReadOnly(iplist[0])) { + rc = -EROFS; + goto TheEnd; + } + + sb = cd.sb = iplist[0]->i_sb; + cd.tid = tid; + + if (tid == 0) + tid = txBegin(sb, 0); + tblk = tid_to_tblock(tid); + + /* + * initialize commit structure + */ + log = JFS_SBI(sb)->log; + cd.log = log; + + /* initialize log record descriptor in commit */ + lrd = &cd.lrd; + lrd->logtid = cpu_to_le32(tblk->logtid); + lrd->backchain = 0; + + tblk->xflag |= flag; + + if ((flag & (COMMIT_FORCE | COMMIT_SYNC)) == 0) + tblk->xflag |= COMMIT_LAZY; + /* + * prepare non-journaled objects for commit + * + * flush data pages of non-journaled file + * to prevent the file getting non-initialized disk blocks + * in case of crash. + * (new blocks - ) + */ + cd.iplist = iplist; + cd.nip = nip; + + /* + * acquire transaction lock on (on-disk) inodes + * + * update on-disk inode from in-memory inode + * acquiring transaction locks for AFTER records + * on the on-disk inode of file object + * + * sort the inodes array by inode number in descending order + * to prevent deadlock when acquiring transaction lock + * of on-disk inodes on multiple on-disk inode pages by + * multiple concurrent transactions + */ + for (k = 0; k < cd.nip; k++) { + top = (cd.iplist[k])->i_ino; + for (n = k + 1; n < cd.nip; n++) { + ip = cd.iplist[n]; + if (ip->i_ino > top) { + top = ip->i_ino; + cd.iplist[n] = cd.iplist[k]; + cd.iplist[k] = ip; + } + } + + ip = cd.iplist[k]; + jfs_ip = JFS_IP(ip); + + /* + * BUGBUG - This code has temporarily been removed. The + * intent is to ensure that any file data is written before + * the metadata is committed to the journal. This prevents + * uninitialized data from appearing in a file after the + * journal has been replayed. (The uninitialized data + * could be sensitive data removed by another user.) + * + * The problem now is that we are holding the IWRITELOCK + * on the inode, and calling filemap_fdatawrite on an + * unmapped page will cause a deadlock in jfs_get_block. + * + * The long term solution is to pare down the use of + * IWRITELOCK. We are currently holding it too long. + * We could also be smarter about which data pages need + * to be written before the transaction is committed and + * when we don't need to worry about it at all. + * + * if ((!S_ISDIR(ip->i_mode)) + * && (tblk->flag & COMMIT_DELETE) == 0) + * filemap_write_and_wait(ip->i_mapping); + */ + + /* + * Mark inode as not dirty. It will still be on the dirty + * inode list, but we'll know not to commit it again unless + * it gets marked dirty again + */ + clear_cflag(COMMIT_Dirty, ip); + + /* inherit anonymous tlock(s) of inode */ + if (jfs_ip->atlhead) { + lid_to_tlock(jfs_ip->atltail)->next = tblk->next; + tblk->next = jfs_ip->atlhead; + if (!tblk->last) + tblk->last = jfs_ip->atltail; + jfs_ip->atlhead = jfs_ip->atltail = 0; + TXN_LOCK(); + list_del_init(&jfs_ip->anon_inode_list); + TXN_UNLOCK(); + } + + /* + * acquire transaction lock on on-disk inode page + * (become first tlock of the tblk's tlock list) + */ + if (((rc = diWrite(tid, ip)))) + goto out; + } + + /* + * write log records from transaction locks + * + * txUpdateMap() resets XAD_NEW in XAD. + */ + if ((rc = txLog(log, tblk, &cd))) + goto TheEnd; + + /* + * Ensure that inode isn't reused before + * lazy commit thread finishes processing + */ + if (tblk->xflag & COMMIT_DELETE) { + ihold(tblk->u.ip); + /* + * Avoid a rare deadlock + * + * If the inode is locked, we may be blocked in + * jfs_commit_inode. If so, we don't want the + * lazy_commit thread doing the last iput() on the inode + * since that may block on the locked inode. Instead, + * commit the transaction synchronously, so the last iput + * will be done by the calling thread (or later) + */ + /* + * I believe this code is no longer needed. Splitting I_LOCK + * into two bits, I_NEW and I_SYNC should prevent this + * deadlock as well. But since I don't have a JFS testload + * to verify this, only a trivial s/I_LOCK/I_SYNC/ was done. + * Joern + */ + if (tblk->u.ip->i_state & I_SYNC) + tblk->xflag &= ~COMMIT_LAZY; + } + + ASSERT((!(tblk->xflag & COMMIT_DELETE)) || + ((tblk->u.ip->i_nlink == 0) && + !test_cflag(COMMIT_Nolink, tblk->u.ip))); + + /* + * write COMMIT log record + */ + lrd->type = cpu_to_le16(LOG_COMMIT); + lrd->length = 0; + lmLog(log, tblk, lrd, NULL); + + lmGroupCommit(log, tblk); + + /* + * - transaction is now committed - + */ + + /* + * force pages in careful update + * (imap addressing structure update) + */ + if (flag & COMMIT_FORCE) + txForce(tblk); + + /* + * update allocation map. + * + * update inode allocation map and inode: + * free pager lock on memory object of inode if any. + * update block allocation map. + * + * txUpdateMap() resets XAD_NEW in XAD. + */ + if (tblk->xflag & COMMIT_FORCE) + txUpdateMap(tblk); + + /* + * free transaction locks and pageout/free pages + */ + txRelease(tblk); + + if ((tblk->flag & tblkGC_LAZY) == 0) + txUnlock(tblk); + + + /* + * reset in-memory object state + */ + for (k = 0; k < cd.nip; k++) { + ip = cd.iplist[k]; + jfs_ip = JFS_IP(ip); + + /* + * reset in-memory inode state + */ + jfs_ip->bxflag = 0; + jfs_ip->blid = 0; + } + + out: + if (rc != 0) + txAbort(tid, 1); + + TheEnd: + jfs_info("txCommit: tid = %d, returning %d", tid, rc); + return rc; +} + +/* + * NAME: txLog() + * + * FUNCTION: Writes AFTER log records for all lines modified + * by tid for segments specified by inodes in comdata. + * Code assumes only WRITELOCKS are recorded in lockwords. + * + * PARAMETERS: + * + * RETURN : + */ +static int txLog(struct jfs_log * log, struct tblock * tblk, struct commit * cd) +{ + int rc = 0; + struct inode *ip; + lid_t lid; + struct tlock *tlck; + struct lrd *lrd = &cd->lrd; + + /* + * write log record(s) for each tlock of transaction, + */ + for (lid = tblk->next; lid; lid = tlck->next) { + tlck = lid_to_tlock(lid); + + tlck->flag |= tlckLOG; + + /* initialize lrd common */ + ip = tlck->ip; + lrd->aggregate = cpu_to_le32(JFS_SBI(ip->i_sb)->aggregate); + lrd->log.redopage.fileset = cpu_to_le32(JFS_IP(ip)->fileset); + lrd->log.redopage.inode = cpu_to_le32(ip->i_ino); + + /* write log record of page from the tlock */ + switch (tlck->type & tlckTYPE) { + case tlckXTREE: + xtLog(log, tblk, lrd, tlck); + break; + + case tlckDTREE: + dtLog(log, tblk, lrd, tlck); + break; + + case tlckINODE: + diLog(log, tblk, lrd, tlck, cd); + break; + + case tlckMAP: + mapLog(log, tblk, lrd, tlck); + break; + + case tlckDATA: + dataLog(log, tblk, lrd, tlck); + break; + + default: + jfs_err("UFO tlock:0x%p", tlck); + } + } + + return rc; +} + +/* + * diLog() + * + * function: log inode tlock and format maplock to update bmap; + */ +static int diLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd, + struct tlock * tlck, struct commit * cd) +{ + int rc = 0; + struct metapage *mp; + pxd_t *pxd; + struct pxd_lock *pxdlock; + + mp = tlck->mp; + + /* initialize as REDOPAGE record format */ + lrd->log.redopage.type = cpu_to_le16(LOG_INODE); + lrd->log.redopage.l2linesize = cpu_to_le16(L2INODESLOTSIZE); + + pxd = &lrd->log.redopage.pxd; + + /* + * inode after image + */ + if (tlck->type & tlckENTRY) { + /* log after-image for logredo(): */ + lrd->type = cpu_to_le16(LOG_REDOPAGE); + PXDaddress(pxd, mp->index); + PXDlength(pxd, + mp->logical_size >> tblk->sb->s_blocksize_bits); + lrd->backchain = cpu_to_le32(lmLog(log, tblk, lrd, tlck)); + + /* mark page as homeward bound */ + tlck->flag |= tlckWRITEPAGE; + } else if (tlck->type & tlckFREE) { + /* + * free inode extent + * + * (pages of the freed inode extent have been invalidated and + * a maplock for free of the extent has been formatted at + * txLock() time); + * + * the tlock had been acquired on the inode allocation map page + * (iag) that specifies the freed extent, even though the map + * page is not itself logged, to prevent pageout of the map + * page before the log; + */ + + /* log LOG_NOREDOINOEXT of the freed inode extent for + * logredo() to start NoRedoPage filters, and to update + * imap and bmap for free of the extent; + */ + lrd->type = cpu_to_le16(LOG_NOREDOINOEXT); + /* + * For the LOG_NOREDOINOEXT record, we need + * to pass the IAG number and inode extent + * index (within that IAG) from which the + * the extent being released. These have been + * passed to us in the iplist[1] and iplist[2]. + */ + lrd->log.noredoinoext.iagnum = + cpu_to_le32((u32) (size_t) cd->iplist[1]); + lrd->log.noredoinoext.inoext_idx = + cpu_to_le32((u32) (size_t) cd->iplist[2]); + + pxdlock = (struct pxd_lock *) & tlck->lock; + *pxd = pxdlock->pxd; + lrd->backchain = cpu_to_le32(lmLog(log, tblk, lrd, NULL)); + + /* update bmap */ + tlck->flag |= tlckUPDATEMAP; + + /* mark page as homeward bound */ + tlck->flag |= tlckWRITEPAGE; + } else + jfs_err("diLog: UFO type tlck:0x%p", tlck); +#ifdef _JFS_WIP + /* + * alloc/free external EA extent + * + * a maplock for txUpdateMap() to update bPWMAP for alloc/free + * of the extent has been formatted at txLock() time; + */ + else { + assert(tlck->type & tlckEA); + + /* log LOG_UPDATEMAP for logredo() to update bmap for + * alloc of new (and free of old) external EA extent; + */ + lrd->type = cpu_to_le16(LOG_UPDATEMAP); + pxdlock = (struct pxd_lock *) & tlck->lock; + nlock = pxdlock->index; + for (i = 0; i < nlock; i++, pxdlock++) { + if (pxdlock->flag & mlckALLOCPXD) + lrd->log.updatemap.type = + cpu_to_le16(LOG_ALLOCPXD); + else + lrd->log.updatemap.type = + cpu_to_le16(LOG_FREEPXD); + lrd->log.updatemap.nxd = cpu_to_le16(1); + lrd->log.updatemap.pxd = pxdlock->pxd; + lrd->backchain = + cpu_to_le32(lmLog(log, tblk, lrd, NULL)); + } + + /* update bmap */ + tlck->flag |= tlckUPDATEMAP; + } +#endif /* _JFS_WIP */ + + return rc; +} + +/* + * dataLog() + * + * function: log data tlock + */ +static int dataLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd, + struct tlock * tlck) +{ + struct metapage *mp; + pxd_t *pxd; + + mp = tlck->mp; + + /* initialize as REDOPAGE record format */ + lrd->log.redopage.type = cpu_to_le16(LOG_DATA); + lrd->log.redopage.l2linesize = cpu_to_le16(L2DATASLOTSIZE); + + pxd = &lrd->log.redopage.pxd; + + /* log after-image for logredo(): */ + lrd->type = cpu_to_le16(LOG_REDOPAGE); + + if (jfs_dirtable_inline(tlck->ip)) { + /* + * The table has been truncated, we've must have deleted + * the last entry, so don't bother logging this + */ + mp->lid = 0; + grab_metapage(mp); + metapage_homeok(mp); + discard_metapage(mp); + tlck->mp = NULL; + return 0; + } + + PXDaddress(pxd, mp->index); + PXDlength(pxd, mp->logical_size >> tblk->sb->s_blocksize_bits); + + lrd->backchain = cpu_to_le32(lmLog(log, tblk, lrd, tlck)); + + /* mark page as homeward bound */ + tlck->flag |= tlckWRITEPAGE; + + return 0; +} + +/* + * dtLog() + * + * function: log dtree tlock and format maplock to update bmap; + */ +static void dtLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd, + struct tlock * tlck) +{ + struct metapage *mp; + struct pxd_lock *pxdlock; + pxd_t *pxd; + + mp = tlck->mp; + + /* initialize as REDOPAGE/NOREDOPAGE record format */ + lrd->log.redopage.type = cpu_to_le16(LOG_DTREE); + lrd->log.redopage.l2linesize = cpu_to_le16(L2DTSLOTSIZE); + + pxd = &lrd->log.redopage.pxd; + + if (tlck->type & tlckBTROOT) + lrd->log.redopage.type |= cpu_to_le16(LOG_BTROOT); + + /* + * page extension via relocation: entry insertion; + * page extension in-place: entry insertion; + * new right page from page split, reinitialized in-line + * root from root page split: entry insertion; + */ + if (tlck->type & (tlckNEW | tlckEXTEND)) { + /* log after-image of the new page for logredo(): + * mark log (LOG_NEW) for logredo() to initialize + * freelist and update bmap for alloc of the new page; + */ + lrd->type = cpu_to_le16(LOG_REDOPAGE); + if (tlck->type & tlckEXTEND) + lrd->log.redopage.type |= cpu_to_le16(LOG_EXTEND); + else + lrd->log.redopage.type |= cpu_to_le16(LOG_NEW); + PXDaddress(pxd, mp->index); + PXDlength(pxd, + mp->logical_size >> tblk->sb->s_blocksize_bits); + lrd->backchain = cpu_to_le32(lmLog(log, tblk, lrd, tlck)); + + /* format a maplock for txUpdateMap() to update bPMAP for + * alloc of the new page; + */ + if (tlck->type & tlckBTROOT) + return; + tlck->flag |= tlckUPDATEMAP; + pxdlock = (struct pxd_lock *) & tlck->lock; + pxdlock->flag = mlckALLOCPXD; + pxdlock->pxd = *pxd; + + pxdlock->index = 1; + + /* mark page as homeward bound */ + tlck->flag |= tlckWRITEPAGE; + return; + } + + /* + * entry insertion/deletion, + * sibling page link update (old right page before split); + */ + if (tlck->type & (tlckENTRY | tlckRELINK)) { + /* log after-image for logredo(): */ + lrd->type = cpu_to_le16(LOG_REDOPAGE); + PXDaddress(pxd, mp->index); + PXDlength(pxd, + mp->logical_size >> tblk->sb->s_blocksize_bits); + lrd->backchain = cpu_to_le32(lmLog(log, tblk, lrd, tlck)); + + /* mark page as homeward bound */ + tlck->flag |= tlckWRITEPAGE; + return; + } + + /* + * page deletion: page has been invalidated + * page relocation: source extent + * + * a maplock for free of the page has been formatted + * at txLock() time); + */ + if (tlck->type & (tlckFREE | tlckRELOCATE)) { + /* log LOG_NOREDOPAGE of the deleted page for logredo() + * to start NoRedoPage filter and to update bmap for free + * of the deletd page + */ + lrd->type = cpu_to_le16(LOG_NOREDOPAGE); + pxdlock = (struct pxd_lock *) & tlck->lock; + *pxd = pxdlock->pxd; + lrd->backchain = cpu_to_le32(lmLog(log, tblk, lrd, NULL)); + + /* a maplock for txUpdateMap() for free of the page + * has been formatted at txLock() time; + */ + tlck->flag |= tlckUPDATEMAP; + } + return; +} + +/* + * xtLog() + * + * function: log xtree tlock and format maplock to update bmap; + */ +static void xtLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd, + struct tlock * tlck) +{ + struct inode *ip; + struct metapage *mp; + xtpage_t *p; + struct xtlock *xtlck; + struct maplock *maplock; + struct xdlistlock *xadlock; + struct pxd_lock *pxdlock; + pxd_t *page_pxd; + int next, lwm, hwm; + + ip = tlck->ip; + mp = tlck->mp; + + /* initialize as REDOPAGE/NOREDOPAGE record format */ + lrd->log.redopage.type = cpu_to_le16(LOG_XTREE); + lrd->log.redopage.l2linesize = cpu_to_le16(L2XTSLOTSIZE); + + page_pxd = &lrd->log.redopage.pxd; + + if (tlck->type & tlckBTROOT) { + lrd->log.redopage.type |= cpu_to_le16(LOG_BTROOT); + p = &JFS_IP(ip)->i_xtroot; + if (S_ISDIR(ip->i_mode)) + lrd->log.redopage.type |= + cpu_to_le16(LOG_DIR_XTREE); + } else + p = (xtpage_t *) mp->data; + next = le16_to_cpu(p->header.nextindex); + + xtlck = (struct xtlock *) & tlck->lock; + + maplock = (struct maplock *) & tlck->lock; + xadlock = (struct xdlistlock *) maplock; + + /* + * entry insertion/extension; + * sibling page link update (old right page before split); + */ + if (tlck->type & (tlckNEW | tlckGROW | tlckRELINK)) { + /* log after-image for logredo(): + * logredo() will update bmap for alloc of new/extended + * extents (XAD_NEW|XAD_EXTEND) of XAD[lwm:next) from + * after-image of XADlist; + * logredo() resets (XAD_NEW|XAD_EXTEND) flag when + * applying the after-image to the meta-data page. + */ + lrd->type = cpu_to_le16(LOG_REDOPAGE); + PXDaddress(page_pxd, mp->index); + PXDlength(page_pxd, + mp->logical_size >> tblk->sb->s_blocksize_bits); + lrd->backchain = cpu_to_le32(lmLog(log, tblk, lrd, tlck)); + + /* format a maplock for txUpdateMap() to update bPMAP + * for alloc of new/extended extents of XAD[lwm:next) + * from the page itself; + * txUpdateMap() resets (XAD_NEW|XAD_EXTEND) flag. + */ + lwm = xtlck->lwm.offset; + if (lwm == 0) + lwm = XTPAGEMAXSLOT; + + if (lwm == next) + goto out; + if (lwm > next) { + jfs_err("xtLog: lwm > next"); + goto out; + } + tlck->flag |= tlckUPDATEMAP; + xadlock->flag = mlckALLOCXADLIST; + xadlock->count = next - lwm; + if ((xadlock->count <= 4) && (tblk->xflag & COMMIT_LAZY)) { + int i; + pxd_t *pxd; + /* + * Lazy commit may allow xtree to be modified before + * txUpdateMap runs. Copy xad into linelock to + * preserve correct data. + * + * We can fit twice as may pxd's as xads in the lock + */ + xadlock->flag = mlckALLOCPXDLIST; + pxd = xadlock->xdlist = &xtlck->pxdlock; + for (i = 0; i < xadlock->count; i++) { + PXDaddress(pxd, addressXAD(&p->xad[lwm + i])); + PXDlength(pxd, lengthXAD(&p->xad[lwm + i])); + p->xad[lwm + i].flag &= + ~(XAD_NEW | XAD_EXTENDED); + pxd++; + } + } else { + /* + * xdlist will point to into inode's xtree, ensure + * that transaction is not committed lazily. + */ + xadlock->flag = mlckALLOCXADLIST; + xadlock->xdlist = &p->xad[lwm]; + tblk->xflag &= ~COMMIT_LAZY; + } + jfs_info("xtLog: alloc ip:0x%p mp:0x%p tlck:0x%p lwm:%d count:%d", + tlck->ip, mp, tlck, lwm, xadlock->count); + + maplock->index = 1; + + out: + /* mark page as homeward bound */ + tlck->flag |= tlckWRITEPAGE; + + return; + } + + /* + * page deletion: file deletion/truncation (ref. xtTruncate()) + * + * (page will be invalidated after log is written and bmap + * is updated from the page); + */ + if (tlck->type & tlckFREE) { + /* LOG_NOREDOPAGE log for NoRedoPage filter: + * if page free from file delete, NoRedoFile filter from + * inode image of zero link count will subsume NoRedoPage + * filters for each page; + * if page free from file truncattion, write NoRedoPage + * filter; + * + * upadte of block allocation map for the page itself: + * if page free from deletion and truncation, LOG_UPDATEMAP + * log for the page itself is generated from processing + * its parent page xad entries; + */ + /* if page free from file truncation, log LOG_NOREDOPAGE + * of the deleted page for logredo() to start NoRedoPage + * filter for the page; + */ + if (tblk->xflag & COMMIT_TRUNCATE) { + /* write NOREDOPAGE for the page */ + lrd->type = cpu_to_le16(LOG_NOREDOPAGE); + PXDaddress(page_pxd, mp->index); + PXDlength(page_pxd, + mp->logical_size >> tblk->sb-> + s_blocksize_bits); + lrd->backchain = + cpu_to_le32(lmLog(log, tblk, lrd, NULL)); + + if (tlck->type & tlckBTROOT) { + /* Empty xtree must be logged */ + lrd->type = cpu_to_le16(LOG_REDOPAGE); + lrd->backchain = + cpu_to_le32(lmLog(log, tblk, lrd, tlck)); + } + } + + /* init LOG_UPDATEMAP of the freed extents + * XAD[XTENTRYSTART:hwm) from the deleted page itself + * for logredo() to update bmap; + */ + lrd->type = cpu_to_le16(LOG_UPDATEMAP); + lrd->log.updatemap.type = cpu_to_le16(LOG_FREEXADLIST); + xtlck = (struct xtlock *) & tlck->lock; + hwm = xtlck->hwm.offset; + lrd->log.updatemap.nxd = + cpu_to_le16(hwm - XTENTRYSTART + 1); + /* reformat linelock for lmLog() */ + xtlck->header.offset = XTENTRYSTART; + xtlck->header.length = hwm - XTENTRYSTART + 1; + xtlck->index = 1; + lrd->backchain = cpu_to_le32(lmLog(log, tblk, lrd, tlck)); + + /* format a maplock for txUpdateMap() to update bmap + * to free extents of XAD[XTENTRYSTART:hwm) from the + * deleted page itself; + */ + tlck->flag |= tlckUPDATEMAP; + xadlock->count = hwm - XTENTRYSTART + 1; + if ((xadlock->count <= 4) && (tblk->xflag & COMMIT_LAZY)) { + int i; + pxd_t *pxd; + /* + * Lazy commit may allow xtree to be modified before + * txUpdateMap runs. Copy xad into linelock to + * preserve correct data. + * + * We can fit twice as may pxd's as xads in the lock + */ + xadlock->flag = mlckFREEPXDLIST; + pxd = xadlock->xdlist = &xtlck->pxdlock; + for (i = 0; i < xadlock->count; i++) { + PXDaddress(pxd, + addressXAD(&p->xad[XTENTRYSTART + i])); + PXDlength(pxd, + lengthXAD(&p->xad[XTENTRYSTART + i])); + pxd++; + } + } else { + /* + * xdlist will point to into inode's xtree, ensure + * that transaction is not committed lazily. + */ + xadlock->flag = mlckFREEXADLIST; + xadlock->xdlist = &p->xad[XTENTRYSTART]; + tblk->xflag &= ~COMMIT_LAZY; + } + jfs_info("xtLog: free ip:0x%p mp:0x%p count:%d lwm:2", + tlck->ip, mp, xadlock->count); + + maplock->index = 1; + + /* mark page as invalid */ + if (((tblk->xflag & COMMIT_PWMAP) || S_ISDIR(ip->i_mode)) + && !(tlck->type & tlckBTROOT)) + tlck->flag |= tlckFREEPAGE; + /* + else (tblk->xflag & COMMIT_PMAP) + ? release the page; + */ + return; + } + + /* + * page/entry truncation: file truncation (ref. xtTruncate()) + * + * |----------+------+------+---------------| + * | | | + * | | hwm - hwm before truncation + * | next - truncation point + * lwm - lwm before truncation + * header ? + */ + if (tlck->type & tlckTRUNCATE) { + pxd_t pxd; /* truncated extent of xad */ + int twm; + + /* + * For truncation the entire linelock may be used, so it would + * be difficult to store xad list in linelock itself. + * Therefore, we'll just force transaction to be committed + * synchronously, so that xtree pages won't be changed before + * txUpdateMap runs. + */ + tblk->xflag &= ~COMMIT_LAZY; + lwm = xtlck->lwm.offset; + if (lwm == 0) + lwm = XTPAGEMAXSLOT; + hwm = xtlck->hwm.offset; + twm = xtlck->twm.offset; + + /* + * write log records + */ + /* log after-image for logredo(): + * + * logredo() will update bmap for alloc of new/extended + * extents (XAD_NEW|XAD_EXTEND) of XAD[lwm:next) from + * after-image of XADlist; + * logredo() resets (XAD_NEW|XAD_EXTEND) flag when + * applying the after-image to the meta-data page. + */ + lrd->type = cpu_to_le16(LOG_REDOPAGE); + PXDaddress(page_pxd, mp->index); + PXDlength(page_pxd, + mp->logical_size >> tblk->sb->s_blocksize_bits); + lrd->backchain = cpu_to_le32(lmLog(log, tblk, lrd, tlck)); + + /* + * truncate entry XAD[twm == next - 1]: + */ + if (twm == next - 1) { + /* init LOG_UPDATEMAP for logredo() to update bmap for + * free of truncated delta extent of the truncated + * entry XAD[next - 1]: + * (xtlck->pxdlock = truncated delta extent); + */ + pxdlock = (struct pxd_lock *) & xtlck->pxdlock; + /* assert(pxdlock->type & tlckTRUNCATE); */ + lrd->type = cpu_to_le16(LOG_UPDATEMAP); + lrd->log.updatemap.type = cpu_to_le16(LOG_FREEPXD); + lrd->log.updatemap.nxd = cpu_to_le16(1); + lrd->log.updatemap.pxd = pxdlock->pxd; + pxd = pxdlock->pxd; /* save to format maplock */ + lrd->backchain = + cpu_to_le32(lmLog(log, tblk, lrd, NULL)); + } + + /* + * free entries XAD[next:hwm]: + */ + if (hwm >= next) { + /* init LOG_UPDATEMAP of the freed extents + * XAD[next:hwm] from the deleted page itself + * for logredo() to update bmap; + */ + lrd->type = cpu_to_le16(LOG_UPDATEMAP); + lrd->log.updatemap.type = + cpu_to_le16(LOG_FREEXADLIST); + xtlck = (struct xtlock *) & tlck->lock; + hwm = xtlck->hwm.offset; + lrd->log.updatemap.nxd = + cpu_to_le16(hwm - next + 1); + /* reformat linelock for lmLog() */ + xtlck->header.offset = next; + xtlck->header.length = hwm - next + 1; + xtlck->index = 1; + lrd->backchain = + cpu_to_le32(lmLog(log, tblk, lrd, tlck)); + } + + /* + * format maplock(s) for txUpdateMap() to update bmap + */ + maplock->index = 0; + + /* + * allocate entries XAD[lwm:next): + */ + if (lwm < next) { + /* format a maplock for txUpdateMap() to update bPMAP + * for alloc of new/extended extents of XAD[lwm:next) + * from the page itself; + * txUpdateMap() resets (XAD_NEW|XAD_EXTEND) flag. + */ + tlck->flag |= tlckUPDATEMAP; + xadlock->flag = mlckALLOCXADLIST; + xadlock->count = next - lwm; + xadlock->xdlist = &p->xad[lwm]; + + jfs_info("xtLog: alloc ip:0x%p mp:0x%p count:%d lwm:%d next:%d", + tlck->ip, mp, xadlock->count, lwm, next); + maplock->index++; + xadlock++; + } + + /* + * truncate entry XAD[twm == next - 1]: + */ + if (twm == next - 1) { + /* format a maplock for txUpdateMap() to update bmap + * to free truncated delta extent of the truncated + * entry XAD[next - 1]; + * (xtlck->pxdlock = truncated delta extent); + */ + tlck->flag |= tlckUPDATEMAP; + pxdlock = (struct pxd_lock *) xadlock; + pxdlock->flag = mlckFREEPXD; + pxdlock->count = 1; + pxdlock->pxd = pxd; + + jfs_info("xtLog: truncate ip:0x%p mp:0x%p count:%d hwm:%d", + ip, mp, pxdlock->count, hwm); + maplock->index++; + xadlock++; + } + + /* + * free entries XAD[next:hwm]: + */ + if (hwm >= next) { + /* format a maplock for txUpdateMap() to update bmap + * to free extents of XAD[next:hwm] from thedeleted + * page itself; + */ + tlck->flag |= tlckUPDATEMAP; + xadlock->flag = mlckFREEXADLIST; + xadlock->count = hwm - next + 1; + xadlock->xdlist = &p->xad[next]; + + jfs_info("xtLog: free ip:0x%p mp:0x%p count:%d next:%d hwm:%d", + tlck->ip, mp, xadlock->count, next, hwm); + maplock->index++; + } + + /* mark page as homeward bound */ + tlck->flag |= tlckWRITEPAGE; + } + return; +} + +/* + * mapLog() + * + * function: log from maplock of freed data extents; + */ +static void mapLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd, + struct tlock * tlck) +{ + struct pxd_lock *pxdlock; + int i, nlock; + pxd_t *pxd; + + /* + * page relocation: free the source page extent + * + * a maplock for txUpdateMap() for free of the page + * has been formatted at txLock() time saving the src + * relocated page address; + */ + if (tlck->type & tlckRELOCATE) { + /* log LOG_NOREDOPAGE of the old relocated page + * for logredo() to start NoRedoPage filter; + */ + lrd->type = cpu_to_le16(LOG_NOREDOPAGE); + pxdlock = (struct pxd_lock *) & tlck->lock; + pxd = &lrd->log.redopage.pxd; + *pxd = pxdlock->pxd; + lrd->backchain = cpu_to_le32(lmLog(log, tblk, lrd, NULL)); + + /* (N.B. currently, logredo() does NOT update bmap + * for free of the page itself for (LOG_XTREE|LOG_NOREDOPAGE); + * if page free from relocation, LOG_UPDATEMAP log is + * specifically generated now for logredo() + * to update bmap for free of src relocated page; + * (new flag LOG_RELOCATE may be introduced which will + * inform logredo() to start NORedoPage filter and also + * update block allocation map at the same time, thus + * avoiding an extra log write); + */ + lrd->type = cpu_to_le16(LOG_UPDATEMAP); + lrd->log.updatemap.type = cpu_to_le16(LOG_FREEPXD); + lrd->log.updatemap.nxd = cpu_to_le16(1); + lrd->log.updatemap.pxd = pxdlock->pxd; + lrd->backchain = cpu_to_le32(lmLog(log, tblk, lrd, NULL)); + + /* a maplock for txUpdateMap() for free of the page + * has been formatted at txLock() time; + */ + tlck->flag |= tlckUPDATEMAP; + return; + } + /* + + * Otherwise it's not a relocate request + * + */ + else { + /* log LOG_UPDATEMAP for logredo() to update bmap for + * free of truncated/relocated delta extent of the data; + * e.g.: external EA extent, relocated/truncated extent + * from xtTailgate(); + */ + lrd->type = cpu_to_le16(LOG_UPDATEMAP); + pxdlock = (struct pxd_lock *) & tlck->lock; + nlock = pxdlock->index; + for (i = 0; i < nlock; i++, pxdlock++) { + if (pxdlock->flag & mlckALLOCPXD) + lrd->log.updatemap.type = + cpu_to_le16(LOG_ALLOCPXD); + else + lrd->log.updatemap.type = + cpu_to_le16(LOG_FREEPXD); + lrd->log.updatemap.nxd = cpu_to_le16(1); + lrd->log.updatemap.pxd = pxdlock->pxd; + lrd->backchain = + cpu_to_le32(lmLog(log, tblk, lrd, NULL)); + jfs_info("mapLog: xaddr:0x%lx xlen:0x%x", + (ulong) addressPXD(&pxdlock->pxd), + lengthPXD(&pxdlock->pxd)); + } + + /* update bmap */ + tlck->flag |= tlckUPDATEMAP; + } +} + +/* + * txEA() + * + * function: acquire maplock for EA/ACL extents or + * set COMMIT_INLINE flag; + */ +void txEA(tid_t tid, struct inode *ip, dxd_t * oldea, dxd_t * newea) +{ + struct tlock *tlck = NULL; + struct pxd_lock *maplock = NULL, *pxdlock = NULL; + + /* + * format maplock for alloc of new EA extent + */ + if (newea) { + /* Since the newea could be a completely zeroed entry we need to + * check for the two flags which indicate we should actually + * commit new EA data + */ + if (newea->flag & DXD_EXTENT) { + tlck = txMaplock(tid, ip, tlckMAP); + maplock = (struct pxd_lock *) & tlck->lock; + pxdlock = (struct pxd_lock *) maplock; + pxdlock->flag = mlckALLOCPXD; + PXDaddress(&pxdlock->pxd, addressDXD(newea)); + PXDlength(&pxdlock->pxd, lengthDXD(newea)); + pxdlock++; + maplock->index = 1; + } else if (newea->flag & DXD_INLINE) { + tlck = NULL; + + set_cflag(COMMIT_Inlineea, ip); + } + } + + /* + * format maplock for free of old EA extent + */ + if (!test_cflag(COMMIT_Nolink, ip) && oldea->flag & DXD_EXTENT) { + if (tlck == NULL) { + tlck = txMaplock(tid, ip, tlckMAP); + maplock = (struct pxd_lock *) & tlck->lock; + pxdlock = (struct pxd_lock *) maplock; + maplock->index = 0; + } + pxdlock->flag = mlckFREEPXD; + PXDaddress(&pxdlock->pxd, addressDXD(oldea)); + PXDlength(&pxdlock->pxd, lengthDXD(oldea)); + maplock->index++; + } +} + +/* + * txForce() + * + * function: synchronously write pages locked by transaction + * after txLog() but before txUpdateMap(); + */ +static void txForce(struct tblock * tblk) +{ + struct tlock *tlck; + lid_t lid, next; + struct metapage *mp; + + /* + * reverse the order of transaction tlocks in + * careful update order of address index pages + * (right to left, bottom up) + */ + tlck = lid_to_tlock(tblk->next); + lid = tlck->next; + tlck->next = 0; + while (lid) { + tlck = lid_to_tlock(lid); + next = tlck->next; + tlck->next = tblk->next; + tblk->next = lid; + lid = next; + } + + /* + * synchronously write the page, and + * hold the page for txUpdateMap(); + */ + for (lid = tblk->next; lid; lid = next) { + tlck = lid_to_tlock(lid); + next = tlck->next; + + if ((mp = tlck->mp) != NULL && + (tlck->type & tlckBTROOT) == 0) { + assert(mp->xflag & COMMIT_PAGE); + + if (tlck->flag & tlckWRITEPAGE) { + tlck->flag &= ~tlckWRITEPAGE; + + /* do not release page to freelist */ + force_metapage(mp); +#if 0 + /* + * The "right" thing to do here is to + * synchronously write the metadata. + * With the current implementation this + * is hard since write_metapage requires + * us to kunmap & remap the page. If we + * have tlocks pointing into the metadata + * pages, we don't want to do this. I think + * we can get by with synchronously writing + * the pages when they are released. + */ + assert(mp->nohomeok); + set_bit(META_dirty, &mp->flag); + set_bit(META_sync, &mp->flag); +#endif + } + } + } +} + +/* + * txUpdateMap() + * + * function: update persistent allocation map (and working map + * if appropriate); + * + * parameter: + */ +static void txUpdateMap(struct tblock * tblk) +{ + struct inode *ip; + struct inode *ipimap; + lid_t lid; + struct tlock *tlck; + struct maplock *maplock; + struct pxd_lock pxdlock; + int maptype; + int k, nlock; + struct metapage *mp = NULL; + + ipimap = JFS_SBI(tblk->sb)->ipimap; + + maptype = (tblk->xflag & COMMIT_PMAP) ? COMMIT_PMAP : COMMIT_PWMAP; + + + /* + * update block allocation map + * + * update allocation state in pmap (and wmap) and + * update lsn of the pmap page; + */ + /* + * scan each tlock/page of transaction for block allocation/free: + * + * for each tlock/page of transaction, update map. + * ? are there tlock for pmap and pwmap at the same time ? + */ + for (lid = tblk->next; lid; lid = tlck->next) { + tlck = lid_to_tlock(lid); + + if ((tlck->flag & tlckUPDATEMAP) == 0) + continue; + + if (tlck->flag & tlckFREEPAGE) { + /* + * Another thread may attempt to reuse freed space + * immediately, so we want to get rid of the metapage + * before anyone else has a chance to get it. + * Lock metapage, update maps, then invalidate + * the metapage. + */ + mp = tlck->mp; + ASSERT(mp->xflag & COMMIT_PAGE); + grab_metapage(mp); + } + + /* + * extent list: + * . in-line PXD list: + * . out-of-line XAD list: + */ + maplock = (struct maplock *) & tlck->lock; + nlock = maplock->index; + + for (k = 0; k < nlock; k++, maplock++) { + /* + * allocate blocks in persistent map: + * + * blocks have been allocated from wmap at alloc time; + */ + if (maplock->flag & mlckALLOC) { + txAllocPMap(ipimap, maplock, tblk); + } + /* + * free blocks in persistent and working map: + * blocks will be freed in pmap and then in wmap; + * + * ? tblock specifies the PMAP/PWMAP based upon + * transaction + * + * free blocks in persistent map: + * blocks will be freed from wmap at last reference + * release of the object for regular files; + * + * Alway free blocks from both persistent & working + * maps for directories + */ + else { /* (maplock->flag & mlckFREE) */ + + if (tlck->flag & tlckDIRECTORY) + txFreeMap(ipimap, maplock, + tblk, COMMIT_PWMAP); + else + txFreeMap(ipimap, maplock, + tblk, maptype); + } + } + if (tlck->flag & tlckFREEPAGE) { + if (!(tblk->flag & tblkGC_LAZY)) { + /* This is equivalent to txRelease */ + ASSERT(mp->lid == lid); + tlck->mp->lid = 0; + } + assert(mp->nohomeok == 1); + metapage_homeok(mp); + discard_metapage(mp); + tlck->mp = NULL; + } + } + /* + * update inode allocation map + * + * update allocation state in pmap and + * update lsn of the pmap page; + * update in-memory inode flag/state + * + * unlock mapper/write lock + */ + if (tblk->xflag & COMMIT_CREATE) { + diUpdatePMap(ipimap, tblk->ino, false, tblk); + /* update persistent block allocation map + * for the allocation of inode extent; + */ + pxdlock.flag = mlckALLOCPXD; + pxdlock.pxd = tblk->u.ixpxd; + pxdlock.index = 1; + txAllocPMap(ipimap, (struct maplock *) & pxdlock, tblk); + } else if (tblk->xflag & COMMIT_DELETE) { + ip = tblk->u.ip; + diUpdatePMap(ipimap, ip->i_ino, true, tblk); + iput(ip); + } +} + +/* + * txAllocPMap() + * + * function: allocate from persistent map; + * + * parameter: + * ipbmap - + * malock - + * xad list: + * pxd: + * + * maptype - + * allocate from persistent map; + * free from persistent map; + * (e.g., tmp file - free from working map at releae + * of last reference); + * free from persistent and working map; + * + * lsn - log sequence number; + */ +static void txAllocPMap(struct inode *ip, struct maplock * maplock, + struct tblock * tblk) +{ + struct inode *ipbmap = JFS_SBI(ip->i_sb)->ipbmap; + struct xdlistlock *xadlistlock; + xad_t *xad; + s64 xaddr; + int xlen; + struct pxd_lock *pxdlock; + struct xdlistlock *pxdlistlock; + pxd_t *pxd; + int n; + + /* + * allocate from persistent map; + */ + if (maplock->flag & mlckALLOCXADLIST) { + xadlistlock = (struct xdlistlock *) maplock; + xad = xadlistlock->xdlist; + for (n = 0; n < xadlistlock->count; n++, xad++) { + if (xad->flag & (XAD_NEW | XAD_EXTENDED)) { + xaddr = addressXAD(xad); + xlen = lengthXAD(xad); + dbUpdatePMap(ipbmap, false, xaddr, + (s64) xlen, tblk); + xad->flag &= ~(XAD_NEW | XAD_EXTENDED); + jfs_info("allocPMap: xaddr:0x%lx xlen:%d", + (ulong) xaddr, xlen); + } + } + } else if (maplock->flag & mlckALLOCPXD) { + pxdlock = (struct pxd_lock *) maplock; + xaddr = addressPXD(&pxdlock->pxd); + xlen = lengthPXD(&pxdlock->pxd); + dbUpdatePMap(ipbmap, false, xaddr, (s64) xlen, tblk); + jfs_info("allocPMap: xaddr:0x%lx xlen:%d", (ulong) xaddr, xlen); + } else { /* (maplock->flag & mlckALLOCPXDLIST) */ + + pxdlistlock = (struct xdlistlock *) maplock; + pxd = pxdlistlock->xdlist; + for (n = 0; n < pxdlistlock->count; n++, pxd++) { + xaddr = addressPXD(pxd); + xlen = lengthPXD(pxd); + dbUpdatePMap(ipbmap, false, xaddr, (s64) xlen, + tblk); + jfs_info("allocPMap: xaddr:0x%lx xlen:%d", + (ulong) xaddr, xlen); + } + } +} + +/* + * txFreeMap() + * + * function: free from persistent and/or working map; + * + * todo: optimization + */ +void txFreeMap(struct inode *ip, + struct maplock * maplock, struct tblock * tblk, int maptype) +{ + struct inode *ipbmap = JFS_SBI(ip->i_sb)->ipbmap; + struct xdlistlock *xadlistlock; + xad_t *xad; + s64 xaddr; + int xlen; + struct pxd_lock *pxdlock; + struct xdlistlock *pxdlistlock; + pxd_t *pxd; + int n; + + jfs_info("txFreeMap: tblk:0x%p maplock:0x%p maptype:0x%x", + tblk, maplock, maptype); + + /* + * free from persistent map; + */ + if (maptype == COMMIT_PMAP || maptype == COMMIT_PWMAP) { + if (maplock->flag & mlckFREEXADLIST) { + xadlistlock = (struct xdlistlock *) maplock; + xad = xadlistlock->xdlist; + for (n = 0; n < xadlistlock->count; n++, xad++) { + if (!(xad->flag & XAD_NEW)) { + xaddr = addressXAD(xad); + xlen = lengthXAD(xad); + dbUpdatePMap(ipbmap, true, xaddr, + (s64) xlen, tblk); + jfs_info("freePMap: xaddr:0x%lx xlen:%d", + (ulong) xaddr, xlen); + } + } + } else if (maplock->flag & mlckFREEPXD) { + pxdlock = (struct pxd_lock *) maplock; + xaddr = addressPXD(&pxdlock->pxd); + xlen = lengthPXD(&pxdlock->pxd); + dbUpdatePMap(ipbmap, true, xaddr, (s64) xlen, + tblk); + jfs_info("freePMap: xaddr:0x%lx xlen:%d", + (ulong) xaddr, xlen); + } else { /* (maplock->flag & mlckALLOCPXDLIST) */ + + pxdlistlock = (struct xdlistlock *) maplock; + pxd = pxdlistlock->xdlist; + for (n = 0; n < pxdlistlock->count; n++, pxd++) { + xaddr = addressPXD(pxd); + xlen = lengthPXD(pxd); + dbUpdatePMap(ipbmap, true, xaddr, + (s64) xlen, tblk); + jfs_info("freePMap: xaddr:0x%lx xlen:%d", + (ulong) xaddr, xlen); + } + } + } + + /* + * free from working map; + */ + if (maptype == COMMIT_PWMAP || maptype == COMMIT_WMAP) { + if (maplock->flag & mlckFREEXADLIST) { + xadlistlock = (struct xdlistlock *) maplock; + xad = xadlistlock->xdlist; + for (n = 0; n < xadlistlock->count; n++, xad++) { + xaddr = addressXAD(xad); + xlen = lengthXAD(xad); + dbFree(ip, xaddr, (s64) xlen); + xad->flag = 0; + jfs_info("freeWMap: xaddr:0x%lx xlen:%d", + (ulong) xaddr, xlen); + } + } else if (maplock->flag & mlckFREEPXD) { + pxdlock = (struct pxd_lock *) maplock; + xaddr = addressPXD(&pxdlock->pxd); + xlen = lengthPXD(&pxdlock->pxd); + dbFree(ip, xaddr, (s64) xlen); + jfs_info("freeWMap: xaddr:0x%lx xlen:%d", + (ulong) xaddr, xlen); + } else { /* (maplock->flag & mlckFREEPXDLIST) */ + + pxdlistlock = (struct xdlistlock *) maplock; + pxd = pxdlistlock->xdlist; + for (n = 0; n < pxdlistlock->count; n++, pxd++) { + xaddr = addressPXD(pxd); + xlen = lengthPXD(pxd); + dbFree(ip, xaddr, (s64) xlen); + jfs_info("freeWMap: xaddr:0x%lx xlen:%d", + (ulong) xaddr, xlen); + } + } + } +} + +/* + * txFreelock() + * + * function: remove tlock from inode anonymous locklist + */ +void txFreelock(struct inode *ip) +{ + struct jfs_inode_info *jfs_ip = JFS_IP(ip); + struct tlock *xtlck, *tlck; + lid_t xlid = 0, lid; + + if (!jfs_ip->atlhead) + return; + + TXN_LOCK(); + xtlck = (struct tlock *) &jfs_ip->atlhead; + + while ((lid = xtlck->next) != 0) { + tlck = lid_to_tlock(lid); + if (tlck->flag & tlckFREELOCK) { + xtlck->next = tlck->next; + txLockFree(lid); + } else { + xtlck = tlck; + xlid = lid; + } + } + + if (jfs_ip->atlhead) + jfs_ip->atltail = xlid; + else { + jfs_ip->atltail = 0; + /* + * If inode was on anon_list, remove it + */ + list_del_init(&jfs_ip->anon_inode_list); + } + TXN_UNLOCK(); +} + +/* + * txAbort() + * + * function: abort tx before commit; + * + * frees line-locks and segment locks for all + * segments in comdata structure. + * Optionally sets state of file-system to FM_DIRTY in super-block. + * log age of page-frames in memory for which caller has + * are reset to 0 (to avoid logwarap). + */ +void txAbort(tid_t tid, int dirty) +{ + lid_t lid, next; + struct metapage *mp; + struct tblock *tblk = tid_to_tblock(tid); + struct tlock *tlck; + + /* + * free tlocks of the transaction + */ + for (lid = tblk->next; lid; lid = next) { + tlck = lid_to_tlock(lid); + next = tlck->next; + mp = tlck->mp; + JFS_IP(tlck->ip)->xtlid = 0; + + if (mp) { + mp->lid = 0; + + /* + * reset lsn of page to avoid logwarap: + * + * (page may have been previously committed by another + * transaction(s) but has not been paged, i.e., + * it may be on logsync list even though it has not + * been logged for the current tx.) + */ + if (mp->xflag & COMMIT_PAGE && mp->lsn) + LogSyncRelease(mp); + } + /* insert tlock at head of freelist */ + TXN_LOCK(); + txLockFree(lid); + TXN_UNLOCK(); + } + + /* caller will free the transaction block */ + + tblk->next = tblk->last = 0; + + /* + * mark filesystem dirty + */ + if (dirty) + jfs_error(tblk->sb, "\n"); + + return; +} + +/* + * txLazyCommit(void) + * + * All transactions except those changing ipimap (COMMIT_FORCE) are + * processed by this routine. This insures that the inode and block + * allocation maps are updated in order. For synchronous transactions, + * let the user thread finish processing after txUpdateMap() is called. + */ +static void txLazyCommit(struct tblock * tblk) +{ + struct jfs_log *log; + + while (((tblk->flag & tblkGC_READY) == 0) && + ((tblk->flag & tblkGC_UNLOCKED) == 0)) { + /* We must have gotten ahead of the user thread + */ + jfs_info("jfs_lazycommit: tblk 0x%p not unlocked", tblk); + yield(); + } + + jfs_info("txLazyCommit: processing tblk 0x%p", tblk); + + txUpdateMap(tblk); + + log = (struct jfs_log *) JFS_SBI(tblk->sb)->log; + + spin_lock_irq(&log->gclock); // LOGGC_LOCK + + tblk->flag |= tblkGC_COMMITTED; + + if (tblk->flag & tblkGC_READY) + log->gcrtc--; + + wake_up_all(&tblk->gcwait); // LOGGC_WAKEUP + + /* + * Can't release log->gclock until we've tested tblk->flag + */ + if (tblk->flag & tblkGC_LAZY) { + spin_unlock_irq(&log->gclock); // LOGGC_UNLOCK + txUnlock(tblk); + tblk->flag &= ~tblkGC_LAZY; + txEnd(tblk - TxBlock); /* Convert back to tid */ + } else + spin_unlock_irq(&log->gclock); // LOGGC_UNLOCK + + jfs_info("txLazyCommit: done: tblk = 0x%p", tblk); +} + +/* + * jfs_lazycommit(void) + * + * To be run as a kernel daemon. If lbmIODone is called in an interrupt + * context, or where blocking is not wanted, this routine will process + * committed transactions from the unlock queue. + */ +int jfs_lazycommit(void *arg) +{ + int WorkDone; + struct tblock *tblk; + unsigned long flags; + struct jfs_sb_info *sbi; + + do { + LAZY_LOCK(flags); + jfs_commit_thread_waking = 0; /* OK to wake another thread */ + while (!list_empty(&TxAnchor.unlock_queue)) { + WorkDone = 0; + list_for_each_entry(tblk, &TxAnchor.unlock_queue, + cqueue) { + + sbi = JFS_SBI(tblk->sb); + /* + * For each volume, the transactions must be + * handled in order. If another commit thread + * is handling a tblk for this superblock, + * skip it + */ + if (sbi->commit_state & IN_LAZYCOMMIT) + continue; + + sbi->commit_state |= IN_LAZYCOMMIT; + WorkDone = 1; + + /* + * Remove transaction from queue + */ + list_del(&tblk->cqueue); + + LAZY_UNLOCK(flags); + txLazyCommit(tblk); + LAZY_LOCK(flags); + + sbi->commit_state &= ~IN_LAZYCOMMIT; + /* + * Don't continue in the for loop. (We can't + * anyway, it's unsafe!) We want to go back to + * the beginning of the list. + */ + break; + } + + /* If there was nothing to do, don't continue */ + if (!WorkDone) + break; + } + /* In case a wakeup came while all threads were active */ + jfs_commit_thread_waking = 0; + + if (freezing(current)) { + LAZY_UNLOCK(flags); + try_to_freeze(); + } else { + DECLARE_WAITQUEUE(wq, current); + + add_wait_queue(&jfs_commit_thread_wait, &wq); + set_current_state(TASK_INTERRUPTIBLE); + LAZY_UNLOCK(flags); + schedule(); + remove_wait_queue(&jfs_commit_thread_wait, &wq); + } + } while (!kthread_should_stop()); + + if (!list_empty(&TxAnchor.unlock_queue)) + jfs_err("jfs_lazycommit being killed w/pending transactions!"); + else + jfs_info("jfs_lazycommit being killed"); + return 0; +} + +void txLazyUnlock(struct tblock * tblk) +{ + unsigned long flags; + + LAZY_LOCK(flags); + + list_add_tail(&tblk->cqueue, &TxAnchor.unlock_queue); + /* + * Don't wake up a commit thread if there is already one servicing + * this superblock, or if the last one we woke up hasn't started yet. + */ + if (!(JFS_SBI(tblk->sb)->commit_state & IN_LAZYCOMMIT) && + !jfs_commit_thread_waking) { + jfs_commit_thread_waking = 1; + wake_up(&jfs_commit_thread_wait); + } + LAZY_UNLOCK(flags); +} + +static void LogSyncRelease(struct metapage * mp) +{ + struct jfs_log *log = mp->log; + + assert(mp->nohomeok); + assert(log); + metapage_homeok(mp); +} + +/* + * txQuiesce + * + * Block all new transactions and push anonymous transactions to + * completion + * + * This does almost the same thing as jfs_sync below. We don't + * worry about deadlocking when jfs_tlocks_low is set, since we would + * expect jfs_sync to get us out of that jam. + */ +void txQuiesce(struct super_block *sb) +{ + struct inode *ip; + struct jfs_inode_info *jfs_ip; + struct jfs_log *log = JFS_SBI(sb)->log; + tid_t tid; + + set_bit(log_QUIESCE, &log->flag); + + TXN_LOCK(); +restart: + while (!list_empty(&TxAnchor.anon_list)) { + jfs_ip = list_entry(TxAnchor.anon_list.next, + struct jfs_inode_info, + anon_inode_list); + ip = &jfs_ip->vfs_inode; + + /* + * inode will be removed from anonymous list + * when it is committed + */ + TXN_UNLOCK(); + tid = txBegin(ip->i_sb, COMMIT_INODE | COMMIT_FORCE); + mutex_lock(&jfs_ip->commit_mutex); + txCommit(tid, 1, &ip, 0); + txEnd(tid); + mutex_unlock(&jfs_ip->commit_mutex); + /* + * Just to be safe. I don't know how + * long we can run without blocking + */ + cond_resched(); + TXN_LOCK(); + } + + /* + * If jfs_sync is running in parallel, there could be some inodes + * on anon_list2. Let's check. + */ + if (!list_empty(&TxAnchor.anon_list2)) { + list_splice_init(&TxAnchor.anon_list2, &TxAnchor.anon_list); + goto restart; + } + TXN_UNLOCK(); + + /* + * We may need to kick off the group commit + */ + jfs_flush_journal(log, 0); +} + +/* + * txResume() + * + * Allows transactions to start again following txQuiesce + */ +void txResume(struct super_block *sb) +{ + struct jfs_log *log = JFS_SBI(sb)->log; + + clear_bit(log_QUIESCE, &log->flag); + TXN_WAKEUP(&log->syncwait); +} + +/* + * jfs_sync(void) + * + * To be run as a kernel daemon. This is awakened when tlocks run low. + * We write any inodes that have anonymous tlocks so they will become + * available. + */ +int jfs_sync(void *arg) +{ + struct inode *ip; + struct jfs_inode_info *jfs_ip; + tid_t tid; + + do { + /* + * write each inode on the anonymous inode list + */ + TXN_LOCK(); + while (jfs_tlocks_low && !list_empty(&TxAnchor.anon_list)) { + jfs_ip = list_entry(TxAnchor.anon_list.next, + struct jfs_inode_info, + anon_inode_list); + ip = &jfs_ip->vfs_inode; + + if (! igrab(ip)) { + /* + * Inode is being freed + */ + list_del_init(&jfs_ip->anon_inode_list); + } else if (mutex_trylock(&jfs_ip->commit_mutex)) { + /* + * inode will be removed from anonymous list + * when it is committed + */ + TXN_UNLOCK(); + tid = txBegin(ip->i_sb, COMMIT_INODE); + txCommit(tid, 1, &ip, 0); + txEnd(tid); + mutex_unlock(&jfs_ip->commit_mutex); + + iput(ip); + /* + * Just to be safe. I don't know how + * long we can run without blocking + */ + cond_resched(); + TXN_LOCK(); + } else { + /* We can't get the commit mutex. It may + * be held by a thread waiting for tlock's + * so let's not block here. Save it to + * put back on the anon_list. + */ + + /* Move from anon_list to anon_list2 */ + list_move(&jfs_ip->anon_inode_list, + &TxAnchor.anon_list2); + + TXN_UNLOCK(); + iput(ip); + TXN_LOCK(); + } + } + /* Add anon_list2 back to anon_list */ + list_splice_init(&TxAnchor.anon_list2, &TxAnchor.anon_list); + + if (freezing(current)) { + TXN_UNLOCK(); + try_to_freeze(); + } else { + set_current_state(TASK_INTERRUPTIBLE); + TXN_UNLOCK(); + schedule(); + } + } while (!kthread_should_stop()); + + jfs_info("jfs_sync being killed"); + return 0; +} + +#if defined(CONFIG_PROC_FS) && defined(CONFIG_JFS_DEBUG) +int jfs_txanchor_proc_show(struct seq_file *m, void *v) +{ + char *freewait; + char *freelockwait; + char *lowlockwait; + + freewait = + waitqueue_active(&TxAnchor.freewait) ? "active" : "empty"; + freelockwait = + waitqueue_active(&TxAnchor.freelockwait) ? "active" : "empty"; + lowlockwait = + waitqueue_active(&TxAnchor.lowlockwait) ? "active" : "empty"; + + seq_printf(m, + "JFS TxAnchor\n" + "============\n" + "freetid = %d\n" + "freewait = %s\n" + "freelock = %d\n" + "freelockwait = %s\n" + "lowlockwait = %s\n" + "tlocksInUse = %d\n" + "jfs_tlocks_low = %d\n" + "unlock_queue is %sempty\n", + TxAnchor.freetid, + freewait, + TxAnchor.freelock, + freelockwait, + lowlockwait, + TxAnchor.tlocksInUse, + jfs_tlocks_low, + list_empty(&TxAnchor.unlock_queue) ? "" : "not "); + return 0; +} +#endif + +#if defined(CONFIG_PROC_FS) && defined(CONFIG_JFS_STATISTICS) +int jfs_txstats_proc_show(struct seq_file *m, void *v) +{ + seq_printf(m, + "JFS TxStats\n" + "===========\n" + "calls to txBegin = %d\n" + "txBegin blocked by sync barrier = %d\n" + "txBegin blocked by tlocks low = %d\n" + "txBegin blocked by no free tid = %d\n" + "calls to txBeginAnon = %d\n" + "txBeginAnon blocked by sync barrier = %d\n" + "txBeginAnon blocked by tlocks low = %d\n" + "calls to txLockAlloc = %d\n" + "tLockAlloc blocked by no free lock = %d\n", + TxStat.txBegin, + TxStat.txBegin_barrier, + TxStat.txBegin_lockslow, + TxStat.txBegin_freetid, + TxStat.txBeginAnon, + TxStat.txBeginAnon_barrier, + TxStat.txBeginAnon_lockslow, + TxStat.txLockAlloc, + TxStat.txLockAlloc_freelock); + return 0; +} +#endif diff --git a/fs/jfs/jfs_txnmgr.h b/fs/jfs/jfs_txnmgr.h new file mode 100644 index 000000000..ab7288937 --- /dev/null +++ b/fs/jfs/jfs_txnmgr.h @@ -0,0 +1,311 @@ +/* + * Copyright (C) International Business Machines Corp., 2000-2004 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +#ifndef _H_JFS_TXNMGR +#define _H_JFS_TXNMGR + +#include "jfs_logmgr.h" + +/* + * Hide implementation of TxBlock and TxLock + */ +#define tid_to_tblock(tid) (&TxBlock[tid]) + +#define lid_to_tlock(lid) (&TxLock[lid]) + +/* + * transaction block + */ +struct tblock { + /* + * tblock and jbuf_t common area: struct logsyncblk + * + * the following 5 fields are the same as struct logsyncblk + * which is common to tblock and jbuf to form logsynclist + */ + u16 xflag; /* tx commit type */ + u16 flag; /* tx commit state */ + lid_t dummy; /* Must keep structures common */ + s32 lsn; /* recovery lsn */ + struct list_head synclist; /* logsynclist link */ + + /* lock management */ + struct super_block *sb; /* super block */ + lid_t next; /* index of first tlock of tid */ + lid_t last; /* index of last tlock of tid */ + wait_queue_head_t waitor; /* tids waiting on this tid */ + + /* log management */ + u32 logtid; /* log transaction id */ + + /* commit management */ + struct list_head cqueue; /* commit queue list */ + s32 clsn; /* commit lsn */ + struct lbuf *bp; + s32 pn; /* commit record log page number */ + s32 eor; /* commit record eor */ + wait_queue_head_t gcwait; /* group commit event list: + * ready transactions wait on this + * event for group commit completion. + */ + union { + struct inode *ip; /* inode being deleted */ + pxd_t ixpxd; /* pxd of inode extent for created inode */ + } u; + u32 ino; /* inode number being created */ +}; + +extern struct tblock *TxBlock; /* transaction block table */ + +/* commit flags: tblk->xflag */ +#define COMMIT_SYNC 0x0001 /* synchronous commit */ +#define COMMIT_FORCE 0x0002 /* force pageout at end of commit */ +#define COMMIT_FLUSH 0x0004 /* init flush at end of commit */ +#define COMMIT_MAP 0x00f0 +#define COMMIT_PMAP 0x0010 /* update pmap */ +#define COMMIT_WMAP 0x0020 /* update wmap */ +#define COMMIT_PWMAP 0x0040 /* update pwmap */ +#define COMMIT_FREE 0x0f00 +#define COMMIT_DELETE 0x0100 /* inode delete */ +#define COMMIT_TRUNCATE 0x0200 /* file truncation */ +#define COMMIT_CREATE 0x0400 /* inode create */ +#define COMMIT_LAZY 0x0800 /* lazy commit */ +#define COMMIT_PAGE 0x1000 /* Identifies element as metapage */ +#define COMMIT_INODE 0x2000 /* Identifies element as inode */ + +/* group commit flags tblk->flag: see jfs_logmgr.h */ + +/* + * transaction lock + */ +struct tlock { + lid_t next; /* 2: index next lockword on tid locklist + * next lockword on freelist + */ + tid_t tid; /* 2: transaction id holding lock */ + + u16 flag; /* 2: lock control */ + u16 type; /* 2: log type */ + + struct metapage *mp; /* 4/8: object page buffer locked */ + struct inode *ip; /* 4/8: object */ + /* (16) */ + + s16 lock[24]; /* 48: overlay area */ +}; /* (64) */ + +extern struct tlock *TxLock; /* transaction lock table */ + +/* + * tlock flag + */ +/* txLock state */ +#define tlckPAGELOCK 0x8000 +#define tlckINODELOCK 0x4000 +#define tlckLINELOCK 0x2000 +#define tlckINLINELOCK 0x1000 +/* lmLog state */ +#define tlckLOG 0x0800 +/* updateMap state */ +#define tlckUPDATEMAP 0x0080 +#define tlckDIRECTORY 0x0040 +/* freeLock state */ +#define tlckFREELOCK 0x0008 +#define tlckWRITEPAGE 0x0004 +#define tlckFREEPAGE 0x0002 + +/* + * tlock type + */ +#define tlckTYPE 0xfe00 +#define tlckINODE 0x8000 +#define tlckXTREE 0x4000 +#define tlckDTREE 0x2000 +#define tlckMAP 0x1000 +#define tlckEA 0x0800 +#define tlckACL 0x0400 +#define tlckDATA 0x0200 +#define tlckBTROOT 0x0100 + +#define tlckOPERATION 0x00ff +#define tlckGROW 0x0001 /* file grow */ +#define tlckREMOVE 0x0002 /* file delete */ +#define tlckTRUNCATE 0x0004 /* file truncate */ +#define tlckRELOCATE 0x0008 /* file/directory relocate */ +#define tlckENTRY 0x0001 /* directory insert/delete */ +#define tlckEXTEND 0x0002 /* directory extend in-line */ +#define tlckSPLIT 0x0010 /* splited page */ +#define tlckNEW 0x0020 /* new page from split */ +#define tlckFREE 0x0040 /* free page */ +#define tlckRELINK 0x0080 /* update sibling pointer */ + +/* + * linelock for lmLog() + * + * note: linelock and its variations are overlaid + * at tlock.lock: watch for alignment; + */ +struct lv { + u8 offset; /* 1: */ + u8 length; /* 1: */ +}; /* (2) */ + +#define TLOCKSHORT 20 +#define TLOCKLONG 28 + +struct linelock { + lid_t next; /* 2: next linelock */ + + s8 maxcnt; /* 1: */ + s8 index; /* 1: */ + + u16 flag; /* 2: */ + u8 type; /* 1: */ + u8 l2linesize; /* 1: log2 of linesize */ + /* (8) */ + + struct lv lv[20]; /* 40: */ +}; /* (48) */ + +#define dt_lock linelock + +struct xtlock { + lid_t next; /* 2: */ + + s8 maxcnt; /* 1: */ + s8 index; /* 1: */ + + u16 flag; /* 2: */ + u8 type; /* 1: */ + u8 l2linesize; /* 1: log2 of linesize */ + /* (8) */ + + struct lv header; /* 2: */ + struct lv lwm; /* 2: low water mark */ + struct lv hwm; /* 2: high water mark */ + struct lv twm; /* 2: */ + /* (16) */ + + s32 pxdlock[8]; /* 32: */ +}; /* (48) */ + + +/* + * maplock for txUpdateMap() + * + * note: maplock and its variations are overlaid + * at tlock.lock/linelock: watch for alignment; + * N.B. next field may be set by linelock, and should not + * be modified by maplock; + * N.B. index of the first pxdlock specifies index of next + * free maplock (i.e., number of maplock) in the tlock; + */ +struct maplock { + lid_t next; /* 2: */ + + u8 maxcnt; /* 2: */ + u8 index; /* 2: next free maplock index */ + + u16 flag; /* 2: */ + u8 type; /* 1: */ + u8 count; /* 1: number of pxd/xad */ + /* (8) */ + + pxd_t pxd; /* 8: */ +}; /* (16): */ + +/* maplock flag */ +#define mlckALLOC 0x00f0 +#define mlckALLOCXADLIST 0x0080 +#define mlckALLOCPXDLIST 0x0040 +#define mlckALLOCXAD 0x0020 +#define mlckALLOCPXD 0x0010 +#define mlckFREE 0x000f +#define mlckFREEXADLIST 0x0008 +#define mlckFREEPXDLIST 0x0004 +#define mlckFREEXAD 0x0002 +#define mlckFREEPXD 0x0001 + +#define pxd_lock maplock + +struct xdlistlock { + lid_t next; /* 2: */ + + u8 maxcnt; /* 2: */ + u8 index; /* 2: */ + + u16 flag; /* 2: */ + u8 type; /* 1: */ + u8 count; /* 1: number of pxd/xad */ + /* (8) */ + + /* + * We need xdlist to be 64 bits (8 bytes), regardless of + * whether void * is 32 or 64 bits + */ + union { + void *_xdlist; /* pxd/xad list */ + s64 pad; /* 8: Force 64-bit xdlist size */ + } union64; +}; /* (16): */ + +#define xdlist union64._xdlist + +/* + * commit + * + * parameter to the commit manager routines + */ +struct commit { + tid_t tid; /* tid = index of tblock */ + int flag; /* flags */ + struct jfs_log *log; /* log */ + struct super_block *sb; /* superblock */ + + int nip; /* number of entries in iplist */ + struct inode **iplist; /* list of pointers to inodes */ + + /* log record descriptor on 64-bit boundary */ + struct lrd lrd; /* : log record descriptor */ +}; + +/* + * external declarations + */ +extern int jfs_tlocks_low; + +extern int txInit(void); +extern void txExit(void); +extern struct tlock *txLock(tid_t, struct inode *, struct metapage *, int); +extern struct tlock *txMaplock(tid_t, struct inode *, int); +extern int txCommit(tid_t, int, struct inode **, int); +extern tid_t txBegin(struct super_block *, int); +extern void txBeginAnon(struct super_block *); +extern void txEnd(tid_t); +extern void txAbort(tid_t, int); +extern struct linelock *txLinelock(struct linelock *); +extern void txFreeMap(struct inode *, struct maplock *, struct tblock *, int); +extern void txEA(tid_t, struct inode *, dxd_t *, dxd_t *); +extern void txFreelock(struct inode *); +extern int lmLog(struct jfs_log *, struct tblock *, struct lrd *, + struct tlock *); +extern void txQuiesce(struct super_block *); +extern void txResume(struct super_block *); +extern void txLazyUnlock(struct tblock *); +extern int jfs_lazycommit(void *); +extern int jfs_sync(void *); +#endif /* _H_JFS_TXNMGR */ diff --git a/fs/jfs/jfs_types.h b/fs/jfs/jfs_types.h new file mode 100644 index 000000000..8f602dcb5 --- /dev/null +++ b/fs/jfs/jfs_types.h @@ -0,0 +1,170 @@ +/* + * Copyright (C) International Business Machines Corp., 2000-2004 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +#ifndef _H_JFS_TYPES +#define _H_JFS_TYPES + +/* + * jfs_types.h: + * + * basic type/utility definitions + * + * note: this header file must be the 1st include file + * of JFS include list in all JFS .c file. + */ + +#include <linux/types.h> +#include <linux/nls.h> + +/* + * transaction and lock id's + * + * Don't change these without carefully considering the impact on the + * size and alignment of all of the linelock variants + */ +typedef u16 tid_t; +typedef u16 lid_t; + +/* + * Almost identical to Linux's timespec, but not quite + */ +struct timestruc_t { + __le32 tv_sec; + __le32 tv_nsec; +}; + +/* + * handy + */ + +#define LEFTMOSTONE 0x80000000 +#define HIGHORDER 0x80000000u /* high order bit on */ +#define ONES 0xffffffffu /* all bit on */ + +/* + * physical xd (pxd) + * + * The leftmost 24 bits of len_addr are the extent length. + * The rightmost 8 bits of len_addr are the most signficant bits of + * the extent address + */ +typedef struct { + __le32 len_addr; + __le32 addr2; +} pxd_t; + +/* xd_t field construction */ + +static inline void PXDlength(pxd_t *pxd, __u32 len) +{ + pxd->len_addr = (pxd->len_addr & cpu_to_le32(~0xffffff)) | + cpu_to_le32(len & 0xffffff); +} + +static inline void PXDaddress(pxd_t *pxd, __u64 addr) +{ + pxd->len_addr = (pxd->len_addr & cpu_to_le32(0xffffff)) | + cpu_to_le32((addr >> 32)<<24); + pxd->addr2 = cpu_to_le32(addr & 0xffffffff); +} + +/* xd_t field extraction */ +static inline __u32 lengthPXD(pxd_t *pxd) +{ + return le32_to_cpu((pxd)->len_addr) & 0xffffff; +} + +static inline __u64 addressPXD(pxd_t *pxd) +{ + __u64 n = le32_to_cpu(pxd->len_addr) & ~0xffffff; + return (n << 8) + le32_to_cpu(pxd->addr2); +} + +#define MAXTREEHEIGHT 8 +/* pxd list */ +struct pxdlist { + s16 maxnpxd; + s16 npxd; + pxd_t pxd[MAXTREEHEIGHT]; +}; + + +/* + * data extent descriptor (dxd) + */ +typedef struct { + __u8 flag; /* 1: flags */ + __u8 rsrvd[3]; + __le32 size; /* 4: size in byte */ + pxd_t loc; /* 8: address and length in unit of fsblksize */ +} dxd_t; /* - 16 - */ + +/* dxd_t flags */ +#define DXD_INDEX 0x80 /* B+-tree index */ +#define DXD_INLINE 0x40 /* in-line data extent */ +#define DXD_EXTENT 0x20 /* out-of-line single extent */ +#define DXD_FILE 0x10 /* out-of-line file (inode) */ +#define DXD_CORRUPT 0x08 /* Inconsistency detected */ + +/* dxd_t field construction + */ +#define DXDlength(dxd, len) PXDlength(&(dxd)->loc, len) +#define DXDaddress(dxd, addr) PXDaddress(&(dxd)->loc, addr) +#define lengthDXD(dxd) lengthPXD(&(dxd)->loc) +#define addressDXD(dxd) addressPXD(&(dxd)->loc) +#define DXDsize(dxd, size32) ((dxd)->size = cpu_to_le32(size32)) +#define sizeDXD(dxd) le32_to_cpu((dxd)->size) + +/* + * directory entry argument + */ +struct component_name { + int namlen; + wchar_t *name; +}; + + +/* + * DASD limit information - stored in directory inode + */ +struct dasd { + u8 thresh; /* Alert Threshold (in percent) */ + u8 delta; /* Alert Threshold delta (in percent) */ + u8 rsrvd1; + u8 limit_hi; /* DASD limit (in logical blocks) */ + __le32 limit_lo; /* DASD limit (in logical blocks) */ + u8 rsrvd2[3]; + u8 used_hi; /* DASD usage (in logical blocks) */ + __le32 used_lo; /* DASD usage (in logical blocks) */ +}; + +#define DASDLIMIT(dasdp) \ + (((u64)((dasdp)->limit_hi) << 32) + __le32_to_cpu((dasdp)->limit_lo)) +#define setDASDLIMIT(dasdp, limit)\ +{\ + (dasdp)->limit_hi = ((u64)limit) >> 32;\ + (dasdp)->limit_lo = __cpu_to_le32(limit);\ +} +#define DASDUSED(dasdp) \ + (((u64)((dasdp)->used_hi) << 32) + __le32_to_cpu((dasdp)->used_lo)) +#define setDASDUSED(dasdp, used)\ +{\ + (dasdp)->used_hi = ((u64)used) >> 32;\ + (dasdp)->used_lo = __cpu_to_le32(used);\ +} + +#endif /* !_H_JFS_TYPES */ diff --git a/fs/jfs/jfs_umount.c b/fs/jfs/jfs_umount.c new file mode 100644 index 000000000..7971f3753 --- /dev/null +++ b/fs/jfs/jfs_umount.c @@ -0,0 +1,168 @@ +/* + * Copyright (C) International Business Machines Corp., 2000-2004 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +/* + * jfs_umount.c + * + * note: file system in transition to aggregate/fileset: + * (ref. jfs_mount.c) + * + * file system unmount is interpreted as mount of the single/only + * fileset in the aggregate and, if unmount of the last fileset, + * as unmount of the aggerate; + */ + +#include <linux/fs.h> +#include "jfs_incore.h" +#include "jfs_filsys.h" +#include "jfs_superblock.h" +#include "jfs_dmap.h" +#include "jfs_imap.h" +#include "jfs_metapage.h" +#include "jfs_debug.h" + +/* + * NAME: jfs_umount(vfsp, flags, crp) + * + * FUNCTION: vfs_umount() + * + * PARAMETERS: vfsp - virtual file system pointer + * flags - unmount for shutdown + * crp - credential + * + * RETURN : EBUSY - device has open files + */ +int jfs_umount(struct super_block *sb) +{ + struct jfs_sb_info *sbi = JFS_SBI(sb); + struct inode *ipbmap = sbi->ipbmap; + struct inode *ipimap = sbi->ipimap; + struct inode *ipaimap = sbi->ipaimap; + struct inode *ipaimap2 = sbi->ipaimap2; + struct jfs_log *log; + int rc = 0; + + jfs_info("UnMount JFS: sb:0x%p", sb); + + /* + * update superblock and close log + * + * if mounted read-write and log based recovery was enabled + */ + if ((log = sbi->log)) + /* + * Wait for outstanding transactions to be written to log: + */ + jfs_flush_journal(log, 2); + + /* + * close fileset inode allocation map (aka fileset inode) + */ + diUnmount(ipimap, 0); + + diFreeSpecial(ipimap); + sbi->ipimap = NULL; + + /* + * close secondary aggregate inode allocation map + */ + ipaimap2 = sbi->ipaimap2; + if (ipaimap2) { + diUnmount(ipaimap2, 0); + diFreeSpecial(ipaimap2); + sbi->ipaimap2 = NULL; + } + + /* + * close aggregate inode allocation map + */ + ipaimap = sbi->ipaimap; + diUnmount(ipaimap, 0); + diFreeSpecial(ipaimap); + sbi->ipaimap = NULL; + + /* + * close aggregate block allocation map + */ + dbUnmount(ipbmap, 0); + + diFreeSpecial(ipbmap); + sbi->ipimap = NULL; + + /* + * Make sure all metadata makes it to disk before we mark + * the superblock as clean + */ + filemap_write_and_wait(sbi->direct_inode->i_mapping); + + /* + * ensure all file system file pages are propagated to their + * home blocks on disk (and their in-memory buffer pages are + * invalidated) BEFORE updating file system superblock state + * (to signify file system is unmounted cleanly, and thus in + * consistent state) and log superblock active file system + * list (to signify skip logredo()). + */ + if (log) { /* log = NULL if read-only mount */ + updateSuper(sb, FM_CLEAN); + + /* + * close log: + * + * remove file system from log active file system list. + */ + rc = lmLogClose(sb); + } + jfs_info("UnMount JFS Complete: rc = %d", rc); + return rc; +} + + +int jfs_umount_rw(struct super_block *sb) +{ + struct jfs_sb_info *sbi = JFS_SBI(sb); + struct jfs_log *log = sbi->log; + + if (!log) + return 0; + + /* + * close log: + * + * remove file system from log active file system list. + */ + jfs_flush_journal(log, 2); + + /* + * Make sure all metadata makes it to disk + */ + dbSync(sbi->ipbmap); + diSync(sbi->ipimap); + + /* + * Note that we have to do this even if sync_blockdev() will + * do exactly the same a few instructions later: We can't + * mark the superblock clean before everything is flushed to + * disk. + */ + filemap_write_and_wait(sbi->direct_inode->i_mapping); + + updateSuper(sb, FM_CLEAN); + + return lmLogClose(sb); +} diff --git a/fs/jfs/jfs_unicode.c b/fs/jfs/jfs_unicode.c new file mode 100644 index 000000000..0148e2e4d --- /dev/null +++ b/fs/jfs/jfs_unicode.c @@ -0,0 +1,138 @@ +/* + * Copyright (C) International Business Machines Corp., 2000-2004 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include <linux/fs.h> +#include <linux/slab.h> +#include "jfs_incore.h" +#include "jfs_filsys.h" +#include "jfs_unicode.h" +#include "jfs_debug.h" + +/* + * NAME: jfs_strfromUCS() + * + * FUNCTION: Convert little-endian unicode string to character string + * + */ +int jfs_strfromUCS_le(char *to, const __le16 * from, + int len, struct nls_table *codepage) +{ + int i; + int outlen = 0; + static int warn_again = 5; /* Only warn up to 5 times total */ + int warn = !!warn_again; /* once per string */ + + if (codepage) { + for (i = 0; (i < len) && from[i]; i++) { + int charlen; + charlen = + codepage->uni2char(le16_to_cpu(from[i]), + &to[outlen], + NLS_MAX_CHARSET_SIZE); + if (charlen > 0) + outlen += charlen; + else + to[outlen++] = '?'; + } + } else { + for (i = 0; (i < len) && from[i]; i++) { + if (unlikely(le16_to_cpu(from[i]) & 0xff00)) { + to[i] = '?'; + if (unlikely(warn)) { + warn--; + warn_again--; + printk(KERN_ERR + "non-latin1 character 0x%x found in JFS file name\n", + le16_to_cpu(from[i])); + printk(KERN_ERR + "mount with iocharset=utf8 to access\n"); + } + + } + else + to[i] = (char) (le16_to_cpu(from[i])); + } + outlen = i; + } + to[outlen] = 0; + return outlen; +} + +/* + * NAME: jfs_strtoUCS() + * + * FUNCTION: Convert character string to unicode string + * + */ +static int jfs_strtoUCS(wchar_t * to, const unsigned char *from, int len, + struct nls_table *codepage) +{ + int charlen; + int i; + + if (codepage) { + for (i = 0; len && *from; i++, from += charlen, len -= charlen) + { + charlen = codepage->char2uni(from, len, &to[i]); + if (charlen < 1) { + jfs_err("jfs_strtoUCS: char2uni returned %d.", + charlen); + jfs_err("charset = %s, char = 0x%x", + codepage->charset, *from); + return charlen; + } + } + } else { + for (i = 0; (i < len) && from[i]; i++) + to[i] = (wchar_t) from[i]; + } + + to[i] = 0; + return i; +} + +/* + * NAME: get_UCSname() + * + * FUNCTION: Allocate and translate to unicode string + * + */ +int get_UCSname(struct component_name * uniName, struct dentry *dentry) +{ + struct nls_table *nls_tab = JFS_SBI(dentry->d_sb)->nls_tab; + int length = dentry->d_name.len; + + if (length > JFS_NAME_MAX) + return -ENAMETOOLONG; + + uniName->name = + kmalloc_array(length + 1, sizeof(wchar_t), GFP_NOFS); + + if (uniName->name == NULL) + return -ENOMEM; + + uniName->namlen = jfs_strtoUCS(uniName->name, dentry->d_name.name, + length, nls_tab); + + if (uniName->namlen < 0) { + kfree(uniName->name); + return uniName->namlen; + } + + return 0; +} diff --git a/fs/jfs/jfs_unicode.h b/fs/jfs/jfs_unicode.h new file mode 100644 index 000000000..8f0f02cb6 --- /dev/null +++ b/fs/jfs/jfs_unicode.h @@ -0,0 +1,156 @@ +/* + * Copyright (C) International Business Machines Corp., 2000-2002 + * Portions Copyright (C) Christoph Hellwig, 2001-2002 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +#ifndef _H_JFS_UNICODE +#define _H_JFS_UNICODE + +#include <linux/slab.h> +#include <asm/byteorder.h> +#include "jfs_types.h" + +typedef struct { + wchar_t start; + wchar_t end; + signed char *table; +} UNICASERANGE; + +extern signed char UniUpperTable[512]; +extern UNICASERANGE UniUpperRange[]; +extern int get_UCSname(struct component_name *, struct dentry *); +extern int jfs_strfromUCS_le(char *, const __le16 *, int, struct nls_table *); + +#define free_UCSname(COMP) kfree((COMP)->name) + +/* + * UniStrcpy: Copy a string + */ +static inline wchar_t *UniStrcpy(wchar_t * ucs1, const wchar_t * ucs2) +{ + wchar_t *anchor = ucs1; /* save the start of result string */ + + while ((*ucs1++ = *ucs2++)); + return anchor; +} + + + +/* + * UniStrncpy: Copy length limited string with pad + */ +static inline __le16 *UniStrncpy_le(__le16 * ucs1, const __le16 * ucs2, + size_t n) +{ + __le16 *anchor = ucs1; + + while (n-- && *ucs2) /* Copy the strings */ + *ucs1++ = *ucs2++; + + n++; + while (n--) /* Pad with nulls */ + *ucs1++ = 0; + return anchor; +} + +/* + * UniStrncmp_le: Compare length limited string - native to little-endian + */ +static inline int UniStrncmp_le(const wchar_t * ucs1, const __le16 * ucs2, + size_t n) +{ + if (!n) + return 0; /* Null strings are equal */ + while ((*ucs1 == __le16_to_cpu(*ucs2)) && *ucs1 && --n) { + ucs1++; + ucs2++; + } + return (int) *ucs1 - (int) __le16_to_cpu(*ucs2); +} + +/* + * UniStrncpy_to_le: Copy length limited string with pad to little-endian + */ +static inline __le16 *UniStrncpy_to_le(__le16 * ucs1, const wchar_t * ucs2, + size_t n) +{ + __le16 *anchor = ucs1; + + while (n-- && *ucs2) /* Copy the strings */ + *ucs1++ = cpu_to_le16(*ucs2++); + + n++; + while (n--) /* Pad with nulls */ + *ucs1++ = 0; + return anchor; +} + +/* + * UniStrncpy_from_le: Copy length limited string with pad from little-endian + */ +static inline wchar_t *UniStrncpy_from_le(wchar_t * ucs1, const __le16 * ucs2, + size_t n) +{ + wchar_t *anchor = ucs1; + + while (n-- && *ucs2) /* Copy the strings */ + *ucs1++ = __le16_to_cpu(*ucs2++); + + n++; + while (n--) /* Pad with nulls */ + *ucs1++ = 0; + return anchor; +} + +/* + * UniToupper: Convert a unicode character to upper case + */ +static inline wchar_t UniToupper(wchar_t uc) +{ + UNICASERANGE *rp; + + if (uc < sizeof(UniUpperTable)) { /* Latin characters */ + return uc + UniUpperTable[uc]; /* Use base tables */ + } else { + rp = UniUpperRange; /* Use range tables */ + while (rp->start) { + if (uc < rp->start) /* Before start of range */ + return uc; /* Uppercase = input */ + if (uc <= rp->end) /* In range */ + return uc + rp->table[uc - rp->start]; + rp++; /* Try next range */ + } + } + return uc; /* Past last range */ +} + + +/* + * UniStrupr: Upper case a unicode string + */ +static inline wchar_t *UniStrupr(wchar_t * upin) +{ + wchar_t *up; + + up = upin; + while (*up) { /* For all characters */ + *up = UniToupper(*up); + up++; + } + return upin; /* Return input pointer */ +} + +#endif /* !_H_JFS_UNICODE */ diff --git a/fs/jfs/jfs_uniupr.c b/fs/jfs/jfs_uniupr.c new file mode 100644 index 000000000..cfe50666d --- /dev/null +++ b/fs/jfs/jfs_uniupr.c @@ -0,0 +1,134 @@ +/* + * Copyright (C) International Business Machines Corp., 2000-2002 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include <linux/fs.h> +#include "jfs_unicode.h" + +/* + * Latin upper case + */ +signed char UniUpperTable[512] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 000-00f */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 010-01f */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 020-02f */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 030-03f */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 040-04f */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 050-05f */ + 0,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32, /* 060-06f */ + -32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32, 0, 0, 0, 0, 0, /* 070-07f */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 080-08f */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 090-09f */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0a0-0af */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0b0-0bf */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0c0-0cf */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0d0-0df */ + -32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32, /* 0e0-0ef */ + -32,-32,-32,-32,-32,-32,-32, 0,-32,-32,-32,-32,-32,-32,-32,121, /* 0f0-0ff */ + 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 100-10f */ + 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 110-11f */ + 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 120-12f */ + 0, 0, 0, -1, 0, -1, 0, -1, 0, 0, -1, 0, -1, 0, -1, 0, /* 130-13f */ + -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, 0, -1, 0, -1, 0, -1, /* 140-14f */ + 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 150-15f */ + 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 160-16f */ + 0, -1, 0, -1, 0, -1, 0, -1, 0, 0, -1, 0, -1, 0, -1, 0, /* 170-17f */ + 0, 0, 0, -1, 0, -1, 0, 0, -1, 0, 0, 0, -1, 0, 0, 0, /* 180-18f */ + 0, 0, -1, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, /* 190-19f */ + 0, -1, 0, -1, 0, -1, 0, 0, -1, 0, 0, 0, 0, -1, 0, 0, /* 1a0-1af */ + -1, 0, 0, 0, -1, 0, -1, 0, 0, -1, 0, 0, 0, -1, 0, 0, /* 1b0-1bf */ + 0, 0, 0, 0, 0, -1, -2, 0, -1, -2, 0, -1, -2, 0, -1, 0, /* 1c0-1cf */ + -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,-79, 0, -1, /* 1d0-1df */ + 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1e0-1ef */ + 0, 0, -1, -2, 0, -1, 0, 0, 0, -1, 0, -1, 0, -1, 0, -1, /* 1f0-1ff */ +}; + +/* Upper case range - Greek */ +static signed char UniCaseRangeU03a0[47] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-38,-37,-37,-37, /* 3a0-3af */ + 0,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32, /* 3b0-3bf */ + -32,-32,-31,-32,-32,-32,-32,-32,-32,-32,-32,-32,-64,-63,-63, +}; + +/* Upper case range - Cyrillic */ +static signed char UniCaseRangeU0430[48] = { + -32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32, /* 430-43f */ + -32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32, /* 440-44f */ + 0,-80,-80,-80,-80,-80,-80,-80,-80,-80,-80,-80,-80, 0,-80,-80, /* 450-45f */ +}; + +/* Upper case range - Extended cyrillic */ +static signed char UniCaseRangeU0490[61] = { + 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 490-49f */ + 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 4a0-4af */ + 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 4b0-4bf */ + 0, 0, -1, 0, -1, 0, 0, 0, -1, 0, 0, 0, -1, +}; + +/* Upper case range - Extended latin and greek */ +static signed char UniCaseRangeU1e00[509] = { + 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1e00-1e0f */ + 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1e10-1e1f */ + 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1e20-1e2f */ + 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1e30-1e3f */ + 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1e40-1e4f */ + 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1e50-1e5f */ + 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1e60-1e6f */ + 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1e70-1e7f */ + 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1e80-1e8f */ + 0, -1, 0, -1, 0, -1, 0, 0, 0, 0, 0,-59, 0, -1, 0, -1, /* 1e90-1e9f */ + 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1ea0-1eaf */ + 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1eb0-1ebf */ + 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1ec0-1ecf */ + 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1ed0-1edf */ + 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1ee0-1eef */ + 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, 0, 0, 0, 0, 0, /* 1ef0-1eff */ + 8, 8, 8, 8, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, /* 1f00-1f0f */ + 8, 8, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 1f10-1f1f */ + 8, 8, 8, 8, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, /* 1f20-1f2f */ + 8, 8, 8, 8, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, /* 1f30-1f3f */ + 8, 8, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 1f40-1f4f */ + 0, 8, 0, 8, 0, 8, 0, 8, 0, 0, 0, 0, 0, 0, 0, 0, /* 1f50-1f5f */ + 8, 8, 8, 8, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, /* 1f60-1f6f */ + 74, 74, 86, 86, 86, 86,100,100, 0, 0,112,112,126,126, 0, 0, /* 1f70-1f7f */ + 8, 8, 8, 8, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, /* 1f80-1f8f */ + 8, 8, 8, 8, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, /* 1f90-1f9f */ + 8, 8, 8, 8, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, /* 1fa0-1faf */ + 8, 8, 0, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 1fb0-1fbf */ + 0, 0, 0, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 1fc0-1fcf */ + 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 1fd0-1fdf */ + 8, 8, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 1fe0-1fef */ + 0, 0, 0, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, +}; + +/* Upper case range - Wide latin */ +static signed char UniCaseRangeUff40[27] = { + 0,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32, /* ff40-ff4f */ + -32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32, +}; + +/* + * Upper Case Range + */ +UNICASERANGE UniUpperRange[] = { + { 0x03a0, 0x03ce, UniCaseRangeU03a0 }, + { 0x0430, 0x045f, UniCaseRangeU0430 }, + { 0x0490, 0x04cc, UniCaseRangeU0490 }, + { 0x1e00, 0x1ffc, UniCaseRangeU1e00 }, + { 0xff40, 0xff5a, UniCaseRangeUff40 }, + { 0 } +}; diff --git a/fs/jfs/jfs_xattr.h b/fs/jfs/jfs_xattr.h new file mode 100644 index 000000000..561f6af46 --- /dev/null +++ b/fs/jfs/jfs_xattr.h @@ -0,0 +1,75 @@ +/* + * Copyright (C) International Business Machines Corp., 2000-2002 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#ifndef H_JFS_XATTR +#define H_JFS_XATTR + +#include <linux/xattr.h> + +/* + * jfs_ea_list describe the on-disk format of the extended attributes. + * I know the null-terminator is redundant since namelen is stored, but + * I am maintaining compatibility with OS/2 where possible. + */ +struct jfs_ea { + u8 flag; /* Unused? */ + u8 namelen; /* Length of name */ + __le16 valuelen; /* Length of value */ + char name[0]; /* Attribute name (includes null-terminator) */ +}; /* Value immediately follows name */ + +struct jfs_ea_list { + __le32 size; /* overall size */ + struct jfs_ea ea[0]; /* Variable length list */ +}; + +/* Macros for defining maxiumum number of bytes supported for EAs */ +#define MAXEASIZE 65535 +#define MAXEALISTSIZE MAXEASIZE + +/* + * some macros for dealing with variable length EA lists. + */ +#define EA_SIZE(ea) \ + (sizeof (struct jfs_ea) + (ea)->namelen + 1 + \ + le16_to_cpu((ea)->valuelen)) +#define NEXT_EA(ea) ((struct jfs_ea *) (((char *) (ea)) + (EA_SIZE (ea)))) +#define FIRST_EA(ealist) ((ealist)->ea) +#define EALIST_SIZE(ealist) le32_to_cpu((ealist)->size) +#define END_EALIST(ealist) \ + ((struct jfs_ea *) (((char *) (ealist)) + EALIST_SIZE(ealist))) + +extern int __jfs_setxattr(tid_t, struct inode *, const char *, const void *, + size_t, int); +extern ssize_t __jfs_getxattr(struct inode *, const char *, void *, size_t); +extern ssize_t jfs_listxattr(struct dentry *, char *, size_t); + +extern const struct xattr_handler *jfs_xattr_handlers[]; + +#ifdef CONFIG_JFS_SECURITY +extern int jfs_init_security(tid_t, struct inode *, struct inode *, + const struct qstr *); +#else +static inline int jfs_init_security(tid_t tid, struct inode *inode, + struct inode *dir, const struct qstr *qstr) +{ + return 0; +} +#endif + +#endif /* H_JFS_XATTR */ diff --git a/fs/jfs/jfs_xtree.c b/fs/jfs/jfs_xtree.c new file mode 100644 index 000000000..2c200b525 --- /dev/null +++ b/fs/jfs/jfs_xtree.c @@ -0,0 +1,3890 @@ +/* + * Copyright (C) International Business Machines Corp., 2000-2005 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +/* + * jfs_xtree.c: extent allocation descriptor B+-tree manager + */ + +#include <linux/fs.h> +#include <linux/module.h> +#include <linux/quotaops.h> +#include <linux/seq_file.h> +#include "jfs_incore.h" +#include "jfs_filsys.h" +#include "jfs_metapage.h" +#include "jfs_dmap.h" +#include "jfs_dinode.h" +#include "jfs_superblock.h" +#include "jfs_debug.h" + +/* + * xtree local flag + */ +#define XT_INSERT 0x00000001 + +/* + * xtree key/entry comparison: extent offset + * + * return: + * -1: k < start of extent + * 0: start_of_extent <= k <= end_of_extent + * 1: k > end_of_extent + */ +#define XT_CMP(CMP, K, X, OFFSET64)\ +{\ + OFFSET64 = offsetXAD(X);\ + (CMP) = ((K) >= OFFSET64 + lengthXAD(X)) ? 1 :\ + ((K) < OFFSET64) ? -1 : 0;\ +} + +/* write a xad entry */ +#define XT_PUTENTRY(XAD, FLAG, OFF, LEN, ADDR)\ +{\ + (XAD)->flag = (FLAG);\ + XADoffset((XAD), (OFF));\ + XADlength((XAD), (LEN));\ + XADaddress((XAD), (ADDR));\ +} + +#define XT_PAGE(IP, MP) BT_PAGE(IP, MP, xtpage_t, i_xtroot) + +/* get page buffer for specified block address */ +/* ToDo: Replace this ugly macro with a function */ +#define XT_GETPAGE(IP, BN, MP, SIZE, P, RC) \ +do { \ + BT_GETPAGE(IP, BN, MP, xtpage_t, SIZE, P, RC, i_xtroot); \ + if (!(RC)) { \ + if ((le16_to_cpu((P)->header.nextindex) < XTENTRYSTART) || \ + (le16_to_cpu((P)->header.nextindex) > \ + le16_to_cpu((P)->header.maxentry)) || \ + (le16_to_cpu((P)->header.maxentry) > \ + (((BN) == 0) ? XTROOTMAXSLOT : PSIZE >> L2XTSLOTSIZE))) { \ + jfs_error((IP)->i_sb, \ + "XT_GETPAGE: xtree page corrupt\n"); \ + BT_PUTPAGE(MP); \ + MP = NULL; \ + RC = -EIO; \ + } \ + } \ +} while (0) + +/* for consistency */ +#define XT_PUTPAGE(MP) BT_PUTPAGE(MP) + +#define XT_GETSEARCH(IP, LEAF, BN, MP, P, INDEX) \ + BT_GETSEARCH(IP, LEAF, BN, MP, xtpage_t, P, INDEX, i_xtroot) +/* xtree entry parameter descriptor */ +struct xtsplit { + struct metapage *mp; + s16 index; + u8 flag; + s64 off; + s64 addr; + int len; + struct pxdlist *pxdlist; +}; + + +/* + * statistics + */ +#ifdef CONFIG_JFS_STATISTICS +static struct { + uint search; + uint fastSearch; + uint split; +} xtStat; +#endif + + +/* + * forward references + */ +static int xtSearch(struct inode *ip, s64 xoff, s64 *next, int *cmpp, + struct btstack * btstack, int flag); + +static int xtSplitUp(tid_t tid, + struct inode *ip, + struct xtsplit * split, struct btstack * btstack); + +static int xtSplitPage(tid_t tid, struct inode *ip, struct xtsplit * split, + struct metapage ** rmpp, s64 * rbnp); + +static int xtSplitRoot(tid_t tid, struct inode *ip, + struct xtsplit * split, struct metapage ** rmpp); + +#ifdef _STILL_TO_PORT +static int xtDeleteUp(tid_t tid, struct inode *ip, struct metapage * fmp, + xtpage_t * fp, struct btstack * btstack); + +static int xtSearchNode(struct inode *ip, + xad_t * xad, + int *cmpp, struct btstack * btstack, int flag); + +static int xtRelink(tid_t tid, struct inode *ip, xtpage_t * fp); +#endif /* _STILL_TO_PORT */ + +/* + * xtLookup() + * + * function: map a single page into a physical extent; + */ +int xtLookup(struct inode *ip, s64 lstart, + s64 llen, int *pflag, s64 * paddr, s32 * plen, int no_check) +{ + int rc = 0; + struct btstack btstack; + int cmp; + s64 bn; + struct metapage *mp; + xtpage_t *p; + int index; + xad_t *xad; + s64 next, size, xoff, xend; + int xlen; + s64 xaddr; + + *paddr = 0; + *plen = llen; + + if (!no_check) { + /* is lookup offset beyond eof ? */ + size = ((u64) ip->i_size + (JFS_SBI(ip->i_sb)->bsize - 1)) >> + JFS_SBI(ip->i_sb)->l2bsize; + if (lstart >= size) + return 0; + } + + /* + * search for the xad entry covering the logical extent + */ +//search: + if ((rc = xtSearch(ip, lstart, &next, &cmp, &btstack, 0))) { + jfs_err("xtLookup: xtSearch returned %d", rc); + return rc; + } + + /* + * compute the physical extent covering logical extent + * + * N.B. search may have failed (e.g., hole in sparse file), + * and returned the index of the next entry. + */ + /* retrieve search result */ + XT_GETSEARCH(ip, btstack.top, bn, mp, p, index); + + /* is xad found covering start of logical extent ? + * lstart is a page start address, + * i.e., lstart cannot start in a hole; + */ + if (cmp) { + if (next) + *plen = min(next - lstart, llen); + goto out; + } + + /* + * lxd covered by xad + */ + xad = &p->xad[index]; + xoff = offsetXAD(xad); + xlen = lengthXAD(xad); + xend = xoff + xlen; + xaddr = addressXAD(xad); + + /* initialize new pxd */ + *pflag = xad->flag; + *paddr = xaddr + (lstart - xoff); + /* a page must be fully covered by an xad */ + *plen = min(xend - lstart, llen); + + out: + XT_PUTPAGE(mp); + + return rc; +} + +/* + * xtSearch() + * + * function: search for the xad entry covering specified offset. + * + * parameters: + * ip - file object; + * xoff - extent offset; + * nextp - address of next extent (if any) for search miss + * cmpp - comparison result: + * btstack - traverse stack; + * flag - search process flag (XT_INSERT); + * + * returns: + * btstack contains (bn, index) of search path traversed to the entry. + * *cmpp is set to result of comparison with the entry returned. + * the page containing the entry is pinned at exit. + */ +static int xtSearch(struct inode *ip, s64 xoff, s64 *nextp, + int *cmpp, struct btstack * btstack, int flag) +{ + struct jfs_inode_info *jfs_ip = JFS_IP(ip); + int rc = 0; + int cmp = 1; /* init for empty page */ + s64 bn; /* block number */ + struct metapage *mp; /* page buffer */ + xtpage_t *p; /* page */ + xad_t *xad; + int base, index, lim, btindex; + struct btframe *btsp; + int nsplit = 0; /* number of pages to split */ + s64 t64; + s64 next = 0; + + INCREMENT(xtStat.search); + + BT_CLR(btstack); + + btstack->nsplit = 0; + + /* + * search down tree from root: + * + * between two consecutive entries of <Ki, Pi> and <Kj, Pj> of + * internal page, child page Pi contains entry with k, Ki <= K < Kj. + * + * if entry with search key K is not found + * internal page search find the entry with largest key Ki + * less than K which point to the child page to search; + * leaf page search find the entry with smallest key Kj + * greater than K so that the returned index is the position of + * the entry to be shifted right for insertion of new entry. + * for empty tree, search key is greater than any key of the tree. + * + * by convention, root bn = 0. + */ + for (bn = 0;;) { + /* get/pin the page to search */ + XT_GETPAGE(ip, bn, mp, PSIZE, p, rc); + if (rc) + return rc; + + /* try sequential access heuristics with the previous + * access entry in target leaf page: + * once search narrowed down into the target leaf, + * key must either match an entry in the leaf or + * key entry does not exist in the tree; + */ +//fastSearch: + if ((jfs_ip->btorder & BT_SEQUENTIAL) && + (p->header.flag & BT_LEAF) && + (index = jfs_ip->btindex) < + le16_to_cpu(p->header.nextindex)) { + xad = &p->xad[index]; + t64 = offsetXAD(xad); + if (xoff < t64 + lengthXAD(xad)) { + if (xoff >= t64) { + *cmpp = 0; + goto out; + } + + /* stop sequential access heuristics */ + goto binarySearch; + } else { /* (t64 + lengthXAD(xad)) <= xoff */ + + /* try next sequential entry */ + index++; + if (index < + le16_to_cpu(p->header.nextindex)) { + xad++; + t64 = offsetXAD(xad); + if (xoff < t64 + lengthXAD(xad)) { + if (xoff >= t64) { + *cmpp = 0; + goto out; + } + + /* miss: key falls between + * previous and this entry + */ + *cmpp = 1; + next = t64; + goto out; + } + + /* (xoff >= t64 + lengthXAD(xad)); + * matching entry may be further out: + * stop heuristic search + */ + /* stop sequential access heuristics */ + goto binarySearch; + } + + /* (index == p->header.nextindex); + * miss: key entry does not exist in + * the target leaf/tree + */ + *cmpp = 1; + goto out; + } + + /* + * if hit, return index of the entry found, and + * if miss, where new entry with search key is + * to be inserted; + */ + out: + /* compute number of pages to split */ + if (flag & XT_INSERT) { + if (p->header.nextindex == /* little-endian */ + p->header.maxentry) + nsplit++; + else + nsplit = 0; + btstack->nsplit = nsplit; + } + + /* save search result */ + btsp = btstack->top; + btsp->bn = bn; + btsp->index = index; + btsp->mp = mp; + + /* update sequential access heuristics */ + jfs_ip->btindex = index; + + if (nextp) + *nextp = next; + + INCREMENT(xtStat.fastSearch); + return 0; + } + + /* well, ... full search now */ + binarySearch: + lim = le16_to_cpu(p->header.nextindex) - XTENTRYSTART; + + /* + * binary search with search key K on the current page + */ + for (base = XTENTRYSTART; lim; lim >>= 1) { + index = base + (lim >> 1); + + XT_CMP(cmp, xoff, &p->xad[index], t64); + if (cmp == 0) { + /* + * search hit + */ + /* search hit - leaf page: + * return the entry found + */ + if (p->header.flag & BT_LEAF) { + *cmpp = cmp; + + /* compute number of pages to split */ + if (flag & XT_INSERT) { + if (p->header.nextindex == + p->header.maxentry) + nsplit++; + else + nsplit = 0; + btstack->nsplit = nsplit; + } + + /* save search result */ + btsp = btstack->top; + btsp->bn = bn; + btsp->index = index; + btsp->mp = mp; + + /* init sequential access heuristics */ + btindex = jfs_ip->btindex; + if (index == btindex || + index == btindex + 1) + jfs_ip->btorder = BT_SEQUENTIAL; + else + jfs_ip->btorder = BT_RANDOM; + jfs_ip->btindex = index; + + return 0; + } + /* search hit - internal page: + * descend/search its child page + */ + if (index < le16_to_cpu(p->header.nextindex)-1) + next = offsetXAD(&p->xad[index + 1]); + goto next; + } + + if (cmp > 0) { + base = index + 1; + --lim; + } + } + + /* + * search miss + * + * base is the smallest index with key (Kj) greater than + * search key (K) and may be zero or maxentry index. + */ + if (base < le16_to_cpu(p->header.nextindex)) + next = offsetXAD(&p->xad[base]); + /* + * search miss - leaf page: + * + * return location of entry (base) where new entry with + * search key K is to be inserted. + */ + if (p->header.flag & BT_LEAF) { + *cmpp = cmp; + + /* compute number of pages to split */ + if (flag & XT_INSERT) { + if (p->header.nextindex == + p->header.maxentry) + nsplit++; + else + nsplit = 0; + btstack->nsplit = nsplit; + } + + /* save search result */ + btsp = btstack->top; + btsp->bn = bn; + btsp->index = base; + btsp->mp = mp; + + /* init sequential access heuristics */ + btindex = jfs_ip->btindex; + if (base == btindex || base == btindex + 1) + jfs_ip->btorder = BT_SEQUENTIAL; + else + jfs_ip->btorder = BT_RANDOM; + jfs_ip->btindex = base; + + if (nextp) + *nextp = next; + + return 0; + } + + /* + * search miss - non-leaf page: + * + * if base is non-zero, decrement base by one to get the parent + * entry of the child page to search. + */ + index = base ? base - 1 : base; + + /* + * go down to child page + */ + next: + /* update number of pages to split */ + if (p->header.nextindex == p->header.maxentry) + nsplit++; + else + nsplit = 0; + + /* push (bn, index) of the parent page/entry */ + if (BT_STACK_FULL(btstack)) { + jfs_error(ip->i_sb, "stack overrun!\n"); + XT_PUTPAGE(mp); + return -EIO; + } + BT_PUSH(btstack, bn, index); + + /* get the child page block number */ + bn = addressXAD(&p->xad[index]); + + /* unpin the parent page */ + XT_PUTPAGE(mp); + } +} + +/* + * xtInsert() + * + * function: + * + * parameter: + * tid - transaction id; + * ip - file object; + * xflag - extent flag (XAD_NOTRECORDED): + * xoff - extent offset; + * xlen - extent length; + * xaddrp - extent address pointer (in/out): + * if (*xaddrp) + * caller allocated data extent at *xaddrp; + * else + * allocate data extent and return its xaddr; + * flag - + * + * return: + */ +int xtInsert(tid_t tid, /* transaction id */ + struct inode *ip, int xflag, s64 xoff, s32 xlen, s64 * xaddrp, + int flag) +{ + int rc = 0; + s64 xaddr, hint; + struct metapage *mp; /* meta-page buffer */ + xtpage_t *p; /* base B+-tree index page */ + s64 bn; + int index, nextindex; + struct btstack btstack; /* traverse stack */ + struct xtsplit split; /* split information */ + xad_t *xad; + int cmp; + s64 next; + struct tlock *tlck; + struct xtlock *xtlck; + + jfs_info("xtInsert: nxoff:0x%lx nxlen:0x%x", (ulong) xoff, xlen); + + /* + * search for the entry location at which to insert: + * + * xtFastSearch() and xtSearch() both returns (leaf page + * pinned, index at which to insert). + * n.b. xtSearch() may return index of maxentry of + * the full page. + */ + if ((rc = xtSearch(ip, xoff, &next, &cmp, &btstack, XT_INSERT))) + return rc; + + /* retrieve search result */ + XT_GETSEARCH(ip, btstack.top, bn, mp, p, index); + + /* This test must follow XT_GETSEARCH since mp must be valid if + * we branch to out: */ + if ((cmp == 0) || (next && (xlen > next - xoff))) { + rc = -EEXIST; + goto out; + } + + /* + * allocate data extent requested + * + * allocation hint: last xad + */ + if ((xaddr = *xaddrp) == 0) { + if (index > XTENTRYSTART) { + xad = &p->xad[index - 1]; + hint = addressXAD(xad) + lengthXAD(xad) - 1; + } else + hint = 0; + if ((rc = dquot_alloc_block(ip, xlen))) + goto out; + if ((rc = dbAlloc(ip, hint, (s64) xlen, &xaddr))) { + dquot_free_block(ip, xlen); + goto out; + } + } + + /* + * insert entry for new extent + */ + xflag |= XAD_NEW; + + /* + * if the leaf page is full, split the page and + * propagate up the router entry for the new page from split + * + * The xtSplitUp() will insert the entry and unpin the leaf page. + */ + nextindex = le16_to_cpu(p->header.nextindex); + if (nextindex == le16_to_cpu(p->header.maxentry)) { + split.mp = mp; + split.index = index; + split.flag = xflag; + split.off = xoff; + split.len = xlen; + split.addr = xaddr; + split.pxdlist = NULL; + if ((rc = xtSplitUp(tid, ip, &split, &btstack))) { + /* undo data extent allocation */ + if (*xaddrp == 0) { + dbFree(ip, xaddr, (s64) xlen); + dquot_free_block(ip, xlen); + } + return rc; + } + + *xaddrp = xaddr; + return 0; + } + + /* + * insert the new entry into the leaf page + */ + /* + * acquire a transaction lock on the leaf page; + * + * action: xad insertion/extension; + */ + BT_MARK_DIRTY(mp, ip); + + /* if insert into middle, shift right remaining entries. */ + if (index < nextindex) + memmove(&p->xad[index + 1], &p->xad[index], + (nextindex - index) * sizeof(xad_t)); + + /* insert the new entry: mark the entry NEW */ + xad = &p->xad[index]; + XT_PUTENTRY(xad, xflag, xoff, xlen, xaddr); + + /* advance next available entry index */ + le16_add_cpu(&p->header.nextindex, 1); + + /* Don't log it if there are no links to the file */ + if (!test_cflag(COMMIT_Nolink, ip)) { + tlck = txLock(tid, ip, mp, tlckXTREE | tlckGROW); + xtlck = (struct xtlock *) & tlck->lock; + xtlck->lwm.offset = + (xtlck->lwm.offset) ? min(index, + (int)xtlck->lwm.offset) : index; + xtlck->lwm.length = + le16_to_cpu(p->header.nextindex) - xtlck->lwm.offset; + } + + *xaddrp = xaddr; + + out: + /* unpin the leaf page */ + XT_PUTPAGE(mp); + + return rc; +} + + +/* + * xtSplitUp() + * + * function: + * split full pages as propagating insertion up the tree + * + * parameter: + * tid - transaction id; + * ip - file object; + * split - entry parameter descriptor; + * btstack - traverse stack from xtSearch() + * + * return: + */ +static int +xtSplitUp(tid_t tid, + struct inode *ip, struct xtsplit * split, struct btstack * btstack) +{ + int rc = 0; + struct metapage *smp; + xtpage_t *sp; /* split page */ + struct metapage *rmp; + s64 rbn; /* new right page block number */ + struct metapage *rcmp; + xtpage_t *rcp; /* right child page */ + s64 rcbn; /* right child page block number */ + int skip; /* index of entry of insertion */ + int nextindex; /* next available entry index of p */ + struct btframe *parent; /* parent page entry on traverse stack */ + xad_t *xad; + s64 xaddr; + int xlen; + int nsplit; /* number of pages split */ + struct pxdlist pxdlist; + pxd_t *pxd; + struct tlock *tlck; + struct xtlock *xtlck; + + smp = split->mp; + sp = XT_PAGE(ip, smp); + + /* is inode xtree root extension/inline EA area free ? */ + if ((sp->header.flag & BT_ROOT) && (!S_ISDIR(ip->i_mode)) && + (le16_to_cpu(sp->header.maxentry) < XTROOTMAXSLOT) && + (JFS_IP(ip)->mode2 & INLINEEA)) { + sp->header.maxentry = cpu_to_le16(XTROOTMAXSLOT); + JFS_IP(ip)->mode2 &= ~INLINEEA; + + BT_MARK_DIRTY(smp, ip); + /* + * acquire a transaction lock on the leaf page; + * + * action: xad insertion/extension; + */ + + /* if insert into middle, shift right remaining entries. */ + skip = split->index; + nextindex = le16_to_cpu(sp->header.nextindex); + if (skip < nextindex) + memmove(&sp->xad[skip + 1], &sp->xad[skip], + (nextindex - skip) * sizeof(xad_t)); + + /* insert the new entry: mark the entry NEW */ + xad = &sp->xad[skip]; + XT_PUTENTRY(xad, split->flag, split->off, split->len, + split->addr); + + /* advance next available entry index */ + le16_add_cpu(&sp->header.nextindex, 1); + + /* Don't log it if there are no links to the file */ + if (!test_cflag(COMMIT_Nolink, ip)) { + tlck = txLock(tid, ip, smp, tlckXTREE | tlckGROW); + xtlck = (struct xtlock *) & tlck->lock; + xtlck->lwm.offset = (xtlck->lwm.offset) ? + min(skip, (int)xtlck->lwm.offset) : skip; + xtlck->lwm.length = + le16_to_cpu(sp->header.nextindex) - + xtlck->lwm.offset; + } + + return 0; + } + + /* + * allocate new index blocks to cover index page split(s) + * + * allocation hint: ? + */ + if (split->pxdlist == NULL) { + nsplit = btstack->nsplit; + split->pxdlist = &pxdlist; + pxdlist.maxnpxd = pxdlist.npxd = 0; + pxd = &pxdlist.pxd[0]; + xlen = JFS_SBI(ip->i_sb)->nbperpage; + for (; nsplit > 0; nsplit--, pxd++) { + if ((rc = dbAlloc(ip, (s64) 0, (s64) xlen, &xaddr)) + == 0) { + PXDaddress(pxd, xaddr); + PXDlength(pxd, xlen); + + pxdlist.maxnpxd++; + + continue; + } + + /* undo allocation */ + + XT_PUTPAGE(smp); + return rc; + } + } + + /* + * Split leaf page <sp> into <sp> and a new right page <rp>. + * + * The split routines insert the new entry into the leaf page, + * and acquire txLock as appropriate. + * return <rp> pinned and its block number <rpbn>. + */ + rc = (sp->header.flag & BT_ROOT) ? + xtSplitRoot(tid, ip, split, &rmp) : + xtSplitPage(tid, ip, split, &rmp, &rbn); + + XT_PUTPAGE(smp); + + if (rc) + return -EIO; + /* + * propagate up the router entry for the leaf page just split + * + * insert a router entry for the new page into the parent page, + * propagate the insert/split up the tree by walking back the stack + * of (bn of parent page, index of child page entry in parent page) + * that were traversed during the search for the page that split. + * + * the propagation of insert/split up the tree stops if the root + * splits or the page inserted into doesn't have to split to hold + * the new entry. + * + * the parent entry for the split page remains the same, and + * a new entry is inserted at its right with the first key and + * block number of the new right page. + * + * There are a maximum of 3 pages pinned at any time: + * right child, left parent and right parent (when the parent splits) + * to keep the child page pinned while working on the parent. + * make sure that all pins are released at exit. + */ + while ((parent = BT_POP(btstack)) != NULL) { + /* parent page specified by stack frame <parent> */ + + /* keep current child pages <rcp> pinned */ + rcmp = rmp; + rcbn = rbn; + rcp = XT_PAGE(ip, rcmp); + + /* + * insert router entry in parent for new right child page <rp> + */ + /* get/pin the parent page <sp> */ + XT_GETPAGE(ip, parent->bn, smp, PSIZE, sp, rc); + if (rc) { + XT_PUTPAGE(rcmp); + return rc; + } + + /* + * The new key entry goes ONE AFTER the index of parent entry, + * because the split was to the right. + */ + skip = parent->index + 1; + + /* + * split or shift right remaining entries of the parent page + */ + nextindex = le16_to_cpu(sp->header.nextindex); + /* + * parent page is full - split the parent page + */ + if (nextindex == le16_to_cpu(sp->header.maxentry)) { + /* init for parent page split */ + split->mp = smp; + split->index = skip; /* index at insert */ + split->flag = XAD_NEW; + split->off = offsetXAD(&rcp->xad[XTENTRYSTART]); + split->len = JFS_SBI(ip->i_sb)->nbperpage; + split->addr = rcbn; + + /* unpin previous right child page */ + XT_PUTPAGE(rcmp); + + /* The split routines insert the new entry, + * and acquire txLock as appropriate. + * return <rp> pinned and its block number <rpbn>. + */ + rc = (sp->header.flag & BT_ROOT) ? + xtSplitRoot(tid, ip, split, &rmp) : + xtSplitPage(tid, ip, split, &rmp, &rbn); + if (rc) { + XT_PUTPAGE(smp); + return rc; + } + + XT_PUTPAGE(smp); + /* keep new child page <rp> pinned */ + } + /* + * parent page is not full - insert in parent page + */ + else { + /* + * insert router entry in parent for the right child + * page from the first entry of the right child page: + */ + /* + * acquire a transaction lock on the parent page; + * + * action: router xad insertion; + */ + BT_MARK_DIRTY(smp, ip); + + /* + * if insert into middle, shift right remaining entries + */ + if (skip < nextindex) + memmove(&sp->xad[skip + 1], &sp->xad[skip], + (nextindex - + skip) << L2XTSLOTSIZE); + + /* insert the router entry */ + xad = &sp->xad[skip]; + XT_PUTENTRY(xad, XAD_NEW, + offsetXAD(&rcp->xad[XTENTRYSTART]), + JFS_SBI(ip->i_sb)->nbperpage, rcbn); + + /* advance next available entry index. */ + le16_add_cpu(&sp->header.nextindex, 1); + + /* Don't log it if there are no links to the file */ + if (!test_cflag(COMMIT_Nolink, ip)) { + tlck = txLock(tid, ip, smp, + tlckXTREE | tlckGROW); + xtlck = (struct xtlock *) & tlck->lock; + xtlck->lwm.offset = (xtlck->lwm.offset) ? + min(skip, (int)xtlck->lwm.offset) : skip; + xtlck->lwm.length = + le16_to_cpu(sp->header.nextindex) - + xtlck->lwm.offset; + } + + /* unpin parent page */ + XT_PUTPAGE(smp); + + /* exit propagate up */ + break; + } + } + + /* unpin current right page */ + XT_PUTPAGE(rmp); + + return 0; +} + + +/* + * xtSplitPage() + * + * function: + * split a full non-root page into + * original/split/left page and new right page + * i.e., the original/split page remains as left page. + * + * parameter: + * int tid, + * struct inode *ip, + * struct xtsplit *split, + * struct metapage **rmpp, + * u64 *rbnp, + * + * return: + * Pointer to page in which to insert or NULL on error. + */ +static int +xtSplitPage(tid_t tid, struct inode *ip, + struct xtsplit * split, struct metapage ** rmpp, s64 * rbnp) +{ + int rc = 0; + struct metapage *smp; + xtpage_t *sp; + struct metapage *rmp; + xtpage_t *rp; /* new right page allocated */ + s64 rbn; /* new right page block number */ + struct metapage *mp; + xtpage_t *p; + s64 nextbn; + int skip, maxentry, middle, righthalf, n; + xad_t *xad; + struct pxdlist *pxdlist; + pxd_t *pxd; + struct tlock *tlck; + struct xtlock *sxtlck = NULL, *rxtlck = NULL; + int quota_allocation = 0; + + smp = split->mp; + sp = XT_PAGE(ip, smp); + + INCREMENT(xtStat.split); + + pxdlist = split->pxdlist; + pxd = &pxdlist->pxd[pxdlist->npxd]; + pxdlist->npxd++; + rbn = addressPXD(pxd); + + /* Allocate blocks to quota. */ + rc = dquot_alloc_block(ip, lengthPXD(pxd)); + if (rc) + goto clean_up; + + quota_allocation += lengthPXD(pxd); + + /* + * allocate the new right page for the split + */ + rmp = get_metapage(ip, rbn, PSIZE, 1); + if (rmp == NULL) { + rc = -EIO; + goto clean_up; + } + + jfs_info("xtSplitPage: ip:0x%p smp:0x%p rmp:0x%p", ip, smp, rmp); + + BT_MARK_DIRTY(rmp, ip); + /* + * action: new page; + */ + + rp = (xtpage_t *) rmp->data; + rp->header.self = *pxd; + rp->header.flag = sp->header.flag & BT_TYPE; + rp->header.maxentry = sp->header.maxentry; /* little-endian */ + rp->header.nextindex = cpu_to_le16(XTENTRYSTART); + + BT_MARK_DIRTY(smp, ip); + /* Don't log it if there are no links to the file */ + if (!test_cflag(COMMIT_Nolink, ip)) { + /* + * acquire a transaction lock on the new right page; + */ + tlck = txLock(tid, ip, rmp, tlckXTREE | tlckNEW); + rxtlck = (struct xtlock *) & tlck->lock; + rxtlck->lwm.offset = XTENTRYSTART; + /* + * acquire a transaction lock on the split page + */ + tlck = txLock(tid, ip, smp, tlckXTREE | tlckGROW); + sxtlck = (struct xtlock *) & tlck->lock; + } + + /* + * initialize/update sibling pointers of <sp> and <rp> + */ + nextbn = le64_to_cpu(sp->header.next); + rp->header.next = cpu_to_le64(nextbn); + rp->header.prev = cpu_to_le64(addressPXD(&sp->header.self)); + sp->header.next = cpu_to_le64(rbn); + + skip = split->index; + + /* + * sequential append at tail (after last entry of last page) + * + * if splitting the last page on a level because of appending + * a entry to it (skip is maxentry), it's likely that the access is + * sequential. adding an empty page on the side of the level is less + * work and can push the fill factor much higher than normal. + * if we're wrong it's no big deal - we will do the split the right + * way next time. + * (it may look like it's equally easy to do a similar hack for + * reverse sorted data, that is, split the tree left, but it's not. + * Be my guest.) + */ + if (nextbn == 0 && skip == le16_to_cpu(sp->header.maxentry)) { + /* + * acquire a transaction lock on the new/right page; + * + * action: xad insertion; + */ + /* insert entry at the first entry of the new right page */ + xad = &rp->xad[XTENTRYSTART]; + XT_PUTENTRY(xad, split->flag, split->off, split->len, + split->addr); + + rp->header.nextindex = cpu_to_le16(XTENTRYSTART + 1); + + if (!test_cflag(COMMIT_Nolink, ip)) { + /* rxtlck->lwm.offset = XTENTRYSTART; */ + rxtlck->lwm.length = 1; + } + + *rmpp = rmp; + *rbnp = rbn; + + jfs_info("xtSplitPage: sp:0x%p rp:0x%p", sp, rp); + return 0; + } + + /* + * non-sequential insert (at possibly middle page) + */ + + /* + * update previous pointer of old next/right page of <sp> + */ + if (nextbn != 0) { + XT_GETPAGE(ip, nextbn, mp, PSIZE, p, rc); + if (rc) { + XT_PUTPAGE(rmp); + goto clean_up; + } + + BT_MARK_DIRTY(mp, ip); + /* + * acquire a transaction lock on the next page; + * + * action:sibling pointer update; + */ + if (!test_cflag(COMMIT_Nolink, ip)) + tlck = txLock(tid, ip, mp, tlckXTREE | tlckRELINK); + + p->header.prev = cpu_to_le64(rbn); + + /* sibling page may have been updated previously, or + * it may be updated later; + */ + + XT_PUTPAGE(mp); + } + + /* + * split the data between the split and new/right pages + */ + maxentry = le16_to_cpu(sp->header.maxentry); + middle = maxentry >> 1; + righthalf = maxentry - middle; + + /* + * skip index in old split/left page - insert into left page: + */ + if (skip <= middle) { + /* move right half of split page to the new right page */ + memmove(&rp->xad[XTENTRYSTART], &sp->xad[middle], + righthalf << L2XTSLOTSIZE); + + /* shift right tail of left half to make room for new entry */ + if (skip < middle) + memmove(&sp->xad[skip + 1], &sp->xad[skip], + (middle - skip) << L2XTSLOTSIZE); + + /* insert new entry */ + xad = &sp->xad[skip]; + XT_PUTENTRY(xad, split->flag, split->off, split->len, + split->addr); + + /* update page header */ + sp->header.nextindex = cpu_to_le16(middle + 1); + if (!test_cflag(COMMIT_Nolink, ip)) { + sxtlck->lwm.offset = (sxtlck->lwm.offset) ? + min(skip, (int)sxtlck->lwm.offset) : skip; + } + + rp->header.nextindex = + cpu_to_le16(XTENTRYSTART + righthalf); + } + /* + * skip index in new right page - insert into right page: + */ + else { + /* move left head of right half to right page */ + n = skip - middle; + memmove(&rp->xad[XTENTRYSTART], &sp->xad[middle], + n << L2XTSLOTSIZE); + + /* insert new entry */ + n += XTENTRYSTART; + xad = &rp->xad[n]; + XT_PUTENTRY(xad, split->flag, split->off, split->len, + split->addr); + + /* move right tail of right half to right page */ + if (skip < maxentry) + memmove(&rp->xad[n + 1], &sp->xad[skip], + (maxentry - skip) << L2XTSLOTSIZE); + + /* update page header */ + sp->header.nextindex = cpu_to_le16(middle); + if (!test_cflag(COMMIT_Nolink, ip)) { + sxtlck->lwm.offset = (sxtlck->lwm.offset) ? + min(middle, (int)sxtlck->lwm.offset) : middle; + } + + rp->header.nextindex = cpu_to_le16(XTENTRYSTART + + righthalf + 1); + } + + if (!test_cflag(COMMIT_Nolink, ip)) { + sxtlck->lwm.length = le16_to_cpu(sp->header.nextindex) - + sxtlck->lwm.offset; + + /* rxtlck->lwm.offset = XTENTRYSTART; */ + rxtlck->lwm.length = le16_to_cpu(rp->header.nextindex) - + XTENTRYSTART; + } + + *rmpp = rmp; + *rbnp = rbn; + + jfs_info("xtSplitPage: sp:0x%p rp:0x%p", sp, rp); + return rc; + + clean_up: + + /* Rollback quota allocation. */ + if (quota_allocation) + dquot_free_block(ip, quota_allocation); + + return (rc); +} + + +/* + * xtSplitRoot() + * + * function: + * split the full root page into original/root/split page and new + * right page + * i.e., root remains fixed in tree anchor (inode) and the root is + * copied to a single new right child page since root page << + * non-root page, and the split root page contains a single entry + * for the new right child page. + * + * parameter: + * int tid, + * struct inode *ip, + * struct xtsplit *split, + * struct metapage **rmpp) + * + * return: + * Pointer to page in which to insert or NULL on error. + */ +static int +xtSplitRoot(tid_t tid, + struct inode *ip, struct xtsplit * split, struct metapage ** rmpp) +{ + xtpage_t *sp; + struct metapage *rmp; + xtpage_t *rp; + s64 rbn; + int skip, nextindex; + xad_t *xad; + pxd_t *pxd; + struct pxdlist *pxdlist; + struct tlock *tlck; + struct xtlock *xtlck; + int rc; + + sp = &JFS_IP(ip)->i_xtroot; + + INCREMENT(xtStat.split); + + /* + * allocate a single (right) child page + */ + pxdlist = split->pxdlist; + pxd = &pxdlist->pxd[pxdlist->npxd]; + pxdlist->npxd++; + rbn = addressPXD(pxd); + rmp = get_metapage(ip, rbn, PSIZE, 1); + if (rmp == NULL) + return -EIO; + + /* Allocate blocks to quota. */ + rc = dquot_alloc_block(ip, lengthPXD(pxd)); + if (rc) { + release_metapage(rmp); + return rc; + } + + jfs_info("xtSplitRoot: ip:0x%p rmp:0x%p", ip, rmp); + + /* + * acquire a transaction lock on the new right page; + * + * action: new page; + */ + BT_MARK_DIRTY(rmp, ip); + + rp = (xtpage_t *) rmp->data; + rp->header.flag = + (sp->header.flag & BT_LEAF) ? BT_LEAF : BT_INTERNAL; + rp->header.self = *pxd; + rp->header.nextindex = cpu_to_le16(XTENTRYSTART); + rp->header.maxentry = cpu_to_le16(PSIZE >> L2XTSLOTSIZE); + + /* initialize sibling pointers */ + rp->header.next = 0; + rp->header.prev = 0; + + /* + * copy the in-line root page into new right page extent + */ + nextindex = le16_to_cpu(sp->header.maxentry); + memmove(&rp->xad[XTENTRYSTART], &sp->xad[XTENTRYSTART], + (nextindex - XTENTRYSTART) << L2XTSLOTSIZE); + + /* + * insert the new entry into the new right/child page + * (skip index in the new right page will not change) + */ + skip = split->index; + /* if insert into middle, shift right remaining entries */ + if (skip != nextindex) + memmove(&rp->xad[skip + 1], &rp->xad[skip], + (nextindex - skip) * sizeof(xad_t)); + + xad = &rp->xad[skip]; + XT_PUTENTRY(xad, split->flag, split->off, split->len, split->addr); + + /* update page header */ + rp->header.nextindex = cpu_to_le16(nextindex + 1); + + if (!test_cflag(COMMIT_Nolink, ip)) { + tlck = txLock(tid, ip, rmp, tlckXTREE | tlckNEW); + xtlck = (struct xtlock *) & tlck->lock; + xtlck->lwm.offset = XTENTRYSTART; + xtlck->lwm.length = le16_to_cpu(rp->header.nextindex) - + XTENTRYSTART; + } + + /* + * reset the root + * + * init root with the single entry for the new right page + * set the 1st entry offset to 0, which force the left-most key + * at any level of the tree to be less than any search key. + */ + /* + * acquire a transaction lock on the root page (in-memory inode); + * + * action: root split; + */ + BT_MARK_DIRTY(split->mp, ip); + + xad = &sp->xad[XTENTRYSTART]; + XT_PUTENTRY(xad, XAD_NEW, 0, JFS_SBI(ip->i_sb)->nbperpage, rbn); + + /* update page header of root */ + sp->header.flag &= ~BT_LEAF; + sp->header.flag |= BT_INTERNAL; + + sp->header.nextindex = cpu_to_le16(XTENTRYSTART + 1); + + if (!test_cflag(COMMIT_Nolink, ip)) { + tlck = txLock(tid, ip, split->mp, tlckXTREE | tlckGROW); + xtlck = (struct xtlock *) & tlck->lock; + xtlck->lwm.offset = XTENTRYSTART; + xtlck->lwm.length = 1; + } + + *rmpp = rmp; + + jfs_info("xtSplitRoot: sp:0x%p rp:0x%p", sp, rp); + return 0; +} + + +/* + * xtExtend() + * + * function: extend in-place; + * + * note: existing extent may or may not have been committed. + * caller is responsible for pager buffer cache update, and + * working block allocation map update; + * update pmap: alloc whole extended extent; + */ +int xtExtend(tid_t tid, /* transaction id */ + struct inode *ip, s64 xoff, /* delta extent offset */ + s32 xlen, /* delta extent length */ + int flag) +{ + int rc = 0; + int cmp; + struct metapage *mp; /* meta-page buffer */ + xtpage_t *p; /* base B+-tree index page */ + s64 bn; + int index, nextindex, len; + struct btstack btstack; /* traverse stack */ + struct xtsplit split; /* split information */ + xad_t *xad; + s64 xaddr; + struct tlock *tlck; + struct xtlock *xtlck = NULL; + + jfs_info("xtExtend: nxoff:0x%lx nxlen:0x%x", (ulong) xoff, xlen); + + /* there must exist extent to be extended */ + if ((rc = xtSearch(ip, xoff - 1, NULL, &cmp, &btstack, XT_INSERT))) + return rc; + + /* retrieve search result */ + XT_GETSEARCH(ip, btstack.top, bn, mp, p, index); + + if (cmp != 0) { + XT_PUTPAGE(mp); + jfs_error(ip->i_sb, "xtSearch did not find extent\n"); + return -EIO; + } + + /* extension must be contiguous */ + xad = &p->xad[index]; + if ((offsetXAD(xad) + lengthXAD(xad)) != xoff) { + XT_PUTPAGE(mp); + jfs_error(ip->i_sb, "extension is not contiguous\n"); + return -EIO; + } + + /* + * acquire a transaction lock on the leaf page; + * + * action: xad insertion/extension; + */ + BT_MARK_DIRTY(mp, ip); + if (!test_cflag(COMMIT_Nolink, ip)) { + tlck = txLock(tid, ip, mp, tlckXTREE | tlckGROW); + xtlck = (struct xtlock *) & tlck->lock; + } + + /* extend will overflow extent ? */ + xlen = lengthXAD(xad) + xlen; + if ((len = xlen - MAXXLEN) <= 0) + goto extendOld; + + /* + * extent overflow: insert entry for new extent + */ +//insertNew: + xoff = offsetXAD(xad) + MAXXLEN; + xaddr = addressXAD(xad) + MAXXLEN; + nextindex = le16_to_cpu(p->header.nextindex); + + /* + * if the leaf page is full, insert the new entry and + * propagate up the router entry for the new page from split + * + * The xtSplitUp() will insert the entry and unpin the leaf page. + */ + if (nextindex == le16_to_cpu(p->header.maxentry)) { + /* xtSpliUp() unpins leaf pages */ + split.mp = mp; + split.index = index + 1; + split.flag = XAD_NEW; + split.off = xoff; /* split offset */ + split.len = len; + split.addr = xaddr; + split.pxdlist = NULL; + if ((rc = xtSplitUp(tid, ip, &split, &btstack))) + return rc; + + /* get back old page */ + XT_GETPAGE(ip, bn, mp, PSIZE, p, rc); + if (rc) + return rc; + /* + * if leaf root has been split, original root has been + * copied to new child page, i.e., original entry now + * resides on the new child page; + */ + if (p->header.flag & BT_INTERNAL) { + ASSERT(p->header.nextindex == + cpu_to_le16(XTENTRYSTART + 1)); + xad = &p->xad[XTENTRYSTART]; + bn = addressXAD(xad); + XT_PUTPAGE(mp); + + /* get new child page */ + XT_GETPAGE(ip, bn, mp, PSIZE, p, rc); + if (rc) + return rc; + + BT_MARK_DIRTY(mp, ip); + if (!test_cflag(COMMIT_Nolink, ip)) { + tlck = txLock(tid, ip, mp, tlckXTREE|tlckGROW); + xtlck = (struct xtlock *) & tlck->lock; + } + } + } + /* + * insert the new entry into the leaf page + */ + else { + /* insert the new entry: mark the entry NEW */ + xad = &p->xad[index + 1]; + XT_PUTENTRY(xad, XAD_NEW, xoff, len, xaddr); + + /* advance next available entry index */ + le16_add_cpu(&p->header.nextindex, 1); + } + + /* get back old entry */ + xad = &p->xad[index]; + xlen = MAXXLEN; + + /* + * extend old extent + */ + extendOld: + XADlength(xad, xlen); + if (!(xad->flag & XAD_NEW)) + xad->flag |= XAD_EXTENDED; + + if (!test_cflag(COMMIT_Nolink, ip)) { + xtlck->lwm.offset = + (xtlck->lwm.offset) ? min(index, + (int)xtlck->lwm.offset) : index; + xtlck->lwm.length = + le16_to_cpu(p->header.nextindex) - xtlck->lwm.offset; + } + + /* unpin the leaf page */ + XT_PUTPAGE(mp); + + return rc; +} + +#ifdef _NOTYET +/* + * xtTailgate() + * + * function: split existing 'tail' extent + * (split offset >= start offset of tail extent), and + * relocate and extend the split tail half; + * + * note: existing extent may or may not have been committed. + * caller is responsible for pager buffer cache update, and + * working block allocation map update; + * update pmap: free old split tail extent, alloc new extent; + */ +int xtTailgate(tid_t tid, /* transaction id */ + struct inode *ip, s64 xoff, /* split/new extent offset */ + s32 xlen, /* new extent length */ + s64 xaddr, /* new extent address */ + int flag) +{ + int rc = 0; + int cmp; + struct metapage *mp; /* meta-page buffer */ + xtpage_t *p; /* base B+-tree index page */ + s64 bn; + int index, nextindex, llen, rlen; + struct btstack btstack; /* traverse stack */ + struct xtsplit split; /* split information */ + xad_t *xad; + struct tlock *tlck; + struct xtlock *xtlck = 0; + struct tlock *mtlck; + struct maplock *pxdlock; + +/* +printf("xtTailgate: nxoff:0x%lx nxlen:0x%x nxaddr:0x%lx\n", + (ulong)xoff, xlen, (ulong)xaddr); +*/ + + /* there must exist extent to be tailgated */ + if ((rc = xtSearch(ip, xoff, NULL, &cmp, &btstack, XT_INSERT))) + return rc; + + /* retrieve search result */ + XT_GETSEARCH(ip, btstack.top, bn, mp, p, index); + + if (cmp != 0) { + XT_PUTPAGE(mp); + jfs_error(ip->i_sb, "couldn't find extent\n"); + return -EIO; + } + + /* entry found must be last entry */ + nextindex = le16_to_cpu(p->header.nextindex); + if (index != nextindex - 1) { + XT_PUTPAGE(mp); + jfs_error(ip->i_sb, "the entry found is not the last entry\n"); + return -EIO; + } + + BT_MARK_DIRTY(mp, ip); + /* + * acquire tlock of the leaf page containing original entry + */ + if (!test_cflag(COMMIT_Nolink, ip)) { + tlck = txLock(tid, ip, mp, tlckXTREE | tlckGROW); + xtlck = (struct xtlock *) & tlck->lock; + } + + /* completely replace extent ? */ + xad = &p->xad[index]; +/* +printf("xtTailgate: xoff:0x%lx xlen:0x%x xaddr:0x%lx\n", + (ulong)offsetXAD(xad), lengthXAD(xad), (ulong)addressXAD(xad)); +*/ + if ((llen = xoff - offsetXAD(xad)) == 0) + goto updateOld; + + /* + * partially replace extent: insert entry for new extent + */ +//insertNew: + /* + * if the leaf page is full, insert the new entry and + * propagate up the router entry for the new page from split + * + * The xtSplitUp() will insert the entry and unpin the leaf page. + */ + if (nextindex == le16_to_cpu(p->header.maxentry)) { + /* xtSpliUp() unpins leaf pages */ + split.mp = mp; + split.index = index + 1; + split.flag = XAD_NEW; + split.off = xoff; /* split offset */ + split.len = xlen; + split.addr = xaddr; + split.pxdlist = NULL; + if ((rc = xtSplitUp(tid, ip, &split, &btstack))) + return rc; + + /* get back old page */ + XT_GETPAGE(ip, bn, mp, PSIZE, p, rc); + if (rc) + return rc; + /* + * if leaf root has been split, original root has been + * copied to new child page, i.e., original entry now + * resides on the new child page; + */ + if (p->header.flag & BT_INTERNAL) { + ASSERT(p->header.nextindex == + cpu_to_le16(XTENTRYSTART + 1)); + xad = &p->xad[XTENTRYSTART]; + bn = addressXAD(xad); + XT_PUTPAGE(mp); + + /* get new child page */ + XT_GETPAGE(ip, bn, mp, PSIZE, p, rc); + if (rc) + return rc; + + BT_MARK_DIRTY(mp, ip); + if (!test_cflag(COMMIT_Nolink, ip)) { + tlck = txLock(tid, ip, mp, tlckXTREE|tlckGROW); + xtlck = (struct xtlock *) & tlck->lock; + } + } + } + /* + * insert the new entry into the leaf page + */ + else { + /* insert the new entry: mark the entry NEW */ + xad = &p->xad[index + 1]; + XT_PUTENTRY(xad, XAD_NEW, xoff, xlen, xaddr); + + /* advance next available entry index */ + le16_add_cpu(&p->header.nextindex, 1); + } + + /* get back old XAD */ + xad = &p->xad[index]; + + /* + * truncate/relocate old extent at split offset + */ + updateOld: + /* update dmap for old/committed/truncated extent */ + rlen = lengthXAD(xad) - llen; + if (!(xad->flag & XAD_NEW)) { + /* free from PWMAP at commit */ + if (!test_cflag(COMMIT_Nolink, ip)) { + mtlck = txMaplock(tid, ip, tlckMAP); + pxdlock = (struct maplock *) & mtlck->lock; + pxdlock->flag = mlckFREEPXD; + PXDaddress(&pxdlock->pxd, addressXAD(xad) + llen); + PXDlength(&pxdlock->pxd, rlen); + pxdlock->index = 1; + } + } else + /* free from WMAP */ + dbFree(ip, addressXAD(xad) + llen, (s64) rlen); + + if (llen) + /* truncate */ + XADlength(xad, llen); + else + /* replace */ + XT_PUTENTRY(xad, XAD_NEW, xoff, xlen, xaddr); + + if (!test_cflag(COMMIT_Nolink, ip)) { + xtlck->lwm.offset = (xtlck->lwm.offset) ? + min(index, (int)xtlck->lwm.offset) : index; + xtlck->lwm.length = le16_to_cpu(p->header.nextindex) - + xtlck->lwm.offset; + } + + /* unpin the leaf page */ + XT_PUTPAGE(mp); + + return rc; +} +#endif /* _NOTYET */ + +/* + * xtUpdate() + * + * function: update XAD; + * + * update extent for allocated_but_not_recorded or + * compressed extent; + * + * parameter: + * nxad - new XAD; + * logical extent of the specified XAD must be completely + * contained by an existing XAD; + */ +int xtUpdate(tid_t tid, struct inode *ip, xad_t * nxad) +{ /* new XAD */ + int rc = 0; + int cmp; + struct metapage *mp; /* meta-page buffer */ + xtpage_t *p; /* base B+-tree index page */ + s64 bn; + int index0, index, newindex, nextindex; + struct btstack btstack; /* traverse stack */ + struct xtsplit split; /* split information */ + xad_t *xad, *lxad, *rxad; + int xflag; + s64 nxoff, xoff; + int nxlen, xlen, lxlen, rxlen; + s64 nxaddr, xaddr; + struct tlock *tlck; + struct xtlock *xtlck = NULL; + int newpage = 0; + + /* there must exist extent to be tailgated */ + nxoff = offsetXAD(nxad); + nxlen = lengthXAD(nxad); + nxaddr = addressXAD(nxad); + + if ((rc = xtSearch(ip, nxoff, NULL, &cmp, &btstack, XT_INSERT))) + return rc; + + /* retrieve search result */ + XT_GETSEARCH(ip, btstack.top, bn, mp, p, index0); + + if (cmp != 0) { + XT_PUTPAGE(mp); + jfs_error(ip->i_sb, "Could not find extent\n"); + return -EIO; + } + + BT_MARK_DIRTY(mp, ip); + /* + * acquire tlock of the leaf page containing original entry + */ + if (!test_cflag(COMMIT_Nolink, ip)) { + tlck = txLock(tid, ip, mp, tlckXTREE | tlckGROW); + xtlck = (struct xtlock *) & tlck->lock; + } + + xad = &p->xad[index0]; + xflag = xad->flag; + xoff = offsetXAD(xad); + xlen = lengthXAD(xad); + xaddr = addressXAD(xad); + + /* nXAD must be completely contained within XAD */ + if ((xoff > nxoff) || + (nxoff + nxlen > xoff + xlen)) { + XT_PUTPAGE(mp); + jfs_error(ip->i_sb, + "nXAD in not completely contained within XAD\n"); + return -EIO; + } + + index = index0; + newindex = index + 1; + nextindex = le16_to_cpu(p->header.nextindex); + +#ifdef _JFS_WIP_NOCOALESCE + if (xoff < nxoff) + goto updateRight; + + /* + * replace XAD with nXAD + */ + replace: /* (nxoff == xoff) */ + if (nxlen == xlen) { + /* replace XAD with nXAD:recorded */ + *xad = *nxad; + xad->flag = xflag & ~XAD_NOTRECORDED; + + goto out; + } else /* (nxlen < xlen) */ + goto updateLeft; +#endif /* _JFS_WIP_NOCOALESCE */ + +/* #ifdef _JFS_WIP_COALESCE */ + if (xoff < nxoff) + goto coalesceRight; + + /* + * coalesce with left XAD + */ +//coalesceLeft: /* (xoff == nxoff) */ + /* is XAD first entry of page ? */ + if (index == XTENTRYSTART) + goto replace; + + /* is nXAD logically and physically contiguous with lXAD ? */ + lxad = &p->xad[index - 1]; + lxlen = lengthXAD(lxad); + if (!(lxad->flag & XAD_NOTRECORDED) && + (nxoff == offsetXAD(lxad) + lxlen) && + (nxaddr == addressXAD(lxad) + lxlen) && + (lxlen + nxlen < MAXXLEN)) { + /* extend right lXAD */ + index0 = index - 1; + XADlength(lxad, lxlen + nxlen); + + /* If we just merged two extents together, need to make sure the + * right extent gets logged. If the left one is marked XAD_NEW, + * then we know it will be logged. Otherwise, mark as + * XAD_EXTENDED + */ + if (!(lxad->flag & XAD_NEW)) + lxad->flag |= XAD_EXTENDED; + + if (xlen > nxlen) { + /* truncate XAD */ + XADoffset(xad, xoff + nxlen); + XADlength(xad, xlen - nxlen); + XADaddress(xad, xaddr + nxlen); + goto out; + } else { /* (xlen == nxlen) */ + + /* remove XAD */ + if (index < nextindex - 1) + memmove(&p->xad[index], &p->xad[index + 1], + (nextindex - index - + 1) << L2XTSLOTSIZE); + + p->header.nextindex = + cpu_to_le16(le16_to_cpu(p->header.nextindex) - + 1); + + index = index0; + newindex = index + 1; + nextindex = le16_to_cpu(p->header.nextindex); + xoff = nxoff = offsetXAD(lxad); + xlen = nxlen = lxlen + nxlen; + xaddr = nxaddr = addressXAD(lxad); + goto coalesceRight; + } + } + + /* + * replace XAD with nXAD + */ + replace: /* (nxoff == xoff) */ + if (nxlen == xlen) { + /* replace XAD with nXAD:recorded */ + *xad = *nxad; + xad->flag = xflag & ~XAD_NOTRECORDED; + + goto coalesceRight; + } else /* (nxlen < xlen) */ + goto updateLeft; + + /* + * coalesce with right XAD + */ + coalesceRight: /* (xoff <= nxoff) */ + /* is XAD last entry of page ? */ + if (newindex == nextindex) { + if (xoff == nxoff) + goto out; + goto updateRight; + } + + /* is nXAD logically and physically contiguous with rXAD ? */ + rxad = &p->xad[index + 1]; + rxlen = lengthXAD(rxad); + if (!(rxad->flag & XAD_NOTRECORDED) && + (nxoff + nxlen == offsetXAD(rxad)) && + (nxaddr + nxlen == addressXAD(rxad)) && + (rxlen + nxlen < MAXXLEN)) { + /* extend left rXAD */ + XADoffset(rxad, nxoff); + XADlength(rxad, rxlen + nxlen); + XADaddress(rxad, nxaddr); + + /* If we just merged two extents together, need to make sure + * the left extent gets logged. If the right one is marked + * XAD_NEW, then we know it will be logged. Otherwise, mark as + * XAD_EXTENDED + */ + if (!(rxad->flag & XAD_NEW)) + rxad->flag |= XAD_EXTENDED; + + if (xlen > nxlen) + /* truncate XAD */ + XADlength(xad, xlen - nxlen); + else { /* (xlen == nxlen) */ + + /* remove XAD */ + memmove(&p->xad[index], &p->xad[index + 1], + (nextindex - index - 1) << L2XTSLOTSIZE); + + p->header.nextindex = + cpu_to_le16(le16_to_cpu(p->header.nextindex) - + 1); + } + + goto out; + } else if (xoff == nxoff) + goto out; + + if (xoff >= nxoff) { + XT_PUTPAGE(mp); + jfs_error(ip->i_sb, "xoff >= nxoff\n"); + return -EIO; + } +/* #endif _JFS_WIP_COALESCE */ + + /* + * split XAD into (lXAD, nXAD): + * + * |---nXAD---> + * --|----------XAD----------|-- + * |-lXAD-| + */ + updateRight: /* (xoff < nxoff) */ + /* truncate old XAD as lXAD:not_recorded */ + xad = &p->xad[index]; + XADlength(xad, nxoff - xoff); + + /* insert nXAD:recorded */ + if (nextindex == le16_to_cpu(p->header.maxentry)) { + + /* xtSpliUp() unpins leaf pages */ + split.mp = mp; + split.index = newindex; + split.flag = xflag & ~XAD_NOTRECORDED; + split.off = nxoff; + split.len = nxlen; + split.addr = nxaddr; + split.pxdlist = NULL; + if ((rc = xtSplitUp(tid, ip, &split, &btstack))) + return rc; + + /* get back old page */ + XT_GETPAGE(ip, bn, mp, PSIZE, p, rc); + if (rc) + return rc; + /* + * if leaf root has been split, original root has been + * copied to new child page, i.e., original entry now + * resides on the new child page; + */ + if (p->header.flag & BT_INTERNAL) { + ASSERT(p->header.nextindex == + cpu_to_le16(XTENTRYSTART + 1)); + xad = &p->xad[XTENTRYSTART]; + bn = addressXAD(xad); + XT_PUTPAGE(mp); + + /* get new child page */ + XT_GETPAGE(ip, bn, mp, PSIZE, p, rc); + if (rc) + return rc; + + BT_MARK_DIRTY(mp, ip); + if (!test_cflag(COMMIT_Nolink, ip)) { + tlck = txLock(tid, ip, mp, tlckXTREE|tlckGROW); + xtlck = (struct xtlock *) & tlck->lock; + } + } else { + /* is nXAD on new page ? */ + if (newindex > + (le16_to_cpu(p->header.maxentry) >> 1)) { + newindex = + newindex - + le16_to_cpu(p->header.nextindex) + + XTENTRYSTART; + newpage = 1; + } + } + } else { + /* if insert into middle, shift right remaining entries */ + if (newindex < nextindex) + memmove(&p->xad[newindex + 1], &p->xad[newindex], + (nextindex - newindex) << L2XTSLOTSIZE); + + /* insert the entry */ + xad = &p->xad[newindex]; + *xad = *nxad; + xad->flag = xflag & ~XAD_NOTRECORDED; + + /* advance next available entry index. */ + p->header.nextindex = + cpu_to_le16(le16_to_cpu(p->header.nextindex) + 1); + } + + /* + * does nXAD force 3-way split ? + * + * |---nXAD--->| + * --|----------XAD-------------|-- + * |-lXAD-| |-rXAD -| + */ + if (nxoff + nxlen == xoff + xlen) + goto out; + + /* reorient nXAD as XAD for further split XAD into (nXAD, rXAD) */ + if (newpage) { + /* close out old page */ + if (!test_cflag(COMMIT_Nolink, ip)) { + xtlck->lwm.offset = (xtlck->lwm.offset) ? + min(index0, (int)xtlck->lwm.offset) : index0; + xtlck->lwm.length = + le16_to_cpu(p->header.nextindex) - + xtlck->lwm.offset; + } + + bn = le64_to_cpu(p->header.next); + XT_PUTPAGE(mp); + + /* get new right page */ + XT_GETPAGE(ip, bn, mp, PSIZE, p, rc); + if (rc) + return rc; + + BT_MARK_DIRTY(mp, ip); + if (!test_cflag(COMMIT_Nolink, ip)) { + tlck = txLock(tid, ip, mp, tlckXTREE | tlckGROW); + xtlck = (struct xtlock *) & tlck->lock; + } + + index0 = index = newindex; + } else + index++; + + newindex = index + 1; + nextindex = le16_to_cpu(p->header.nextindex); + xlen = xlen - (nxoff - xoff); + xoff = nxoff; + xaddr = nxaddr; + + /* recompute split pages */ + if (nextindex == le16_to_cpu(p->header.maxentry)) { + XT_PUTPAGE(mp); + + if ((rc = xtSearch(ip, nxoff, NULL, &cmp, &btstack, XT_INSERT))) + return rc; + + /* retrieve search result */ + XT_GETSEARCH(ip, btstack.top, bn, mp, p, index0); + + if (cmp != 0) { + XT_PUTPAGE(mp); + jfs_error(ip->i_sb, "xtSearch failed\n"); + return -EIO; + } + + if (index0 != index) { + XT_PUTPAGE(mp); + jfs_error(ip->i_sb, "unexpected value of index\n"); + return -EIO; + } + } + + /* + * split XAD into (nXAD, rXAD) + * + * ---nXAD---| + * --|----------XAD----------|-- + * |-rXAD-| + */ + updateLeft: /* (nxoff == xoff) && (nxlen < xlen) */ + /* update old XAD with nXAD:recorded */ + xad = &p->xad[index]; + *xad = *nxad; + xad->flag = xflag & ~XAD_NOTRECORDED; + + /* insert rXAD:not_recorded */ + xoff = xoff + nxlen; + xlen = xlen - nxlen; + xaddr = xaddr + nxlen; + if (nextindex == le16_to_cpu(p->header.maxentry)) { +/* +printf("xtUpdate.updateLeft.split p:0x%p\n", p); +*/ + /* xtSpliUp() unpins leaf pages */ + split.mp = mp; + split.index = newindex; + split.flag = xflag; + split.off = xoff; + split.len = xlen; + split.addr = xaddr; + split.pxdlist = NULL; + if ((rc = xtSplitUp(tid, ip, &split, &btstack))) + return rc; + + /* get back old page */ + XT_GETPAGE(ip, bn, mp, PSIZE, p, rc); + if (rc) + return rc; + + /* + * if leaf root has been split, original root has been + * copied to new child page, i.e., original entry now + * resides on the new child page; + */ + if (p->header.flag & BT_INTERNAL) { + ASSERT(p->header.nextindex == + cpu_to_le16(XTENTRYSTART + 1)); + xad = &p->xad[XTENTRYSTART]; + bn = addressXAD(xad); + XT_PUTPAGE(mp); + + /* get new child page */ + XT_GETPAGE(ip, bn, mp, PSIZE, p, rc); + if (rc) + return rc; + + BT_MARK_DIRTY(mp, ip); + if (!test_cflag(COMMIT_Nolink, ip)) { + tlck = txLock(tid, ip, mp, tlckXTREE|tlckGROW); + xtlck = (struct xtlock *) & tlck->lock; + } + } + } else { + /* if insert into middle, shift right remaining entries */ + if (newindex < nextindex) + memmove(&p->xad[newindex + 1], &p->xad[newindex], + (nextindex - newindex) << L2XTSLOTSIZE); + + /* insert the entry */ + xad = &p->xad[newindex]; + XT_PUTENTRY(xad, xflag, xoff, xlen, xaddr); + + /* advance next available entry index. */ + p->header.nextindex = + cpu_to_le16(le16_to_cpu(p->header.nextindex) + 1); + } + + out: + if (!test_cflag(COMMIT_Nolink, ip)) { + xtlck->lwm.offset = (xtlck->lwm.offset) ? + min(index0, (int)xtlck->lwm.offset) : index0; + xtlck->lwm.length = le16_to_cpu(p->header.nextindex) - + xtlck->lwm.offset; + } + + /* unpin the leaf page */ + XT_PUTPAGE(mp); + + return rc; +} + + +/* + * xtAppend() + * + * function: grow in append mode from contiguous region specified ; + * + * parameter: + * tid - transaction id; + * ip - file object; + * xflag - extent flag: + * xoff - extent offset; + * maxblocks - max extent length; + * xlen - extent length (in/out); + * xaddrp - extent address pointer (in/out): + * flag - + * + * return: + */ +int xtAppend(tid_t tid, /* transaction id */ + struct inode *ip, int xflag, s64 xoff, s32 maxblocks, + s32 * xlenp, /* (in/out) */ + s64 * xaddrp, /* (in/out) */ + int flag) +{ + int rc = 0; + struct metapage *mp; /* meta-page buffer */ + xtpage_t *p; /* base B+-tree index page */ + s64 bn, xaddr; + int index, nextindex; + struct btstack btstack; /* traverse stack */ + struct xtsplit split; /* split information */ + xad_t *xad; + int cmp; + struct tlock *tlck; + struct xtlock *xtlck; + int nsplit, nblocks, xlen; + struct pxdlist pxdlist; + pxd_t *pxd; + s64 next; + + xaddr = *xaddrp; + xlen = *xlenp; + jfs_info("xtAppend: xoff:0x%lx maxblocks:%d xlen:%d xaddr:0x%lx", + (ulong) xoff, maxblocks, xlen, (ulong) xaddr); + + /* + * search for the entry location at which to insert: + * + * xtFastSearch() and xtSearch() both returns (leaf page + * pinned, index at which to insert). + * n.b. xtSearch() may return index of maxentry of + * the full page. + */ + if ((rc = xtSearch(ip, xoff, &next, &cmp, &btstack, XT_INSERT))) + return rc; + + /* retrieve search result */ + XT_GETSEARCH(ip, btstack.top, bn, mp, p, index); + + if (cmp == 0) { + rc = -EEXIST; + goto out; + } + + if (next) + xlen = min(xlen, (int)(next - xoff)); +//insert: + /* + * insert entry for new extent + */ + xflag |= XAD_NEW; + + /* + * if the leaf page is full, split the page and + * propagate up the router entry for the new page from split + * + * The xtSplitUp() will insert the entry and unpin the leaf page. + */ + nextindex = le16_to_cpu(p->header.nextindex); + if (nextindex < le16_to_cpu(p->header.maxentry)) + goto insertLeaf; + + /* + * allocate new index blocks to cover index page split(s) + */ + nsplit = btstack.nsplit; + split.pxdlist = &pxdlist; + pxdlist.maxnpxd = pxdlist.npxd = 0; + pxd = &pxdlist.pxd[0]; + nblocks = JFS_SBI(ip->i_sb)->nbperpage; + for (; nsplit > 0; nsplit--, pxd++, xaddr += nblocks, maxblocks -= nblocks) { + if ((rc = dbAllocBottomUp(ip, xaddr, (s64) nblocks)) == 0) { + PXDaddress(pxd, xaddr); + PXDlength(pxd, nblocks); + + pxdlist.maxnpxd++; + + continue; + } + + /* undo allocation */ + + goto out; + } + + xlen = min(xlen, maxblocks); + + /* + * allocate data extent requested + */ + if ((rc = dbAllocBottomUp(ip, xaddr, (s64) xlen))) + goto out; + + split.mp = mp; + split.index = index; + split.flag = xflag; + split.off = xoff; + split.len = xlen; + split.addr = xaddr; + if ((rc = xtSplitUp(tid, ip, &split, &btstack))) { + /* undo data extent allocation */ + dbFree(ip, *xaddrp, (s64) * xlenp); + + return rc; + } + + *xaddrp = xaddr; + *xlenp = xlen; + return 0; + + /* + * insert the new entry into the leaf page + */ + insertLeaf: + /* + * allocate data extent requested + */ + if ((rc = dbAllocBottomUp(ip, xaddr, (s64) xlen))) + goto out; + + BT_MARK_DIRTY(mp, ip); + /* + * acquire a transaction lock on the leaf page; + * + * action: xad insertion/extension; + */ + tlck = txLock(tid, ip, mp, tlckXTREE | tlckGROW); + xtlck = (struct xtlock *) & tlck->lock; + + /* insert the new entry: mark the entry NEW */ + xad = &p->xad[index]; + XT_PUTENTRY(xad, xflag, xoff, xlen, xaddr); + + /* advance next available entry index */ + le16_add_cpu(&p->header.nextindex, 1); + + xtlck->lwm.offset = + (xtlck->lwm.offset) ? min(index,(int) xtlck->lwm.offset) : index; + xtlck->lwm.length = le16_to_cpu(p->header.nextindex) - + xtlck->lwm.offset; + + *xaddrp = xaddr; + *xlenp = xlen; + + out: + /* unpin the leaf page */ + XT_PUTPAGE(mp); + + return rc; +} +#ifdef _STILL_TO_PORT + +/* - TBD for defragmentaion/reorganization - + * + * xtDelete() + * + * function: + * delete the entry with the specified key. + * + * N.B.: whole extent of the entry is assumed to be deleted. + * + * parameter: + * + * return: + * ENOENT: if the entry is not found. + * + * exception: + */ +int xtDelete(tid_t tid, struct inode *ip, s64 xoff, s32 xlen, int flag) +{ + int rc = 0; + struct btstack btstack; + int cmp; + s64 bn; + struct metapage *mp; + xtpage_t *p; + int index, nextindex; + struct tlock *tlck; + struct xtlock *xtlck; + + /* + * find the matching entry; xtSearch() pins the page + */ + if ((rc = xtSearch(ip, xoff, NULL, &cmp, &btstack, 0))) + return rc; + + XT_GETSEARCH(ip, btstack.top, bn, mp, p, index); + if (cmp) { + /* unpin the leaf page */ + XT_PUTPAGE(mp); + return -ENOENT; + } + + /* + * delete the entry from the leaf page + */ + nextindex = le16_to_cpu(p->header.nextindex); + le16_add_cpu(&p->header.nextindex, -1); + + /* + * if the leaf page bocome empty, free the page + */ + if (p->header.nextindex == cpu_to_le16(XTENTRYSTART)) + return (xtDeleteUp(tid, ip, mp, p, &btstack)); + + BT_MARK_DIRTY(mp, ip); + /* + * acquire a transaction lock on the leaf page; + * + * action:xad deletion; + */ + tlck = txLock(tid, ip, mp, tlckXTREE); + xtlck = (struct xtlock *) & tlck->lock; + xtlck->lwm.offset = + (xtlck->lwm.offset) ? min(index, xtlck->lwm.offset) : index; + + /* if delete from middle, shift left/compact the remaining entries */ + if (index < nextindex - 1) + memmove(&p->xad[index], &p->xad[index + 1], + (nextindex - index - 1) * sizeof(xad_t)); + + XT_PUTPAGE(mp); + + return 0; +} + + +/* - TBD for defragmentaion/reorganization - + * + * xtDeleteUp() + * + * function: + * free empty pages as propagating deletion up the tree + * + * parameter: + * + * return: + */ +static int +xtDeleteUp(tid_t tid, struct inode *ip, + struct metapage * fmp, xtpage_t * fp, struct btstack * btstack) +{ + int rc = 0; + struct metapage *mp; + xtpage_t *p; + int index, nextindex; + s64 xaddr; + int xlen; + struct btframe *parent; + struct tlock *tlck; + struct xtlock *xtlck; + + /* + * keep root leaf page which has become empty + */ + if (fp->header.flag & BT_ROOT) { + /* keep the root page */ + fp->header.flag &= ~BT_INTERNAL; + fp->header.flag |= BT_LEAF; + fp->header.nextindex = cpu_to_le16(XTENTRYSTART); + + /* XT_PUTPAGE(fmp); */ + + return 0; + } + + /* + * free non-root leaf page + */ + if ((rc = xtRelink(tid, ip, fp))) { + XT_PUTPAGE(fmp); + return rc; + } + + xaddr = addressPXD(&fp->header.self); + xlen = lengthPXD(&fp->header.self); + /* free the page extent */ + dbFree(ip, xaddr, (s64) xlen); + + /* free the buffer page */ + discard_metapage(fmp); + + /* + * propagate page deletion up the index tree + * + * If the delete from the parent page makes it empty, + * continue all the way up the tree. + * stop if the root page is reached (which is never deleted) or + * if the entry deletion does not empty the page. + */ + while ((parent = BT_POP(btstack)) != NULL) { + /* get/pin the parent page <sp> */ + XT_GETPAGE(ip, parent->bn, mp, PSIZE, p, rc); + if (rc) + return rc; + + index = parent->index; + + /* delete the entry for the freed child page from parent. + */ + nextindex = le16_to_cpu(p->header.nextindex); + + /* + * the parent has the single entry being deleted: + * free the parent page which has become empty. + */ + if (nextindex == 1) { + if (p->header.flag & BT_ROOT) { + /* keep the root page */ + p->header.flag &= ~BT_INTERNAL; + p->header.flag |= BT_LEAF; + p->header.nextindex = + cpu_to_le16(XTENTRYSTART); + + /* XT_PUTPAGE(mp); */ + + break; + } else { + /* free the parent page */ + if ((rc = xtRelink(tid, ip, p))) + return rc; + + xaddr = addressPXD(&p->header.self); + /* free the page extent */ + dbFree(ip, xaddr, + (s64) JFS_SBI(ip->i_sb)->nbperpage); + + /* unpin/free the buffer page */ + discard_metapage(mp); + + /* propagate up */ + continue; + } + } + /* + * the parent has other entries remaining: + * delete the router entry from the parent page. + */ + else { + BT_MARK_DIRTY(mp, ip); + /* + * acquire a transaction lock on the leaf page; + * + * action:xad deletion; + */ + tlck = txLock(tid, ip, mp, tlckXTREE); + xtlck = (struct xtlock *) & tlck->lock; + xtlck->lwm.offset = + (xtlck->lwm.offset) ? min(index, + xtlck->lwm. + offset) : index; + + /* if delete from middle, + * shift left/compact the remaining entries in the page + */ + if (index < nextindex - 1) + memmove(&p->xad[index], &p->xad[index + 1], + (nextindex - index - + 1) << L2XTSLOTSIZE); + + le16_add_cpu(&p->header.nextindex, -1); + jfs_info("xtDeleteUp(entry): 0x%lx[%d]", + (ulong) parent->bn, index); + } + + /* unpin the parent page */ + XT_PUTPAGE(mp); + + /* exit propagation up */ + break; + } + + return 0; +} + + +/* + * NAME: xtRelocate() + * + * FUNCTION: relocate xtpage or data extent of regular file; + * This function is mainly used by defragfs utility. + * + * NOTE: This routine does not have the logic to handle + * uncommitted allocated extent. The caller should call + * txCommit() to commit all the allocation before call + * this routine. + */ +int +xtRelocate(tid_t tid, struct inode * ip, xad_t * oxad, /* old XAD */ + s64 nxaddr, /* new xaddr */ + int xtype) +{ /* extent type: XTPAGE or DATAEXT */ + int rc = 0; + struct tblock *tblk; + struct tlock *tlck; + struct xtlock *xtlck; + struct metapage *mp, *pmp, *lmp, *rmp; /* meta-page buffer */ + xtpage_t *p, *pp, *rp, *lp; /* base B+-tree index page */ + xad_t *xad; + pxd_t *pxd; + s64 xoff, xsize; + int xlen; + s64 oxaddr, sxaddr, dxaddr, nextbn, prevbn; + cbuf_t *cp; + s64 offset, nbytes, nbrd, pno; + int nb, npages, nblks; + s64 bn; + int cmp; + int index; + struct pxd_lock *pxdlock; + struct btstack btstack; /* traverse stack */ + + xtype = xtype & EXTENT_TYPE; + + xoff = offsetXAD(oxad); + oxaddr = addressXAD(oxad); + xlen = lengthXAD(oxad); + + /* validate extent offset */ + offset = xoff << JFS_SBI(ip->i_sb)->l2bsize; + if (offset >= ip->i_size) + return -ESTALE; /* stale extent */ + + jfs_info("xtRelocate: xtype:%d xoff:0x%lx xlen:0x%x xaddr:0x%lx:0x%lx", + xtype, (ulong) xoff, xlen, (ulong) oxaddr, (ulong) nxaddr); + + /* + * 1. get and validate the parent xtpage/xad entry + * covering the source extent to be relocated; + */ + if (xtype == DATAEXT) { + /* search in leaf entry */ + rc = xtSearch(ip, xoff, NULL, &cmp, &btstack, 0); + if (rc) + return rc; + + /* retrieve search result */ + XT_GETSEARCH(ip, btstack.top, bn, pmp, pp, index); + + if (cmp) { + XT_PUTPAGE(pmp); + return -ESTALE; + } + + /* validate for exact match with a single entry */ + xad = &pp->xad[index]; + if (addressXAD(xad) != oxaddr || lengthXAD(xad) != xlen) { + XT_PUTPAGE(pmp); + return -ESTALE; + } + } else { /* (xtype == XTPAGE) */ + + /* search in internal entry */ + rc = xtSearchNode(ip, oxad, &cmp, &btstack, 0); + if (rc) + return rc; + + /* retrieve search result */ + XT_GETSEARCH(ip, btstack.top, bn, pmp, pp, index); + + if (cmp) { + XT_PUTPAGE(pmp); + return -ESTALE; + } + + /* xtSearchNode() validated for exact match with a single entry + */ + xad = &pp->xad[index]; + } + jfs_info("xtRelocate: parent xad entry validated."); + + /* + * 2. relocate the extent + */ + if (xtype == DATAEXT) { + /* if the extent is allocated-but-not-recorded + * there is no real data to be moved in this extent, + */ + if (xad->flag & XAD_NOTRECORDED) + goto out; + else + /* release xtpage for cmRead()/xtLookup() */ + XT_PUTPAGE(pmp); + + /* + * cmRelocate() + * + * copy target data pages to be relocated; + * + * data extent must start at page boundary and + * multiple of page size (except the last data extent); + * read in each page of the source data extent into cbuf, + * update the cbuf extent descriptor of the page to be + * homeward bound to new dst data extent + * copy the data from the old extent to new extent. + * copy is essential for compressed files to avoid problems + * that can arise if there was a change in compression + * algorithms. + * it is a good strategy because it may disrupt cache + * policy to keep the pages in memory afterwards. + */ + offset = xoff << JFS_SBI(ip->i_sb)->l2bsize; + assert((offset & CM_OFFSET) == 0); + nbytes = xlen << JFS_SBI(ip->i_sb)->l2bsize; + pno = offset >> CM_L2BSIZE; + npages = (nbytes + (CM_BSIZE - 1)) >> CM_L2BSIZE; +/* + npages = ((offset + nbytes - 1) >> CM_L2BSIZE) - + (offset >> CM_L2BSIZE) + 1; +*/ + sxaddr = oxaddr; + dxaddr = nxaddr; + + /* process the request one cache buffer at a time */ + for (nbrd = 0; nbrd < nbytes; nbrd += nb, + offset += nb, pno++, npages--) { + /* compute page size */ + nb = min(nbytes - nbrd, CM_BSIZE); + + /* get the cache buffer of the page */ + if (rc = cmRead(ip, offset, npages, &cp)) + break; + + assert(addressPXD(&cp->cm_pxd) == sxaddr); + assert(!cp->cm_modified); + + /* bind buffer with the new extent address */ + nblks = nb >> JFS_IP(ip->i_sb)->l2bsize; + cmSetXD(ip, cp, pno, dxaddr, nblks); + + /* release the cbuf, mark it as modified */ + cmPut(cp, true); + + dxaddr += nblks; + sxaddr += nblks; + } + + /* get back parent page */ + if ((rc = xtSearch(ip, xoff, NULL, &cmp, &btstack, 0))) + return rc; + + XT_GETSEARCH(ip, btstack.top, bn, pmp, pp, index); + jfs_info("xtRelocate: target data extent relocated."); + } else { /* (xtype == XTPAGE) */ + + /* + * read in the target xtpage from the source extent; + */ + XT_GETPAGE(ip, oxaddr, mp, PSIZE, p, rc); + if (rc) { + XT_PUTPAGE(pmp); + return rc; + } + + /* + * read in sibling pages if any to update sibling pointers; + */ + rmp = NULL; + if (p->header.next) { + nextbn = le64_to_cpu(p->header.next); + XT_GETPAGE(ip, nextbn, rmp, PSIZE, rp, rc); + if (rc) { + XT_PUTPAGE(pmp); + XT_PUTPAGE(mp); + return (rc); + } + } + + lmp = NULL; + if (p->header.prev) { + prevbn = le64_to_cpu(p->header.prev); + XT_GETPAGE(ip, prevbn, lmp, PSIZE, lp, rc); + if (rc) { + XT_PUTPAGE(pmp); + XT_PUTPAGE(mp); + if (rmp) + XT_PUTPAGE(rmp); + return (rc); + } + } + + /* at this point, all xtpages to be updated are in memory */ + + /* + * update sibling pointers of sibling xtpages if any; + */ + if (lmp) { + BT_MARK_DIRTY(lmp, ip); + tlck = txLock(tid, ip, lmp, tlckXTREE | tlckRELINK); + lp->header.next = cpu_to_le64(nxaddr); + XT_PUTPAGE(lmp); + } + + if (rmp) { + BT_MARK_DIRTY(rmp, ip); + tlck = txLock(tid, ip, rmp, tlckXTREE | tlckRELINK); + rp->header.prev = cpu_to_le64(nxaddr); + XT_PUTPAGE(rmp); + } + + /* + * update the target xtpage to be relocated + * + * update the self address of the target page + * and write to destination extent; + * redo image covers the whole xtpage since it is new page + * to the destination extent; + * update of bmap for the free of source extent + * of the target xtpage itself: + * update of bmap for the allocation of destination extent + * of the target xtpage itself: + * update of bmap for the extents covered by xad entries in + * the target xtpage is not necessary since they are not + * updated; + * if not committed before this relocation, + * target page may contain XAD_NEW entries which must + * be scanned for bmap update (logredo() always + * scan xtpage REDOPAGE image for bmap update); + * if committed before this relocation (tlckRELOCATE), + * scan may be skipped by commit() and logredo(); + */ + BT_MARK_DIRTY(mp, ip); + /* tlckNEW init xtlck->lwm.offset = XTENTRYSTART; */ + tlck = txLock(tid, ip, mp, tlckXTREE | tlckNEW); + xtlck = (struct xtlock *) & tlck->lock; + + /* update the self address in the xtpage header */ + pxd = &p->header.self; + PXDaddress(pxd, nxaddr); + + /* linelock for the after image of the whole page */ + xtlck->lwm.length = + le16_to_cpu(p->header.nextindex) - xtlck->lwm.offset; + + /* update the buffer extent descriptor of target xtpage */ + xsize = xlen << JFS_SBI(ip->i_sb)->l2bsize; + bmSetXD(mp, nxaddr, xsize); + + /* unpin the target page to new homeward bound */ + XT_PUTPAGE(mp); + jfs_info("xtRelocate: target xtpage relocated."); + } + + /* + * 3. acquire maplock for the source extent to be freed; + * + * acquire a maplock saving the src relocated extent address; + * to free of the extent at commit time; + */ + out: + /* if DATAEXT relocation, write a LOG_UPDATEMAP record for + * free PXD of the source data extent (logredo() will update + * bmap for free of source data extent), and update bmap for + * free of the source data extent; + */ + if (xtype == DATAEXT) + tlck = txMaplock(tid, ip, tlckMAP); + /* if XTPAGE relocation, write a LOG_NOREDOPAGE record + * for the source xtpage (logredo() will init NoRedoPage + * filter and will also update bmap for free of the source + * xtpage), and update bmap for free of the source xtpage; + * N.B. We use tlckMAP instead of tlkcXTREE because there + * is no buffer associated with this lock since the buffer + * has been redirected to the target location. + */ + else /* (xtype == XTPAGE) */ + tlck = txMaplock(tid, ip, tlckMAP | tlckRELOCATE); + + pxdlock = (struct pxd_lock *) & tlck->lock; + pxdlock->flag = mlckFREEPXD; + PXDaddress(&pxdlock->pxd, oxaddr); + PXDlength(&pxdlock->pxd, xlen); + pxdlock->index = 1; + + /* + * 4. update the parent xad entry for relocation; + * + * acquire tlck for the parent entry with XAD_NEW as entry + * update which will write LOG_REDOPAGE and update bmap for + * allocation of XAD_NEW destination extent; + */ + jfs_info("xtRelocate: update parent xad entry."); + BT_MARK_DIRTY(pmp, ip); + tlck = txLock(tid, ip, pmp, tlckXTREE | tlckGROW); + xtlck = (struct xtlock *) & tlck->lock; + + /* update the XAD with the new destination extent; */ + xad = &pp->xad[index]; + xad->flag |= XAD_NEW; + XADaddress(xad, nxaddr); + + xtlck->lwm.offset = min(index, xtlck->lwm.offset); + xtlck->lwm.length = le16_to_cpu(pp->header.nextindex) - + xtlck->lwm.offset; + + /* unpin the parent xtpage */ + XT_PUTPAGE(pmp); + + return rc; +} + + +/* + * xtSearchNode() + * + * function: search for the internal xad entry covering specified extent. + * This function is mainly used by defragfs utility. + * + * parameters: + * ip - file object; + * xad - extent to find; + * cmpp - comparison result: + * btstack - traverse stack; + * flag - search process flag; + * + * returns: + * btstack contains (bn, index) of search path traversed to the entry. + * *cmpp is set to result of comparison with the entry returned. + * the page containing the entry is pinned at exit. + */ +static int xtSearchNode(struct inode *ip, xad_t * xad, /* required XAD entry */ + int *cmpp, struct btstack * btstack, int flag) +{ + int rc = 0; + s64 xoff, xaddr; + int xlen; + int cmp = 1; /* init for empty page */ + s64 bn; /* block number */ + struct metapage *mp; /* meta-page buffer */ + xtpage_t *p; /* page */ + int base, index, lim; + struct btframe *btsp; + s64 t64; + + BT_CLR(btstack); + + xoff = offsetXAD(xad); + xlen = lengthXAD(xad); + xaddr = addressXAD(xad); + + /* + * search down tree from root: + * + * between two consecutive entries of <Ki, Pi> and <Kj, Pj> of + * internal page, child page Pi contains entry with k, Ki <= K < Kj. + * + * if entry with search key K is not found + * internal page search find the entry with largest key Ki + * less than K which point to the child page to search; + * leaf page search find the entry with smallest key Kj + * greater than K so that the returned index is the position of + * the entry to be shifted right for insertion of new entry. + * for empty tree, search key is greater than any key of the tree. + * + * by convention, root bn = 0. + */ + for (bn = 0;;) { + /* get/pin the page to search */ + XT_GETPAGE(ip, bn, mp, PSIZE, p, rc); + if (rc) + return rc; + if (p->header.flag & BT_LEAF) { + XT_PUTPAGE(mp); + return -ESTALE; + } + + lim = le16_to_cpu(p->header.nextindex) - XTENTRYSTART; + + /* + * binary search with search key K on the current page + */ + for (base = XTENTRYSTART; lim; lim >>= 1) { + index = base + (lim >> 1); + + XT_CMP(cmp, xoff, &p->xad[index], t64); + if (cmp == 0) { + /* + * search hit + * + * verify for exact match; + */ + if (xaddr == addressXAD(&p->xad[index]) && + xoff == offsetXAD(&p->xad[index])) { + *cmpp = cmp; + + /* save search result */ + btsp = btstack->top; + btsp->bn = bn; + btsp->index = index; + btsp->mp = mp; + + return 0; + } + + /* descend/search its child page */ + goto next; + } + + if (cmp > 0) { + base = index + 1; + --lim; + } + } + + /* + * search miss - non-leaf page: + * + * base is the smallest index with key (Kj) greater than + * search key (K) and may be zero or maxentry index. + * if base is non-zero, decrement base by one to get the parent + * entry of the child page to search. + */ + index = base ? base - 1 : base; + + /* + * go down to child page + */ + next: + /* get the child page block number */ + bn = addressXAD(&p->xad[index]); + + /* unpin the parent page */ + XT_PUTPAGE(mp); + } +} + + +/* + * xtRelink() + * + * function: + * link around a freed page. + * + * Parameter: + * int tid, + * struct inode *ip, + * xtpage_t *p) + * + * returns: + */ +static int xtRelink(tid_t tid, struct inode *ip, xtpage_t * p) +{ + int rc = 0; + struct metapage *mp; + s64 nextbn, prevbn; + struct tlock *tlck; + + nextbn = le64_to_cpu(p->header.next); + prevbn = le64_to_cpu(p->header.prev); + + /* update prev pointer of the next page */ + if (nextbn != 0) { + XT_GETPAGE(ip, nextbn, mp, PSIZE, p, rc); + if (rc) + return rc; + + /* + * acquire a transaction lock on the page; + * + * action: update prev pointer; + */ + BT_MARK_DIRTY(mp, ip); + tlck = txLock(tid, ip, mp, tlckXTREE | tlckRELINK); + + /* the page may already have been tlock'd */ + + p->header.prev = cpu_to_le64(prevbn); + + XT_PUTPAGE(mp); + } + + /* update next pointer of the previous page */ + if (prevbn != 0) { + XT_GETPAGE(ip, prevbn, mp, PSIZE, p, rc); + if (rc) + return rc; + + /* + * acquire a transaction lock on the page; + * + * action: update next pointer; + */ + BT_MARK_DIRTY(mp, ip); + tlck = txLock(tid, ip, mp, tlckXTREE | tlckRELINK); + + /* the page may already have been tlock'd */ + + p->header.next = le64_to_cpu(nextbn); + + XT_PUTPAGE(mp); + } + + return 0; +} +#endif /* _STILL_TO_PORT */ + + +/* + * xtInitRoot() + * + * initialize file root (inline in inode) + */ +void xtInitRoot(tid_t tid, struct inode *ip) +{ + xtpage_t *p; + + /* + * acquire a transaction lock on the root + * + * action: + */ + txLock(tid, ip, (struct metapage *) &JFS_IP(ip)->bxflag, + tlckXTREE | tlckNEW); + p = &JFS_IP(ip)->i_xtroot; + + p->header.flag = DXD_INDEX | BT_ROOT | BT_LEAF; + p->header.nextindex = cpu_to_le16(XTENTRYSTART); + + if (S_ISDIR(ip->i_mode)) + p->header.maxentry = cpu_to_le16(XTROOTINITSLOT_DIR); + else { + p->header.maxentry = cpu_to_le16(XTROOTINITSLOT); + ip->i_size = 0; + } + + + return; +} + + +/* + * We can run into a deadlock truncating a file with a large number of + * xtree pages (large fragmented file). A robust fix would entail a + * reservation system where we would reserve a number of metadata pages + * and tlocks which we would be guaranteed without a deadlock. Without + * this, a partial fix is to limit number of metadata pages we will lock + * in a single transaction. Currently we will truncate the file so that + * no more than 50 leaf pages will be locked. The caller of xtTruncate + * will be responsible for ensuring that the current transaction gets + * committed, and that subsequent transactions are created to truncate + * the file further if needed. + */ +#define MAX_TRUNCATE_LEAVES 50 + +/* + * xtTruncate() + * + * function: + * traverse for truncation logging backward bottom up; + * terminate at the last extent entry at the current subtree + * root page covering new down size. + * truncation may occur within the last extent entry. + * + * parameter: + * int tid, + * struct inode *ip, + * s64 newsize, + * int type) {PWMAP, PMAP, WMAP; DELETE, TRUNCATE} + * + * return: + * + * note: + * PWMAP: + * 1. truncate (non-COMMIT_NOLINK file) + * by jfs_truncate() or jfs_open(O_TRUNC): + * xtree is updated; + * 2. truncate index table of directory when last entry removed + * map update via tlock at commit time; + * PMAP: + * Call xtTruncate_pmap instead + * WMAP: + * 1. remove (free zero link count) on last reference release + * (pmap has been freed at commit zero link count); + * 2. truncate (COMMIT_NOLINK file, i.e., tmp file): + * xtree is updated; + * map update directly at truncation time; + * + * if (DELETE) + * no LOG_NOREDOPAGE is required (NOREDOFILE is sufficient); + * else if (TRUNCATE) + * must write LOG_NOREDOPAGE for deleted index page; + * + * pages may already have been tlocked by anonymous transactions + * during file growth (i.e., write) before truncation; + * + * except last truncated entry, deleted entries remains as is + * in the page (nextindex is updated) for other use + * (e.g., log/update allocation map): this avoid copying the page + * info but delay free of pages; + * + */ +s64 xtTruncate(tid_t tid, struct inode *ip, s64 newsize, int flag) +{ + int rc = 0; + s64 teof; + struct metapage *mp; + xtpage_t *p; + s64 bn; + int index, nextindex; + xad_t *xad; + s64 xoff, xaddr; + int xlen, len, freexlen; + struct btstack btstack; + struct btframe *parent; + struct tblock *tblk = NULL; + struct tlock *tlck = NULL; + struct xtlock *xtlck = NULL; + struct xdlistlock xadlock; /* maplock for COMMIT_WMAP */ + struct pxd_lock *pxdlock; /* maplock for COMMIT_WMAP */ + s64 nfreed; + int freed, log; + int locked_leaves = 0; + + /* save object truncation type */ + if (tid) { + tblk = tid_to_tblock(tid); + tblk->xflag |= flag; + } + + nfreed = 0; + + flag &= COMMIT_MAP; + assert(flag != COMMIT_PMAP); + + if (flag == COMMIT_PWMAP) + log = 1; + else { + log = 0; + xadlock.flag = mlckFREEXADLIST; + xadlock.index = 1; + } + + /* + * if the newsize is not an integral number of pages, + * the file between newsize and next page boundary will + * be cleared. + * if truncating into a file hole, it will cause + * a full block to be allocated for the logical block. + */ + + /* + * release page blocks of truncated region <teof, eof> + * + * free the data blocks from the leaf index blocks. + * delete the parent index entries corresponding to + * the freed child data/index blocks. + * free the index blocks themselves which aren't needed + * in new sized file. + * + * index blocks are updated only if the blocks are to be + * retained in the new sized file. + * if type is PMAP, the data and index pages are NOT + * freed, and the data and index blocks are NOT freed + * from working map. + * (this will allow continued access of data/index of + * temporary file (zerolink count file truncated to zero-length)). + */ + teof = (newsize + (JFS_SBI(ip->i_sb)->bsize - 1)) >> + JFS_SBI(ip->i_sb)->l2bsize; + + /* clear stack */ + BT_CLR(&btstack); + + /* + * start with root + * + * root resides in the inode + */ + bn = 0; + + /* + * first access of each page: + */ + getPage: + XT_GETPAGE(ip, bn, mp, PSIZE, p, rc); + if (rc) + return rc; + + /* process entries backward from last index */ + index = le16_to_cpu(p->header.nextindex) - 1; + + + /* Since this is the rightmost page at this level, and we may have + * already freed a page that was formerly to the right, let's make + * sure that the next pointer is zero. + */ + if (p->header.next) { + if (log) + /* + * Make sure this change to the header is logged. + * If we really truncate this leaf, the flag + * will be changed to tlckTRUNCATE + */ + tlck = txLock(tid, ip, mp, tlckXTREE|tlckGROW); + BT_MARK_DIRTY(mp, ip); + p->header.next = 0; + } + + if (p->header.flag & BT_INTERNAL) + goto getChild; + + /* + * leaf page + */ + freed = 0; + + /* does region covered by leaf page precede Teof ? */ + xad = &p->xad[index]; + xoff = offsetXAD(xad); + xlen = lengthXAD(xad); + if (teof >= xoff + xlen) { + XT_PUTPAGE(mp); + goto getParent; + } + + /* (re)acquire tlock of the leaf page */ + if (log) { + if (++locked_leaves > MAX_TRUNCATE_LEAVES) { + /* + * We need to limit the size of the transaction + * to avoid exhausting pagecache & tlocks + */ + XT_PUTPAGE(mp); + newsize = (xoff + xlen) << JFS_SBI(ip->i_sb)->l2bsize; + goto getParent; + } + tlck = txLock(tid, ip, mp, tlckXTREE); + tlck->type = tlckXTREE | tlckTRUNCATE; + xtlck = (struct xtlock *) & tlck->lock; + xtlck->hwm.offset = le16_to_cpu(p->header.nextindex) - 1; + } + BT_MARK_DIRTY(mp, ip); + + /* + * scan backward leaf page entries + */ + for (; index >= XTENTRYSTART; index--) { + xad = &p->xad[index]; + xoff = offsetXAD(xad); + xlen = lengthXAD(xad); + xaddr = addressXAD(xad); + + /* + * The "data" for a directory is indexed by the block + * device's address space. This metadata must be invalidated + * here + */ + if (S_ISDIR(ip->i_mode) && (teof == 0)) + invalidate_xad_metapages(ip, *xad); + /* + * entry beyond eof: continue scan of current page + * xad + * ---|---=======-------> + * eof + */ + if (teof < xoff) { + nfreed += xlen; + continue; + } + + /* + * (xoff <= teof): last entry to be deleted from page; + * If other entries remain in page: keep and update the page. + */ + + /* + * eof == entry_start: delete the entry + * xad + * -------|=======-------> + * eof + * + */ + if (teof == xoff) { + nfreed += xlen; + + if (index == XTENTRYSTART) + break; + + nextindex = index; + } + /* + * eof within the entry: truncate the entry. + * xad + * -------===|===-------> + * eof + */ + else if (teof < xoff + xlen) { + /* update truncated entry */ + len = teof - xoff; + freexlen = xlen - len; + XADlength(xad, len); + + /* save pxd of truncated extent in tlck */ + xaddr += len; + if (log) { /* COMMIT_PWMAP */ + xtlck->lwm.offset = (xtlck->lwm.offset) ? + min(index, (int)xtlck->lwm.offset) : index; + xtlck->lwm.length = index + 1 - + xtlck->lwm.offset; + xtlck->twm.offset = index; + pxdlock = (struct pxd_lock *) & xtlck->pxdlock; + pxdlock->flag = mlckFREEPXD; + PXDaddress(&pxdlock->pxd, xaddr); + PXDlength(&pxdlock->pxd, freexlen); + } + /* free truncated extent */ + else { /* COMMIT_WMAP */ + + pxdlock = (struct pxd_lock *) & xadlock; + pxdlock->flag = mlckFREEPXD; + PXDaddress(&pxdlock->pxd, xaddr); + PXDlength(&pxdlock->pxd, freexlen); + txFreeMap(ip, pxdlock, NULL, COMMIT_WMAP); + + /* reset map lock */ + xadlock.flag = mlckFREEXADLIST; + } + + /* current entry is new last entry; */ + nextindex = index + 1; + + nfreed += freexlen; + } + /* + * eof beyond the entry: + * xad + * -------=======---|---> + * eof + */ + else { /* (xoff + xlen < teof) */ + + nextindex = index + 1; + } + + if (nextindex < le16_to_cpu(p->header.nextindex)) { + if (!log) { /* COMMIT_WAMP */ + xadlock.xdlist = &p->xad[nextindex]; + xadlock.count = + le16_to_cpu(p->header.nextindex) - + nextindex; + txFreeMap(ip, (struct maplock *) & xadlock, + NULL, COMMIT_WMAP); + } + p->header.nextindex = cpu_to_le16(nextindex); + } + + XT_PUTPAGE(mp); + + /* assert(freed == 0); */ + goto getParent; + } /* end scan of leaf page entries */ + + freed = 1; + + /* + * leaf page become empty: free the page if type != PMAP + */ + if (log) { /* COMMIT_PWMAP */ + /* txCommit() with tlckFREE: + * free data extents covered by leaf [XTENTRYSTART:hwm); + * invalidate leaf if COMMIT_PWMAP; + * if (TRUNCATE), will write LOG_NOREDOPAGE; + */ + tlck->type = tlckXTREE | tlckFREE; + } else { /* COMMIT_WAMP */ + + /* free data extents covered by leaf */ + xadlock.xdlist = &p->xad[XTENTRYSTART]; + xadlock.count = + le16_to_cpu(p->header.nextindex) - XTENTRYSTART; + txFreeMap(ip, (struct maplock *) & xadlock, NULL, COMMIT_WMAP); + } + + if (p->header.flag & BT_ROOT) { + p->header.flag &= ~BT_INTERNAL; + p->header.flag |= BT_LEAF; + p->header.nextindex = cpu_to_le16(XTENTRYSTART); + + XT_PUTPAGE(mp); /* debug */ + goto out; + } else { + if (log) { /* COMMIT_PWMAP */ + /* page will be invalidated at tx completion + */ + XT_PUTPAGE(mp); + } else { /* COMMIT_WMAP */ + + if (mp->lid) + lid_to_tlock(mp->lid)->flag |= tlckFREELOCK; + + /* invalidate empty leaf page */ + discard_metapage(mp); + } + } + + /* + * the leaf page become empty: delete the parent entry + * for the leaf page if the parent page is to be kept + * in the new sized file. + */ + + /* + * go back up to the parent page + */ + getParent: + /* pop/restore parent entry for the current child page */ + if ((parent = BT_POP(&btstack)) == NULL) + /* current page must have been root */ + goto out; + + /* get back the parent page */ + bn = parent->bn; + XT_GETPAGE(ip, bn, mp, PSIZE, p, rc); + if (rc) + return rc; + + index = parent->index; + + /* + * child page was not empty: + */ + if (freed == 0) { + /* has any entry deleted from parent ? */ + if (index < le16_to_cpu(p->header.nextindex) - 1) { + /* (re)acquire tlock on the parent page */ + if (log) { /* COMMIT_PWMAP */ + /* txCommit() with tlckTRUNCATE: + * free child extents covered by parent [); + */ + tlck = txLock(tid, ip, mp, tlckXTREE); + xtlck = (struct xtlock *) & tlck->lock; + if (!(tlck->type & tlckTRUNCATE)) { + xtlck->hwm.offset = + le16_to_cpu(p->header. + nextindex) - 1; + tlck->type = + tlckXTREE | tlckTRUNCATE; + } + } else { /* COMMIT_WMAP */ + + /* free child extents covered by parent */ + xadlock.xdlist = &p->xad[index + 1]; + xadlock.count = + le16_to_cpu(p->header.nextindex) - + index - 1; + txFreeMap(ip, (struct maplock *) & xadlock, + NULL, COMMIT_WMAP); + } + BT_MARK_DIRTY(mp, ip); + + p->header.nextindex = cpu_to_le16(index + 1); + } + XT_PUTPAGE(mp); + goto getParent; + } + + /* + * child page was empty: + */ + nfreed += lengthXAD(&p->xad[index]); + + /* + * During working map update, child page's tlock must be handled + * before parent's. This is because the parent's tlock will cause + * the child's disk space to be marked available in the wmap, so + * it's important that the child page be released by that time. + * + * ToDo: tlocks should be on doubly-linked list, so we can + * quickly remove it and add it to the end. + */ + + /* + * Move parent page's tlock to the end of the tid's tlock list + */ + if (log && mp->lid && (tblk->last != mp->lid) && + lid_to_tlock(mp->lid)->tid) { + lid_t lid = mp->lid; + struct tlock *prev; + + tlck = lid_to_tlock(lid); + + if (tblk->next == lid) + tblk->next = tlck->next; + else { + for (prev = lid_to_tlock(tblk->next); + prev->next != lid; + prev = lid_to_tlock(prev->next)) { + assert(prev->next); + } + prev->next = tlck->next; + } + lid_to_tlock(tblk->last)->next = lid; + tlck->next = 0; + tblk->last = lid; + } + + /* + * parent page become empty: free the page + */ + if (index == XTENTRYSTART) { + if (log) { /* COMMIT_PWMAP */ + /* txCommit() with tlckFREE: + * free child extents covered by parent; + * invalidate parent if COMMIT_PWMAP; + */ + tlck = txLock(tid, ip, mp, tlckXTREE); + xtlck = (struct xtlock *) & tlck->lock; + xtlck->hwm.offset = + le16_to_cpu(p->header.nextindex) - 1; + tlck->type = tlckXTREE | tlckFREE; + } else { /* COMMIT_WMAP */ + + /* free child extents covered by parent */ + xadlock.xdlist = &p->xad[XTENTRYSTART]; + xadlock.count = + le16_to_cpu(p->header.nextindex) - + XTENTRYSTART; + txFreeMap(ip, (struct maplock *) & xadlock, NULL, + COMMIT_WMAP); + } + BT_MARK_DIRTY(mp, ip); + + if (p->header.flag & BT_ROOT) { + p->header.flag &= ~BT_INTERNAL; + p->header.flag |= BT_LEAF; + p->header.nextindex = cpu_to_le16(XTENTRYSTART); + if (le16_to_cpu(p->header.maxentry) == XTROOTMAXSLOT) { + /* + * Shrink root down to allow inline + * EA (otherwise fsck complains) + */ + p->header.maxentry = + cpu_to_le16(XTROOTINITSLOT); + JFS_IP(ip)->mode2 |= INLINEEA; + } + + XT_PUTPAGE(mp); /* debug */ + goto out; + } else { + if (log) { /* COMMIT_PWMAP */ + /* page will be invalidated at tx completion + */ + XT_PUTPAGE(mp); + } else { /* COMMIT_WMAP */ + + if (mp->lid) + lid_to_tlock(mp->lid)->flag |= + tlckFREELOCK; + + /* invalidate parent page */ + discard_metapage(mp); + } + + /* parent has become empty and freed: + * go back up to its parent page + */ + /* freed = 1; */ + goto getParent; + } + } + /* + * parent page still has entries for front region; + */ + else { + /* try truncate region covered by preceding entry + * (process backward) + */ + index--; + + /* go back down to the child page corresponding + * to the entry + */ + goto getChild; + } + + /* + * internal page: go down to child page of current entry + */ + getChild: + /* save current parent entry for the child page */ + if (BT_STACK_FULL(&btstack)) { + jfs_error(ip->i_sb, "stack overrun!\n"); + XT_PUTPAGE(mp); + return -EIO; + } + BT_PUSH(&btstack, bn, index); + + /* get child page */ + xad = &p->xad[index]; + bn = addressXAD(xad); + + /* + * first access of each internal entry: + */ + /* release parent page */ + XT_PUTPAGE(mp); + + /* process the child page */ + goto getPage; + + out: + /* + * update file resource stat + */ + /* set size + */ + if (S_ISDIR(ip->i_mode) && !newsize) + ip->i_size = 1; /* fsck hates zero-length directories */ + else + ip->i_size = newsize; + + /* update quota allocation to reflect freed blocks */ + dquot_free_block(ip, nfreed); + + /* + * free tlock of invalidated pages + */ + if (flag == COMMIT_WMAP) + txFreelock(ip); + + return newsize; +} + + +/* + * xtTruncate_pmap() + * + * function: + * Perform truncate to zero length for deleted file, leaving the + * the xtree and working map untouched. This allows the file to + * be accessed via open file handles, while the delete of the file + * is committed to disk. + * + * parameter: + * tid_t tid, + * struct inode *ip, + * s64 committed_size) + * + * return: new committed size + * + * note: + * + * To avoid deadlock by holding too many transaction locks, the + * truncation may be broken up into multiple transactions. + * The committed_size keeps track of part of the file has been + * freed from the pmaps. + */ +s64 xtTruncate_pmap(tid_t tid, struct inode *ip, s64 committed_size) +{ + s64 bn; + struct btstack btstack; + int cmp; + int index; + int locked_leaves = 0; + struct metapage *mp; + xtpage_t *p; + struct btframe *parent; + int rc; + struct tblock *tblk; + struct tlock *tlck = NULL; + xad_t *xad; + int xlen; + s64 xoff; + struct xtlock *xtlck = NULL; + + /* save object truncation type */ + tblk = tid_to_tblock(tid); + tblk->xflag |= COMMIT_PMAP; + + /* clear stack */ + BT_CLR(&btstack); + + if (committed_size) { + xoff = (committed_size >> JFS_SBI(ip->i_sb)->l2bsize) - 1; + rc = xtSearch(ip, xoff, NULL, &cmp, &btstack, 0); + if (rc) + return rc; + + XT_GETSEARCH(ip, btstack.top, bn, mp, p, index); + + if (cmp != 0) { + XT_PUTPAGE(mp); + jfs_error(ip->i_sb, "did not find extent\n"); + return -EIO; + } + } else { + /* + * start with root + * + * root resides in the inode + */ + bn = 0; + + /* + * first access of each page: + */ + getPage: + XT_GETPAGE(ip, bn, mp, PSIZE, p, rc); + if (rc) + return rc; + + /* process entries backward from last index */ + index = le16_to_cpu(p->header.nextindex) - 1; + + if (p->header.flag & BT_INTERNAL) + goto getChild; + } + + /* + * leaf page + */ + + if (++locked_leaves > MAX_TRUNCATE_LEAVES) { + /* + * We need to limit the size of the transaction + * to avoid exhausting pagecache & tlocks + */ + xad = &p->xad[index]; + xoff = offsetXAD(xad); + xlen = lengthXAD(xad); + XT_PUTPAGE(mp); + return (xoff + xlen) << JFS_SBI(ip->i_sb)->l2bsize; + } + tlck = txLock(tid, ip, mp, tlckXTREE); + tlck->type = tlckXTREE | tlckFREE; + xtlck = (struct xtlock *) & tlck->lock; + xtlck->hwm.offset = index; + + + XT_PUTPAGE(mp); + + /* + * go back up to the parent page + */ + getParent: + /* pop/restore parent entry for the current child page */ + if ((parent = BT_POP(&btstack)) == NULL) + /* current page must have been root */ + goto out; + + /* get back the parent page */ + bn = parent->bn; + XT_GETPAGE(ip, bn, mp, PSIZE, p, rc); + if (rc) + return rc; + + index = parent->index; + + /* + * parent page become empty: free the page + */ + if (index == XTENTRYSTART) { + /* txCommit() with tlckFREE: + * free child extents covered by parent; + * invalidate parent if COMMIT_PWMAP; + */ + tlck = txLock(tid, ip, mp, tlckXTREE); + xtlck = (struct xtlock *) & tlck->lock; + xtlck->hwm.offset = le16_to_cpu(p->header.nextindex) - 1; + tlck->type = tlckXTREE | tlckFREE; + + XT_PUTPAGE(mp); + + if (p->header.flag & BT_ROOT) { + + goto out; + } else { + goto getParent; + } + } + /* + * parent page still has entries for front region; + */ + else + index--; + /* + * internal page: go down to child page of current entry + */ + getChild: + /* save current parent entry for the child page */ + if (BT_STACK_FULL(&btstack)) { + jfs_error(ip->i_sb, "stack overrun!\n"); + XT_PUTPAGE(mp); + return -EIO; + } + BT_PUSH(&btstack, bn, index); + + /* get child page */ + xad = &p->xad[index]; + bn = addressXAD(xad); + + /* + * first access of each internal entry: + */ + /* release parent page */ + XT_PUTPAGE(mp); + + /* process the child page */ + goto getPage; + + out: + + return 0; +} + +#ifdef CONFIG_JFS_STATISTICS +int jfs_xtstat_proc_show(struct seq_file *m, void *v) +{ + seq_printf(m, + "JFS Xtree statistics\n" + "====================\n" + "searches = %d\n" + "fast searches = %d\n" + "splits = %d\n", + xtStat.search, + xtStat.fastSearch, + xtStat.split); + return 0; +} +#endif diff --git a/fs/jfs/jfs_xtree.h b/fs/jfs/jfs_xtree.h new file mode 100644 index 000000000..1e0987986 --- /dev/null +++ b/fs/jfs/jfs_xtree.h @@ -0,0 +1,125 @@ +/* + * Copyright (C) International Business Machines Corp., 2000-2002 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +#ifndef _H_JFS_XTREE +#define _H_JFS_XTREE + +/* + * jfs_xtree.h: extent allocation descriptor B+-tree manager + */ + +#include "jfs_btree.h" + + +/* + * extent allocation descriptor (xad) + */ +typedef struct xad { + __u8 flag; /* 1: flag */ + __u8 rsvrd[2]; /* 2: reserved */ + __u8 off1; /* 1: offset in unit of fsblksize */ + __le32 off2; /* 4: offset in unit of fsblksize */ + pxd_t loc; /* 8: length and address in unit of fsblksize */ +} xad_t; /* (16) */ + +#define MAXXLEN ((1 << 24) - 1) + +#define XTSLOTSIZE 16 +#define L2XTSLOTSIZE 4 + +/* xad_t field construction */ +#define XADoffset(xad, offset64)\ +{\ + (xad)->off1 = ((u64)offset64) >> 32;\ + (xad)->off2 = __cpu_to_le32((offset64) & 0xffffffff);\ +} +#define XADaddress(xad, address64) PXDaddress(&(xad)->loc, address64) +#define XADlength(xad, length32) PXDlength(&(xad)->loc, length32) + +/* xad_t field extraction */ +#define offsetXAD(xad)\ + ( ((s64)((xad)->off1)) << 32 | __le32_to_cpu((xad)->off2)) +#define addressXAD(xad) addressPXD(&(xad)->loc) +#define lengthXAD(xad) lengthPXD(&(xad)->loc) + +/* xad list */ +struct xadlist { + s16 maxnxad; + s16 nxad; + xad_t *xad; +}; + +/* xad_t flags */ +#define XAD_NEW 0x01 /* new */ +#define XAD_EXTENDED 0x02 /* extended */ +#define XAD_COMPRESSED 0x04 /* compressed with recorded length */ +#define XAD_NOTRECORDED 0x08 /* allocated but not recorded */ +#define XAD_COW 0x10 /* copy-on-write */ + + +/* possible values for maxentry */ +#define XTROOTINITSLOT_DIR 6 +#define XTROOTINITSLOT 10 +#define XTROOTMAXSLOT 18 +#define XTPAGEMAXSLOT 256 +#define XTENTRYSTART 2 + +/* + * xtree page: + */ +typedef union { + struct xtheader { + __le64 next; /* 8: */ + __le64 prev; /* 8: */ + + u8 flag; /* 1: */ + u8 rsrvd1; /* 1: */ + __le16 nextindex; /* 2: next index = number of entries */ + __le16 maxentry; /* 2: max number of entries */ + __le16 rsrvd2; /* 2: */ + + pxd_t self; /* 8: self */ + } header; /* (32) */ + + xad_t xad[XTROOTMAXSLOT]; /* 16 * maxentry: xad array */ +} xtpage_t; + +/* + * external declaration + */ +extern int xtLookup(struct inode *ip, s64 lstart, s64 llen, + int *pflag, s64 * paddr, int *plen, int flag); +extern void xtInitRoot(tid_t tid, struct inode *ip); +extern int xtInsert(tid_t tid, struct inode *ip, + int xflag, s64 xoff, int xlen, s64 * xaddrp, int flag); +extern int xtExtend(tid_t tid, struct inode *ip, s64 xoff, int xlen, + int flag); +#ifdef _NOTYET +extern int xtTailgate(tid_t tid, struct inode *ip, + s64 xoff, int xlen, s64 xaddr, int flag); +#endif +extern int xtUpdate(tid_t tid, struct inode *ip, struct xad *nxad); +extern int xtDelete(tid_t tid, struct inode *ip, s64 xoff, int xlen, + int flag); +extern s64 xtTruncate(tid_t tid, struct inode *ip, s64 newsize, int type); +extern s64 xtTruncate_pmap(tid_t tid, struct inode *ip, s64 committed_size); +extern int xtRelocate(tid_t tid, struct inode *ip, + xad_t * oxad, s64 nxaddr, int xtype); +extern int xtAppend(tid_t tid, + struct inode *ip, int xflag, s64 xoff, int maxblocks, + int *xlenp, s64 * xaddrp, int flag); +#endif /* !_H_JFS_XTREE */ diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c new file mode 100644 index 000000000..14528c0ff --- /dev/null +++ b/fs/jfs/namei.c @@ -0,0 +1,1620 @@ +/* + * Copyright (C) International Business Machines Corp., 2000-2004 + * Portions Copyright (C) Christoph Hellwig, 2001-2002 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include <linux/fs.h> +#include <linux/namei.h> +#include <linux/ctype.h> +#include <linux/quotaops.h> +#include <linux/exportfs.h> +#include "jfs_incore.h" +#include "jfs_superblock.h" +#include "jfs_inode.h" +#include "jfs_dinode.h" +#include "jfs_dmap.h" +#include "jfs_unicode.h" +#include "jfs_metapage.h" +#include "jfs_xattr.h" +#include "jfs_acl.h" +#include "jfs_debug.h" + +/* + * forward references + */ +const struct dentry_operations jfs_ci_dentry_operations; + +static s64 commitZeroLink(tid_t, struct inode *); + +/* + * NAME: free_ea_wmap(inode) + * + * FUNCTION: free uncommitted extended attributes from working map + * + */ +static inline void free_ea_wmap(struct inode *inode) +{ + dxd_t *ea = &JFS_IP(inode)->ea; + + if (ea->flag & DXD_EXTENT) { + /* free EA pages from cache */ + invalidate_dxd_metapages(inode, *ea); + dbFree(inode, addressDXD(ea), lengthDXD(ea)); + } + ea->flag = 0; +} + +/* + * NAME: jfs_create(dip, dentry, mode) + * + * FUNCTION: create a regular file in the parent directory <dip> + * with name = <from dentry> and mode = <mode> + * + * PARAMETER: dip - parent directory vnode + * dentry - dentry of new file + * mode - create mode (rwxrwxrwx). + * nd- nd struct + * + * RETURN: Errors from subroutines + * + */ +static int jfs_create(struct inode *dip, struct dentry *dentry, umode_t mode, + bool excl) +{ + int rc = 0; + tid_t tid; /* transaction id */ + struct inode *ip = NULL; /* child directory inode */ + ino_t ino; + struct component_name dname; /* child directory name */ + struct btstack btstack; + struct inode *iplist[2]; + struct tblock *tblk; + + jfs_info("jfs_create: dip:0x%p name:%pd", dip, dentry); + + rc = dquot_initialize(dip); + if (rc) + goto out1; + + /* + * search parent directory for entry/freespace + * (dtSearch() returns parent directory page pinned) + */ + if ((rc = get_UCSname(&dname, dentry))) + goto out1; + + /* + * Either iAlloc() or txBegin() may block. Deadlock can occur if we + * block there while holding dtree page, so we allocate the inode & + * begin the transaction before we search the directory. + */ + ip = ialloc(dip, mode); + if (IS_ERR(ip)) { + rc = PTR_ERR(ip); + goto out2; + } + + tid = txBegin(dip->i_sb, 0); + + mutex_lock_nested(&JFS_IP(dip)->commit_mutex, COMMIT_MUTEX_PARENT); + mutex_lock_nested(&JFS_IP(ip)->commit_mutex, COMMIT_MUTEX_CHILD); + + rc = jfs_init_acl(tid, ip, dip); + if (rc) + goto out3; + + rc = jfs_init_security(tid, ip, dip, &dentry->d_name); + if (rc) { + txAbort(tid, 0); + goto out3; + } + + if ((rc = dtSearch(dip, &dname, &ino, &btstack, JFS_CREATE))) { + jfs_err("jfs_create: dtSearch returned %d", rc); + txAbort(tid, 0); + goto out3; + } + + tblk = tid_to_tblock(tid); + tblk->xflag |= COMMIT_CREATE; + tblk->ino = ip->i_ino; + tblk->u.ixpxd = JFS_IP(ip)->ixpxd; + + iplist[0] = dip; + iplist[1] = ip; + + /* + * initialize the child XAD tree root in-line in inode + */ + xtInitRoot(tid, ip); + + /* + * create entry in parent directory for child directory + * (dtInsert() releases parent directory page) + */ + ino = ip->i_ino; + if ((rc = dtInsert(tid, dip, &dname, &ino, &btstack))) { + if (rc == -EIO) { + jfs_err("jfs_create: dtInsert returned -EIO"); + txAbort(tid, 1); /* Marks Filesystem dirty */ + } else + txAbort(tid, 0); /* Filesystem full */ + goto out3; + } + + ip->i_op = &jfs_file_inode_operations; + ip->i_fop = &jfs_file_operations; + ip->i_mapping->a_ops = &jfs_aops; + + mark_inode_dirty(ip); + + dip->i_ctime = dip->i_mtime = current_time(dip); + + mark_inode_dirty(dip); + + rc = txCommit(tid, 2, &iplist[0], 0); + + out3: + txEnd(tid); + mutex_unlock(&JFS_IP(ip)->commit_mutex); + mutex_unlock(&JFS_IP(dip)->commit_mutex); + if (rc) { + free_ea_wmap(ip); + clear_nlink(ip); + discard_new_inode(ip); + } else { + d_instantiate_new(dentry, ip); + } + + out2: + free_UCSname(&dname); + + out1: + + jfs_info("jfs_create: rc:%d", rc); + return rc; +} + + +/* + * NAME: jfs_mkdir(dip, dentry, mode) + * + * FUNCTION: create a child directory in the parent directory <dip> + * with name = <from dentry> and mode = <mode> + * + * PARAMETER: dip - parent directory vnode + * dentry - dentry of child directory + * mode - create mode (rwxrwxrwx). + * + * RETURN: Errors from subroutines + * + * note: + * EACCESS: user needs search+write permission on the parent directory + */ +static int jfs_mkdir(struct inode *dip, struct dentry *dentry, umode_t mode) +{ + int rc = 0; + tid_t tid; /* transaction id */ + struct inode *ip = NULL; /* child directory inode */ + ino_t ino; + struct component_name dname; /* child directory name */ + struct btstack btstack; + struct inode *iplist[2]; + struct tblock *tblk; + + jfs_info("jfs_mkdir: dip:0x%p name:%pd", dip, dentry); + + rc = dquot_initialize(dip); + if (rc) + goto out1; + + /* + * search parent directory for entry/freespace + * (dtSearch() returns parent directory page pinned) + */ + if ((rc = get_UCSname(&dname, dentry))) + goto out1; + + /* + * Either iAlloc() or txBegin() may block. Deadlock can occur if we + * block there while holding dtree page, so we allocate the inode & + * begin the transaction before we search the directory. + */ + ip = ialloc(dip, S_IFDIR | mode); + if (IS_ERR(ip)) { + rc = PTR_ERR(ip); + goto out2; + } + + tid = txBegin(dip->i_sb, 0); + + mutex_lock_nested(&JFS_IP(dip)->commit_mutex, COMMIT_MUTEX_PARENT); + mutex_lock_nested(&JFS_IP(ip)->commit_mutex, COMMIT_MUTEX_CHILD); + + rc = jfs_init_acl(tid, ip, dip); + if (rc) + goto out3; + + rc = jfs_init_security(tid, ip, dip, &dentry->d_name); + if (rc) { + txAbort(tid, 0); + goto out3; + } + + if ((rc = dtSearch(dip, &dname, &ino, &btstack, JFS_CREATE))) { + jfs_err("jfs_mkdir: dtSearch returned %d", rc); + txAbort(tid, 0); + goto out3; + } + + tblk = tid_to_tblock(tid); + tblk->xflag |= COMMIT_CREATE; + tblk->ino = ip->i_ino; + tblk->u.ixpxd = JFS_IP(ip)->ixpxd; + + iplist[0] = dip; + iplist[1] = ip; + + /* + * initialize the child directory in-line in inode + */ + dtInitRoot(tid, ip, dip->i_ino); + + /* + * create entry in parent directory for child directory + * (dtInsert() releases parent directory page) + */ + ino = ip->i_ino; + if ((rc = dtInsert(tid, dip, &dname, &ino, &btstack))) { + if (rc == -EIO) { + jfs_err("jfs_mkdir: dtInsert returned -EIO"); + txAbort(tid, 1); /* Marks Filesystem dirty */ + } else + txAbort(tid, 0); /* Filesystem full */ + goto out3; + } + + set_nlink(ip, 2); /* for '.' */ + ip->i_op = &jfs_dir_inode_operations; + ip->i_fop = &jfs_dir_operations; + + mark_inode_dirty(ip); + + /* update parent directory inode */ + inc_nlink(dip); /* for '..' from child directory */ + dip->i_ctime = dip->i_mtime = current_time(dip); + mark_inode_dirty(dip); + + rc = txCommit(tid, 2, &iplist[0], 0); + + out3: + txEnd(tid); + mutex_unlock(&JFS_IP(ip)->commit_mutex); + mutex_unlock(&JFS_IP(dip)->commit_mutex); + if (rc) { + free_ea_wmap(ip); + clear_nlink(ip); + discard_new_inode(ip); + } else { + d_instantiate_new(dentry, ip); + } + + out2: + free_UCSname(&dname); + + + out1: + + jfs_info("jfs_mkdir: rc:%d", rc); + return rc; +} + +/* + * NAME: jfs_rmdir(dip, dentry) + * + * FUNCTION: remove a link to child directory + * + * PARAMETER: dip - parent inode + * dentry - child directory dentry + * + * RETURN: -EINVAL - if name is . or .. + * -EINVAL - if . or .. exist but are invalid. + * errors from subroutines + * + * note: + * if other threads have the directory open when the last link + * is removed, the "." and ".." entries, if present, are removed before + * rmdir() returns and no new entries may be created in the directory, + * but the directory is not removed until the last reference to + * the directory is released (cf.unlink() of regular file). + */ +static int jfs_rmdir(struct inode *dip, struct dentry *dentry) +{ + int rc; + tid_t tid; /* transaction id */ + struct inode *ip = d_inode(dentry); + ino_t ino; + struct component_name dname; + struct inode *iplist[2]; + struct tblock *tblk; + + jfs_info("jfs_rmdir: dip:0x%p name:%pd", dip, dentry); + + /* Init inode for quota operations. */ + rc = dquot_initialize(dip); + if (rc) + goto out; + rc = dquot_initialize(ip); + if (rc) + goto out; + + /* directory must be empty to be removed */ + if (!dtEmpty(ip)) { + rc = -ENOTEMPTY; + goto out; + } + + if ((rc = get_UCSname(&dname, dentry))) { + goto out; + } + + tid = txBegin(dip->i_sb, 0); + + mutex_lock_nested(&JFS_IP(dip)->commit_mutex, COMMIT_MUTEX_PARENT); + mutex_lock_nested(&JFS_IP(ip)->commit_mutex, COMMIT_MUTEX_CHILD); + + iplist[0] = dip; + iplist[1] = ip; + + tblk = tid_to_tblock(tid); + tblk->xflag |= COMMIT_DELETE; + tblk->u.ip = ip; + + /* + * delete the entry of target directory from parent directory + */ + ino = ip->i_ino; + if ((rc = dtDelete(tid, dip, &dname, &ino, JFS_REMOVE))) { + jfs_err("jfs_rmdir: dtDelete returned %d", rc); + if (rc == -EIO) + txAbort(tid, 1); + txEnd(tid); + mutex_unlock(&JFS_IP(ip)->commit_mutex); + mutex_unlock(&JFS_IP(dip)->commit_mutex); + + goto out2; + } + + /* update parent directory's link count corresponding + * to ".." entry of the target directory deleted + */ + dip->i_ctime = dip->i_mtime = current_time(dip); + inode_dec_link_count(dip); + + /* + * OS/2 could have created EA and/or ACL + */ + /* free EA from both persistent and working map */ + if (JFS_IP(ip)->ea.flag & DXD_EXTENT) { + /* free EA pages */ + txEA(tid, ip, &JFS_IP(ip)->ea, NULL); + } + JFS_IP(ip)->ea.flag = 0; + + /* free ACL from both persistent and working map */ + if (JFS_IP(ip)->acl.flag & DXD_EXTENT) { + /* free ACL pages */ + txEA(tid, ip, &JFS_IP(ip)->acl, NULL); + } + JFS_IP(ip)->acl.flag = 0; + + /* mark the target directory as deleted */ + clear_nlink(ip); + mark_inode_dirty(ip); + + rc = txCommit(tid, 2, &iplist[0], 0); + + txEnd(tid); + + mutex_unlock(&JFS_IP(ip)->commit_mutex); + mutex_unlock(&JFS_IP(dip)->commit_mutex); + + /* + * Truncating the directory index table is not guaranteed. It + * may need to be done iteratively + */ + if (test_cflag(COMMIT_Stale, dip)) { + if (dip->i_size > 1) + jfs_truncate_nolock(dip, 0); + + clear_cflag(COMMIT_Stale, dip); + } + + out2: + free_UCSname(&dname); + + out: + jfs_info("jfs_rmdir: rc:%d", rc); + return rc; +} + +/* + * NAME: jfs_unlink(dip, dentry) + * + * FUNCTION: remove a link to object <vp> named by <name> + * from parent directory <dvp> + * + * PARAMETER: dip - inode of parent directory + * dentry - dentry of object to be removed + * + * RETURN: errors from subroutines + * + * note: + * temporary file: if one or more processes have the file open + * when the last link is removed, the link will be removed before + * unlink() returns, but the removal of the file contents will be + * postponed until all references to the files are closed. + * + * JFS does NOT support unlink() on directories. + * + */ +static int jfs_unlink(struct inode *dip, struct dentry *dentry) +{ + int rc; + tid_t tid; /* transaction id */ + struct inode *ip = d_inode(dentry); + ino_t ino; + struct component_name dname; /* object name */ + struct inode *iplist[2]; + struct tblock *tblk; + s64 new_size = 0; + int commit_flag; + + jfs_info("jfs_unlink: dip:0x%p name:%pd", dip, dentry); + + /* Init inode for quota operations. */ + rc = dquot_initialize(dip); + if (rc) + goto out; + rc = dquot_initialize(ip); + if (rc) + goto out; + + if ((rc = get_UCSname(&dname, dentry))) + goto out; + + IWRITE_LOCK(ip, RDWRLOCK_NORMAL); + + tid = txBegin(dip->i_sb, 0); + + mutex_lock_nested(&JFS_IP(dip)->commit_mutex, COMMIT_MUTEX_PARENT); + mutex_lock_nested(&JFS_IP(ip)->commit_mutex, COMMIT_MUTEX_CHILD); + + iplist[0] = dip; + iplist[1] = ip; + + /* + * delete the entry of target file from parent directory + */ + ino = ip->i_ino; + if ((rc = dtDelete(tid, dip, &dname, &ino, JFS_REMOVE))) { + jfs_err("jfs_unlink: dtDelete returned %d", rc); + if (rc == -EIO) + txAbort(tid, 1); /* Marks FS Dirty */ + txEnd(tid); + mutex_unlock(&JFS_IP(ip)->commit_mutex); + mutex_unlock(&JFS_IP(dip)->commit_mutex); + IWRITE_UNLOCK(ip); + goto out1; + } + + ASSERT(ip->i_nlink); + + ip->i_ctime = dip->i_ctime = dip->i_mtime = current_time(ip); + mark_inode_dirty(dip); + + /* update target's inode */ + inode_dec_link_count(ip); + + /* + * commit zero link count object + */ + if (ip->i_nlink == 0) { + assert(!test_cflag(COMMIT_Nolink, ip)); + /* free block resources */ + if ((new_size = commitZeroLink(tid, ip)) < 0) { + txAbort(tid, 1); /* Marks FS Dirty */ + txEnd(tid); + mutex_unlock(&JFS_IP(ip)->commit_mutex); + mutex_unlock(&JFS_IP(dip)->commit_mutex); + IWRITE_UNLOCK(ip); + rc = new_size; + goto out1; + } + tblk = tid_to_tblock(tid); + tblk->xflag |= COMMIT_DELETE; + tblk->u.ip = ip; + } + + /* + * Incomplete truncate of file data can + * result in timing problems unless we synchronously commit the + * transaction. + */ + if (new_size) + commit_flag = COMMIT_SYNC; + else + commit_flag = 0; + + /* + * If xtTruncate was incomplete, commit synchronously to avoid + * timing complications + */ + rc = txCommit(tid, 2, &iplist[0], commit_flag); + + txEnd(tid); + + mutex_unlock(&JFS_IP(ip)->commit_mutex); + mutex_unlock(&JFS_IP(dip)->commit_mutex); + + while (new_size && (rc == 0)) { + tid = txBegin(dip->i_sb, 0); + mutex_lock(&JFS_IP(ip)->commit_mutex); + new_size = xtTruncate_pmap(tid, ip, new_size); + if (new_size < 0) { + txAbort(tid, 1); /* Marks FS Dirty */ + rc = new_size; + } else + rc = txCommit(tid, 2, &iplist[0], COMMIT_SYNC); + txEnd(tid); + mutex_unlock(&JFS_IP(ip)->commit_mutex); + } + + if (ip->i_nlink == 0) + set_cflag(COMMIT_Nolink, ip); + + IWRITE_UNLOCK(ip); + + /* + * Truncating the directory index table is not guaranteed. It + * may need to be done iteratively + */ + if (test_cflag(COMMIT_Stale, dip)) { + if (dip->i_size > 1) + jfs_truncate_nolock(dip, 0); + + clear_cflag(COMMIT_Stale, dip); + } + + out1: + free_UCSname(&dname); + out: + jfs_info("jfs_unlink: rc:%d", rc); + return rc; +} + +/* + * NAME: commitZeroLink() + * + * FUNCTION: for non-directory, called by jfs_remove(), + * truncate a regular file, directory or symbolic + * link to zero length. return 0 if type is not + * one of these. + * + * if the file is currently associated with a VM segment + * only permanent disk and inode map resources are freed, + * and neither the inode nor indirect blocks are modified + * so that the resources can be later freed in the work + * map by ctrunc1. + * if there is no VM segment on entry, the resources are + * freed in both work and permanent map. + * (? for temporary file - memory object is cached even + * after no reference: + * reference count > 0 - ) + * + * PARAMETERS: cd - pointer to commit data structure. + * current inode is the one to truncate. + * + * RETURN: Errors from subroutines + */ +static s64 commitZeroLink(tid_t tid, struct inode *ip) +{ + int filetype; + struct tblock *tblk; + + jfs_info("commitZeroLink: tid = %d, ip = 0x%p", tid, ip); + + filetype = ip->i_mode & S_IFMT; + switch (filetype) { + case S_IFREG: + break; + case S_IFLNK: + /* fast symbolic link */ + if (ip->i_size < IDATASIZE) { + ip->i_size = 0; + return 0; + } + break; + default: + assert(filetype != S_IFDIR); + return 0; + } + + set_cflag(COMMIT_Freewmap, ip); + + /* mark transaction of block map update type */ + tblk = tid_to_tblock(tid); + tblk->xflag |= COMMIT_PMAP; + + /* + * free EA + */ + if (JFS_IP(ip)->ea.flag & DXD_EXTENT) + /* acquire maplock on EA to be freed from block map */ + txEA(tid, ip, &JFS_IP(ip)->ea, NULL); + + /* + * free ACL + */ + if (JFS_IP(ip)->acl.flag & DXD_EXTENT) + /* acquire maplock on EA to be freed from block map */ + txEA(tid, ip, &JFS_IP(ip)->acl, NULL); + + /* + * free xtree/data (truncate to zero length): + * free xtree/data pages from cache if COMMIT_PWMAP, + * free xtree/data blocks from persistent block map, and + * free xtree/data blocks from working block map if COMMIT_PWMAP; + */ + if (ip->i_size) + return xtTruncate_pmap(tid, ip, 0); + + return 0; +} + + +/* + * NAME: jfs_free_zero_link() + * + * FUNCTION: for non-directory, called by iClose(), + * free resources of a file from cache and WORKING map + * for a file previously committed with zero link count + * while associated with a pager object, + * + * PARAMETER: ip - pointer to inode of file. + */ +void jfs_free_zero_link(struct inode *ip) +{ + int type; + + jfs_info("jfs_free_zero_link: ip = 0x%p", ip); + + /* return if not reg or symbolic link or if size is + * already ok. + */ + type = ip->i_mode & S_IFMT; + + switch (type) { + case S_IFREG: + break; + case S_IFLNK: + /* if its contained in inode nothing to do */ + if (ip->i_size < IDATASIZE) + return; + break; + default: + return; + } + + /* + * free EA + */ + if (JFS_IP(ip)->ea.flag & DXD_EXTENT) { + s64 xaddr = addressDXD(&JFS_IP(ip)->ea); + int xlen = lengthDXD(&JFS_IP(ip)->ea); + struct maplock maplock; /* maplock for COMMIT_WMAP */ + struct pxd_lock *pxdlock; /* maplock for COMMIT_WMAP */ + + /* free EA pages from cache */ + invalidate_dxd_metapages(ip, JFS_IP(ip)->ea); + + /* free EA extent from working block map */ + maplock.index = 1; + pxdlock = (struct pxd_lock *) & maplock; + pxdlock->flag = mlckFREEPXD; + PXDaddress(&pxdlock->pxd, xaddr); + PXDlength(&pxdlock->pxd, xlen); + txFreeMap(ip, pxdlock, NULL, COMMIT_WMAP); + } + + /* + * free ACL + */ + if (JFS_IP(ip)->acl.flag & DXD_EXTENT) { + s64 xaddr = addressDXD(&JFS_IP(ip)->acl); + int xlen = lengthDXD(&JFS_IP(ip)->acl); + struct maplock maplock; /* maplock for COMMIT_WMAP */ + struct pxd_lock *pxdlock; /* maplock for COMMIT_WMAP */ + + invalidate_dxd_metapages(ip, JFS_IP(ip)->acl); + + /* free ACL extent from working block map */ + maplock.index = 1; + pxdlock = (struct pxd_lock *) & maplock; + pxdlock->flag = mlckFREEPXD; + PXDaddress(&pxdlock->pxd, xaddr); + PXDlength(&pxdlock->pxd, xlen); + txFreeMap(ip, pxdlock, NULL, COMMIT_WMAP); + } + + /* + * free xtree/data (truncate to zero length): + * free xtree/data pages from cache, and + * free xtree/data blocks from working block map; + */ + if (ip->i_size) + xtTruncate(0, ip, 0, COMMIT_WMAP); +} + +/* + * NAME: jfs_link(vp, dvp, name, crp) + * + * FUNCTION: create a link to <vp> by the name = <name> + * in the parent directory <dvp> + * + * PARAMETER: vp - target object + * dvp - parent directory of new link + * name - name of new link to target object + * crp - credential + * + * RETURN: Errors from subroutines + * + * note: + * JFS does NOT support link() on directories (to prevent circular + * path in the directory hierarchy); + * EPERM: the target object is a directory, and either the caller + * does not have appropriate privileges or the implementation prohibits + * using link() on directories [XPG4.2]. + * + * JFS does NOT support links between file systems: + * EXDEV: target object and new link are on different file systems and + * implementation does not support links between file systems [XPG4.2]. + */ +static int jfs_link(struct dentry *old_dentry, + struct inode *dir, struct dentry *dentry) +{ + int rc; + tid_t tid; + struct inode *ip = d_inode(old_dentry); + ino_t ino; + struct component_name dname; + struct btstack btstack; + struct inode *iplist[2]; + + jfs_info("jfs_link: %pd %pd", old_dentry, dentry); + + rc = dquot_initialize(dir); + if (rc) + goto out; + + tid = txBegin(ip->i_sb, 0); + + mutex_lock_nested(&JFS_IP(dir)->commit_mutex, COMMIT_MUTEX_PARENT); + mutex_lock_nested(&JFS_IP(ip)->commit_mutex, COMMIT_MUTEX_CHILD); + + /* + * scan parent directory for entry/freespace + */ + if ((rc = get_UCSname(&dname, dentry))) + goto out_tx; + + if ((rc = dtSearch(dir, &dname, &ino, &btstack, JFS_CREATE))) + goto free_dname; + + /* + * create entry for new link in parent directory + */ + ino = ip->i_ino; + if ((rc = dtInsert(tid, dir, &dname, &ino, &btstack))) + goto free_dname; + + /* update object inode */ + inc_nlink(ip); /* for new link */ + ip->i_ctime = current_time(ip); + dir->i_ctime = dir->i_mtime = current_time(dir); + mark_inode_dirty(dir); + ihold(ip); + + iplist[0] = ip; + iplist[1] = dir; + rc = txCommit(tid, 2, &iplist[0], 0); + + if (rc) { + drop_nlink(ip); /* never instantiated */ + iput(ip); + } else + d_instantiate(dentry, ip); + + free_dname: + free_UCSname(&dname); + + out_tx: + txEnd(tid); + + mutex_unlock(&JFS_IP(ip)->commit_mutex); + mutex_unlock(&JFS_IP(dir)->commit_mutex); + + out: + jfs_info("jfs_link: rc:%d", rc); + return rc; +} + +/* + * NAME: jfs_symlink(dip, dentry, name) + * + * FUNCTION: creates a symbolic link to <symlink> by name <name> + * in directory <dip> + * + * PARAMETER: dip - parent directory vnode + * dentry - dentry of symbolic link + * name - the path name of the existing object + * that will be the source of the link + * + * RETURN: errors from subroutines + * + * note: + * ENAMETOOLONG: pathname resolution of a symbolic link produced + * an intermediate result whose length exceeds PATH_MAX [XPG4.2] +*/ + +static int jfs_symlink(struct inode *dip, struct dentry *dentry, + const char *name) +{ + int rc; + tid_t tid; + ino_t ino = 0; + struct component_name dname; + int ssize; /* source pathname size */ + struct btstack btstack; + struct inode *ip = d_inode(dentry); + s64 xlen = 0; + int bmask = 0, xsize; + s64 xaddr; + struct metapage *mp; + struct super_block *sb; + struct tblock *tblk; + + struct inode *iplist[2]; + + jfs_info("jfs_symlink: dip:0x%p name:%s", dip, name); + + rc = dquot_initialize(dip); + if (rc) + goto out1; + + ssize = strlen(name) + 1; + + /* + * search parent directory for entry/freespace + * (dtSearch() returns parent directory page pinned) + */ + + if ((rc = get_UCSname(&dname, dentry))) + goto out1; + + /* + * allocate on-disk/in-memory inode for symbolic link: + * (iAlloc() returns new, locked inode) + */ + ip = ialloc(dip, S_IFLNK | 0777); + if (IS_ERR(ip)) { + rc = PTR_ERR(ip); + goto out2; + } + + tid = txBegin(dip->i_sb, 0); + + mutex_lock_nested(&JFS_IP(dip)->commit_mutex, COMMIT_MUTEX_PARENT); + mutex_lock_nested(&JFS_IP(ip)->commit_mutex, COMMIT_MUTEX_CHILD); + + rc = jfs_init_security(tid, ip, dip, &dentry->d_name); + if (rc) + goto out3; + + tblk = tid_to_tblock(tid); + tblk->xflag |= COMMIT_CREATE; + tblk->ino = ip->i_ino; + tblk->u.ixpxd = JFS_IP(ip)->ixpxd; + + /* fix symlink access permission + * (dir_create() ANDs in the u.u_cmask, + * but symlinks really need to be 777 access) + */ + ip->i_mode |= 0777; + + /* + * write symbolic link target path name + */ + xtInitRoot(tid, ip); + + /* + * write source path name inline in on-disk inode (fast symbolic link) + */ + + if (ssize <= IDATASIZE) { + ip->i_op = &jfs_fast_symlink_inode_operations; + + ip->i_link = JFS_IP(ip)->i_inline; + memcpy(ip->i_link, name, ssize); + ip->i_size = ssize - 1; + + /* + * if symlink is > 128 bytes, we don't have the space to + * store inline extended attributes + */ + if (ssize > sizeof (JFS_IP(ip)->i_inline)) + JFS_IP(ip)->mode2 &= ~INLINEEA; + + jfs_info("jfs_symlink: fast symlink added ssize:%d name:%s ", + ssize, name); + } + /* + * write source path name in a single extent + */ + else { + jfs_info("jfs_symlink: allocate extent ip:0x%p", ip); + + ip->i_op = &jfs_symlink_inode_operations; + inode_nohighmem(ip); + ip->i_mapping->a_ops = &jfs_aops; + + /* + * even though the data of symlink object (source + * path name) is treated as non-journaled user data, + * it is read/written thru buffer cache for performance. + */ + sb = ip->i_sb; + bmask = JFS_SBI(sb)->bsize - 1; + xsize = (ssize + bmask) & ~bmask; + xaddr = 0; + xlen = xsize >> JFS_SBI(sb)->l2bsize; + if ((rc = xtInsert(tid, ip, 0, 0, xlen, &xaddr, 0))) { + txAbort(tid, 0); + goto out3; + } + ip->i_size = ssize - 1; + while (ssize) { + /* This is kind of silly since PATH_MAX == 4K */ + int copy_size = min(ssize, PSIZE); + + mp = get_metapage(ip, xaddr, PSIZE, 1); + + if (mp == NULL) { + xtTruncate(tid, ip, 0, COMMIT_PWMAP); + rc = -EIO; + txAbort(tid, 0); + goto out3; + } + memcpy(mp->data, name, copy_size); + flush_metapage(mp); + ssize -= copy_size; + name += copy_size; + xaddr += JFS_SBI(sb)->nbperpage; + } + } + + /* + * create entry for symbolic link in parent directory + */ + rc = dtSearch(dip, &dname, &ino, &btstack, JFS_CREATE); + if (rc == 0) { + ino = ip->i_ino; + rc = dtInsert(tid, dip, &dname, &ino, &btstack); + } + if (rc) { + if (xlen) + xtTruncate(tid, ip, 0, COMMIT_PWMAP); + txAbort(tid, 0); + /* discard new inode */ + goto out3; + } + + mark_inode_dirty(ip); + + dip->i_ctime = dip->i_mtime = current_time(dip); + mark_inode_dirty(dip); + /* + * commit update of parent directory and link object + */ + + iplist[0] = dip; + iplist[1] = ip; + rc = txCommit(tid, 2, &iplist[0], 0); + + out3: + txEnd(tid); + mutex_unlock(&JFS_IP(ip)->commit_mutex); + mutex_unlock(&JFS_IP(dip)->commit_mutex); + if (rc) { + free_ea_wmap(ip); + clear_nlink(ip); + discard_new_inode(ip); + } else { + d_instantiate_new(dentry, ip); + } + + out2: + free_UCSname(&dname); + + out1: + jfs_info("jfs_symlink: rc:%d", rc); + return rc; +} + + +/* + * NAME: jfs_rename + * + * FUNCTION: rename a file or directory + */ +static int jfs_rename(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry, + unsigned int flags) +{ + struct btstack btstack; + ino_t ino; + struct component_name new_dname; + struct inode *new_ip; + struct component_name old_dname; + struct inode *old_ip; + int rc; + tid_t tid; + struct tlock *tlck; + struct dt_lock *dtlck; + struct lv *lv; + int ipcount; + struct inode *iplist[4]; + struct tblock *tblk; + s64 new_size = 0; + int commit_flag; + + if (flags & ~RENAME_NOREPLACE) + return -EINVAL; + + jfs_info("jfs_rename: %pd %pd", old_dentry, new_dentry); + + rc = dquot_initialize(old_dir); + if (rc) + goto out1; + rc = dquot_initialize(new_dir); + if (rc) + goto out1; + + old_ip = d_inode(old_dentry); + new_ip = d_inode(new_dentry); + + if ((rc = get_UCSname(&old_dname, old_dentry))) + goto out1; + + if ((rc = get_UCSname(&new_dname, new_dentry))) + goto out2; + + /* + * Make sure source inode number is what we think it is + */ + rc = dtSearch(old_dir, &old_dname, &ino, &btstack, JFS_LOOKUP); + if (rc || (ino != old_ip->i_ino)) { + rc = -ENOENT; + goto out3; + } + + /* + * Make sure dest inode number (if any) is what we think it is + */ + rc = dtSearch(new_dir, &new_dname, &ino, &btstack, JFS_LOOKUP); + if (!rc) { + if ((!new_ip) || (ino != new_ip->i_ino)) { + rc = -ESTALE; + goto out3; + } + } else if (rc != -ENOENT) + goto out3; + else if (new_ip) { + /* no entry exists, but one was expected */ + rc = -ESTALE; + goto out3; + } + + if (S_ISDIR(old_ip->i_mode)) { + if (new_ip) { + if (!dtEmpty(new_ip)) { + rc = -ENOTEMPTY; + goto out3; + } + } + } else if (new_ip) { + IWRITE_LOCK(new_ip, RDWRLOCK_NORMAL); + /* Init inode for quota operations. */ + rc = dquot_initialize(new_ip); + if (rc) + goto out_unlock; + } + + /* + * The real work starts here + */ + tid = txBegin(new_dir->i_sb, 0); + + /* + * How do we know the locking is safe from deadlocks? + * The vfs does the hard part for us. Any time we are taking nested + * commit_mutexes, the vfs already has i_mutex held on the parent. + * Here, the vfs has already taken i_mutex on both old_dir and new_dir. + */ + mutex_lock_nested(&JFS_IP(new_dir)->commit_mutex, COMMIT_MUTEX_PARENT); + mutex_lock_nested(&JFS_IP(old_ip)->commit_mutex, COMMIT_MUTEX_CHILD); + if (old_dir != new_dir) + mutex_lock_nested(&JFS_IP(old_dir)->commit_mutex, + COMMIT_MUTEX_SECOND_PARENT); + + if (new_ip) { + mutex_lock_nested(&JFS_IP(new_ip)->commit_mutex, + COMMIT_MUTEX_VICTIM); + /* + * Change existing directory entry to new inode number + */ + ino = new_ip->i_ino; + rc = dtModify(tid, new_dir, &new_dname, &ino, + old_ip->i_ino, JFS_RENAME); + if (rc) + goto out_tx; + drop_nlink(new_ip); + if (S_ISDIR(new_ip->i_mode)) { + drop_nlink(new_ip); + if (new_ip->i_nlink) { + mutex_unlock(&JFS_IP(new_ip)->commit_mutex); + if (old_dir != new_dir) + mutex_unlock(&JFS_IP(old_dir)->commit_mutex); + mutex_unlock(&JFS_IP(old_ip)->commit_mutex); + mutex_unlock(&JFS_IP(new_dir)->commit_mutex); + if (!S_ISDIR(old_ip->i_mode) && new_ip) + IWRITE_UNLOCK(new_ip); + jfs_error(new_ip->i_sb, + "new_ip->i_nlink != 0\n"); + return -EIO; + } + tblk = tid_to_tblock(tid); + tblk->xflag |= COMMIT_DELETE; + tblk->u.ip = new_ip; + } else if (new_ip->i_nlink == 0) { + assert(!test_cflag(COMMIT_Nolink, new_ip)); + /* free block resources */ + if ((new_size = commitZeroLink(tid, new_ip)) < 0) { + txAbort(tid, 1); /* Marks FS Dirty */ + rc = new_size; + goto out_tx; + } + tblk = tid_to_tblock(tid); + tblk->xflag |= COMMIT_DELETE; + tblk->u.ip = new_ip; + } else { + new_ip->i_ctime = current_time(new_ip); + mark_inode_dirty(new_ip); + } + } else { + /* + * Add new directory entry + */ + rc = dtSearch(new_dir, &new_dname, &ino, &btstack, + JFS_CREATE); + if (rc) { + jfs_err("jfs_rename didn't expect dtSearch to fail w/rc = %d", + rc); + goto out_tx; + } + + ino = old_ip->i_ino; + rc = dtInsert(tid, new_dir, &new_dname, &ino, &btstack); + if (rc) { + if (rc == -EIO) + jfs_err("jfs_rename: dtInsert returned -EIO"); + goto out_tx; + } + if (S_ISDIR(old_ip->i_mode)) + inc_nlink(new_dir); + } + /* + * Remove old directory entry + */ + + ino = old_ip->i_ino; + rc = dtDelete(tid, old_dir, &old_dname, &ino, JFS_REMOVE); + if (rc) { + jfs_err("jfs_rename did not expect dtDelete to return rc = %d", + rc); + txAbort(tid, 1); /* Marks Filesystem dirty */ + goto out_tx; + } + if (S_ISDIR(old_ip->i_mode)) { + drop_nlink(old_dir); + if (old_dir != new_dir) { + /* + * Change inode number of parent for moved directory + */ + + JFS_IP(old_ip)->i_dtroot.header.idotdot = + cpu_to_le32(new_dir->i_ino); + + /* Linelock header of dtree */ + tlck = txLock(tid, old_ip, + (struct metapage *) &JFS_IP(old_ip)->bxflag, + tlckDTREE | tlckBTROOT | tlckRELINK); + dtlck = (struct dt_lock *) & tlck->lock; + ASSERT(dtlck->index == 0); + lv = & dtlck->lv[0]; + lv->offset = 0; + lv->length = 1; + dtlck->index++; + } + } + + /* + * Update ctime on changed/moved inodes & mark dirty + */ + old_ip->i_ctime = current_time(old_ip); + mark_inode_dirty(old_ip); + + new_dir->i_ctime = new_dir->i_mtime = current_time(new_dir); + mark_inode_dirty(new_dir); + + /* Build list of inodes modified by this transaction */ + ipcount = 0; + iplist[ipcount++] = old_ip; + if (new_ip) + iplist[ipcount++] = new_ip; + iplist[ipcount++] = old_dir; + + if (old_dir != new_dir) { + iplist[ipcount++] = new_dir; + old_dir->i_ctime = old_dir->i_mtime = current_time(old_dir); + mark_inode_dirty(old_dir); + } + + /* + * Incomplete truncate of file data can + * result in timing problems unless we synchronously commit the + * transaction. + */ + if (new_size) + commit_flag = COMMIT_SYNC; + else + commit_flag = 0; + + rc = txCommit(tid, ipcount, iplist, commit_flag); + + out_tx: + txEnd(tid); + if (new_ip) + mutex_unlock(&JFS_IP(new_ip)->commit_mutex); + if (old_dir != new_dir) + mutex_unlock(&JFS_IP(old_dir)->commit_mutex); + mutex_unlock(&JFS_IP(old_ip)->commit_mutex); + mutex_unlock(&JFS_IP(new_dir)->commit_mutex); + + while (new_size && (rc == 0)) { + tid = txBegin(new_ip->i_sb, 0); + mutex_lock(&JFS_IP(new_ip)->commit_mutex); + new_size = xtTruncate_pmap(tid, new_ip, new_size); + if (new_size < 0) { + txAbort(tid, 1); + rc = new_size; + } else + rc = txCommit(tid, 1, &new_ip, COMMIT_SYNC); + txEnd(tid); + mutex_unlock(&JFS_IP(new_ip)->commit_mutex); + } + if (new_ip && (new_ip->i_nlink == 0)) + set_cflag(COMMIT_Nolink, new_ip); + /* + * Truncating the directory index table is not guaranteed. It + * may need to be done iteratively + */ + if (test_cflag(COMMIT_Stale, old_dir)) { + if (old_dir->i_size > 1) + jfs_truncate_nolock(old_dir, 0); + + clear_cflag(COMMIT_Stale, old_dir); + } + out_unlock: + if (new_ip && !S_ISDIR(new_ip->i_mode)) + IWRITE_UNLOCK(new_ip); + out3: + free_UCSname(&new_dname); + out2: + free_UCSname(&old_dname); + out1: + jfs_info("jfs_rename: returning %d", rc); + return rc; +} + + +/* + * NAME: jfs_mknod + * + * FUNCTION: Create a special file (device) + */ +static int jfs_mknod(struct inode *dir, struct dentry *dentry, + umode_t mode, dev_t rdev) +{ + struct jfs_inode_info *jfs_ip; + struct btstack btstack; + struct component_name dname; + ino_t ino; + struct inode *ip; + struct inode *iplist[2]; + int rc; + tid_t tid; + struct tblock *tblk; + + jfs_info("jfs_mknod: %pd", dentry); + + rc = dquot_initialize(dir); + if (rc) + goto out; + + if ((rc = get_UCSname(&dname, dentry))) + goto out; + + ip = ialloc(dir, mode); + if (IS_ERR(ip)) { + rc = PTR_ERR(ip); + goto out1; + } + jfs_ip = JFS_IP(ip); + + tid = txBegin(dir->i_sb, 0); + + mutex_lock_nested(&JFS_IP(dir)->commit_mutex, COMMIT_MUTEX_PARENT); + mutex_lock_nested(&JFS_IP(ip)->commit_mutex, COMMIT_MUTEX_CHILD); + + rc = jfs_init_acl(tid, ip, dir); + if (rc) + goto out3; + + rc = jfs_init_security(tid, ip, dir, &dentry->d_name); + if (rc) { + txAbort(tid, 0); + goto out3; + } + + if ((rc = dtSearch(dir, &dname, &ino, &btstack, JFS_CREATE))) { + txAbort(tid, 0); + goto out3; + } + + tblk = tid_to_tblock(tid); + tblk->xflag |= COMMIT_CREATE; + tblk->ino = ip->i_ino; + tblk->u.ixpxd = JFS_IP(ip)->ixpxd; + + ino = ip->i_ino; + if ((rc = dtInsert(tid, dir, &dname, &ino, &btstack))) { + txAbort(tid, 0); + goto out3; + } + + ip->i_op = &jfs_file_inode_operations; + jfs_ip->dev = new_encode_dev(rdev); + init_special_inode(ip, ip->i_mode, rdev); + + mark_inode_dirty(ip); + + dir->i_ctime = dir->i_mtime = current_time(dir); + + mark_inode_dirty(dir); + + iplist[0] = dir; + iplist[1] = ip; + rc = txCommit(tid, 2, iplist, 0); + + out3: + txEnd(tid); + mutex_unlock(&JFS_IP(ip)->commit_mutex); + mutex_unlock(&JFS_IP(dir)->commit_mutex); + if (rc) { + free_ea_wmap(ip); + clear_nlink(ip); + discard_new_inode(ip); + } else { + d_instantiate_new(dentry, ip); + } + + out1: + free_UCSname(&dname); + + out: + jfs_info("jfs_mknod: returning %d", rc); + return rc; +} + +static struct dentry *jfs_lookup(struct inode *dip, struct dentry *dentry, unsigned int flags) +{ + struct btstack btstack; + ino_t inum; + struct inode *ip; + struct component_name key; + int rc; + + jfs_info("jfs_lookup: name = %pd", dentry); + + if ((rc = get_UCSname(&key, dentry))) + return ERR_PTR(rc); + rc = dtSearch(dip, &key, &inum, &btstack, JFS_LOOKUP); + free_UCSname(&key); + if (rc == -ENOENT) { + ip = NULL; + } else if (rc) { + jfs_err("jfs_lookup: dtSearch returned %d", rc); + ip = ERR_PTR(rc); + } else { + ip = jfs_iget(dip->i_sb, inum); + if (IS_ERR(ip)) + jfs_err("jfs_lookup: iget failed on inum %d", (uint)inum); + } + + return d_splice_alias(ip, dentry); +} + +static struct inode *jfs_nfs_get_inode(struct super_block *sb, + u64 ino, u32 generation) +{ + struct inode *inode; + + if (ino == 0) + return ERR_PTR(-ESTALE); + inode = jfs_iget(sb, ino); + if (IS_ERR(inode)) + return ERR_CAST(inode); + + if (generation && inode->i_generation != generation) { + iput(inode); + return ERR_PTR(-ESTALE); + } + + return inode; +} + +struct dentry *jfs_fh_to_dentry(struct super_block *sb, struct fid *fid, + int fh_len, int fh_type) +{ + return generic_fh_to_dentry(sb, fid, fh_len, fh_type, + jfs_nfs_get_inode); +} + +struct dentry *jfs_fh_to_parent(struct super_block *sb, struct fid *fid, + int fh_len, int fh_type) +{ + return generic_fh_to_parent(sb, fid, fh_len, fh_type, + jfs_nfs_get_inode); +} + +struct dentry *jfs_get_parent(struct dentry *dentry) +{ + unsigned long parent_ino; + + parent_ino = + le32_to_cpu(JFS_IP(d_inode(dentry))->i_dtroot.header.idotdot); + + return d_obtain_alias(jfs_iget(dentry->d_sb, parent_ino)); +} + +const struct inode_operations jfs_dir_inode_operations = { + .create = jfs_create, + .lookup = jfs_lookup, + .link = jfs_link, + .unlink = jfs_unlink, + .symlink = jfs_symlink, + .mkdir = jfs_mkdir, + .rmdir = jfs_rmdir, + .mknod = jfs_mknod, + .rename = jfs_rename, + .listxattr = jfs_listxattr, + .setattr = jfs_setattr, +#ifdef CONFIG_JFS_POSIX_ACL + .get_acl = jfs_get_acl, + .set_acl = jfs_set_acl, +#endif +}; + +const struct file_operations jfs_dir_operations = { + .read = generic_read_dir, + .iterate = jfs_readdir, + .fsync = jfs_fsync, + .unlocked_ioctl = jfs_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = jfs_compat_ioctl, +#endif + .llseek = generic_file_llseek, +}; + +static int jfs_ci_hash(const struct dentry *dir, struct qstr *this) +{ + unsigned long hash; + int i; + + hash = init_name_hash(dir); + for (i=0; i < this->len; i++) + hash = partial_name_hash(tolower(this->name[i]), hash); + this->hash = end_name_hash(hash); + + return 0; +} + +static int jfs_ci_compare(const struct dentry *dentry, + unsigned int len, const char *str, const struct qstr *name) +{ + int i, result = 1; + + if (len != name->len) + goto out; + for (i=0; i < len; i++) { + if (tolower(str[i]) != tolower(name->name[i])) + goto out; + } + result = 0; +out: + return result; +} + +static int jfs_ci_revalidate(struct dentry *dentry, unsigned int flags) +{ + /* + * This is not negative dentry. Always valid. + * + * Note, rename() to existing directory entry will have ->d_inode, + * and will use existing name which isn't specified name by user. + * + * We may be able to drop this positive dentry here. But dropping + * positive dentry isn't good idea. So it's unsupported like + * rename("filename", "FILENAME") for now. + */ + if (d_really_is_positive(dentry)) + return 1; + + /* + * This may be nfsd (or something), anyway, we can't see the + * intent of this. So, since this can be for creation, drop it. + */ + if (!flags) + return 0; + + /* + * Drop the negative dentry, in order to make sure to use the + * case sensitive name which is specified by user if this is + * for creation. + */ + if (flags & (LOOKUP_CREATE | LOOKUP_RENAME_TARGET)) + return 0; + return 1; +} + +const struct dentry_operations jfs_ci_dentry_operations = +{ + .d_hash = jfs_ci_hash, + .d_compare = jfs_ci_compare, + .d_revalidate = jfs_ci_revalidate, +}; diff --git a/fs/jfs/resize.c b/fs/jfs/resize.c new file mode 100644 index 000000000..7ddcb445a --- /dev/null +++ b/fs/jfs/resize.c @@ -0,0 +1,549 @@ +/* + * Copyright (C) International Business Machines Corp., 2000-2004 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +*/ + +#include <linux/fs.h> +#include <linux/buffer_head.h> +#include <linux/quotaops.h> +#include "jfs_incore.h" +#include "jfs_filsys.h" +#include "jfs_metapage.h" +#include "jfs_dinode.h" +#include "jfs_imap.h" +#include "jfs_dmap.h" +#include "jfs_superblock.h" +#include "jfs_txnmgr.h" +#include "jfs_debug.h" + +#define BITSPERPAGE (PSIZE << 3) +#define L2MEGABYTE 20 +#define MEGABYTE (1 << L2MEGABYTE) +#define MEGABYTE32 (MEGABYTE << 5) + +/* convert block number to bmap file page number */ +#define BLKTODMAPN(b)\ + (((b) >> 13) + ((b) >> 23) + ((b) >> 33) + 3 + 1) + +/* + * jfs_extendfs() + * + * function: extend file system; + * + * |-------------------------------|----------|----------| + * file system space fsck inline log + * workspace space + * + * input: + * new LVSize: in LV blocks (required) + * new LogSize: in LV blocks (optional) + * new FSSize: in LV blocks (optional) + * + * new configuration: + * 1. set new LogSize as specified or default from new LVSize; + * 2. compute new FSCKSize from new LVSize; + * 3. set new FSSize as MIN(FSSize, LVSize-(LogSize+FSCKSize)) where + * assert(new FSSize >= old FSSize), + * i.e., file system must not be shrunk; + */ +int jfs_extendfs(struct super_block *sb, s64 newLVSize, int newLogSize) +{ + int rc = 0; + struct jfs_sb_info *sbi = JFS_SBI(sb); + struct inode *ipbmap = sbi->ipbmap; + struct inode *ipbmap2; + struct inode *ipimap = sbi->ipimap; + struct jfs_log *log = sbi->log; + struct bmap *bmp = sbi->bmap; + s64 newLogAddress, newFSCKAddress; + int newFSCKSize; + s64 newMapSize = 0, mapSize; + s64 XAddress, XSize, nblocks, xoff, xaddr, t64; + s64 oldLVSize; + s64 newFSSize; + s64 VolumeSize; + int newNpages = 0, nPages, newPage, xlen, t32; + int tid; + int log_formatted = 0; + struct inode *iplist[1]; + struct jfs_superblock *j_sb, *j_sb2; + s64 old_agsize; + int agsizechanged = 0; + struct buffer_head *bh, *bh2; + + /* If the volume hasn't grown, get out now */ + + if (sbi->mntflag & JFS_INLINELOG) + oldLVSize = addressPXD(&sbi->logpxd) + lengthPXD(&sbi->logpxd); + else + oldLVSize = addressPXD(&sbi->fsckpxd) + + lengthPXD(&sbi->fsckpxd); + + if (oldLVSize >= newLVSize) { + printk(KERN_WARNING + "jfs_extendfs: volume hasn't grown, returning\n"); + goto out; + } + + VolumeSize = i_size_read(sb->s_bdev->bd_inode) >> sb->s_blocksize_bits; + + if (VolumeSize) { + if (newLVSize > VolumeSize) { + printk(KERN_WARNING "jfs_extendfs: invalid size\n"); + rc = -EINVAL; + goto out; + } + } else { + /* check the device */ + bh = sb_bread(sb, newLVSize - 1); + if (!bh) { + printk(KERN_WARNING "jfs_extendfs: invalid size\n"); + rc = -EINVAL; + goto out; + } + bforget(bh); + } + + /* Can't extend write-protected drive */ + + if (isReadOnly(ipbmap)) { + printk(KERN_WARNING "jfs_extendfs: read-only file system\n"); + rc = -EROFS; + goto out; + } + + /* + * reconfigure LV spaces + * --------------------- + * + * validate new size, or, if not specified, determine new size + */ + + /* + * reconfigure inline log space: + */ + if ((sbi->mntflag & JFS_INLINELOG)) { + if (newLogSize == 0) { + /* + * no size specified: default to 1/256 of aggregate + * size; rounded up to a megabyte boundary; + */ + newLogSize = newLVSize >> 8; + t32 = (1 << (20 - sbi->l2bsize)) - 1; + newLogSize = (newLogSize + t32) & ~t32; + newLogSize = + min(newLogSize, MEGABYTE32 >> sbi->l2bsize); + } else { + /* + * convert the newLogSize to fs blocks. + * + * Since this is given in megabytes, it will always be + * an even number of pages. + */ + newLogSize = (newLogSize * MEGABYTE) >> sbi->l2bsize; + } + + } else + newLogSize = 0; + + newLogAddress = newLVSize - newLogSize; + + /* + * reconfigure fsck work space: + * + * configure it to the end of the logical volume regardless of + * whether file system extends to the end of the aggregate; + * Need enough 4k pages to cover: + * - 1 bit per block in aggregate rounded up to BPERDMAP boundary + * - 1 extra page to handle control page and intermediate level pages + * - 50 extra pages for the chkdsk service log + */ + t64 = ((newLVSize - newLogSize + BPERDMAP - 1) >> L2BPERDMAP) + << L2BPERDMAP; + t32 = DIV_ROUND_UP(t64, BITSPERPAGE) + 1 + 50; + newFSCKSize = t32 << sbi->l2nbperpage; + newFSCKAddress = newLogAddress - newFSCKSize; + + /* + * compute new file system space; + */ + newFSSize = newLVSize - newLogSize - newFSCKSize; + + /* file system cannot be shrunk */ + if (newFSSize < bmp->db_mapsize) { + rc = -EINVAL; + goto out; + } + + /* + * If we're expanding enough that the inline log does not overlap + * the old one, we can format the new log before we quiesce the + * filesystem. + */ + if ((sbi->mntflag & JFS_INLINELOG) && (newLogAddress > oldLVSize)) { + if ((rc = lmLogFormat(log, newLogAddress, newLogSize))) + goto out; + log_formatted = 1; + } + /* + * quiesce file system + * + * (prepare to move the inline log and to prevent map update) + * + * block any new transactions and wait for completion of + * all wip transactions and flush modified pages s.t. + * on-disk file system is in consistent state and + * log is not required for recovery. + */ + txQuiesce(sb); + + /* Reset size of direct inode */ + sbi->direct_inode->i_size = i_size_read(sb->s_bdev->bd_inode); + + if (sbi->mntflag & JFS_INLINELOG) { + /* + * deactivate old inline log + */ + lmLogShutdown(log); + + /* + * mark on-disk super block for fs in transition; + * + * update on-disk superblock for the new space configuration + * of inline log space and fsck work space descriptors: + * N.B. FS descriptor is NOT updated; + * + * crash recovery: + * logredo(): if FM_EXTENDFS, return to fsck() for cleanup; + * fsck(): if FM_EXTENDFS, reformat inline log and fsck + * workspace from superblock inline log descriptor and fsck + * workspace descriptor; + */ + + /* read in superblock */ + if ((rc = readSuper(sb, &bh))) + goto error_out; + j_sb = (struct jfs_superblock *)bh->b_data; + + /* mark extendfs() in progress */ + j_sb->s_state |= cpu_to_le32(FM_EXTENDFS); + j_sb->s_xsize = cpu_to_le64(newFSSize); + PXDaddress(&j_sb->s_xfsckpxd, newFSCKAddress); + PXDlength(&j_sb->s_xfsckpxd, newFSCKSize); + PXDaddress(&j_sb->s_xlogpxd, newLogAddress); + PXDlength(&j_sb->s_xlogpxd, newLogSize); + + /* synchronously update superblock */ + mark_buffer_dirty(bh); + sync_dirty_buffer(bh); + brelse(bh); + + /* + * format new inline log synchronously; + * + * crash recovery: if log move in progress, + * reformat log and exit success; + */ + if (!log_formatted) + if ((rc = lmLogFormat(log, newLogAddress, newLogSize))) + goto error_out; + + /* + * activate new log + */ + log->base = newLogAddress; + log->size = newLogSize >> (L2LOGPSIZE - sb->s_blocksize_bits); + if ((rc = lmLogInit(log))) + goto error_out; + } + + /* + * extend block allocation map + * --------------------------- + * + * extendfs() for new extension, retry after crash recovery; + * + * note: both logredo() and fsck() rebuild map from + * the bitmap and configuration parameter from superblock + * (disregarding all other control information in the map); + * + * superblock: + * s_size: aggregate size in physical blocks; + */ + /* + * compute the new block allocation map configuration + * + * map dinode: + * di_size: map file size in byte; + * di_nblocks: number of blocks allocated for map file; + * di_mapsize: number of blocks in aggregate (covered by map); + * map control page: + * db_mapsize: number of blocks in aggregate (covered by map); + */ + newMapSize = newFSSize; + /* number of data pages of new bmap file: + * roundup new size to full dmap page boundary and + * add 1 extra dmap page for next extendfs() + */ + t64 = (newMapSize - 1) + BPERDMAP; + newNpages = BLKTODMAPN(t64) + 1; + + /* + * extend map from current map (WITHOUT growing mapfile) + * + * map new extension with unmapped part of the last partial + * dmap page, if applicable, and extra page(s) allocated + * at end of bmap by mkfs() or previous extendfs(); + */ + extendBmap: + /* compute number of blocks requested to extend */ + mapSize = bmp->db_mapsize; + XAddress = mapSize; /* eXtension Address */ + XSize = newMapSize - mapSize; /* eXtension Size */ + old_agsize = bmp->db_agsize; /* We need to know if this changes */ + + /* compute number of blocks that can be extended by current mapfile */ + t64 = dbMapFileSizeToMapSize(ipbmap); + if (mapSize > t64) { + printk(KERN_ERR "jfs_extendfs: mapSize (0x%Lx) > t64 (0x%Lx)\n", + (long long) mapSize, (long long) t64); + rc = -EIO; + goto error_out; + } + nblocks = min(t64 - mapSize, XSize); + + /* + * update map pages for new extension: + * + * update/init dmap and bubble up the control hierarchy + * incrementally fold up dmaps into upper levels; + * update bmap control page; + */ + if ((rc = dbExtendFS(ipbmap, XAddress, nblocks))) + goto error_out; + + agsizechanged |= (bmp->db_agsize != old_agsize); + + /* + * the map now has extended to cover additional nblocks: + * dn_mapsize = oldMapsize + nblocks; + */ + /* ipbmap->i_mapsize += nblocks; */ + XSize -= nblocks; + + /* + * grow map file to cover remaining extension + * and/or one extra dmap page for next extendfs(); + * + * allocate new map pages and its backing blocks, and + * update map file xtree + */ + /* compute number of data pages of current bmap file */ + nPages = ipbmap->i_size >> L2PSIZE; + + /* need to grow map file ? */ + if (nPages == newNpages) + goto finalizeBmap; + + /* + * grow bmap file for the new map pages required: + * + * allocate growth at the start of newly extended region; + * bmap file only grows sequentially, i.e., both data pages + * and possibly xtree index pages may grow in append mode, + * s.t. logredo() can reconstruct pre-extension state + * by washing away bmap file of pages outside s_size boundary; + */ + /* + * journal map file growth as if a regular file growth: + * (note: bmap is created with di_mode = IFJOURNAL|IFREG); + * + * journaling of bmap file growth is not required since + * logredo() do/can not use log records of bmap file growth + * but it provides careful write semantics, pmap update, etc.; + */ + /* synchronous write of data pages: bmap data pages are + * cached in meta-data cache, and not written out + * by txCommit(); + */ + rc = filemap_fdatawait(ipbmap->i_mapping); + if (rc) + goto error_out; + + rc = filemap_write_and_wait(ipbmap->i_mapping); + if (rc) + goto error_out; + + diWriteSpecial(ipbmap, 0); + + newPage = nPages; /* first new page number */ + xoff = newPage << sbi->l2nbperpage; + xlen = (newNpages - nPages) << sbi->l2nbperpage; + xlen = min(xlen, (int) nblocks) & ~(sbi->nbperpage - 1); + xaddr = XAddress; + + tid = txBegin(sb, COMMIT_FORCE); + + if ((rc = xtAppend(tid, ipbmap, 0, xoff, nblocks, &xlen, &xaddr, 0))) { + txEnd(tid); + goto error_out; + } + /* update bmap file size */ + ipbmap->i_size += xlen << sbi->l2bsize; + inode_add_bytes(ipbmap, xlen << sbi->l2bsize); + + iplist[0] = ipbmap; + rc = txCommit(tid, 1, &iplist[0], COMMIT_FORCE); + + txEnd(tid); + + if (rc) + goto error_out; + + /* + * map file has been grown now to cover extension to further out; + * di_size = new map file size; + * + * if huge extension, the previous extension based on previous + * map file size may not have been sufficient to cover whole extension + * (it could have been used up for new map pages), + * but the newly grown map file now covers lot bigger new free space + * available for further extension of map; + */ + /* any more blocks to extend ? */ + if (XSize) + goto extendBmap; + + finalizeBmap: + /* finalize bmap */ + dbFinalizeBmap(ipbmap); + + /* + * update inode allocation map + * --------------------------- + * + * move iag lists from old to new iag; + * agstart field is not updated for logredo() to reconstruct + * iag lists if system crash occurs. + * (computation of ag number from agstart based on agsize + * will correctly identify the new ag); + */ + /* if new AG size the same as old AG size, done! */ + if (agsizechanged) { + if ((rc = diExtendFS(ipimap, ipbmap))) + goto error_out; + + /* finalize imap */ + if ((rc = diSync(ipimap))) + goto error_out; + } + + /* + * finalize + * -------- + * + * extension is committed when on-disk super block is + * updated with new descriptors: logredo will recover + * crash before it to pre-extension state; + */ + + /* sync log to skip log replay of bmap file growth transaction; */ + /* lmLogSync(log, 1); */ + + /* + * synchronous write bmap global control page; + * for crash before completion of write + * logredo() will recover to pre-extendfs state; + * for crash after completion of write, + * logredo() will recover post-extendfs state; + */ + if ((rc = dbSync(ipbmap))) + goto error_out; + + /* + * copy primary bmap inode to secondary bmap inode + */ + + ipbmap2 = diReadSpecial(sb, BMAP_I, 1); + if (ipbmap2 == NULL) { + printk(KERN_ERR "jfs_extendfs: diReadSpecial(bmap) failed\n"); + goto error_out; + } + memcpy(&JFS_IP(ipbmap2)->i_xtroot, &JFS_IP(ipbmap)->i_xtroot, 288); + ipbmap2->i_size = ipbmap->i_size; + ipbmap2->i_blocks = ipbmap->i_blocks; + + diWriteSpecial(ipbmap2, 1); + diFreeSpecial(ipbmap2); + + /* + * update superblock + */ + if ((rc = readSuper(sb, &bh))) + goto error_out; + j_sb = (struct jfs_superblock *)bh->b_data; + + /* mark extendfs() completion */ + j_sb->s_state &= cpu_to_le32(~FM_EXTENDFS); + j_sb->s_size = cpu_to_le64(bmp->db_mapsize << + le16_to_cpu(j_sb->s_l2bfactor)); + j_sb->s_agsize = cpu_to_le32(bmp->db_agsize); + + /* update inline log space descriptor */ + if (sbi->mntflag & JFS_INLINELOG) { + PXDaddress(&(j_sb->s_logpxd), newLogAddress); + PXDlength(&(j_sb->s_logpxd), newLogSize); + } + + /* record log's mount serial number */ + j_sb->s_logserial = cpu_to_le32(log->serial); + + /* update fsck work space descriptor */ + PXDaddress(&(j_sb->s_fsckpxd), newFSCKAddress); + PXDlength(&(j_sb->s_fsckpxd), newFSCKSize); + j_sb->s_fscklog = 1; + /* sb->s_fsckloglen remains the same */ + + /* Update secondary superblock */ + bh2 = sb_bread(sb, SUPER2_OFF >> sb->s_blocksize_bits); + if (bh2) { + j_sb2 = (struct jfs_superblock *)bh2->b_data; + memcpy(j_sb2, j_sb, sizeof (struct jfs_superblock)); + + mark_buffer_dirty(bh); + sync_dirty_buffer(bh2); + brelse(bh2); + } + + /* write primary superblock */ + mark_buffer_dirty(bh); + sync_dirty_buffer(bh); + brelse(bh); + + goto resume; + + error_out: + jfs_error(sb, "\n"); + + resume: + /* + * resume file system transactions + */ + txResume(sb); + + out: + return rc; +} diff --git a/fs/jfs/super.c b/fs/jfs/super.c new file mode 100644 index 000000000..09da5cf14 --- /dev/null +++ b/fs/jfs/super.c @@ -0,0 +1,1078 @@ +/* + * Copyright (C) International Business Machines Corp., 2000-2004 + * Portions Copyright (C) Christoph Hellwig, 2001-2002 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include <linux/fs.h> +#include <linux/module.h> +#include <linux/parser.h> +#include <linux/completion.h> +#include <linux/vfs.h> +#include <linux/quotaops.h> +#include <linux/mount.h> +#include <linux/moduleparam.h> +#include <linux/kthread.h> +#include <linux/posix_acl.h> +#include <linux/buffer_head.h> +#include <linux/exportfs.h> +#include <linux/crc32.h> +#include <linux/slab.h> +#include <linux/uaccess.h> +#include <linux/seq_file.h> +#include <linux/blkdev.h> + +#include "jfs_incore.h" +#include "jfs_filsys.h" +#include "jfs_inode.h" +#include "jfs_metapage.h" +#include "jfs_superblock.h" +#include "jfs_dmap.h" +#include "jfs_imap.h" +#include "jfs_acl.h" +#include "jfs_debug.h" +#include "jfs_xattr.h" +#include "jfs_dinode.h" + +MODULE_DESCRIPTION("The Journaled Filesystem (JFS)"); +MODULE_AUTHOR("Steve Best/Dave Kleikamp/Barry Arndt, IBM"); +MODULE_LICENSE("GPL"); + +static struct kmem_cache *jfs_inode_cachep; + +static const struct super_operations jfs_super_operations; +static const struct export_operations jfs_export_operations; +static struct file_system_type jfs_fs_type; + +#define MAX_COMMIT_THREADS 64 +static int commit_threads; +module_param(commit_threads, int, 0); +MODULE_PARM_DESC(commit_threads, "Number of commit threads"); + +static struct task_struct *jfsCommitThread[MAX_COMMIT_THREADS]; +struct task_struct *jfsIOthread; +struct task_struct *jfsSyncThread; + +#ifdef CONFIG_JFS_DEBUG +int jfsloglevel = JFS_LOGLEVEL_WARN; +module_param(jfsloglevel, int, 0644); +MODULE_PARM_DESC(jfsloglevel, "Specify JFS loglevel (0, 1 or 2)"); +#endif + +static void jfs_handle_error(struct super_block *sb) +{ + struct jfs_sb_info *sbi = JFS_SBI(sb); + + if (sb_rdonly(sb)) + return; + + updateSuper(sb, FM_DIRTY); + + if (sbi->flag & JFS_ERR_PANIC) + panic("JFS (device %s): panic forced after error\n", + sb->s_id); + else if (sbi->flag & JFS_ERR_REMOUNT_RO) { + jfs_err("ERROR: (device %s): remounting filesystem as read-only", + sb->s_id); + sb->s_flags |= SB_RDONLY; + } + + /* nothing is done for continue beyond marking the superblock dirty */ +} + +void jfs_error(struct super_block *sb, const char *fmt, ...) +{ + struct va_format vaf; + va_list args; + + va_start(args, fmt); + + vaf.fmt = fmt; + vaf.va = &args; + + pr_err("ERROR: (device %s): %ps: %pV\n", + sb->s_id, __builtin_return_address(0), &vaf); + + va_end(args); + + jfs_handle_error(sb); +} + +static struct inode *jfs_alloc_inode(struct super_block *sb) +{ + struct jfs_inode_info *jfs_inode; + + jfs_inode = kmem_cache_alloc(jfs_inode_cachep, GFP_NOFS); + if (!jfs_inode) + return NULL; +#ifdef CONFIG_QUOTA + memset(&jfs_inode->i_dquot, 0, sizeof(jfs_inode->i_dquot)); +#endif + return &jfs_inode->vfs_inode; +} + +static void jfs_i_callback(struct rcu_head *head) +{ + struct inode *inode = container_of(head, struct inode, i_rcu); + struct jfs_inode_info *ji = JFS_IP(inode); + kmem_cache_free(jfs_inode_cachep, ji); +} + +static void jfs_destroy_inode(struct inode *inode) +{ + struct jfs_inode_info *ji = JFS_IP(inode); + + BUG_ON(!list_empty(&ji->anon_inode_list)); + + spin_lock_irq(&ji->ag_lock); + if (ji->active_ag != -1) { + struct bmap *bmap = JFS_SBI(inode->i_sb)->bmap; + atomic_dec(&bmap->db_active[ji->active_ag]); + ji->active_ag = -1; + } + spin_unlock_irq(&ji->ag_lock); + call_rcu(&inode->i_rcu, jfs_i_callback); +} + +static int jfs_statfs(struct dentry *dentry, struct kstatfs *buf) +{ + struct jfs_sb_info *sbi = JFS_SBI(dentry->d_sb); + s64 maxinodes; + struct inomap *imap = JFS_IP(sbi->ipimap)->i_imap; + + jfs_info("In jfs_statfs"); + buf->f_type = JFS_SUPER_MAGIC; + buf->f_bsize = sbi->bsize; + buf->f_blocks = sbi->bmap->db_mapsize; + buf->f_bfree = sbi->bmap->db_nfree; + buf->f_bavail = sbi->bmap->db_nfree; + /* + * If we really return the number of allocated & free inodes, some + * applications will fail because they won't see enough free inodes. + * We'll try to calculate some guess as to how many inodes we can + * really allocate + * + * buf->f_files = atomic_read(&imap->im_numinos); + * buf->f_ffree = atomic_read(&imap->im_numfree); + */ + maxinodes = min((s64) atomic_read(&imap->im_numinos) + + ((sbi->bmap->db_nfree >> imap->im_l2nbperiext) + << L2INOSPEREXT), (s64) 0xffffffffLL); + buf->f_files = maxinodes; + buf->f_ffree = maxinodes - (atomic_read(&imap->im_numinos) - + atomic_read(&imap->im_numfree)); + buf->f_fsid.val[0] = (u32)crc32_le(0, sbi->uuid, sizeof(sbi->uuid)/2); + buf->f_fsid.val[1] = (u32)crc32_le(0, sbi->uuid + sizeof(sbi->uuid)/2, + sizeof(sbi->uuid)/2); + + buf->f_namelen = JFS_NAME_MAX; + return 0; +} + +#ifdef CONFIG_QUOTA +static int jfs_quota_off(struct super_block *sb, int type); +static int jfs_quota_on(struct super_block *sb, int type, int format_id, + const struct path *path); + +static void jfs_quota_off_umount(struct super_block *sb) +{ + int type; + + for (type = 0; type < MAXQUOTAS; type++) + jfs_quota_off(sb, type); +} + +static const struct quotactl_ops jfs_quotactl_ops = { + .quota_on = jfs_quota_on, + .quota_off = jfs_quota_off, + .quota_sync = dquot_quota_sync, + .get_state = dquot_get_state, + .set_info = dquot_set_dqinfo, + .get_dqblk = dquot_get_dqblk, + .set_dqblk = dquot_set_dqblk, + .get_nextdqblk = dquot_get_next_dqblk, +}; +#else +static inline void jfs_quota_off_umount(struct super_block *sb) +{ +} +#endif + +static void jfs_put_super(struct super_block *sb) +{ + struct jfs_sb_info *sbi = JFS_SBI(sb); + int rc; + + jfs_info("In jfs_put_super"); + + jfs_quota_off_umount(sb); + + rc = jfs_umount(sb); + if (rc) + jfs_err("jfs_umount failed with return code %d", rc); + + unload_nls(sbi->nls_tab); + + truncate_inode_pages(sbi->direct_inode->i_mapping, 0); + iput(sbi->direct_inode); + + kfree(sbi); +} + +enum { + Opt_integrity, Opt_nointegrity, Opt_iocharset, Opt_resize, + Opt_resize_nosize, Opt_errors, Opt_ignore, Opt_err, Opt_quota, + Opt_usrquota, Opt_grpquota, Opt_uid, Opt_gid, Opt_umask, + Opt_discard, Opt_nodiscard, Opt_discard_minblk +}; + +static const match_table_t tokens = { + {Opt_integrity, "integrity"}, + {Opt_nointegrity, "nointegrity"}, + {Opt_iocharset, "iocharset=%s"}, + {Opt_resize, "resize=%u"}, + {Opt_resize_nosize, "resize"}, + {Opt_errors, "errors=%s"}, + {Opt_ignore, "noquota"}, + {Opt_ignore, "quota"}, + {Opt_usrquota, "usrquota"}, + {Opt_grpquota, "grpquota"}, + {Opt_uid, "uid=%u"}, + {Opt_gid, "gid=%u"}, + {Opt_umask, "umask=%u"}, + {Opt_discard, "discard"}, + {Opt_nodiscard, "nodiscard"}, + {Opt_discard_minblk, "discard=%u"}, + {Opt_err, NULL} +}; + +static int parse_options(char *options, struct super_block *sb, s64 *newLVSize, + int *flag) +{ + void *nls_map = (void *)-1; /* -1: no change; NULL: none */ + char *p; + struct jfs_sb_info *sbi = JFS_SBI(sb); + + *newLVSize = 0; + + if (!options) + return 1; + + while ((p = strsep(&options, ",")) != NULL) { + substring_t args[MAX_OPT_ARGS]; + int token; + if (!*p) + continue; + + token = match_token(p, tokens, args); + switch (token) { + case Opt_integrity: + *flag &= ~JFS_NOINTEGRITY; + break; + case Opt_nointegrity: + *flag |= JFS_NOINTEGRITY; + break; + case Opt_ignore: + /* Silently ignore the quota options */ + /* Don't do anything ;-) */ + break; + case Opt_iocharset: + if (nls_map && nls_map != (void *) -1) + unload_nls(nls_map); + if (!strcmp(args[0].from, "none")) + nls_map = NULL; + else { + nls_map = load_nls(args[0].from); + if (!nls_map) { + pr_err("JFS: charset not found\n"); + goto cleanup; + } + } + break; + case Opt_resize: + { + char *resize = args[0].from; + int rc = kstrtoll(resize, 0, newLVSize); + + if (rc) + goto cleanup; + break; + } + case Opt_resize_nosize: + { + *newLVSize = i_size_read(sb->s_bdev->bd_inode) >> + sb->s_blocksize_bits; + if (*newLVSize == 0) + pr_err("JFS: Cannot determine volume size\n"); + break; + } + case Opt_errors: + { + char *errors = args[0].from; + if (!errors || !*errors) + goto cleanup; + if (!strcmp(errors, "continue")) { + *flag &= ~JFS_ERR_REMOUNT_RO; + *flag &= ~JFS_ERR_PANIC; + *flag |= JFS_ERR_CONTINUE; + } else if (!strcmp(errors, "remount-ro")) { + *flag &= ~JFS_ERR_CONTINUE; + *flag &= ~JFS_ERR_PANIC; + *flag |= JFS_ERR_REMOUNT_RO; + } else if (!strcmp(errors, "panic")) { + *flag &= ~JFS_ERR_CONTINUE; + *flag &= ~JFS_ERR_REMOUNT_RO; + *flag |= JFS_ERR_PANIC; + } else { + pr_err("JFS: %s is an invalid error handler\n", + errors); + goto cleanup; + } + break; + } + +#ifdef CONFIG_QUOTA + case Opt_quota: + case Opt_usrquota: + *flag |= JFS_USRQUOTA; + break; + case Opt_grpquota: + *flag |= JFS_GRPQUOTA; + break; +#else + case Opt_usrquota: + case Opt_grpquota: + case Opt_quota: + pr_err("JFS: quota operations not supported\n"); + break; +#endif + case Opt_uid: + { + char *uid = args[0].from; + uid_t val; + int rc = kstrtouint(uid, 0, &val); + + if (rc) + goto cleanup; + sbi->uid = make_kuid(current_user_ns(), val); + if (!uid_valid(sbi->uid)) + goto cleanup; + break; + } + + case Opt_gid: + { + char *gid = args[0].from; + gid_t val; + int rc = kstrtouint(gid, 0, &val); + + if (rc) + goto cleanup; + sbi->gid = make_kgid(current_user_ns(), val); + if (!gid_valid(sbi->gid)) + goto cleanup; + break; + } + + case Opt_umask: + { + char *umask = args[0].from; + int rc = kstrtouint(umask, 8, &sbi->umask); + + if (rc) + goto cleanup; + if (sbi->umask & ~0777) { + pr_err("JFS: Invalid value of umask\n"); + goto cleanup; + } + break; + } + + case Opt_discard: + { + struct request_queue *q = bdev_get_queue(sb->s_bdev); + /* if set to 1, even copying files will cause + * trimming :O + * -> user has more control over the online trimming + */ + sbi->minblks_trim = 64; + if (blk_queue_discard(q)) + *flag |= JFS_DISCARD; + else + pr_err("JFS: discard option not supported on device\n"); + break; + } + + case Opt_nodiscard: + *flag &= ~JFS_DISCARD; + break; + + case Opt_discard_minblk: + { + struct request_queue *q = bdev_get_queue(sb->s_bdev); + char *minblks_trim = args[0].from; + int rc; + if (blk_queue_discard(q)) { + *flag |= JFS_DISCARD; + rc = kstrtouint(minblks_trim, 0, + &sbi->minblks_trim); + if (rc) + goto cleanup; + } else + pr_err("JFS: discard option not supported on device\n"); + break; + } + + default: + printk("jfs: Unrecognized mount option \"%s\" or missing value\n", + p); + goto cleanup; + } + } + + if (nls_map != (void *) -1) { + /* Discard old (if remount) */ + unload_nls(sbi->nls_tab); + sbi->nls_tab = nls_map; + } + return 1; + +cleanup: + if (nls_map && nls_map != (void *) -1) + unload_nls(nls_map); + return 0; +} + +static int jfs_remount(struct super_block *sb, int *flags, char *data) +{ + s64 newLVSize = 0; + int rc = 0; + int flag = JFS_SBI(sb)->flag; + int ret; + + sync_filesystem(sb); + if (!parse_options(data, sb, &newLVSize, &flag)) + return -EINVAL; + + if (newLVSize) { + if (sb_rdonly(sb)) { + pr_err("JFS: resize requires volume to be mounted read-write\n"); + return -EROFS; + } + rc = jfs_extendfs(sb, newLVSize, 0); + if (rc) + return rc; + } + + if (sb_rdonly(sb) && !(*flags & SB_RDONLY)) { + /* + * Invalidate any previously read metadata. fsck may have + * changed the on-disk data since we mounted r/o + */ + truncate_inode_pages(JFS_SBI(sb)->direct_inode->i_mapping, 0); + + JFS_SBI(sb)->flag = flag; + ret = jfs_mount_rw(sb, 1); + + /* mark the fs r/w for quota activity */ + sb->s_flags &= ~SB_RDONLY; + + dquot_resume(sb, -1); + return ret; + } + if (!sb_rdonly(sb) && (*flags & SB_RDONLY)) { + rc = dquot_suspend(sb, -1); + if (rc < 0) + return rc; + rc = jfs_umount_rw(sb); + JFS_SBI(sb)->flag = flag; + return rc; + } + if ((JFS_SBI(sb)->flag & JFS_NOINTEGRITY) != (flag & JFS_NOINTEGRITY)) + if (!sb_rdonly(sb)) { + rc = jfs_umount_rw(sb); + if (rc) + return rc; + + JFS_SBI(sb)->flag = flag; + ret = jfs_mount_rw(sb, 1); + return ret; + } + JFS_SBI(sb)->flag = flag; + + return 0; +} + +static int jfs_fill_super(struct super_block *sb, void *data, int silent) +{ + struct jfs_sb_info *sbi; + struct inode *inode; + int rc; + s64 newLVSize = 0; + int flag, ret = -EINVAL; + + jfs_info("In jfs_read_super: s_flags=0x%lx", sb->s_flags); + + sbi = kzalloc(sizeof(struct jfs_sb_info), GFP_KERNEL); + if (!sbi) + return -ENOMEM; + + sb->s_fs_info = sbi; + sb->s_max_links = JFS_LINK_MAX; + sbi->sb = sb; + sbi->uid = INVALID_UID; + sbi->gid = INVALID_GID; + sbi->umask = -1; + + /* initialize the mount flag and determine the default error handler */ + flag = JFS_ERR_REMOUNT_RO; + + if (!parse_options((char *) data, sb, &newLVSize, &flag)) + goto out_kfree; + sbi->flag = flag; + +#ifdef CONFIG_JFS_POSIX_ACL + sb->s_flags |= SB_POSIXACL; +#endif + + if (newLVSize) { + pr_err("resize option for remount only\n"); + goto out_kfree; + } + + /* + * Initialize blocksize to 4K. + */ + sb_set_blocksize(sb, PSIZE); + + /* + * Set method vectors. + */ + sb->s_op = &jfs_super_operations; + sb->s_export_op = &jfs_export_operations; + sb->s_xattr = jfs_xattr_handlers; +#ifdef CONFIG_QUOTA + sb->dq_op = &dquot_operations; + sb->s_qcop = &jfs_quotactl_ops; + sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP; +#endif + + /* + * Initialize direct-mapping inode/address-space + */ + inode = new_inode(sb); + if (inode == NULL) { + ret = -ENOMEM; + goto out_unload; + } + inode->i_ino = 0; + inode->i_size = i_size_read(sb->s_bdev->bd_inode); + inode->i_mapping->a_ops = &jfs_metapage_aops; + inode_fake_hash(inode); + mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS); + + sbi->direct_inode = inode; + + rc = jfs_mount(sb); + if (rc) { + if (!silent) + jfs_err("jfs_mount failed w/return code = %d", rc); + goto out_mount_failed; + } + if (sb_rdonly(sb)) + sbi->log = NULL; + else { + rc = jfs_mount_rw(sb, 0); + if (rc) { + if (!silent) { + jfs_err("jfs_mount_rw failed, return code = %d", + rc); + } + goto out_no_rw; + } + } + + sb->s_magic = JFS_SUPER_MAGIC; + + if (sbi->mntflag & JFS_OS2) + sb->s_d_op = &jfs_ci_dentry_operations; + + inode = jfs_iget(sb, ROOT_I); + if (IS_ERR(inode)) { + ret = PTR_ERR(inode); + goto out_no_rw; + } + sb->s_root = d_make_root(inode); + if (!sb->s_root) + goto out_no_root; + + /* logical blocks are represented by 40 bits in pxd_t, etc. + * and page cache is indexed by long + */ + sb->s_maxbytes = min(((loff_t)sb->s_blocksize) << 40, MAX_LFS_FILESIZE); + sb->s_time_gran = 1; + return 0; + +out_no_root: + jfs_err("jfs_read_super: get root dentry failed"); + +out_no_rw: + rc = jfs_umount(sb); + if (rc) + jfs_err("jfs_umount failed with return code %d", rc); +out_mount_failed: + filemap_write_and_wait(sbi->direct_inode->i_mapping); + truncate_inode_pages(sbi->direct_inode->i_mapping, 0); + make_bad_inode(sbi->direct_inode); + iput(sbi->direct_inode); + sbi->direct_inode = NULL; +out_unload: + unload_nls(sbi->nls_tab); +out_kfree: + kfree(sbi); + return ret; +} + +static int jfs_freeze(struct super_block *sb) +{ + struct jfs_sb_info *sbi = JFS_SBI(sb); + struct jfs_log *log = sbi->log; + int rc = 0; + + if (!sb_rdonly(sb)) { + txQuiesce(sb); + rc = lmLogShutdown(log); + if (rc) { + jfs_error(sb, "lmLogShutdown failed\n"); + + /* let operations fail rather than hang */ + txResume(sb); + + return rc; + } + rc = updateSuper(sb, FM_CLEAN); + if (rc) { + jfs_err("jfs_freeze: updateSuper failed"); + /* + * Don't fail here. Everything succeeded except + * marking the superblock clean, so there's really + * no harm in leaving it frozen for now. + */ + } + } + return 0; +} + +static int jfs_unfreeze(struct super_block *sb) +{ + struct jfs_sb_info *sbi = JFS_SBI(sb); + struct jfs_log *log = sbi->log; + int rc = 0; + + if (!sb_rdonly(sb)) { + rc = updateSuper(sb, FM_MOUNT); + if (rc) { + jfs_error(sb, "updateSuper failed\n"); + goto out; + } + rc = lmLogInit(log); + if (rc) + jfs_error(sb, "lmLogInit failed\n"); +out: + txResume(sb); + } + return rc; +} + +static struct dentry *jfs_do_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) +{ + return mount_bdev(fs_type, flags, dev_name, data, jfs_fill_super); +} + +static int jfs_sync_fs(struct super_block *sb, int wait) +{ + struct jfs_log *log = JFS_SBI(sb)->log; + + /* log == NULL indicates read-only mount */ + if (log) { + /* + * Write quota structures to quota file, sync_blockdev() will + * write them to disk later + */ + dquot_writeback_dquots(sb, -1); + jfs_flush_journal(log, wait); + jfs_syncpt(log, 0); + } + + return 0; +} + +static int jfs_show_options(struct seq_file *seq, struct dentry *root) +{ + struct jfs_sb_info *sbi = JFS_SBI(root->d_sb); + + if (uid_valid(sbi->uid)) + seq_printf(seq, ",uid=%d", from_kuid(&init_user_ns, sbi->uid)); + if (gid_valid(sbi->gid)) + seq_printf(seq, ",gid=%d", from_kgid(&init_user_ns, sbi->gid)); + if (sbi->umask != -1) + seq_printf(seq, ",umask=%03o", sbi->umask); + if (sbi->flag & JFS_NOINTEGRITY) + seq_puts(seq, ",nointegrity"); + if (sbi->flag & JFS_DISCARD) + seq_printf(seq, ",discard=%u", sbi->minblks_trim); + if (sbi->nls_tab) + seq_printf(seq, ",iocharset=%s", sbi->nls_tab->charset); + if (sbi->flag & JFS_ERR_CONTINUE) + seq_printf(seq, ",errors=continue"); + if (sbi->flag & JFS_ERR_PANIC) + seq_printf(seq, ",errors=panic"); + +#ifdef CONFIG_QUOTA + if (sbi->flag & JFS_USRQUOTA) + seq_puts(seq, ",usrquota"); + + if (sbi->flag & JFS_GRPQUOTA) + seq_puts(seq, ",grpquota"); +#endif + + return 0; +} + +#ifdef CONFIG_QUOTA + +/* Read data from quotafile - avoid pagecache and such because we cannot afford + * acquiring the locks... As quota files are never truncated and quota code + * itself serializes the operations (and no one else should touch the files) + * we don't have to be afraid of races */ +static ssize_t jfs_quota_read(struct super_block *sb, int type, char *data, + size_t len, loff_t off) +{ + struct inode *inode = sb_dqopt(sb)->files[type]; + sector_t blk = off >> sb->s_blocksize_bits; + int err = 0; + int offset = off & (sb->s_blocksize - 1); + int tocopy; + size_t toread; + struct buffer_head tmp_bh; + struct buffer_head *bh; + loff_t i_size = i_size_read(inode); + + if (off > i_size) + return 0; + if (off+len > i_size) + len = i_size-off; + toread = len; + while (toread > 0) { + tocopy = sb->s_blocksize - offset < toread ? + sb->s_blocksize - offset : toread; + + tmp_bh.b_state = 0; + tmp_bh.b_size = i_blocksize(inode); + err = jfs_get_block(inode, blk, &tmp_bh, 0); + if (err) + return err; + if (!buffer_mapped(&tmp_bh)) /* A hole? */ + memset(data, 0, tocopy); + else { + bh = sb_bread(sb, tmp_bh.b_blocknr); + if (!bh) + return -EIO; + memcpy(data, bh->b_data+offset, tocopy); + brelse(bh); + } + offset = 0; + toread -= tocopy; + data += tocopy; + blk++; + } + return len; +} + +/* Write to quotafile */ +static ssize_t jfs_quota_write(struct super_block *sb, int type, + const char *data, size_t len, loff_t off) +{ + struct inode *inode = sb_dqopt(sb)->files[type]; + sector_t blk = off >> sb->s_blocksize_bits; + int err = 0; + int offset = off & (sb->s_blocksize - 1); + int tocopy; + size_t towrite = len; + struct buffer_head tmp_bh; + struct buffer_head *bh; + + inode_lock(inode); + while (towrite > 0) { + tocopy = sb->s_blocksize - offset < towrite ? + sb->s_blocksize - offset : towrite; + + tmp_bh.b_state = 0; + tmp_bh.b_size = i_blocksize(inode); + err = jfs_get_block(inode, blk, &tmp_bh, 1); + if (err) + goto out; + if (offset || tocopy != sb->s_blocksize) + bh = sb_bread(sb, tmp_bh.b_blocknr); + else + bh = sb_getblk(sb, tmp_bh.b_blocknr); + if (!bh) { + err = -EIO; + goto out; + } + lock_buffer(bh); + memcpy(bh->b_data+offset, data, tocopy); + flush_dcache_page(bh->b_page); + set_buffer_uptodate(bh); + mark_buffer_dirty(bh); + unlock_buffer(bh); + brelse(bh); + offset = 0; + towrite -= tocopy; + data += tocopy; + blk++; + } +out: + if (len == towrite) { + inode_unlock(inode); + return err; + } + if (inode->i_size < off+len-towrite) + i_size_write(inode, off+len-towrite); + inode->i_mtime = inode->i_ctime = current_time(inode); + mark_inode_dirty(inode); + inode_unlock(inode); + return len - towrite; +} + +static struct dquot **jfs_get_dquots(struct inode *inode) +{ + return JFS_IP(inode)->i_dquot; +} + +static int jfs_quota_on(struct super_block *sb, int type, int format_id, + const struct path *path) +{ + int err; + struct inode *inode; + + err = dquot_quota_on(sb, type, format_id, path); + if (err) + return err; + + inode = d_inode(path->dentry); + inode_lock(inode); + JFS_IP(inode)->mode2 |= JFS_NOATIME_FL | JFS_IMMUTABLE_FL; + inode_set_flags(inode, S_NOATIME | S_IMMUTABLE, + S_NOATIME | S_IMMUTABLE); + inode_unlock(inode); + mark_inode_dirty(inode); + + return 0; +} + +static int jfs_quota_off(struct super_block *sb, int type) +{ + struct inode *inode = sb_dqopt(sb)->files[type]; + int err; + + if (!inode || !igrab(inode)) + goto out; + + err = dquot_quota_off(sb, type); + if (err) + goto out_put; + + inode_lock(inode); + JFS_IP(inode)->mode2 &= ~(JFS_NOATIME_FL | JFS_IMMUTABLE_FL); + inode_set_flags(inode, 0, S_NOATIME | S_IMMUTABLE); + inode_unlock(inode); + mark_inode_dirty(inode); +out_put: + iput(inode); + return err; +out: + return dquot_quota_off(sb, type); +} +#endif + +static const struct super_operations jfs_super_operations = { + .alloc_inode = jfs_alloc_inode, + .destroy_inode = jfs_destroy_inode, + .dirty_inode = jfs_dirty_inode, + .write_inode = jfs_write_inode, + .evict_inode = jfs_evict_inode, + .put_super = jfs_put_super, + .sync_fs = jfs_sync_fs, + .freeze_fs = jfs_freeze, + .unfreeze_fs = jfs_unfreeze, + .statfs = jfs_statfs, + .remount_fs = jfs_remount, + .show_options = jfs_show_options, +#ifdef CONFIG_QUOTA + .quota_read = jfs_quota_read, + .quota_write = jfs_quota_write, + .get_dquots = jfs_get_dquots, +#endif +}; + +static const struct export_operations jfs_export_operations = { + .fh_to_dentry = jfs_fh_to_dentry, + .fh_to_parent = jfs_fh_to_parent, + .get_parent = jfs_get_parent, +}; + +static struct file_system_type jfs_fs_type = { + .owner = THIS_MODULE, + .name = "jfs", + .mount = jfs_do_mount, + .kill_sb = kill_block_super, + .fs_flags = FS_REQUIRES_DEV, +}; +MODULE_ALIAS_FS("jfs"); + +static void init_once(void *foo) +{ + struct jfs_inode_info *jfs_ip = (struct jfs_inode_info *) foo; + + memset(jfs_ip, 0, sizeof(struct jfs_inode_info)); + INIT_LIST_HEAD(&jfs_ip->anon_inode_list); + init_rwsem(&jfs_ip->rdwrlock); + mutex_init(&jfs_ip->commit_mutex); + init_rwsem(&jfs_ip->xattr_sem); + spin_lock_init(&jfs_ip->ag_lock); + jfs_ip->active_ag = -1; + inode_init_once(&jfs_ip->vfs_inode); +} + +static int __init init_jfs_fs(void) +{ + int i; + int rc; + + jfs_inode_cachep = + kmem_cache_create_usercopy("jfs_ip", sizeof(struct jfs_inode_info), + 0, SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD|SLAB_ACCOUNT, + offsetof(struct jfs_inode_info, i_inline), IDATASIZE, + init_once); + if (jfs_inode_cachep == NULL) + return -ENOMEM; + + /* + * Metapage initialization + */ + rc = metapage_init(); + if (rc) { + jfs_err("metapage_init failed w/rc = %d", rc); + goto free_slab; + } + + /* + * Transaction Manager initialization + */ + rc = txInit(); + if (rc) { + jfs_err("txInit failed w/rc = %d", rc); + goto free_metapage; + } + + /* + * I/O completion thread (endio) + */ + jfsIOthread = kthread_run(jfsIOWait, NULL, "jfsIO"); + if (IS_ERR(jfsIOthread)) { + rc = PTR_ERR(jfsIOthread); + jfs_err("init_jfs_fs: fork failed w/rc = %d", rc); + goto end_txmngr; + } + + if (commit_threads < 1) + commit_threads = num_online_cpus(); + if (commit_threads > MAX_COMMIT_THREADS) + commit_threads = MAX_COMMIT_THREADS; + + for (i = 0; i < commit_threads; i++) { + jfsCommitThread[i] = kthread_run(jfs_lazycommit, NULL, + "jfsCommit"); + if (IS_ERR(jfsCommitThread[i])) { + rc = PTR_ERR(jfsCommitThread[i]); + jfs_err("init_jfs_fs: fork failed w/rc = %d", rc); + commit_threads = i; + goto kill_committask; + } + } + + jfsSyncThread = kthread_run(jfs_sync, NULL, "jfsSync"); + if (IS_ERR(jfsSyncThread)) { + rc = PTR_ERR(jfsSyncThread); + jfs_err("init_jfs_fs: fork failed w/rc = %d", rc); + goto kill_committask; + } + +#ifdef PROC_FS_JFS + jfs_proc_init(); +#endif + + rc = register_filesystem(&jfs_fs_type); + if (!rc) + return 0; + +#ifdef PROC_FS_JFS + jfs_proc_clean(); +#endif + kthread_stop(jfsSyncThread); +kill_committask: + for (i = 0; i < commit_threads; i++) + kthread_stop(jfsCommitThread[i]); + kthread_stop(jfsIOthread); +end_txmngr: + txExit(); +free_metapage: + metapage_exit(); +free_slab: + kmem_cache_destroy(jfs_inode_cachep); + return rc; +} + +static void __exit exit_jfs_fs(void) +{ + int i; + + jfs_info("exit_jfs_fs called"); + + txExit(); + metapage_exit(); + + kthread_stop(jfsIOthread); + for (i = 0; i < commit_threads; i++) + kthread_stop(jfsCommitThread[i]); + kthread_stop(jfsSyncThread); +#ifdef PROC_FS_JFS + jfs_proc_clean(); +#endif + unregister_filesystem(&jfs_fs_type); + + /* + * Make sure all delayed rcu free inodes are flushed before we + * destroy cache. + */ + rcu_barrier(); + kmem_cache_destroy(jfs_inode_cachep); +} + +module_init(init_jfs_fs) +module_exit(exit_jfs_fs) diff --git a/fs/jfs/symlink.c b/fs/jfs/symlink.c new file mode 100644 index 000000000..383206079 --- /dev/null +++ b/fs/jfs/symlink.c @@ -0,0 +1,35 @@ +/* + * Copyright (C) Christoph Hellwig, 2001-2002 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include <linux/fs.h> +#include "jfs_incore.h" +#include "jfs_inode.h" +#include "jfs_xattr.h" + +const struct inode_operations jfs_fast_symlink_inode_operations = { + .get_link = simple_get_link, + .setattr = jfs_setattr, + .listxattr = jfs_listxattr, +}; + +const struct inode_operations jfs_symlink_inode_operations = { + .get_link = page_get_link, + .setattr = jfs_setattr, + .listxattr = jfs_listxattr, +}; + diff --git a/fs/jfs/xattr.c b/fs/jfs/xattr.c new file mode 100644 index 000000000..a6797986b --- /dev/null +++ b/fs/jfs/xattr.c @@ -0,0 +1,1046 @@ +/* + * Copyright (C) International Business Machines Corp., 2000-2004 + * Copyright (C) Christoph Hellwig, 2002 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include <linux/capability.h> +#include <linux/fs.h> +#include <linux/xattr.h> +#include <linux/posix_acl_xattr.h> +#include <linux/slab.h> +#include <linux/quotaops.h> +#include <linux/security.h> +#include "jfs_incore.h" +#include "jfs_superblock.h" +#include "jfs_dmap.h" +#include "jfs_debug.h" +#include "jfs_dinode.h" +#include "jfs_extent.h" +#include "jfs_metapage.h" +#include "jfs_xattr.h" +#include "jfs_acl.h" + +/* + * jfs_xattr.c: extended attribute service + * + * Overall design -- + * + * Format: + * + * Extended attribute lists (jfs_ea_list) consist of an overall size (32 bit + * value) and a variable (0 or more) number of extended attribute + * entries. Each extended attribute entry (jfs_ea) is a <name,value> double + * where <name> is constructed from a null-terminated ascii string + * (1 ... 255 bytes in the name) and <value> is arbitrary 8 bit data + * (1 ... 65535 bytes). The in-memory format is + * + * 0 1 2 4 4 + namelen + 1 + * +-------+--------+--------+----------------+-------------------+ + * | Flags | Name | Value | Name String \0 | Data . . . . | + * | | Length | Length | | | + * +-------+--------+--------+----------------+-------------------+ + * + * A jfs_ea_list then is structured as + * + * 0 4 4 + EA_SIZE(ea1) + * +------------+-------------------+--------------------+----- + * | Overall EA | First FEA Element | Second FEA Element | ..... + * | List Size | | | + * +------------+-------------------+--------------------+----- + * + * On-disk: + * + * FEALISTs are stored on disk using blocks allocated by dbAlloc() and + * written directly. An EA list may be in-lined in the inode if there is + * sufficient room available. + */ + +struct ea_buffer { + int flag; /* Indicates what storage xattr points to */ + int max_size; /* largest xattr that fits in current buffer */ + dxd_t new_ea; /* dxd to replace ea when modifying xattr */ + struct metapage *mp; /* metapage containing ea list */ + struct jfs_ea_list *xattr; /* buffer containing ea list */ +}; + +/* + * ea_buffer.flag values + */ +#define EA_INLINE 0x0001 +#define EA_EXTENT 0x0002 +#define EA_NEW 0x0004 +#define EA_MALLOC 0x0008 + + +/* + * Mapping of on-disk attribute names: for on-disk attribute names with an + * unknown prefix (not "system.", "user.", "security.", or "trusted."), the + * prefix "os2." is prepended. On the way back to disk, "os2." prefixes are + * stripped and we make sure that the remaining name does not start with one + * of the know prefixes. + */ + +static int is_known_namespace(const char *name) +{ + if (strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN) && + strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN) && + strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN) && + strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN)) + return false; + + return true; +} + +static inline int name_size(struct jfs_ea *ea) +{ + if (is_known_namespace(ea->name)) + return ea->namelen; + else + return ea->namelen + XATTR_OS2_PREFIX_LEN; +} + +static inline int copy_name(char *buffer, struct jfs_ea *ea) +{ + int len = ea->namelen; + + if (!is_known_namespace(ea->name)) { + memcpy(buffer, XATTR_OS2_PREFIX, XATTR_OS2_PREFIX_LEN); + buffer += XATTR_OS2_PREFIX_LEN; + len += XATTR_OS2_PREFIX_LEN; + } + memcpy(buffer, ea->name, ea->namelen); + buffer[ea->namelen] = 0; + + return len; +} + +/* Forward references */ +static void ea_release(struct inode *inode, struct ea_buffer *ea_buf); + +/* + * NAME: ea_write_inline + * + * FUNCTION: Attempt to write an EA inline if area is available + * + * PRE CONDITIONS: + * Already verified that the specified EA is small enough to fit inline + * + * PARAMETERS: + * ip - Inode pointer + * ealist - EA list pointer + * size - size of ealist in bytes + * ea - dxd_t structure to be filled in with necessary EA information + * if we successfully copy the EA inline + * + * NOTES: + * Checks if the inode's inline area is available. If so, copies EA inline + * and sets <ea> fields appropriately. Otherwise, returns failure, EA will + * have to be put into an extent. + * + * RETURNS: 0 for successful copy to inline area; -1 if area not available + */ +static int ea_write_inline(struct inode *ip, struct jfs_ea_list *ealist, + int size, dxd_t * ea) +{ + struct jfs_inode_info *ji = JFS_IP(ip); + + /* + * Make sure we have an EA -- the NULL EA list is valid, but you + * can't copy it! + */ + if (ealist && size > sizeof (struct jfs_ea_list)) { + assert(size <= sizeof (ji->i_inline_ea)); + + /* + * See if the space is available or if it is already being + * used for an inline EA. + */ + if (!(ji->mode2 & INLINEEA) && !(ji->ea.flag & DXD_INLINE)) + return -EPERM; + + DXDsize(ea, size); + DXDlength(ea, 0); + DXDaddress(ea, 0); + memcpy(ji->i_inline_ea, ealist, size); + ea->flag = DXD_INLINE; + ji->mode2 &= ~INLINEEA; + } else { + ea->flag = 0; + DXDsize(ea, 0); + DXDlength(ea, 0); + DXDaddress(ea, 0); + + /* Free up INLINE area */ + if (ji->ea.flag & DXD_INLINE) + ji->mode2 |= INLINEEA; + } + + return 0; +} + +/* + * NAME: ea_write + * + * FUNCTION: Write an EA for an inode + * + * PRE CONDITIONS: EA has been verified + * + * PARAMETERS: + * ip - Inode pointer + * ealist - EA list pointer + * size - size of ealist in bytes + * ea - dxd_t structure to be filled in appropriately with where the + * EA was copied + * + * NOTES: Will write EA inline if able to, otherwise allocates blocks for an + * extent and synchronously writes it to those blocks. + * + * RETURNS: 0 for success; Anything else indicates failure + */ +static int ea_write(struct inode *ip, struct jfs_ea_list *ealist, int size, + dxd_t * ea) +{ + struct super_block *sb = ip->i_sb; + struct jfs_inode_info *ji = JFS_IP(ip); + struct jfs_sb_info *sbi = JFS_SBI(sb); + int nblocks; + s64 blkno; + int rc = 0, i; + char *cp; + s32 nbytes, nb; + s32 bytes_to_write; + struct metapage *mp; + + /* + * Quick check to see if this is an in-linable EA. Short EAs + * and empty EAs are all in-linable, provided the space exists. + */ + if (!ealist || size <= sizeof (ji->i_inline_ea)) { + if (!ea_write_inline(ip, ealist, size, ea)) + return 0; + } + + /* figure out how many blocks we need */ + nblocks = (size + (sb->s_blocksize - 1)) >> sb->s_blocksize_bits; + + /* Allocate new blocks to quota. */ + rc = dquot_alloc_block(ip, nblocks); + if (rc) + return rc; + + rc = dbAlloc(ip, INOHINT(ip), nblocks, &blkno); + if (rc) { + /*Rollback quota allocation. */ + dquot_free_block(ip, nblocks); + return rc; + } + + /* + * Now have nblocks worth of storage to stuff into the FEALIST. + * loop over the FEALIST copying data into the buffer one page at + * a time. + */ + cp = (char *) ealist; + nbytes = size; + for (i = 0; i < nblocks; i += sbi->nbperpage) { + /* + * Determine how many bytes for this request, and round up to + * the nearest aggregate block size + */ + nb = min(PSIZE, nbytes); + bytes_to_write = + ((((nb + sb->s_blocksize - 1)) >> sb->s_blocksize_bits)) + << sb->s_blocksize_bits; + + if (!(mp = get_metapage(ip, blkno + i, bytes_to_write, 1))) { + rc = -EIO; + goto failed; + } + + memcpy(mp->data, cp, nb); + + /* + * We really need a way to propagate errors for + * forced writes like this one. --hch + * + * (__write_metapage => release_metapage => flush_metapage) + */ +#ifdef _JFS_FIXME + if ((rc = flush_metapage(mp))) { + /* + * the write failed -- this means that the buffer + * is still assigned and the blocks are not being + * used. this seems like the best error recovery + * we can get ... + */ + goto failed; + } +#else + flush_metapage(mp); +#endif + + cp += PSIZE; + nbytes -= nb; + } + + ea->flag = DXD_EXTENT; + DXDsize(ea, le32_to_cpu(ealist->size)); + DXDlength(ea, nblocks); + DXDaddress(ea, blkno); + + /* Free up INLINE area */ + if (ji->ea.flag & DXD_INLINE) + ji->mode2 |= INLINEEA; + + return 0; + + failed: + /* Rollback quota allocation. */ + dquot_free_block(ip, nblocks); + + dbFree(ip, blkno, nblocks); + return rc; +} + +/* + * NAME: ea_read_inline + * + * FUNCTION: Read an inlined EA into user's buffer + * + * PARAMETERS: + * ip - Inode pointer + * ealist - Pointer to buffer to fill in with EA + * + * RETURNS: 0 + */ +static int ea_read_inline(struct inode *ip, struct jfs_ea_list *ealist) +{ + struct jfs_inode_info *ji = JFS_IP(ip); + int ea_size = sizeDXD(&ji->ea); + + if (ea_size == 0) { + ealist->size = 0; + return 0; + } + + /* Sanity Check */ + if ((sizeDXD(&ji->ea) > sizeof (ji->i_inline_ea))) + return -EIO; + if (le32_to_cpu(((struct jfs_ea_list *) &ji->i_inline_ea)->size) + != ea_size) + return -EIO; + + memcpy(ealist, ji->i_inline_ea, ea_size); + return 0; +} + +/* + * NAME: ea_read + * + * FUNCTION: copy EA data into user's buffer + * + * PARAMETERS: + * ip - Inode pointer + * ealist - Pointer to buffer to fill in with EA + * + * NOTES: If EA is inline calls ea_read_inline() to copy EA. + * + * RETURNS: 0 for success; other indicates failure + */ +static int ea_read(struct inode *ip, struct jfs_ea_list *ealist) +{ + struct super_block *sb = ip->i_sb; + struct jfs_inode_info *ji = JFS_IP(ip); + struct jfs_sb_info *sbi = JFS_SBI(sb); + int nblocks; + s64 blkno; + char *cp = (char *) ealist; + int i; + int nbytes, nb; + s32 bytes_to_read; + struct metapage *mp; + + /* quick check for in-line EA */ + if (ji->ea.flag & DXD_INLINE) + return ea_read_inline(ip, ealist); + + nbytes = sizeDXD(&ji->ea); + if (!nbytes) { + jfs_error(sb, "nbytes is 0\n"); + return -EIO; + } + + /* + * Figure out how many blocks were allocated when this EA list was + * originally written to disk. + */ + nblocks = lengthDXD(&ji->ea) << sbi->l2nbperpage; + blkno = addressDXD(&ji->ea) << sbi->l2nbperpage; + + /* + * I have found the disk blocks which were originally used to store + * the FEALIST. now i loop over each contiguous block copying the + * data into the buffer. + */ + for (i = 0; i < nblocks; i += sbi->nbperpage) { + /* + * Determine how many bytes for this request, and round up to + * the nearest aggregate block size + */ + nb = min(PSIZE, nbytes); + bytes_to_read = + ((((nb + sb->s_blocksize - 1)) >> sb->s_blocksize_bits)) + << sb->s_blocksize_bits; + + if (!(mp = read_metapage(ip, blkno + i, bytes_to_read, 1))) + return -EIO; + + memcpy(cp, mp->data, nb); + release_metapage(mp); + + cp += PSIZE; + nbytes -= nb; + } + + return 0; +} + +/* + * NAME: ea_get + * + * FUNCTION: Returns buffer containing existing extended attributes. + * The size of the buffer will be the larger of the existing + * attributes size, or min_size. + * + * The buffer, which may be inlined in the inode or in the + * page cache must be release by calling ea_release or ea_put + * + * PARAMETERS: + * inode - Inode pointer + * ea_buf - Structure to be populated with ealist and its metadata + * min_size- minimum size of buffer to be returned + * + * RETURNS: 0 for success; Other indicates failure + */ +static int ea_get(struct inode *inode, struct ea_buffer *ea_buf, int min_size) +{ + struct jfs_inode_info *ji = JFS_IP(inode); + struct super_block *sb = inode->i_sb; + int size; + int ea_size = sizeDXD(&ji->ea); + int blocks_needed, current_blocks; + s64 blkno; + int rc; + int quota_allocation = 0; + + /* When fsck.jfs clears a bad ea, it doesn't clear the size */ + if (ji->ea.flag == 0) + ea_size = 0; + + if (ea_size == 0) { + if (min_size == 0) { + ea_buf->flag = 0; + ea_buf->max_size = 0; + ea_buf->xattr = NULL; + return 0; + } + if ((min_size <= sizeof (ji->i_inline_ea)) && + (ji->mode2 & INLINEEA)) { + ea_buf->flag = EA_INLINE | EA_NEW; + ea_buf->max_size = sizeof (ji->i_inline_ea); + ea_buf->xattr = (struct jfs_ea_list *) ji->i_inline_ea; + DXDlength(&ea_buf->new_ea, 0); + DXDaddress(&ea_buf->new_ea, 0); + ea_buf->new_ea.flag = DXD_INLINE; + DXDsize(&ea_buf->new_ea, min_size); + return 0; + } + current_blocks = 0; + } else if (ji->ea.flag & DXD_INLINE) { + if (min_size <= sizeof (ji->i_inline_ea)) { + ea_buf->flag = EA_INLINE; + ea_buf->max_size = sizeof (ji->i_inline_ea); + ea_buf->xattr = (struct jfs_ea_list *) ji->i_inline_ea; + goto size_check; + } + current_blocks = 0; + } else { + if (!(ji->ea.flag & DXD_EXTENT)) { + jfs_error(sb, "invalid ea.flag\n"); + return -EIO; + } + current_blocks = (ea_size + sb->s_blocksize - 1) >> + sb->s_blocksize_bits; + } + size = max(min_size, ea_size); + + if (size > PSIZE) { + /* + * To keep the rest of the code simple. Allocate a + * contiguous buffer to work with. Make the buffer large + * enough to make use of the whole extent. + */ + ea_buf->max_size = (size + sb->s_blocksize - 1) & + ~(sb->s_blocksize - 1); + + ea_buf->xattr = kmalloc(ea_buf->max_size, GFP_KERNEL); + if (ea_buf->xattr == NULL) + return -ENOMEM; + + ea_buf->flag = EA_MALLOC; + + if (ea_size == 0) + return 0; + + if ((rc = ea_read(inode, ea_buf->xattr))) { + kfree(ea_buf->xattr); + ea_buf->xattr = NULL; + return rc; + } + goto size_check; + } + blocks_needed = (min_size + sb->s_blocksize - 1) >> + sb->s_blocksize_bits; + + if (blocks_needed > current_blocks) { + /* Allocate new blocks to quota. */ + rc = dquot_alloc_block(inode, blocks_needed); + if (rc) + return -EDQUOT; + + quota_allocation = blocks_needed; + + rc = dbAlloc(inode, INOHINT(inode), (s64) blocks_needed, + &blkno); + if (rc) + goto clean_up; + + DXDlength(&ea_buf->new_ea, blocks_needed); + DXDaddress(&ea_buf->new_ea, blkno); + ea_buf->new_ea.flag = DXD_EXTENT; + DXDsize(&ea_buf->new_ea, min_size); + + ea_buf->flag = EA_EXTENT | EA_NEW; + + ea_buf->mp = get_metapage(inode, blkno, + blocks_needed << sb->s_blocksize_bits, + 1); + if (ea_buf->mp == NULL) { + dbFree(inode, blkno, (s64) blocks_needed); + rc = -EIO; + goto clean_up; + } + ea_buf->xattr = ea_buf->mp->data; + ea_buf->max_size = (min_size + sb->s_blocksize - 1) & + ~(sb->s_blocksize - 1); + if (ea_size == 0) + return 0; + if ((rc = ea_read(inode, ea_buf->xattr))) { + discard_metapage(ea_buf->mp); + dbFree(inode, blkno, (s64) blocks_needed); + goto clean_up; + } + goto size_check; + } + ea_buf->flag = EA_EXTENT; + ea_buf->mp = read_metapage(inode, addressDXD(&ji->ea), + lengthDXD(&ji->ea) << sb->s_blocksize_bits, + 1); + if (ea_buf->mp == NULL) { + rc = -EIO; + goto clean_up; + } + ea_buf->xattr = ea_buf->mp->data; + ea_buf->max_size = (ea_size + sb->s_blocksize - 1) & + ~(sb->s_blocksize - 1); + + size_check: + if (EALIST_SIZE(ea_buf->xattr) != ea_size) { + printk(KERN_ERR "ea_get: invalid extended attribute\n"); + print_hex_dump(KERN_ERR, "", DUMP_PREFIX_ADDRESS, 16, 1, + ea_buf->xattr, ea_size, 1); + ea_release(inode, ea_buf); + rc = -EIO; + goto clean_up; + } + + return ea_size; + + clean_up: + /* Rollback quota allocation */ + if (quota_allocation) + dquot_free_block(inode, quota_allocation); + + return (rc); +} + +static void ea_release(struct inode *inode, struct ea_buffer *ea_buf) +{ + if (ea_buf->flag & EA_MALLOC) + kfree(ea_buf->xattr); + else if (ea_buf->flag & EA_EXTENT) { + assert(ea_buf->mp); + release_metapage(ea_buf->mp); + + if (ea_buf->flag & EA_NEW) + dbFree(inode, addressDXD(&ea_buf->new_ea), + lengthDXD(&ea_buf->new_ea)); + } +} + +static int ea_put(tid_t tid, struct inode *inode, struct ea_buffer *ea_buf, + int new_size) +{ + struct jfs_inode_info *ji = JFS_IP(inode); + unsigned long old_blocks, new_blocks; + int rc = 0; + + if (new_size == 0) { + ea_release(inode, ea_buf); + ea_buf = NULL; + } else if (ea_buf->flag & EA_INLINE) { + assert(new_size <= sizeof (ji->i_inline_ea)); + ji->mode2 &= ~INLINEEA; + ea_buf->new_ea.flag = DXD_INLINE; + DXDsize(&ea_buf->new_ea, new_size); + DXDaddress(&ea_buf->new_ea, 0); + DXDlength(&ea_buf->new_ea, 0); + } else if (ea_buf->flag & EA_MALLOC) { + rc = ea_write(inode, ea_buf->xattr, new_size, &ea_buf->new_ea); + kfree(ea_buf->xattr); + } else if (ea_buf->flag & EA_NEW) { + /* We have already allocated a new dxd */ + flush_metapage(ea_buf->mp); + } else { + /* ->xattr must point to original ea's metapage */ + rc = ea_write(inode, ea_buf->xattr, new_size, &ea_buf->new_ea); + discard_metapage(ea_buf->mp); + } + if (rc) + return rc; + + old_blocks = new_blocks = 0; + + if (ji->ea.flag & DXD_EXTENT) { + invalidate_dxd_metapages(inode, ji->ea); + old_blocks = lengthDXD(&ji->ea); + } + + if (ea_buf) { + txEA(tid, inode, &ji->ea, &ea_buf->new_ea); + if (ea_buf->new_ea.flag & DXD_EXTENT) { + new_blocks = lengthDXD(&ea_buf->new_ea); + if (ji->ea.flag & DXD_INLINE) + ji->mode2 |= INLINEEA; + } + ji->ea = ea_buf->new_ea; + } else { + txEA(tid, inode, &ji->ea, NULL); + if (ji->ea.flag & DXD_INLINE) + ji->mode2 |= INLINEEA; + ji->ea.flag = 0; + ji->ea.size = 0; + } + + /* If old blocks exist, they must be removed from quota allocation. */ + if (old_blocks) + dquot_free_block(inode, old_blocks); + + inode->i_ctime = current_time(inode); + + return 0; +} + +int __jfs_setxattr(tid_t tid, struct inode *inode, const char *name, + const void *value, size_t value_len, int flags) +{ + struct jfs_ea_list *ealist; + struct jfs_ea *ea, *old_ea = NULL, *next_ea = NULL; + struct ea_buffer ea_buf; + int old_ea_size = 0; + int xattr_size; + int new_size; + int namelen = strlen(name); + int found = 0; + int rc; + int length; + + down_write(&JFS_IP(inode)->xattr_sem); + + xattr_size = ea_get(inode, &ea_buf, 0); + if (xattr_size < 0) { + rc = xattr_size; + goto out; + } + + again: + ealist = (struct jfs_ea_list *) ea_buf.xattr; + new_size = sizeof (struct jfs_ea_list); + + if (xattr_size) { + for (ea = FIRST_EA(ealist); ea < END_EALIST(ealist); + ea = NEXT_EA(ea)) { + if ((namelen == ea->namelen) && + (memcmp(name, ea->name, namelen) == 0)) { + found = 1; + if (flags & XATTR_CREATE) { + rc = -EEXIST; + goto release; + } + old_ea = ea; + old_ea_size = EA_SIZE(ea); + next_ea = NEXT_EA(ea); + } else + new_size += EA_SIZE(ea); + } + } + + if (!found) { + if (flags & XATTR_REPLACE) { + rc = -ENODATA; + goto release; + } + if (value == NULL) { + rc = 0; + goto release; + } + } + if (value) + new_size += sizeof (struct jfs_ea) + namelen + 1 + value_len; + + if (new_size > ea_buf.max_size) { + /* + * We need to allocate more space for merged ea list. + * We should only have loop to again: once. + */ + ea_release(inode, &ea_buf); + xattr_size = ea_get(inode, &ea_buf, new_size); + if (xattr_size < 0) { + rc = xattr_size; + goto out; + } + goto again; + } + + /* Remove old ea of the same name */ + if (found) { + /* number of bytes following target EA */ + length = (char *) END_EALIST(ealist) - (char *) next_ea; + if (length > 0) + memmove(old_ea, next_ea, length); + xattr_size -= old_ea_size; + } + + /* Add new entry to the end */ + if (value) { + if (xattr_size == 0) + /* Completely new ea list */ + xattr_size = sizeof (struct jfs_ea_list); + + /* + * The size of EA value is limitted by on-disk format up to + * __le16, there would be an overflow if the size is equal + * to XATTR_SIZE_MAX (65536). In order to avoid this issue, + * we can pre-checkup the value size against USHRT_MAX, and + * return -E2BIG in this case, which is consistent with the + * VFS setxattr interface. + */ + if (value_len >= USHRT_MAX) { + rc = -E2BIG; + goto release; + } + + ea = (struct jfs_ea *) ((char *) ealist + xattr_size); + ea->flag = 0; + ea->namelen = namelen; + ea->valuelen = (cpu_to_le16(value_len)); + memcpy(ea->name, name, namelen); + ea->name[namelen] = 0; + if (value_len) + memcpy(&ea->name[namelen + 1], value, value_len); + xattr_size += EA_SIZE(ea); + } + + /* DEBUG - If we did this right, these number match */ + if (xattr_size != new_size) { + printk(KERN_ERR + "__jfs_setxattr: xattr_size = %d, new_size = %d\n", + xattr_size, new_size); + + rc = -EINVAL; + goto release; + } + + /* + * If we're left with an empty list, there's no ea + */ + if (new_size == sizeof (struct jfs_ea_list)) + new_size = 0; + + ealist->size = cpu_to_le32(new_size); + + rc = ea_put(tid, inode, &ea_buf, new_size); + + goto out; + release: + ea_release(inode, &ea_buf); + out: + up_write(&JFS_IP(inode)->xattr_sem); + + return rc; +} + +ssize_t __jfs_getxattr(struct inode *inode, const char *name, void *data, + size_t buf_size) +{ + struct jfs_ea_list *ealist; + struct jfs_ea *ea; + struct ea_buffer ea_buf; + int xattr_size; + ssize_t size; + int namelen = strlen(name); + char *value; + + down_read(&JFS_IP(inode)->xattr_sem); + + xattr_size = ea_get(inode, &ea_buf, 0); + + if (xattr_size < 0) { + size = xattr_size; + goto out; + } + + if (xattr_size == 0) + goto not_found; + + ealist = (struct jfs_ea_list *) ea_buf.xattr; + + /* Find the named attribute */ + for (ea = FIRST_EA(ealist); ea < END_EALIST(ealist); ea = NEXT_EA(ea)) + if ((namelen == ea->namelen) && + memcmp(name, ea->name, namelen) == 0) { + /* Found it */ + size = le16_to_cpu(ea->valuelen); + if (!data) + goto release; + else if (size > buf_size) { + size = -ERANGE; + goto release; + } + value = ((char *) &ea->name) + ea->namelen + 1; + memcpy(data, value, size); + goto release; + } + not_found: + size = -ENODATA; + release: + ea_release(inode, &ea_buf); + out: + up_read(&JFS_IP(inode)->xattr_sem); + + return size; +} + +/* + * No special permissions are needed to list attributes except for trusted.* + */ +static inline int can_list(struct jfs_ea *ea) +{ + return (strncmp(ea->name, XATTR_TRUSTED_PREFIX, + XATTR_TRUSTED_PREFIX_LEN) || + capable(CAP_SYS_ADMIN)); +} + +ssize_t jfs_listxattr(struct dentry * dentry, char *data, size_t buf_size) +{ + struct inode *inode = d_inode(dentry); + char *buffer; + ssize_t size = 0; + int xattr_size; + struct jfs_ea_list *ealist; + struct jfs_ea *ea; + struct ea_buffer ea_buf; + + down_read(&JFS_IP(inode)->xattr_sem); + + xattr_size = ea_get(inode, &ea_buf, 0); + if (xattr_size < 0) { + size = xattr_size; + goto out; + } + + if (xattr_size == 0) + goto release; + + ealist = (struct jfs_ea_list *) ea_buf.xattr; + + /* compute required size of list */ + for (ea = FIRST_EA(ealist); ea < END_EALIST(ealist); ea = NEXT_EA(ea)) { + if (can_list(ea)) + size += name_size(ea) + 1; + } + + if (!data) + goto release; + + if (size > buf_size) { + size = -ERANGE; + goto release; + } + + /* Copy attribute names to buffer */ + buffer = data; + for (ea = FIRST_EA(ealist); ea < END_EALIST(ealist); ea = NEXT_EA(ea)) { + if (can_list(ea)) { + int namelen = copy_name(buffer, ea); + buffer += namelen + 1; + } + } + + release: + ea_release(inode, &ea_buf); + out: + up_read(&JFS_IP(inode)->xattr_sem); + return size; +} + +static int __jfs_xattr_set(struct inode *inode, const char *name, + const void *value, size_t size, int flags) +{ + struct jfs_inode_info *ji = JFS_IP(inode); + tid_t tid; + int rc; + + tid = txBegin(inode->i_sb, 0); + mutex_lock(&ji->commit_mutex); + rc = __jfs_setxattr(tid, inode, name, value, size, flags); + if (!rc) + rc = txCommit(tid, 1, &inode, 0); + txEnd(tid); + mutex_unlock(&ji->commit_mutex); + + return rc; +} + +static int jfs_xattr_get(const struct xattr_handler *handler, + struct dentry *unused, struct inode *inode, + const char *name, void *value, size_t size) +{ + name = xattr_full_name(handler, name); + return __jfs_getxattr(inode, name, value, size); +} + +static int jfs_xattr_set(const struct xattr_handler *handler, + struct dentry *unused, struct inode *inode, + const char *name, const void *value, + size_t size, int flags) +{ + name = xattr_full_name(handler, name); + return __jfs_xattr_set(inode, name, value, size, flags); +} + +static int jfs_xattr_get_os2(const struct xattr_handler *handler, + struct dentry *unused, struct inode *inode, + const char *name, void *value, size_t size) +{ + if (is_known_namespace(name)) + return -EOPNOTSUPP; + return __jfs_getxattr(inode, name, value, size); +} + +static int jfs_xattr_set_os2(const struct xattr_handler *handler, + struct dentry *unused, struct inode *inode, + const char *name, const void *value, + size_t size, int flags) +{ + if (is_known_namespace(name)) + return -EOPNOTSUPP; + return __jfs_xattr_set(inode, name, value, size, flags); +} + +static const struct xattr_handler jfs_user_xattr_handler = { + .prefix = XATTR_USER_PREFIX, + .get = jfs_xattr_get, + .set = jfs_xattr_set, +}; + +static const struct xattr_handler jfs_os2_xattr_handler = { + .prefix = XATTR_OS2_PREFIX, + .get = jfs_xattr_get_os2, + .set = jfs_xattr_set_os2, +}; + +static const struct xattr_handler jfs_security_xattr_handler = { + .prefix = XATTR_SECURITY_PREFIX, + .get = jfs_xattr_get, + .set = jfs_xattr_set, +}; + +static const struct xattr_handler jfs_trusted_xattr_handler = { + .prefix = XATTR_TRUSTED_PREFIX, + .get = jfs_xattr_get, + .set = jfs_xattr_set, +}; + +const struct xattr_handler *jfs_xattr_handlers[] = { +#ifdef CONFIG_JFS_POSIX_ACL + &posix_acl_access_xattr_handler, + &posix_acl_default_xattr_handler, +#endif + &jfs_os2_xattr_handler, + &jfs_user_xattr_handler, + &jfs_security_xattr_handler, + &jfs_trusted_xattr_handler, + NULL, +}; + + +#ifdef CONFIG_JFS_SECURITY +static int jfs_initxattrs(struct inode *inode, const struct xattr *xattr_array, + void *fs_info) +{ + const struct xattr *xattr; + tid_t *tid = fs_info; + char *name; + int err = 0; + + for (xattr = xattr_array; xattr->name != NULL; xattr++) { + name = kmalloc(XATTR_SECURITY_PREFIX_LEN + + strlen(xattr->name) + 1, GFP_NOFS); + if (!name) { + err = -ENOMEM; + break; + } + strcpy(name, XATTR_SECURITY_PREFIX); + strcpy(name + XATTR_SECURITY_PREFIX_LEN, xattr->name); + + err = __jfs_setxattr(*tid, inode, name, + xattr->value, xattr->value_len, 0); + kfree(name); + if (err < 0) + break; + } + return err; +} + +int jfs_init_security(tid_t tid, struct inode *inode, struct inode *dir, + const struct qstr *qstr) +{ + return security_inode_init_security(inode, dir, qstr, + &jfs_initxattrs, &tid); +} +#endif |