diff options
Diffstat (limited to 'fs/gfs2')
47 files changed, 33074 insertions, 0 deletions
diff --git a/fs/gfs2/Kconfig b/fs/gfs2/Kconfig new file mode 100644 index 0000000000..be7f87a8e1 --- /dev/null +++ b/fs/gfs2/Kconfig @@ -0,0 +1,37 @@ +# SPDX-License-Identifier: GPL-2.0-only +config GFS2_FS + tristate "GFS2 file system support" + select BUFFER_HEAD + select FS_POSIX_ACL + select CRC32 + select LIBCRC32C + select QUOTACTL + select FS_IOMAP + help + A cluster filesystem. + + Allows a cluster of computers to simultaneously use a block device + that is shared between them (with FC, iSCSI, NBD, etc...). GFS reads + and writes to the block device like a local filesystem, but also uses + a lock module to allow the computers coordinate their I/O so + filesystem consistency is maintained. One of the nifty features of + GFS is perfect consistency -- changes made to the filesystem on one + machine show up immediately on all other machines in the cluster. + + To use the GFS2 filesystem in a cluster, you will need to enable + the locking module below. Documentation and utilities for GFS2 can + be found here: http://sources.redhat.com/cluster + + The "nolock" lock module is now built in to GFS2 by default. If + you want to use the DLM, be sure to enable IPv4/6 networking. + +config GFS2_FS_LOCKING_DLM + bool "GFS2 DLM locking" + depends on (GFS2_FS!=n) && NET && INET && (IPV6 || IPV6=n) && \ + CONFIGFS_FS && SYSFS && (DLM=y || DLM=GFS2_FS) + help + Multiple node locking module for GFS2 + + Most users of GFS2 will require this. It provides the locking + interface between GFS2 and the DLM, which is required to use GFS2 + in a cluster environment. diff --git a/fs/gfs2/Makefile b/fs/gfs2/Makefile new file mode 100644 index 0000000000..41b2aa4bc3 --- /dev/null +++ b/fs/gfs2/Makefile @@ -0,0 +1,11 @@ +# SPDX-License-Identifier: GPL-2.0 +ccflags-y := -I$(src) +obj-$(CONFIG_GFS2_FS) += gfs2.o +gfs2-y := acl.o bmap.o dir.o xattr.o glock.o \ + glops.o log.o lops.o main.o meta_io.o \ + aops.o dentry.o export.o file.o \ + ops_fstype.o inode.o quota.o \ + recovery.o rgrp.o super.o sys.o trans.o util.o + +gfs2-$(CONFIG_GFS2_FS_LOCKING_DLM) += lock_dlm.o + diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c new file mode 100644 index 0000000000..443640e6fb --- /dev/null +++ b/fs/gfs2/acl.c @@ -0,0 +1,155 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + */ + +#include <linux/sched.h> +#include <linux/slab.h> +#include <linux/spinlock.h> +#include <linux/completion.h> +#include <linux/buffer_head.h> +#include <linux/xattr.h> +#include <linux/posix_acl.h> +#include <linux/posix_acl_xattr.h> +#include <linux/gfs2_ondisk.h> + +#include "gfs2.h" +#include "incore.h" +#include "acl.h" +#include "xattr.h" +#include "glock.h" +#include "inode.h" +#include "meta_io.h" +#include "quota.h" +#include "rgrp.h" +#include "trans.h" +#include "util.h" + +static const char *gfs2_acl_name(int type) +{ + switch (type) { + case ACL_TYPE_ACCESS: + return XATTR_POSIX_ACL_ACCESS; + case ACL_TYPE_DEFAULT: + return XATTR_POSIX_ACL_DEFAULT; + } + return NULL; +} + +static struct posix_acl *__gfs2_get_acl(struct inode *inode, int type) +{ + struct gfs2_inode *ip = GFS2_I(inode); + struct posix_acl *acl; + const char *name; + char *data; + int len; + + if (!ip->i_eattr) + return NULL; + + name = gfs2_acl_name(type); + len = gfs2_xattr_acl_get(ip, name, &data); + if (len <= 0) + return ERR_PTR(len); + acl = posix_acl_from_xattr(&init_user_ns, data, len); + kfree(data); + return acl; +} + +struct posix_acl *gfs2_get_acl(struct inode *inode, int type, bool rcu) +{ + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_holder gh; + bool need_unlock = false; + struct posix_acl *acl; + + if (rcu) + return ERR_PTR(-ECHILD); + + if (!gfs2_glock_is_locked_by_me(ip->i_gl)) { + int ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, + LM_FLAG_ANY, &gh); + if (ret) + return ERR_PTR(ret); + need_unlock = true; + } + acl = __gfs2_get_acl(inode, type); + if (need_unlock) + gfs2_glock_dq_uninit(&gh); + return acl; +} + +int __gfs2_set_acl(struct inode *inode, struct posix_acl *acl, int type) +{ + int error; + size_t len; + char *data; + const char *name = gfs2_acl_name(type); + + if (acl) { + len = posix_acl_xattr_size(acl->a_count); + data = kmalloc(len, GFP_NOFS); + if (data == NULL) + return -ENOMEM; + error = posix_acl_to_xattr(&init_user_ns, acl, data, len); + if (error < 0) + goto out; + } else { + data = NULL; + len = 0; + } + + error = __gfs2_xattr_set(inode, name, data, len, 0, GFS2_EATYPE_SYS); + if (error) + goto out; + set_cached_acl(inode, type, acl); +out: + kfree(data); + return error; +} + +int gfs2_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, + struct posix_acl *acl, int type) +{ + struct inode *inode = d_inode(dentry); + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_holder gh; + bool need_unlock = false; + int ret; + umode_t mode; + + if (acl && acl->a_count > GFS2_ACL_MAX_ENTRIES(GFS2_SB(inode))) + return -E2BIG; + + ret = gfs2_qa_get(ip); + if (ret) + return ret; + + if (!gfs2_glock_is_locked_by_me(ip->i_gl)) { + ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh); + if (ret) + goto out; + need_unlock = true; + } + + mode = inode->i_mode; + if (type == ACL_TYPE_ACCESS && acl) { + ret = posix_acl_update_mode(&nop_mnt_idmap, inode, &mode, &acl); + if (ret) + goto unlock; + } + + ret = __gfs2_set_acl(inode, acl, type); + if (!ret && mode != inode->i_mode) { + inode_set_ctime_current(inode); + inode->i_mode = mode; + mark_inode_dirty(inode); + } +unlock: + if (need_unlock) + gfs2_glock_dq_uninit(&gh); +out: + gfs2_qa_put(ip); + return ret; +} diff --git a/fs/gfs2/acl.h b/fs/gfs2/acl.h new file mode 100644 index 0000000000..d4deb2b199 --- /dev/null +++ b/fs/gfs2/acl.h @@ -0,0 +1,19 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + */ + +#ifndef __ACL_DOT_H__ +#define __ACL_DOT_H__ + +#include "incore.h" + +#define GFS2_ACL_MAX_ENTRIES(sdp) ((300 << (sdp)->sd_sb.sb_bsize_shift) >> 12) + +extern struct posix_acl *gfs2_get_acl(struct inode *inode, int type, bool rcu); +extern int __gfs2_set_acl(struct inode *inode, struct posix_acl *acl, int type); +extern int gfs2_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, + struct posix_acl *acl, int type); + +#endif /* __ACL_DOT_H__ */ diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c new file mode 100644 index 0000000000..c26d48355c --- /dev/null +++ b/fs/gfs2/aops.c @@ -0,0 +1,777 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved. + */ + +#include <linux/sched.h> +#include <linux/slab.h> +#include <linux/spinlock.h> +#include <linux/completion.h> +#include <linux/buffer_head.h> +#include <linux/pagemap.h> +#include <linux/pagevec.h> +#include <linux/mpage.h> +#include <linux/fs.h> +#include <linux/writeback.h> +#include <linux/swap.h> +#include <linux/gfs2_ondisk.h> +#include <linux/backing-dev.h> +#include <linux/uio.h> +#include <trace/events/writeback.h> +#include <linux/sched/signal.h> + +#include "gfs2.h" +#include "incore.h" +#include "bmap.h" +#include "glock.h" +#include "inode.h" +#include "log.h" +#include "meta_io.h" +#include "quota.h" +#include "trans.h" +#include "rgrp.h" +#include "super.h" +#include "util.h" +#include "glops.h" +#include "aops.h" + + +void gfs2_trans_add_databufs(struct gfs2_inode *ip, struct folio *folio, + size_t from, size_t len) +{ + struct buffer_head *head = folio_buffers(folio); + unsigned int bsize = head->b_size; + struct buffer_head *bh; + size_t to = from + len; + size_t start, end; + + for (bh = head, start = 0; bh != head || !start; + bh = bh->b_this_page, start = end) { + end = start + bsize; + if (end <= from) + continue; + if (start >= to) + break; + set_buffer_uptodate(bh); + gfs2_trans_add_data(ip->i_gl, bh); + } +} + +/** + * gfs2_get_block_noalloc - Fills in a buffer head with details about a block + * @inode: The inode + * @lblock: The block number to look up + * @bh_result: The buffer head to return the result in + * @create: Non-zero if we may add block to the file + * + * Returns: errno + */ + +static int gfs2_get_block_noalloc(struct inode *inode, sector_t lblock, + struct buffer_head *bh_result, int create) +{ + int error; + + error = gfs2_block_map(inode, lblock, bh_result, 0); + if (error) + return error; + if (!buffer_mapped(bh_result)) + return -ENODATA; + return 0; +} + +/** + * gfs2_write_jdata_folio - gfs2 jdata-specific version of block_write_full_page + * @folio: The folio to write + * @wbc: The writeback control + * + * This is the same as calling block_write_full_page, but it also + * writes pages outside of i_size + */ +static int gfs2_write_jdata_folio(struct folio *folio, + struct writeback_control *wbc) +{ + struct inode * const inode = folio->mapping->host; + loff_t i_size = i_size_read(inode); + + /* + * The folio straddles i_size. It must be zeroed out on each and every + * writepage invocation because it may be mmapped. "A file is mapped + * in multiples of the page size. For a file that is not a multiple of + * the page size, the remaining memory is zeroed when mapped, and + * writes to that region are not written out to the file." + */ + if (folio_pos(folio) < i_size && + i_size < folio_pos(folio) + folio_size(folio)) + folio_zero_segment(folio, offset_in_folio(folio, i_size), + folio_size(folio)); + + return __block_write_full_folio(inode, folio, gfs2_get_block_noalloc, + wbc, end_buffer_async_write); +} + +/** + * __gfs2_jdata_write_folio - The core of jdata writepage + * @folio: The folio to write + * @wbc: The writeback control + * + * This is shared between writepage and writepages and implements the + * core of the writepage operation. If a transaction is required then + * the checked flag will have been set and the transaction will have + * already been started before this is called. + */ +static int __gfs2_jdata_write_folio(struct folio *folio, + struct writeback_control *wbc) +{ + struct inode *inode = folio->mapping->host; + struct gfs2_inode *ip = GFS2_I(inode); + + if (folio_test_checked(folio)) { + folio_clear_checked(folio); + if (!folio_buffers(folio)) { + folio_create_empty_buffers(folio, + inode->i_sb->s_blocksize, + BIT(BH_Dirty)|BIT(BH_Uptodate)); + } + gfs2_trans_add_databufs(ip, folio, 0, folio_size(folio)); + } + return gfs2_write_jdata_folio(folio, wbc); +} + +/** + * gfs2_jdata_writepage - Write complete page + * @page: Page to write + * @wbc: The writeback control + * + * Returns: errno + * + */ + +static int gfs2_jdata_writepage(struct page *page, struct writeback_control *wbc) +{ + struct folio *folio = page_folio(page); + struct inode *inode = page->mapping->host; + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_sbd *sdp = GFS2_SB(inode); + + if (gfs2_assert_withdraw(sdp, gfs2_glock_is_held_excl(ip->i_gl))) + goto out; + if (folio_test_checked(folio) || current->journal_info) + goto out_ignore; + return __gfs2_jdata_write_folio(folio, wbc); + +out_ignore: + folio_redirty_for_writepage(wbc, folio); +out: + folio_unlock(folio); + return 0; +} + +/** + * gfs2_writepages - Write a bunch of dirty pages back to disk + * @mapping: The mapping to write + * @wbc: Write-back control + * + * Used for both ordered and writeback modes. + */ +static int gfs2_writepages(struct address_space *mapping, + struct writeback_control *wbc) +{ + struct gfs2_sbd *sdp = gfs2_mapping2sbd(mapping); + struct iomap_writepage_ctx wpc = { }; + int ret; + + /* + * Even if we didn't write enough pages here, we might still be holding + * dirty pages in the ail. We forcibly flush the ail because we don't + * want balance_dirty_pages() to loop indefinitely trying to write out + * pages held in the ail that it can't find. + */ + ret = iomap_writepages(mapping, wbc, &wpc, &gfs2_writeback_ops); + if (ret == 0 && wbc->nr_to_write > 0) + set_bit(SDF_FORCE_AIL_FLUSH, &sdp->sd_flags); + return ret; +} + +/** + * gfs2_write_jdata_batch - Write back a folio batch's worth of folios + * @mapping: The mapping + * @wbc: The writeback control + * @fbatch: The batch of folios + * @done_index: Page index + * + * Returns: non-zero if loop should terminate, zero otherwise + */ + +static int gfs2_write_jdata_batch(struct address_space *mapping, + struct writeback_control *wbc, + struct folio_batch *fbatch, + pgoff_t *done_index) +{ + struct inode *inode = mapping->host; + struct gfs2_sbd *sdp = GFS2_SB(inode); + unsigned nrblocks; + int i; + int ret; + int nr_pages = 0; + int nr_folios = folio_batch_count(fbatch); + + for (i = 0; i < nr_folios; i++) + nr_pages += folio_nr_pages(fbatch->folios[i]); + nrblocks = nr_pages * (PAGE_SIZE >> inode->i_blkbits); + + ret = gfs2_trans_begin(sdp, nrblocks, nrblocks); + if (ret < 0) + return ret; + + for (i = 0; i < nr_folios; i++) { + struct folio *folio = fbatch->folios[i]; + + *done_index = folio->index; + + folio_lock(folio); + + if (unlikely(folio->mapping != mapping)) { +continue_unlock: + folio_unlock(folio); + continue; + } + + if (!folio_test_dirty(folio)) { + /* someone wrote it for us */ + goto continue_unlock; + } + + if (folio_test_writeback(folio)) { + if (wbc->sync_mode != WB_SYNC_NONE) + folio_wait_writeback(folio); + else + goto continue_unlock; + } + + BUG_ON(folio_test_writeback(folio)); + if (!folio_clear_dirty_for_io(folio)) + goto continue_unlock; + + trace_wbc_writepage(wbc, inode_to_bdi(inode)); + + ret = __gfs2_jdata_write_folio(folio, wbc); + if (unlikely(ret)) { + if (ret == AOP_WRITEPAGE_ACTIVATE) { + folio_unlock(folio); + ret = 0; + } else { + + /* + * done_index is set past this page, + * so media errors will not choke + * background writeout for the entire + * file. This has consequences for + * range_cyclic semantics (ie. it may + * not be suitable for data integrity + * writeout). + */ + *done_index = folio_next_index(folio); + ret = 1; + break; + } + } + + /* + * We stop writing back only if we are not doing + * integrity sync. In case of integrity sync we have to + * keep going until we have written all the pages + * we tagged for writeback prior to entering this loop. + */ + if (--wbc->nr_to_write <= 0 && wbc->sync_mode == WB_SYNC_NONE) { + ret = 1; + break; + } + + } + gfs2_trans_end(sdp); + return ret; +} + +/** + * gfs2_write_cache_jdata - Like write_cache_pages but different + * @mapping: The mapping to write + * @wbc: The writeback control + * + * The reason that we use our own function here is that we need to + * start transactions before we grab page locks. This allows us + * to get the ordering right. + */ + +static int gfs2_write_cache_jdata(struct address_space *mapping, + struct writeback_control *wbc) +{ + int ret = 0; + int done = 0; + struct folio_batch fbatch; + int nr_folios; + pgoff_t writeback_index; + pgoff_t index; + pgoff_t end; + pgoff_t done_index; + int cycled; + int range_whole = 0; + xa_mark_t tag; + + folio_batch_init(&fbatch); + if (wbc->range_cyclic) { + writeback_index = mapping->writeback_index; /* prev offset */ + index = writeback_index; + if (index == 0) + cycled = 1; + else + cycled = 0; + end = -1; + } else { + index = wbc->range_start >> PAGE_SHIFT; + end = wbc->range_end >> PAGE_SHIFT; + if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) + range_whole = 1; + cycled = 1; /* ignore range_cyclic tests */ + } + if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) + tag = PAGECACHE_TAG_TOWRITE; + else + tag = PAGECACHE_TAG_DIRTY; + +retry: + if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) + tag_pages_for_writeback(mapping, index, end); + done_index = index; + while (!done && (index <= end)) { + nr_folios = filemap_get_folios_tag(mapping, &index, end, + tag, &fbatch); + if (nr_folios == 0) + break; + + ret = gfs2_write_jdata_batch(mapping, wbc, &fbatch, + &done_index); + if (ret) + done = 1; + if (ret > 0) + ret = 0; + folio_batch_release(&fbatch); + cond_resched(); + } + + if (!cycled && !done) { + /* + * range_cyclic: + * We hit the last page and there is more work to be done: wrap + * back to the start of the file + */ + cycled = 1; + index = 0; + end = writeback_index - 1; + goto retry; + } + + if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) + mapping->writeback_index = done_index; + + return ret; +} + + +/** + * gfs2_jdata_writepages - Write a bunch of dirty pages back to disk + * @mapping: The mapping to write + * @wbc: The writeback control + * + */ + +static int gfs2_jdata_writepages(struct address_space *mapping, + struct writeback_control *wbc) +{ + struct gfs2_inode *ip = GFS2_I(mapping->host); + struct gfs2_sbd *sdp = GFS2_SB(mapping->host); + int ret; + + ret = gfs2_write_cache_jdata(mapping, wbc); + if (ret == 0 && wbc->sync_mode == WB_SYNC_ALL) { + gfs2_log_flush(sdp, ip->i_gl, GFS2_LOG_HEAD_FLUSH_NORMAL | + GFS2_LFC_JDATA_WPAGES); + ret = gfs2_write_cache_jdata(mapping, wbc); + } + return ret; +} + +/** + * stuffed_readpage - Fill in a Linux page with stuffed file data + * @ip: the inode + * @page: the page + * + * Returns: errno + */ +static int stuffed_readpage(struct gfs2_inode *ip, struct page *page) +{ + struct buffer_head *dibh; + u64 dsize = i_size_read(&ip->i_inode); + void *kaddr; + int error; + + /* + * Due to the order of unstuffing files and ->fault(), we can be + * asked for a zero page in the case of a stuffed file being extended, + * so we need to supply one here. It doesn't happen often. + */ + if (unlikely(page->index)) { + zero_user(page, 0, PAGE_SIZE); + SetPageUptodate(page); + return 0; + } + + error = gfs2_meta_inode_buffer(ip, &dibh); + if (error) + return error; + + kaddr = kmap_local_page(page); + memcpy(kaddr, dibh->b_data + sizeof(struct gfs2_dinode), dsize); + memset(kaddr + dsize, 0, PAGE_SIZE - dsize); + kunmap_local(kaddr); + flush_dcache_page(page); + brelse(dibh); + SetPageUptodate(page); + + return 0; +} + +/** + * gfs2_read_folio - read a folio from a file + * @file: The file to read + * @folio: The folio in the file + */ +static int gfs2_read_folio(struct file *file, struct folio *folio) +{ + struct inode *inode = folio->mapping->host; + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_sbd *sdp = GFS2_SB(inode); + int error; + + if (!gfs2_is_jdata(ip) || + (i_blocksize(inode) == PAGE_SIZE && !folio_buffers(folio))) { + error = iomap_read_folio(folio, &gfs2_iomap_ops); + } else if (gfs2_is_stuffed(ip)) { + error = stuffed_readpage(ip, &folio->page); + folio_unlock(folio); + } else { + error = mpage_read_folio(folio, gfs2_block_map); + } + + if (unlikely(gfs2_withdrawn(sdp))) + return -EIO; + + return error; +} + +/** + * gfs2_internal_read - read an internal file + * @ip: The gfs2 inode + * @buf: The buffer to fill + * @pos: The file position + * @size: The amount to read + * + */ + +int gfs2_internal_read(struct gfs2_inode *ip, char *buf, loff_t *pos, + unsigned size) +{ + struct address_space *mapping = ip->i_inode.i_mapping; + unsigned long index = *pos >> PAGE_SHIFT; + unsigned offset = *pos & (PAGE_SIZE - 1); + unsigned copied = 0; + unsigned amt; + struct page *page; + + do { + page = read_cache_page(mapping, index, gfs2_read_folio, NULL); + if (IS_ERR(page)) { + if (PTR_ERR(page) == -EINTR) + continue; + return PTR_ERR(page); + } + amt = size - copied; + if (offset + size > PAGE_SIZE) + amt = PAGE_SIZE - offset; + memcpy_from_page(buf + copied, page, offset, amt); + put_page(page); + copied += amt; + index++; + offset = 0; + } while(copied < size); + (*pos) += size; + return size; +} + +/** + * gfs2_readahead - Read a bunch of pages at once + * @rac: Read-ahead control structure + * + * Some notes: + * 1. This is only for readahead, so we can simply ignore any things + * which are slightly inconvenient (such as locking conflicts between + * the page lock and the glock) and return having done no I/O. Its + * obviously not something we'd want to do on too regular a basis. + * Any I/O we ignore at this time will be done via readpage later. + * 2. We don't handle stuffed files here we let readpage do the honours. + * 3. mpage_readahead() does most of the heavy lifting in the common case. + * 4. gfs2_block_map() is relied upon to set BH_Boundary in the right places. + */ + +static void gfs2_readahead(struct readahead_control *rac) +{ + struct inode *inode = rac->mapping->host; + struct gfs2_inode *ip = GFS2_I(inode); + + if (gfs2_is_stuffed(ip)) + ; + else if (gfs2_is_jdata(ip)) + mpage_readahead(rac, gfs2_block_map); + else + iomap_readahead(rac, &gfs2_iomap_ops); +} + +/** + * adjust_fs_space - Adjusts the free space available due to gfs2_grow + * @inode: the rindex inode + */ +void adjust_fs_space(struct inode *inode) +{ + struct gfs2_sbd *sdp = GFS2_SB(inode); + struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode); + struct gfs2_statfs_change_host *m_sc = &sdp->sd_statfs_master; + struct gfs2_statfs_change_host *l_sc = &sdp->sd_statfs_local; + struct buffer_head *m_bh; + u64 fs_total, new_free; + + if (gfs2_trans_begin(sdp, 2 * RES_STATFS, 0) != 0) + return; + + /* Total up the file system space, according to the latest rindex. */ + fs_total = gfs2_ri_total(sdp); + if (gfs2_meta_inode_buffer(m_ip, &m_bh) != 0) + goto out; + + spin_lock(&sdp->sd_statfs_spin); + gfs2_statfs_change_in(m_sc, m_bh->b_data + + sizeof(struct gfs2_dinode)); + if (fs_total > (m_sc->sc_total + l_sc->sc_total)) + new_free = fs_total - (m_sc->sc_total + l_sc->sc_total); + else + new_free = 0; + spin_unlock(&sdp->sd_statfs_spin); + fs_warn(sdp, "File system extended by %llu blocks.\n", + (unsigned long long)new_free); + gfs2_statfs_change(sdp, new_free, new_free, 0); + + update_statfs(sdp, m_bh); + brelse(m_bh); +out: + sdp->sd_rindex_uptodate = 0; + gfs2_trans_end(sdp); +} + +static bool jdata_dirty_folio(struct address_space *mapping, + struct folio *folio) +{ + if (current->journal_info) + folio_set_checked(folio); + return block_dirty_folio(mapping, folio); +} + +/** + * gfs2_bmap - Block map function + * @mapping: Address space info + * @lblock: The block to map + * + * Returns: The disk address for the block or 0 on hole or error + */ + +static sector_t gfs2_bmap(struct address_space *mapping, sector_t lblock) +{ + struct gfs2_inode *ip = GFS2_I(mapping->host); + struct gfs2_holder i_gh; + sector_t dblock = 0; + int error; + + error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh); + if (error) + return 0; + + if (!gfs2_is_stuffed(ip)) + dblock = iomap_bmap(mapping, lblock, &gfs2_iomap_ops); + + gfs2_glock_dq_uninit(&i_gh); + + return dblock; +} + +static void gfs2_discard(struct gfs2_sbd *sdp, struct buffer_head *bh) +{ + struct gfs2_bufdata *bd; + + lock_buffer(bh); + gfs2_log_lock(sdp); + clear_buffer_dirty(bh); + bd = bh->b_private; + if (bd) { + if (!list_empty(&bd->bd_list) && !buffer_pinned(bh)) + list_del_init(&bd->bd_list); + else { + spin_lock(&sdp->sd_ail_lock); + gfs2_remove_from_journal(bh, REMOVE_JDATA); + spin_unlock(&sdp->sd_ail_lock); + } + } + bh->b_bdev = NULL; + clear_buffer_mapped(bh); + clear_buffer_req(bh); + clear_buffer_new(bh); + gfs2_log_unlock(sdp); + unlock_buffer(bh); +} + +static void gfs2_invalidate_folio(struct folio *folio, size_t offset, + size_t length) +{ + struct gfs2_sbd *sdp = GFS2_SB(folio->mapping->host); + size_t stop = offset + length; + int partial_page = (offset || length < folio_size(folio)); + struct buffer_head *bh, *head; + unsigned long pos = 0; + + BUG_ON(!folio_test_locked(folio)); + if (!partial_page) + folio_clear_checked(folio); + head = folio_buffers(folio); + if (!head) + goto out; + + bh = head; + do { + if (pos + bh->b_size > stop) + return; + + if (offset <= pos) + gfs2_discard(sdp, bh); + pos += bh->b_size; + bh = bh->b_this_page; + } while (bh != head); +out: + if (!partial_page) + filemap_release_folio(folio, 0); +} + +/** + * gfs2_release_folio - free the metadata associated with a folio + * @folio: the folio that's being released + * @gfp_mask: passed from Linux VFS, ignored by us + * + * Calls try_to_free_buffers() to free the buffers and put the folio if the + * buffers can be released. + * + * Returns: true if the folio was put or else false + */ + +bool gfs2_release_folio(struct folio *folio, gfp_t gfp_mask) +{ + struct address_space *mapping = folio->mapping; + struct gfs2_sbd *sdp = gfs2_mapping2sbd(mapping); + struct buffer_head *bh, *head; + struct gfs2_bufdata *bd; + + head = folio_buffers(folio); + if (!head) + return false; + + /* + * mm accommodates an old ext3 case where clean folios might + * not have had the dirty bit cleared. Thus, it can send actual + * dirty folios to ->release_folio() via shrink_active_list(). + * + * As a workaround, we skip folios that contain dirty buffers + * below. Once ->release_folio isn't called on dirty folios + * anymore, we can warn on dirty buffers like we used to here + * again. + */ + + gfs2_log_lock(sdp); + bh = head; + do { + if (atomic_read(&bh->b_count)) + goto cannot_release; + bd = bh->b_private; + if (bd && bd->bd_tr) + goto cannot_release; + if (buffer_dirty(bh) || WARN_ON(buffer_pinned(bh))) + goto cannot_release; + bh = bh->b_this_page; + } while (bh != head); + + bh = head; + do { + bd = bh->b_private; + if (bd) { + gfs2_assert_warn(sdp, bd->bd_bh == bh); + bd->bd_bh = NULL; + bh->b_private = NULL; + /* + * The bd may still be queued as a revoke, in which + * case we must not dequeue nor free it. + */ + if (!bd->bd_blkno && !list_empty(&bd->bd_list)) + list_del_init(&bd->bd_list); + if (list_empty(&bd->bd_list)) + kmem_cache_free(gfs2_bufdata_cachep, bd); + } + + bh = bh->b_this_page; + } while (bh != head); + gfs2_log_unlock(sdp); + + return try_to_free_buffers(folio); + +cannot_release: + gfs2_log_unlock(sdp); + return false; +} + +static const struct address_space_operations gfs2_aops = { + .writepages = gfs2_writepages, + .read_folio = gfs2_read_folio, + .readahead = gfs2_readahead, + .dirty_folio = iomap_dirty_folio, + .release_folio = iomap_release_folio, + .invalidate_folio = iomap_invalidate_folio, + .bmap = gfs2_bmap, + .migrate_folio = filemap_migrate_folio, + .is_partially_uptodate = iomap_is_partially_uptodate, + .error_remove_page = generic_error_remove_page, +}; + +static const struct address_space_operations gfs2_jdata_aops = { + .writepage = gfs2_jdata_writepage, + .writepages = gfs2_jdata_writepages, + .read_folio = gfs2_read_folio, + .readahead = gfs2_readahead, + .dirty_folio = jdata_dirty_folio, + .bmap = gfs2_bmap, + .invalidate_folio = gfs2_invalidate_folio, + .release_folio = gfs2_release_folio, + .is_partially_uptodate = block_is_partially_uptodate, + .error_remove_page = generic_error_remove_page, +}; + +void gfs2_set_aops(struct inode *inode) +{ + if (gfs2_is_jdata(GFS2_I(inode))) + inode->i_mapping->a_ops = &gfs2_jdata_aops; + else + inode->i_mapping->a_ops = &gfs2_aops; +} diff --git a/fs/gfs2/aops.h b/fs/gfs2/aops.h new file mode 100644 index 0000000000..f08322ef41 --- /dev/null +++ b/fs/gfs2/aops.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2018 Red Hat, Inc. All rights reserved. + */ + +#ifndef __AOPS_DOT_H__ +#define __AOPS_DOT_H__ + +#include "incore.h" + +extern void adjust_fs_space(struct inode *inode); +extern void gfs2_trans_add_databufs(struct gfs2_inode *ip, struct folio *folio, + size_t from, size_t len); + +#endif /* __AOPS_DOT_H__ */ diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c new file mode 100644 index 0000000000..ef7017fb69 --- /dev/null +++ b/fs/gfs2/bmap.c @@ -0,0 +1,2483 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + */ + +#include <linux/spinlock.h> +#include <linux/completion.h> +#include <linux/buffer_head.h> +#include <linux/blkdev.h> +#include <linux/gfs2_ondisk.h> +#include <linux/crc32.h> +#include <linux/iomap.h> +#include <linux/ktime.h> + +#include "gfs2.h" +#include "incore.h" +#include "bmap.h" +#include "glock.h" +#include "inode.h" +#include "meta_io.h" +#include "quota.h" +#include "rgrp.h" +#include "log.h" +#include "super.h" +#include "trans.h" +#include "dir.h" +#include "util.h" +#include "aops.h" +#include "trace_gfs2.h" + +/* This doesn't need to be that large as max 64 bit pointers in a 4k + * block is 512, so __u16 is fine for that. It saves stack space to + * keep it small. + */ +struct metapath { + struct buffer_head *mp_bh[GFS2_MAX_META_HEIGHT]; + __u16 mp_list[GFS2_MAX_META_HEIGHT]; + int mp_fheight; /* find_metapath height */ + int mp_aheight; /* actual height (lookup height) */ +}; + +static int punch_hole(struct gfs2_inode *ip, u64 offset, u64 length); + +/** + * gfs2_unstuffer_page - unstuff a stuffed inode into a block cached by a page + * @ip: the inode + * @dibh: the dinode buffer + * @block: the block number that was allocated + * @page: The (optional) page. This is looked up if @page is NULL + * + * Returns: errno + */ + +static int gfs2_unstuffer_page(struct gfs2_inode *ip, struct buffer_head *dibh, + u64 block, struct page *page) +{ + struct inode *inode = &ip->i_inode; + + if (!PageUptodate(page)) { + void *kaddr = kmap(page); + u64 dsize = i_size_read(inode); + + memcpy(kaddr, dibh->b_data + sizeof(struct gfs2_dinode), dsize); + memset(kaddr + dsize, 0, PAGE_SIZE - dsize); + kunmap(page); + + SetPageUptodate(page); + } + + if (gfs2_is_jdata(ip)) { + struct buffer_head *bh; + + if (!page_has_buffers(page)) + create_empty_buffers(page, BIT(inode->i_blkbits), + BIT(BH_Uptodate)); + + bh = page_buffers(page); + if (!buffer_mapped(bh)) + map_bh(bh, inode->i_sb, block); + + set_buffer_uptodate(bh); + gfs2_trans_add_data(ip->i_gl, bh); + } else { + set_page_dirty(page); + gfs2_ordered_add_inode(ip); + } + + return 0; +} + +static int __gfs2_unstuff_inode(struct gfs2_inode *ip, struct page *page) +{ + struct buffer_head *bh, *dibh; + struct gfs2_dinode *di; + u64 block = 0; + int isdir = gfs2_is_dir(ip); + int error; + + error = gfs2_meta_inode_buffer(ip, &dibh); + if (error) + return error; + + if (i_size_read(&ip->i_inode)) { + /* Get a free block, fill it with the stuffed data, + and write it out to disk */ + + unsigned int n = 1; + error = gfs2_alloc_blocks(ip, &block, &n, 0, NULL); + if (error) + goto out_brelse; + if (isdir) { + gfs2_trans_remove_revoke(GFS2_SB(&ip->i_inode), block, 1); + error = gfs2_dir_get_new_buffer(ip, block, &bh); + if (error) + goto out_brelse; + gfs2_buffer_copy_tail(bh, sizeof(struct gfs2_meta_header), + dibh, sizeof(struct gfs2_dinode)); + brelse(bh); + } else { + error = gfs2_unstuffer_page(ip, dibh, block, page); + if (error) + goto out_brelse; + } + } + + /* Set up the pointer to the new block */ + + gfs2_trans_add_meta(ip->i_gl, dibh); + di = (struct gfs2_dinode *)dibh->b_data; + gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode)); + + if (i_size_read(&ip->i_inode)) { + *(__be64 *)(di + 1) = cpu_to_be64(block); + gfs2_add_inode_blocks(&ip->i_inode, 1); + di->di_blocks = cpu_to_be64(gfs2_get_inode_blocks(&ip->i_inode)); + } + + ip->i_height = 1; + di->di_height = cpu_to_be16(1); + +out_brelse: + brelse(dibh); + return error; +} + +/** + * gfs2_unstuff_dinode - Unstuff a dinode when the data has grown too big + * @ip: The GFS2 inode to unstuff + * + * This routine unstuffs a dinode and returns it to a "normal" state such + * that the height can be grown in the traditional way. + * + * Returns: errno + */ + +int gfs2_unstuff_dinode(struct gfs2_inode *ip) +{ + struct inode *inode = &ip->i_inode; + struct page *page; + int error; + + down_write(&ip->i_rw_mutex); + page = grab_cache_page(inode->i_mapping, 0); + error = -ENOMEM; + if (!page) + goto out; + error = __gfs2_unstuff_inode(ip, page); + unlock_page(page); + put_page(page); +out: + up_write(&ip->i_rw_mutex); + return error; +} + +/** + * find_metapath - Find path through the metadata tree + * @sdp: The superblock + * @block: The disk block to look up + * @mp: The metapath to return the result in + * @height: The pre-calculated height of the metadata tree + * + * This routine returns a struct metapath structure that defines a path + * through the metadata of inode "ip" to get to block "block". + * + * Example: + * Given: "ip" is a height 3 file, "offset" is 101342453, and this is a + * filesystem with a blocksize of 4096. + * + * find_metapath() would return a struct metapath structure set to: + * mp_fheight = 3, mp_list[0] = 0, mp_list[1] = 48, and mp_list[2] = 165. + * + * That means that in order to get to the block containing the byte at + * offset 101342453, we would load the indirect block pointed to by pointer + * 0 in the dinode. We would then load the indirect block pointed to by + * pointer 48 in that indirect block. We would then load the data block + * pointed to by pointer 165 in that indirect block. + * + * ---------------------------------------- + * | Dinode | | + * | | 4| + * | |0 1 2 3 4 5 9| + * | | 6| + * ---------------------------------------- + * | + * | + * V + * ---------------------------------------- + * | Indirect Block | + * | 5| + * | 4 4 4 4 4 5 5 1| + * |0 5 6 7 8 9 0 1 2| + * ---------------------------------------- + * | + * | + * V + * ---------------------------------------- + * | Indirect Block | + * | 1 1 1 1 1 5| + * | 6 6 6 6 6 1| + * |0 3 4 5 6 7 2| + * ---------------------------------------- + * | + * | + * V + * ---------------------------------------- + * | Data block containing offset | + * | 101342453 | + * | | + * | | + * ---------------------------------------- + * + */ + +static void find_metapath(const struct gfs2_sbd *sdp, u64 block, + struct metapath *mp, unsigned int height) +{ + unsigned int i; + + mp->mp_fheight = height; + for (i = height; i--;) + mp->mp_list[i] = do_div(block, sdp->sd_inptrs); +} + +static inline unsigned int metapath_branch_start(const struct metapath *mp) +{ + if (mp->mp_list[0] == 0) + return 2; + return 1; +} + +/** + * metaptr1 - Return the first possible metadata pointer in a metapath buffer + * @height: The metadata height (0 = dinode) + * @mp: The metapath + */ +static inline __be64 *metaptr1(unsigned int height, const struct metapath *mp) +{ + struct buffer_head *bh = mp->mp_bh[height]; + if (height == 0) + return ((__be64 *)(bh->b_data + sizeof(struct gfs2_dinode))); + return ((__be64 *)(bh->b_data + sizeof(struct gfs2_meta_header))); +} + +/** + * metapointer - Return pointer to start of metadata in a buffer + * @height: The metadata height (0 = dinode) + * @mp: The metapath + * + * Return a pointer to the block number of the next height of the metadata + * tree given a buffer containing the pointer to the current height of the + * metadata tree. + */ + +static inline __be64 *metapointer(unsigned int height, const struct metapath *mp) +{ + __be64 *p = metaptr1(height, mp); + return p + mp->mp_list[height]; +} + +static inline const __be64 *metaend(unsigned int height, const struct metapath *mp) +{ + const struct buffer_head *bh = mp->mp_bh[height]; + return (const __be64 *)(bh->b_data + bh->b_size); +} + +static void clone_metapath(struct metapath *clone, struct metapath *mp) +{ + unsigned int hgt; + + *clone = *mp; + for (hgt = 0; hgt < mp->mp_aheight; hgt++) + get_bh(clone->mp_bh[hgt]); +} + +static void gfs2_metapath_ra(struct gfs2_glock *gl, __be64 *start, __be64 *end) +{ + const __be64 *t; + + for (t = start; t < end; t++) { + struct buffer_head *rabh; + + if (!*t) + continue; + + rabh = gfs2_getbuf(gl, be64_to_cpu(*t), CREATE); + if (trylock_buffer(rabh)) { + if (!buffer_uptodate(rabh)) { + rabh->b_end_io = end_buffer_read_sync; + submit_bh(REQ_OP_READ | REQ_RAHEAD | REQ_META | + REQ_PRIO, rabh); + continue; + } + unlock_buffer(rabh); + } + brelse(rabh); + } +} + +static int __fillup_metapath(struct gfs2_inode *ip, struct metapath *mp, + unsigned int x, unsigned int h) +{ + for (; x < h; x++) { + __be64 *ptr = metapointer(x, mp); + u64 dblock = be64_to_cpu(*ptr); + int ret; + + if (!dblock) + break; + ret = gfs2_meta_buffer(ip, GFS2_METATYPE_IN, dblock, &mp->mp_bh[x + 1]); + if (ret) + return ret; + } + mp->mp_aheight = x + 1; + return 0; +} + +/** + * lookup_metapath - Walk the metadata tree to a specific point + * @ip: The inode + * @mp: The metapath + * + * Assumes that the inode's buffer has already been looked up and + * hooked onto mp->mp_bh[0] and that the metapath has been initialised + * by find_metapath(). + * + * If this function encounters part of the tree which has not been + * allocated, it returns the current height of the tree at the point + * at which it found the unallocated block. Blocks which are found are + * added to the mp->mp_bh[] list. + * + * Returns: error + */ + +static int lookup_metapath(struct gfs2_inode *ip, struct metapath *mp) +{ + return __fillup_metapath(ip, mp, 0, ip->i_height - 1); +} + +/** + * fillup_metapath - fill up buffers for the metadata path to a specific height + * @ip: The inode + * @mp: The metapath + * @h: The height to which it should be mapped + * + * Similar to lookup_metapath, but does lookups for a range of heights + * + * Returns: error or the number of buffers filled + */ + +static int fillup_metapath(struct gfs2_inode *ip, struct metapath *mp, int h) +{ + unsigned int x = 0; + int ret; + + if (h) { + /* find the first buffer we need to look up. */ + for (x = h - 1; x > 0; x--) { + if (mp->mp_bh[x]) + break; + } + } + ret = __fillup_metapath(ip, mp, x, h); + if (ret) + return ret; + return mp->mp_aheight - x - 1; +} + +static sector_t metapath_to_block(struct gfs2_sbd *sdp, struct metapath *mp) +{ + sector_t factor = 1, block = 0; + int hgt; + + for (hgt = mp->mp_fheight - 1; hgt >= 0; hgt--) { + if (hgt < mp->mp_aheight) + block += mp->mp_list[hgt] * factor; + factor *= sdp->sd_inptrs; + } + return block; +} + +static void release_metapath(struct metapath *mp) +{ + int i; + + for (i = 0; i < GFS2_MAX_META_HEIGHT; i++) { + if (mp->mp_bh[i] == NULL) + break; + brelse(mp->mp_bh[i]); + mp->mp_bh[i] = NULL; + } +} + +/** + * gfs2_extent_length - Returns length of an extent of blocks + * @bh: The metadata block + * @ptr: Current position in @bh + * @limit: Max extent length to return + * @eob: Set to 1 if we hit "end of block" + * + * Returns: The length of the extent (minimum of one block) + */ + +static inline unsigned int gfs2_extent_length(struct buffer_head *bh, __be64 *ptr, size_t limit, int *eob) +{ + const __be64 *end = (__be64 *)(bh->b_data + bh->b_size); + const __be64 *first = ptr; + u64 d = be64_to_cpu(*ptr); + + *eob = 0; + do { + ptr++; + if (ptr >= end) + break; + d++; + } while(be64_to_cpu(*ptr) == d); + if (ptr >= end) + *eob = 1; + return ptr - first; +} + +enum walker_status { WALK_STOP, WALK_FOLLOW, WALK_CONTINUE }; + +/* + * gfs2_metadata_walker - walk an indirect block + * @mp: Metapath to indirect block + * @ptrs: Number of pointers to look at + * + * When returning WALK_FOLLOW, the walker must update @mp to point at the right + * indirect block to follow. + */ +typedef enum walker_status (*gfs2_metadata_walker)(struct metapath *mp, + unsigned int ptrs); + +/* + * gfs2_walk_metadata - walk a tree of indirect blocks + * @inode: The inode + * @mp: Starting point of walk + * @max_len: Maximum number of blocks to walk + * @walker: Called during the walk + * + * Returns 1 if the walk was stopped by @walker, 0 if we went past @max_len or + * past the end of metadata, and a negative error code otherwise. + */ + +static int gfs2_walk_metadata(struct inode *inode, struct metapath *mp, + u64 max_len, gfs2_metadata_walker walker) +{ + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_sbd *sdp = GFS2_SB(inode); + u64 factor = 1; + unsigned int hgt; + int ret; + + /* + * The walk starts in the lowest allocated indirect block, which may be + * before the position indicated by @mp. Adjust @max_len accordingly + * to avoid a short walk. + */ + for (hgt = mp->mp_fheight - 1; hgt >= mp->mp_aheight; hgt--) { + max_len += mp->mp_list[hgt] * factor; + mp->mp_list[hgt] = 0; + factor *= sdp->sd_inptrs; + } + + for (;;) { + u16 start = mp->mp_list[hgt]; + enum walker_status status; + unsigned int ptrs; + u64 len; + + /* Walk indirect block. */ + ptrs = (hgt >= 1 ? sdp->sd_inptrs : sdp->sd_diptrs) - start; + len = ptrs * factor; + if (len > max_len) + ptrs = DIV_ROUND_UP_ULL(max_len, factor); + status = walker(mp, ptrs); + switch (status) { + case WALK_STOP: + return 1; + case WALK_FOLLOW: + BUG_ON(mp->mp_aheight == mp->mp_fheight); + ptrs = mp->mp_list[hgt] - start; + len = ptrs * factor; + break; + case WALK_CONTINUE: + break; + } + if (len >= max_len) + break; + max_len -= len; + if (status == WALK_FOLLOW) + goto fill_up_metapath; + +lower_metapath: + /* Decrease height of metapath. */ + brelse(mp->mp_bh[hgt]); + mp->mp_bh[hgt] = NULL; + mp->mp_list[hgt] = 0; + if (!hgt) + break; + hgt--; + factor *= sdp->sd_inptrs; + + /* Advance in metadata tree. */ + (mp->mp_list[hgt])++; + if (hgt) { + if (mp->mp_list[hgt] >= sdp->sd_inptrs) + goto lower_metapath; + } else { + if (mp->mp_list[hgt] >= sdp->sd_diptrs) + break; + } + +fill_up_metapath: + /* Increase height of metapath. */ + ret = fillup_metapath(ip, mp, ip->i_height - 1); + if (ret < 0) + return ret; + hgt += ret; + for (; ret; ret--) + do_div(factor, sdp->sd_inptrs); + mp->mp_aheight = hgt + 1; + } + return 0; +} + +static enum walker_status gfs2_hole_walker(struct metapath *mp, + unsigned int ptrs) +{ + const __be64 *start, *ptr, *end; + unsigned int hgt; + + hgt = mp->mp_aheight - 1; + start = metapointer(hgt, mp); + end = start + ptrs; + + for (ptr = start; ptr < end; ptr++) { + if (*ptr) { + mp->mp_list[hgt] += ptr - start; + if (mp->mp_aheight == mp->mp_fheight) + return WALK_STOP; + return WALK_FOLLOW; + } + } + return WALK_CONTINUE; +} + +/** + * gfs2_hole_size - figure out the size of a hole + * @inode: The inode + * @lblock: The logical starting block number + * @len: How far to look (in blocks) + * @mp: The metapath at lblock + * @iomap: The iomap to store the hole size in + * + * This function modifies @mp. + * + * Returns: errno on error + */ +static int gfs2_hole_size(struct inode *inode, sector_t lblock, u64 len, + struct metapath *mp, struct iomap *iomap) +{ + struct metapath clone; + u64 hole_size; + int ret; + + clone_metapath(&clone, mp); + ret = gfs2_walk_metadata(inode, &clone, len, gfs2_hole_walker); + if (ret < 0) + goto out; + + if (ret == 1) + hole_size = metapath_to_block(GFS2_SB(inode), &clone) - lblock; + else + hole_size = len; + iomap->length = hole_size << inode->i_blkbits; + ret = 0; + +out: + release_metapath(&clone); + return ret; +} + +static inline void gfs2_indirect_init(struct metapath *mp, + struct gfs2_glock *gl, unsigned int i, + unsigned offset, u64 bn) +{ + __be64 *ptr = (__be64 *)(mp->mp_bh[i - 1]->b_data + + ((i > 1) ? sizeof(struct gfs2_meta_header) : + sizeof(struct gfs2_dinode))); + BUG_ON(i < 1); + BUG_ON(mp->mp_bh[i] != NULL); + mp->mp_bh[i] = gfs2_meta_new(gl, bn); + gfs2_trans_add_meta(gl, mp->mp_bh[i]); + gfs2_metatype_set(mp->mp_bh[i], GFS2_METATYPE_IN, GFS2_FORMAT_IN); + gfs2_buffer_clear_tail(mp->mp_bh[i], sizeof(struct gfs2_meta_header)); + ptr += offset; + *ptr = cpu_to_be64(bn); +} + +enum alloc_state { + ALLOC_DATA = 0, + ALLOC_GROW_DEPTH = 1, + ALLOC_GROW_HEIGHT = 2, + /* ALLOC_UNSTUFF = 3, TBD and rather complicated */ +}; + +/** + * __gfs2_iomap_alloc - Build a metadata tree of the requested height + * @inode: The GFS2 inode + * @iomap: The iomap structure + * @mp: The metapath, with proper height information calculated + * + * In this routine we may have to alloc: + * i) Indirect blocks to grow the metadata tree height + * ii) Indirect blocks to fill in lower part of the metadata tree + * iii) Data blocks + * + * This function is called after __gfs2_iomap_get, which works out the + * total number of blocks which we need via gfs2_alloc_size. + * + * We then do the actual allocation asking for an extent at a time (if + * enough contiguous free blocks are available, there will only be one + * allocation request per call) and uses the state machine to initialise + * the blocks in order. + * + * Right now, this function will allocate at most one indirect block + * worth of data -- with a default block size of 4K, that's slightly + * less than 2M. If this limitation is ever removed to allow huge + * allocations, we would probably still want to limit the iomap size we + * return to avoid stalling other tasks during huge writes; the next + * iomap iteration would then find the blocks already allocated. + * + * Returns: errno on error + */ + +static int __gfs2_iomap_alloc(struct inode *inode, struct iomap *iomap, + struct metapath *mp) +{ + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_sbd *sdp = GFS2_SB(inode); + struct buffer_head *dibh = mp->mp_bh[0]; + u64 bn; + unsigned n, i, blks, alloced = 0, iblks = 0, branch_start = 0; + size_t dblks = iomap->length >> inode->i_blkbits; + const unsigned end_of_metadata = mp->mp_fheight - 1; + int ret; + enum alloc_state state; + __be64 *ptr; + __be64 zero_bn = 0; + + BUG_ON(mp->mp_aheight < 1); + BUG_ON(dibh == NULL); + BUG_ON(dblks < 1); + + gfs2_trans_add_meta(ip->i_gl, dibh); + + down_write(&ip->i_rw_mutex); + + if (mp->mp_fheight == mp->mp_aheight) { + /* Bottom indirect block exists */ + state = ALLOC_DATA; + } else { + /* Need to allocate indirect blocks */ + if (mp->mp_fheight == ip->i_height) { + /* Writing into existing tree, extend tree down */ + iblks = mp->mp_fheight - mp->mp_aheight; + state = ALLOC_GROW_DEPTH; + } else { + /* Building up tree height */ + state = ALLOC_GROW_HEIGHT; + iblks = mp->mp_fheight - ip->i_height; + branch_start = metapath_branch_start(mp); + iblks += (mp->mp_fheight - branch_start); + } + } + + /* start of the second part of the function (state machine) */ + + blks = dblks + iblks; + i = mp->mp_aheight; + do { + n = blks - alloced; + ret = gfs2_alloc_blocks(ip, &bn, &n, 0, NULL); + if (ret) + goto out; + alloced += n; + if (state != ALLOC_DATA || gfs2_is_jdata(ip)) + gfs2_trans_remove_revoke(sdp, bn, n); + switch (state) { + /* Growing height of tree */ + case ALLOC_GROW_HEIGHT: + if (i == 1) { + ptr = (__be64 *)(dibh->b_data + + sizeof(struct gfs2_dinode)); + zero_bn = *ptr; + } + for (; i - 1 < mp->mp_fheight - ip->i_height && n > 0; + i++, n--) + gfs2_indirect_init(mp, ip->i_gl, i, 0, bn++); + if (i - 1 == mp->mp_fheight - ip->i_height) { + i--; + gfs2_buffer_copy_tail(mp->mp_bh[i], + sizeof(struct gfs2_meta_header), + dibh, sizeof(struct gfs2_dinode)); + gfs2_buffer_clear_tail(dibh, + sizeof(struct gfs2_dinode) + + sizeof(__be64)); + ptr = (__be64 *)(mp->mp_bh[i]->b_data + + sizeof(struct gfs2_meta_header)); + *ptr = zero_bn; + state = ALLOC_GROW_DEPTH; + for(i = branch_start; i < mp->mp_fheight; i++) { + if (mp->mp_bh[i] == NULL) + break; + brelse(mp->mp_bh[i]); + mp->mp_bh[i] = NULL; + } + i = branch_start; + } + if (n == 0) + break; + fallthrough; /* To branching from existing tree */ + case ALLOC_GROW_DEPTH: + if (i > 1 && i < mp->mp_fheight) + gfs2_trans_add_meta(ip->i_gl, mp->mp_bh[i-1]); + for (; i < mp->mp_fheight && n > 0; i++, n--) + gfs2_indirect_init(mp, ip->i_gl, i, + mp->mp_list[i-1], bn++); + if (i == mp->mp_fheight) + state = ALLOC_DATA; + if (n == 0) + break; + fallthrough; /* To tree complete, adding data blocks */ + case ALLOC_DATA: + BUG_ON(n > dblks); + BUG_ON(mp->mp_bh[end_of_metadata] == NULL); + gfs2_trans_add_meta(ip->i_gl, mp->mp_bh[end_of_metadata]); + dblks = n; + ptr = metapointer(end_of_metadata, mp); + iomap->addr = bn << inode->i_blkbits; + iomap->flags |= IOMAP_F_MERGED | IOMAP_F_NEW; + while (n-- > 0) + *ptr++ = cpu_to_be64(bn++); + break; + } + } while (iomap->addr == IOMAP_NULL_ADDR); + + iomap->type = IOMAP_MAPPED; + iomap->length = (u64)dblks << inode->i_blkbits; + ip->i_height = mp->mp_fheight; + gfs2_add_inode_blocks(&ip->i_inode, alloced); + gfs2_dinode_out(ip, dibh->b_data); +out: + up_write(&ip->i_rw_mutex); + return ret; +} + +#define IOMAP_F_GFS2_BOUNDARY IOMAP_F_PRIVATE + +/** + * gfs2_alloc_size - Compute the maximum allocation size + * @inode: The inode + * @mp: The metapath + * @size: Requested size in blocks + * + * Compute the maximum size of the next allocation at @mp. + * + * Returns: size in blocks + */ +static u64 gfs2_alloc_size(struct inode *inode, struct metapath *mp, u64 size) +{ + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_sbd *sdp = GFS2_SB(inode); + const __be64 *first, *ptr, *end; + + /* + * For writes to stuffed files, this function is called twice via + * __gfs2_iomap_get, before and after unstuffing. The size we return the + * first time needs to be large enough to get the reservation and + * allocation sizes right. The size we return the second time must + * be exact or else __gfs2_iomap_alloc won't do the right thing. + */ + + if (gfs2_is_stuffed(ip) || mp->mp_fheight != mp->mp_aheight) { + unsigned int maxsize = mp->mp_fheight > 1 ? + sdp->sd_inptrs : sdp->sd_diptrs; + maxsize -= mp->mp_list[mp->mp_fheight - 1]; + if (size > maxsize) + size = maxsize; + return size; + } + + first = metapointer(ip->i_height - 1, mp); + end = metaend(ip->i_height - 1, mp); + if (end - first > size) + end = first + size; + for (ptr = first; ptr < end; ptr++) { + if (*ptr) + break; + } + return ptr - first; +} + +/** + * __gfs2_iomap_get - Map blocks from an inode to disk blocks + * @inode: The inode + * @pos: Starting position in bytes + * @length: Length to map, in bytes + * @flags: iomap flags + * @iomap: The iomap structure + * @mp: The metapath + * + * Returns: errno + */ +static int __gfs2_iomap_get(struct inode *inode, loff_t pos, loff_t length, + unsigned flags, struct iomap *iomap, + struct metapath *mp) +{ + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_sbd *sdp = GFS2_SB(inode); + loff_t size = i_size_read(inode); + __be64 *ptr; + sector_t lblock; + sector_t lblock_stop; + int ret; + int eob; + u64 len; + struct buffer_head *dibh = NULL, *bh; + u8 height; + + if (!length) + return -EINVAL; + + down_read(&ip->i_rw_mutex); + + ret = gfs2_meta_inode_buffer(ip, &dibh); + if (ret) + goto unlock; + mp->mp_bh[0] = dibh; + + if (gfs2_is_stuffed(ip)) { + if (flags & IOMAP_WRITE) { + loff_t max_size = gfs2_max_stuffed_size(ip); + + if (pos + length > max_size) + goto unstuff; + iomap->length = max_size; + } else { + if (pos >= size) { + if (flags & IOMAP_REPORT) { + ret = -ENOENT; + goto unlock; + } else { + iomap->offset = pos; + iomap->length = length; + goto hole_found; + } + } + iomap->length = size; + } + iomap->addr = (ip->i_no_addr << inode->i_blkbits) + + sizeof(struct gfs2_dinode); + iomap->type = IOMAP_INLINE; + iomap->inline_data = dibh->b_data + sizeof(struct gfs2_dinode); + goto out; + } + +unstuff: + lblock = pos >> inode->i_blkbits; + iomap->offset = lblock << inode->i_blkbits; + lblock_stop = (pos + length - 1) >> inode->i_blkbits; + len = lblock_stop - lblock + 1; + iomap->length = len << inode->i_blkbits; + + height = ip->i_height; + while ((lblock + 1) * sdp->sd_sb.sb_bsize > sdp->sd_heightsize[height]) + height++; + find_metapath(sdp, lblock, mp, height); + if (height > ip->i_height || gfs2_is_stuffed(ip)) + goto do_alloc; + + ret = lookup_metapath(ip, mp); + if (ret) + goto unlock; + + if (mp->mp_aheight != ip->i_height) + goto do_alloc; + + ptr = metapointer(ip->i_height - 1, mp); + if (*ptr == 0) + goto do_alloc; + + bh = mp->mp_bh[ip->i_height - 1]; + len = gfs2_extent_length(bh, ptr, len, &eob); + + iomap->addr = be64_to_cpu(*ptr) << inode->i_blkbits; + iomap->length = len << inode->i_blkbits; + iomap->type = IOMAP_MAPPED; + iomap->flags |= IOMAP_F_MERGED; + if (eob) + iomap->flags |= IOMAP_F_GFS2_BOUNDARY; + +out: + iomap->bdev = inode->i_sb->s_bdev; +unlock: + up_read(&ip->i_rw_mutex); + return ret; + +do_alloc: + if (flags & IOMAP_REPORT) { + if (pos >= size) + ret = -ENOENT; + else if (height == ip->i_height) + ret = gfs2_hole_size(inode, lblock, len, mp, iomap); + else + iomap->length = size - iomap->offset; + } else if (flags & IOMAP_WRITE) { + u64 alloc_size; + + if (flags & IOMAP_DIRECT) + goto out; /* (see gfs2_file_direct_write) */ + + len = gfs2_alloc_size(inode, mp, len); + alloc_size = len << inode->i_blkbits; + if (alloc_size < iomap->length) + iomap->length = alloc_size; + } else { + if (pos < size && height == ip->i_height) + ret = gfs2_hole_size(inode, lblock, len, mp, iomap); + } +hole_found: + iomap->addr = IOMAP_NULL_ADDR; + iomap->type = IOMAP_HOLE; + goto out; +} + +static struct folio * +gfs2_iomap_get_folio(struct iomap_iter *iter, loff_t pos, unsigned len) +{ + struct inode *inode = iter->inode; + unsigned int blockmask = i_blocksize(inode) - 1; + struct gfs2_sbd *sdp = GFS2_SB(inode); + unsigned int blocks; + struct folio *folio; + int status; + + blocks = ((pos & blockmask) + len + blockmask) >> inode->i_blkbits; + status = gfs2_trans_begin(sdp, RES_DINODE + blocks, 0); + if (status) + return ERR_PTR(status); + + folio = iomap_get_folio(iter, pos, len); + if (IS_ERR(folio)) + gfs2_trans_end(sdp); + return folio; +} + +static void gfs2_iomap_put_folio(struct inode *inode, loff_t pos, + unsigned copied, struct folio *folio) +{ + struct gfs2_trans *tr = current->journal_info; + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_sbd *sdp = GFS2_SB(inode); + + if (!gfs2_is_stuffed(ip)) + gfs2_trans_add_databufs(ip, folio, offset_in_folio(folio, pos), + copied); + + folio_unlock(folio); + folio_put(folio); + + if (tr->tr_num_buf_new) + __mark_inode_dirty(inode, I_DIRTY_DATASYNC); + + gfs2_trans_end(sdp); +} + +static const struct iomap_folio_ops gfs2_iomap_folio_ops = { + .get_folio = gfs2_iomap_get_folio, + .put_folio = gfs2_iomap_put_folio, +}; + +static int gfs2_iomap_begin_write(struct inode *inode, loff_t pos, + loff_t length, unsigned flags, + struct iomap *iomap, + struct metapath *mp) +{ + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_sbd *sdp = GFS2_SB(inode); + bool unstuff; + int ret; + + unstuff = gfs2_is_stuffed(ip) && + pos + length > gfs2_max_stuffed_size(ip); + + if (unstuff || iomap->type == IOMAP_HOLE) { + unsigned int data_blocks, ind_blocks; + struct gfs2_alloc_parms ap = {}; + unsigned int rblocks; + struct gfs2_trans *tr; + + gfs2_write_calc_reserv(ip, iomap->length, &data_blocks, + &ind_blocks); + ap.target = data_blocks + ind_blocks; + ret = gfs2_quota_lock_check(ip, &ap); + if (ret) + return ret; + + ret = gfs2_inplace_reserve(ip, &ap); + if (ret) + goto out_qunlock; + + rblocks = RES_DINODE + ind_blocks; + if (gfs2_is_jdata(ip)) + rblocks += data_blocks; + if (ind_blocks || data_blocks) + rblocks += RES_STATFS + RES_QUOTA; + if (inode == sdp->sd_rindex) + rblocks += 2 * RES_STATFS; + rblocks += gfs2_rg_blocks(ip, data_blocks + ind_blocks); + + ret = gfs2_trans_begin(sdp, rblocks, + iomap->length >> inode->i_blkbits); + if (ret) + goto out_trans_fail; + + if (unstuff) { + ret = gfs2_unstuff_dinode(ip); + if (ret) + goto out_trans_end; + release_metapath(mp); + ret = __gfs2_iomap_get(inode, iomap->offset, + iomap->length, flags, iomap, mp); + if (ret) + goto out_trans_end; + } + + if (iomap->type == IOMAP_HOLE) { + ret = __gfs2_iomap_alloc(inode, iomap, mp); + if (ret) { + gfs2_trans_end(sdp); + gfs2_inplace_release(ip); + punch_hole(ip, iomap->offset, iomap->length); + goto out_qunlock; + } + } + + tr = current->journal_info; + if (tr->tr_num_buf_new) + __mark_inode_dirty(inode, I_DIRTY_DATASYNC); + + gfs2_trans_end(sdp); + } + + if (gfs2_is_stuffed(ip) || gfs2_is_jdata(ip)) + iomap->folio_ops = &gfs2_iomap_folio_ops; + return 0; + +out_trans_end: + gfs2_trans_end(sdp); +out_trans_fail: + gfs2_inplace_release(ip); +out_qunlock: + gfs2_quota_unlock(ip); + return ret; +} + +static int gfs2_iomap_begin(struct inode *inode, loff_t pos, loff_t length, + unsigned flags, struct iomap *iomap, + struct iomap *srcmap) +{ + struct gfs2_inode *ip = GFS2_I(inode); + struct metapath mp = { .mp_aheight = 1, }; + int ret; + + if (gfs2_is_jdata(ip)) + iomap->flags |= IOMAP_F_BUFFER_HEAD; + + trace_gfs2_iomap_start(ip, pos, length, flags); + ret = __gfs2_iomap_get(inode, pos, length, flags, iomap, &mp); + if (ret) + goto out_unlock; + + switch(flags & (IOMAP_WRITE | IOMAP_ZERO)) { + case IOMAP_WRITE: + if (flags & IOMAP_DIRECT) { + /* + * Silently fall back to buffered I/O for stuffed files + * or if we've got a hole (see gfs2_file_direct_write). + */ + if (iomap->type != IOMAP_MAPPED) + ret = -ENOTBLK; + goto out_unlock; + } + break; + case IOMAP_ZERO: + if (iomap->type == IOMAP_HOLE) + goto out_unlock; + break; + default: + goto out_unlock; + } + + ret = gfs2_iomap_begin_write(inode, pos, length, flags, iomap, &mp); + +out_unlock: + release_metapath(&mp); + trace_gfs2_iomap_end(ip, iomap, ret); + return ret; +} + +static int gfs2_iomap_end(struct inode *inode, loff_t pos, loff_t length, + ssize_t written, unsigned flags, struct iomap *iomap) +{ + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_sbd *sdp = GFS2_SB(inode); + + switch (flags & (IOMAP_WRITE | IOMAP_ZERO)) { + case IOMAP_WRITE: + if (flags & IOMAP_DIRECT) + return 0; + break; + case IOMAP_ZERO: + if (iomap->type == IOMAP_HOLE) + return 0; + break; + default: + return 0; + } + + if (!gfs2_is_stuffed(ip)) + gfs2_ordered_add_inode(ip); + + if (inode == sdp->sd_rindex) + adjust_fs_space(inode); + + gfs2_inplace_release(ip); + + if (ip->i_qadata && ip->i_qadata->qa_qd_num) + gfs2_quota_unlock(ip); + + if (length != written && (iomap->flags & IOMAP_F_NEW)) { + /* Deallocate blocks that were just allocated. */ + loff_t hstart = round_up(pos + written, i_blocksize(inode)); + loff_t hend = iomap->offset + iomap->length; + + if (hstart < hend) { + truncate_pagecache_range(inode, hstart, hend - 1); + punch_hole(ip, hstart, hend - hstart); + } + } + + if (unlikely(!written)) + return 0; + + if (iomap->flags & IOMAP_F_SIZE_CHANGED) + mark_inode_dirty(inode); + set_bit(GLF_DIRTY, &ip->i_gl->gl_flags); + return 0; +} + +const struct iomap_ops gfs2_iomap_ops = { + .iomap_begin = gfs2_iomap_begin, + .iomap_end = gfs2_iomap_end, +}; + +/** + * gfs2_block_map - Map one or more blocks of an inode to a disk block + * @inode: The inode + * @lblock: The logical block number + * @bh_map: The bh to be mapped + * @create: True if its ok to alloc blocks to satify the request + * + * The size of the requested mapping is defined in bh_map->b_size. + * + * Clears buffer_mapped(bh_map) and leaves bh_map->b_size unchanged + * when @lblock is not mapped. Sets buffer_mapped(bh_map) and + * bh_map->b_size to indicate the size of the mapping when @lblock and + * successive blocks are mapped, up to the requested size. + * + * Sets buffer_boundary() if a read of metadata will be required + * before the next block can be mapped. Sets buffer_new() if new + * blocks were allocated. + * + * Returns: errno + */ + +int gfs2_block_map(struct inode *inode, sector_t lblock, + struct buffer_head *bh_map, int create) +{ + struct gfs2_inode *ip = GFS2_I(inode); + loff_t pos = (loff_t)lblock << inode->i_blkbits; + loff_t length = bh_map->b_size; + struct iomap iomap = { }; + int ret; + + clear_buffer_mapped(bh_map); + clear_buffer_new(bh_map); + clear_buffer_boundary(bh_map); + trace_gfs2_bmap(ip, bh_map, lblock, create, 1); + + if (!create) + ret = gfs2_iomap_get(inode, pos, length, &iomap); + else + ret = gfs2_iomap_alloc(inode, pos, length, &iomap); + if (ret) + goto out; + + if (iomap.length > bh_map->b_size) { + iomap.length = bh_map->b_size; + iomap.flags &= ~IOMAP_F_GFS2_BOUNDARY; + } + if (iomap.addr != IOMAP_NULL_ADDR) + map_bh(bh_map, inode->i_sb, iomap.addr >> inode->i_blkbits); + bh_map->b_size = iomap.length; + if (iomap.flags & IOMAP_F_GFS2_BOUNDARY) + set_buffer_boundary(bh_map); + if (iomap.flags & IOMAP_F_NEW) + set_buffer_new(bh_map); + +out: + trace_gfs2_bmap(ip, bh_map, lblock, create, ret); + return ret; +} + +int gfs2_get_extent(struct inode *inode, u64 lblock, u64 *dblock, + unsigned int *extlen) +{ + unsigned int blkbits = inode->i_blkbits; + struct iomap iomap = { }; + unsigned int len; + int ret; + + ret = gfs2_iomap_get(inode, lblock << blkbits, *extlen << blkbits, + &iomap); + if (ret) + return ret; + if (iomap.type != IOMAP_MAPPED) + return -EIO; + *dblock = iomap.addr >> blkbits; + len = iomap.length >> blkbits; + if (len < *extlen) + *extlen = len; + return 0; +} + +int gfs2_alloc_extent(struct inode *inode, u64 lblock, u64 *dblock, + unsigned int *extlen, bool *new) +{ + unsigned int blkbits = inode->i_blkbits; + struct iomap iomap = { }; + unsigned int len; + int ret; + + ret = gfs2_iomap_alloc(inode, lblock << blkbits, *extlen << blkbits, + &iomap); + if (ret) + return ret; + if (iomap.type != IOMAP_MAPPED) + return -EIO; + *dblock = iomap.addr >> blkbits; + len = iomap.length >> blkbits; + if (len < *extlen) + *extlen = len; + *new = iomap.flags & IOMAP_F_NEW; + return 0; +} + +/* + * NOTE: Never call gfs2_block_zero_range with an open transaction because it + * uses iomap write to perform its actions, which begin their own transactions + * (iomap_begin, get_folio, etc.) + */ +static int gfs2_block_zero_range(struct inode *inode, loff_t from, + unsigned int length) +{ + BUG_ON(current->journal_info); + return iomap_zero_range(inode, from, length, NULL, &gfs2_iomap_ops); +} + +#define GFS2_JTRUNC_REVOKES 8192 + +/** + * gfs2_journaled_truncate - Wrapper for truncate_pagecache for jdata files + * @inode: The inode being truncated + * @oldsize: The original (larger) size + * @newsize: The new smaller size + * + * With jdata files, we have to journal a revoke for each block which is + * truncated. As a result, we need to split this into separate transactions + * if the number of pages being truncated gets too large. + */ + +static int gfs2_journaled_truncate(struct inode *inode, u64 oldsize, u64 newsize) +{ + struct gfs2_sbd *sdp = GFS2_SB(inode); + u64 max_chunk = GFS2_JTRUNC_REVOKES * sdp->sd_vfs->s_blocksize; + u64 chunk; + int error; + + while (oldsize != newsize) { + struct gfs2_trans *tr; + unsigned int offs; + + chunk = oldsize - newsize; + if (chunk > max_chunk) + chunk = max_chunk; + + offs = oldsize & ~PAGE_MASK; + if (offs && chunk > PAGE_SIZE) + chunk = offs + ((chunk - offs) & PAGE_MASK); + + truncate_pagecache(inode, oldsize - chunk); + oldsize -= chunk; + + tr = current->journal_info; + if (!test_bit(TR_TOUCHED, &tr->tr_flags)) + continue; + + gfs2_trans_end(sdp); + error = gfs2_trans_begin(sdp, RES_DINODE, GFS2_JTRUNC_REVOKES); + if (error) + return error; + } + + return 0; +} + +static int trunc_start(struct inode *inode, u64 newsize) +{ + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_sbd *sdp = GFS2_SB(inode); + struct buffer_head *dibh = NULL; + int journaled = gfs2_is_jdata(ip); + u64 oldsize = inode->i_size; + int error; + + if (!gfs2_is_stuffed(ip)) { + unsigned int blocksize = i_blocksize(inode); + unsigned int offs = newsize & (blocksize - 1); + if (offs) { + error = gfs2_block_zero_range(inode, newsize, + blocksize - offs); + if (error) + return error; + } + } + if (journaled) + error = gfs2_trans_begin(sdp, RES_DINODE + RES_JDATA, GFS2_JTRUNC_REVOKES); + else + error = gfs2_trans_begin(sdp, RES_DINODE, 0); + if (error) + return error; + + error = gfs2_meta_inode_buffer(ip, &dibh); + if (error) + goto out; + + gfs2_trans_add_meta(ip->i_gl, dibh); + + if (gfs2_is_stuffed(ip)) + gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode) + newsize); + else + ip->i_diskflags |= GFS2_DIF_TRUNC_IN_PROG; + + i_size_write(inode, newsize); + ip->i_inode.i_mtime = inode_set_ctime_current(&ip->i_inode); + gfs2_dinode_out(ip, dibh->b_data); + + if (journaled) + error = gfs2_journaled_truncate(inode, oldsize, newsize); + else + truncate_pagecache(inode, newsize); + +out: + brelse(dibh); + if (current->journal_info) + gfs2_trans_end(sdp); + return error; +} + +int gfs2_iomap_get(struct inode *inode, loff_t pos, loff_t length, + struct iomap *iomap) +{ + struct metapath mp = { .mp_aheight = 1, }; + int ret; + + ret = __gfs2_iomap_get(inode, pos, length, 0, iomap, &mp); + release_metapath(&mp); + return ret; +} + +int gfs2_iomap_alloc(struct inode *inode, loff_t pos, loff_t length, + struct iomap *iomap) +{ + struct metapath mp = { .mp_aheight = 1, }; + int ret; + + ret = __gfs2_iomap_get(inode, pos, length, IOMAP_WRITE, iomap, &mp); + if (!ret && iomap->type == IOMAP_HOLE) + ret = __gfs2_iomap_alloc(inode, iomap, &mp); + release_metapath(&mp); + return ret; +} + +/** + * sweep_bh_for_rgrps - find an rgrp in a meta buffer and free blocks therein + * @ip: inode + * @rd_gh: holder of resource group glock + * @bh: buffer head to sweep + * @start: starting point in bh + * @end: end point in bh + * @meta: true if bh points to metadata (rather than data) + * @btotal: place to keep count of total blocks freed + * + * We sweep a metadata buffer (provided by the metapath) for blocks we need to + * free, and free them all. However, we do it one rgrp at a time. If this + * block has references to multiple rgrps, we break it into individual + * transactions. This allows other processes to use the rgrps while we're + * focused on a single one, for better concurrency / performance. + * At every transaction boundary, we rewrite the inode into the journal. + * That way the bitmaps are kept consistent with the inode and we can recover + * if we're interrupted by power-outages. + * + * Returns: 0, or return code if an error occurred. + * *btotal has the total number of blocks freed + */ +static int sweep_bh_for_rgrps(struct gfs2_inode *ip, struct gfs2_holder *rd_gh, + struct buffer_head *bh, __be64 *start, __be64 *end, + bool meta, u32 *btotal) +{ + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + struct gfs2_rgrpd *rgd; + struct gfs2_trans *tr; + __be64 *p; + int blks_outside_rgrp; + u64 bn, bstart, isize_blks; + s64 blen; /* needs to be s64 or gfs2_add_inode_blocks breaks */ + int ret = 0; + bool buf_in_tr = false; /* buffer was added to transaction */ + +more_rgrps: + rgd = NULL; + if (gfs2_holder_initialized(rd_gh)) { + rgd = gfs2_glock2rgrp(rd_gh->gh_gl); + gfs2_assert_withdraw(sdp, + gfs2_glock_is_locked_by_me(rd_gh->gh_gl)); + } + blks_outside_rgrp = 0; + bstart = 0; + blen = 0; + + for (p = start; p < end; p++) { + if (!*p) + continue; + bn = be64_to_cpu(*p); + + if (rgd) { + if (!rgrp_contains_block(rgd, bn)) { + blks_outside_rgrp++; + continue; + } + } else { + rgd = gfs2_blk2rgrpd(sdp, bn, true); + if (unlikely(!rgd)) { + ret = -EIO; + goto out; + } + ret = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, + LM_FLAG_NODE_SCOPE, rd_gh); + if (ret) + goto out; + + /* Must be done with the rgrp glock held: */ + if (gfs2_rs_active(&ip->i_res) && + rgd == ip->i_res.rs_rgd) + gfs2_rs_deltree(&ip->i_res); + } + + /* The size of our transactions will be unknown until we + actually process all the metadata blocks that relate to + the rgrp. So we estimate. We know it can't be more than + the dinode's i_blocks and we don't want to exceed the + journal flush threshold, sd_log_thresh2. */ + if (current->journal_info == NULL) { + unsigned int jblocks_rqsted, revokes; + + jblocks_rqsted = rgd->rd_length + RES_DINODE + + RES_INDIRECT; + isize_blks = gfs2_get_inode_blocks(&ip->i_inode); + if (isize_blks > atomic_read(&sdp->sd_log_thresh2)) + jblocks_rqsted += + atomic_read(&sdp->sd_log_thresh2); + else + jblocks_rqsted += isize_blks; + revokes = jblocks_rqsted; + if (meta) + revokes += end - start; + else if (ip->i_depth) + revokes += sdp->sd_inptrs; + ret = gfs2_trans_begin(sdp, jblocks_rqsted, revokes); + if (ret) + goto out_unlock; + down_write(&ip->i_rw_mutex); + } + /* check if we will exceed the transaction blocks requested */ + tr = current->journal_info; + if (tr->tr_num_buf_new + RES_STATFS + + RES_QUOTA >= atomic_read(&sdp->sd_log_thresh2)) { + /* We set blks_outside_rgrp to ensure the loop will + be repeated for the same rgrp, but with a new + transaction. */ + blks_outside_rgrp++; + /* This next part is tricky. If the buffer was added + to the transaction, we've already set some block + pointers to 0, so we better follow through and free + them, or we will introduce corruption (so break). + This may be impossible, or at least rare, but I + decided to cover the case regardless. + + If the buffer was not added to the transaction + (this call), doing so would exceed our transaction + size, so we need to end the transaction and start a + new one (so goto). */ + + if (buf_in_tr) + break; + goto out_unlock; + } + + gfs2_trans_add_meta(ip->i_gl, bh); + buf_in_tr = true; + *p = 0; + if (bstart + blen == bn) { + blen++; + continue; + } + if (bstart) { + __gfs2_free_blocks(ip, rgd, bstart, (u32)blen, meta); + (*btotal) += blen; + gfs2_add_inode_blocks(&ip->i_inode, -blen); + } + bstart = bn; + blen = 1; + } + if (bstart) { + __gfs2_free_blocks(ip, rgd, bstart, (u32)blen, meta); + (*btotal) += blen; + gfs2_add_inode_blocks(&ip->i_inode, -blen); + } +out_unlock: + if (!ret && blks_outside_rgrp) { /* If buffer still has non-zero blocks + outside the rgrp we just processed, + do it all over again. */ + if (current->journal_info) { + struct buffer_head *dibh; + + ret = gfs2_meta_inode_buffer(ip, &dibh); + if (ret) + goto out; + + /* Every transaction boundary, we rewrite the dinode + to keep its di_blocks current in case of failure. */ + ip->i_inode.i_mtime = inode_set_ctime_current(&ip->i_inode); + gfs2_trans_add_meta(ip->i_gl, dibh); + gfs2_dinode_out(ip, dibh->b_data); + brelse(dibh); + up_write(&ip->i_rw_mutex); + gfs2_trans_end(sdp); + buf_in_tr = false; + } + gfs2_glock_dq_uninit(rd_gh); + cond_resched(); + goto more_rgrps; + } +out: + return ret; +} + +static bool mp_eq_to_hgt(struct metapath *mp, __u16 *list, unsigned int h) +{ + if (memcmp(mp->mp_list, list, h * sizeof(mp->mp_list[0]))) + return false; + return true; +} + +/** + * find_nonnull_ptr - find a non-null pointer given a metapath and height + * @sdp: The superblock + * @mp: starting metapath + * @h: desired height to search + * @end_list: See punch_hole(). + * @end_aligned: See punch_hole(). + * + * Assumes the metapath is valid (with buffers) out to height h. + * Returns: true if a non-null pointer was found in the metapath buffer + * false if all remaining pointers are NULL in the buffer + */ +static bool find_nonnull_ptr(struct gfs2_sbd *sdp, struct metapath *mp, + unsigned int h, + __u16 *end_list, unsigned int end_aligned) +{ + struct buffer_head *bh = mp->mp_bh[h]; + __be64 *first, *ptr, *end; + + first = metaptr1(h, mp); + ptr = first + mp->mp_list[h]; + end = (__be64 *)(bh->b_data + bh->b_size); + if (end_list && mp_eq_to_hgt(mp, end_list, h)) { + bool keep_end = h < end_aligned; + end = first + end_list[h] + keep_end; + } + + while (ptr < end) { + if (*ptr) { /* if we have a non-null pointer */ + mp->mp_list[h] = ptr - first; + h++; + if (h < GFS2_MAX_META_HEIGHT) + mp->mp_list[h] = 0; + return true; + } + ptr++; + } + return false; +} + +enum dealloc_states { + DEALLOC_MP_FULL = 0, /* Strip a metapath with all buffers read in */ + DEALLOC_MP_LOWER = 1, /* lower the metapath strip height */ + DEALLOC_FILL_MP = 2, /* Fill in the metapath to the given height. */ + DEALLOC_DONE = 3, /* process complete */ +}; + +static inline void +metapointer_range(struct metapath *mp, int height, + __u16 *start_list, unsigned int start_aligned, + __u16 *end_list, unsigned int end_aligned, + __be64 **start, __be64 **end) +{ + struct buffer_head *bh = mp->mp_bh[height]; + __be64 *first; + + first = metaptr1(height, mp); + *start = first; + if (mp_eq_to_hgt(mp, start_list, height)) { + bool keep_start = height < start_aligned; + *start = first + start_list[height] + keep_start; + } + *end = (__be64 *)(bh->b_data + bh->b_size); + if (end_list && mp_eq_to_hgt(mp, end_list, height)) { + bool keep_end = height < end_aligned; + *end = first + end_list[height] + keep_end; + } +} + +static inline bool walk_done(struct gfs2_sbd *sdp, + struct metapath *mp, int height, + __u16 *end_list, unsigned int end_aligned) +{ + __u16 end; + + if (end_list) { + bool keep_end = height < end_aligned; + if (!mp_eq_to_hgt(mp, end_list, height)) + return false; + end = end_list[height] + keep_end; + } else + end = (height > 0) ? sdp->sd_inptrs : sdp->sd_diptrs; + return mp->mp_list[height] >= end; +} + +/** + * punch_hole - deallocate blocks in a file + * @ip: inode to truncate + * @offset: the start of the hole + * @length: the size of the hole (or 0 for truncate) + * + * Punch a hole into a file or truncate a file at a given position. This + * function operates in whole blocks (@offset and @length are rounded + * accordingly); partially filled blocks must be cleared otherwise. + * + * This function works from the bottom up, and from the right to the left. In + * other words, it strips off the highest layer (data) before stripping any of + * the metadata. Doing it this way is best in case the operation is interrupted + * by power failure, etc. The dinode is rewritten in every transaction to + * guarantee integrity. + */ +static int punch_hole(struct gfs2_inode *ip, u64 offset, u64 length) +{ + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + u64 maxsize = sdp->sd_heightsize[ip->i_height]; + struct metapath mp = {}; + struct buffer_head *dibh, *bh; + struct gfs2_holder rd_gh; + unsigned int bsize_shift = sdp->sd_sb.sb_bsize_shift; + u64 lblock = (offset + (1 << bsize_shift) - 1) >> bsize_shift; + __u16 start_list[GFS2_MAX_META_HEIGHT]; + __u16 __end_list[GFS2_MAX_META_HEIGHT], *end_list = NULL; + unsigned int start_aligned, end_aligned; + unsigned int strip_h = ip->i_height - 1; + u32 btotal = 0; + int ret, state; + int mp_h; /* metapath buffers are read in to this height */ + u64 prev_bnr = 0; + __be64 *start, *end; + + if (offset >= maxsize) { + /* + * The starting point lies beyond the allocated metadata; + * there are no blocks to deallocate. + */ + return 0; + } + + /* + * The start position of the hole is defined by lblock, start_list, and + * start_aligned. The end position of the hole is defined by lend, + * end_list, and end_aligned. + * + * start_aligned and end_aligned define down to which height the start + * and end positions are aligned to the metadata tree (i.e., the + * position is a multiple of the metadata granularity at the height + * above). This determines at which heights additional meta pointers + * needs to be preserved for the remaining data. + */ + + if (length) { + u64 end_offset = offset + length; + u64 lend; + + /* + * Clip the end at the maximum file size for the given height: + * that's how far the metadata goes; files bigger than that + * will have additional layers of indirection. + */ + if (end_offset > maxsize) + end_offset = maxsize; + lend = end_offset >> bsize_shift; + + if (lblock >= lend) + return 0; + + find_metapath(sdp, lend, &mp, ip->i_height); + end_list = __end_list; + memcpy(end_list, mp.mp_list, sizeof(mp.mp_list)); + + for (mp_h = ip->i_height - 1; mp_h > 0; mp_h--) { + if (end_list[mp_h]) + break; + } + end_aligned = mp_h; + } + + find_metapath(sdp, lblock, &mp, ip->i_height); + memcpy(start_list, mp.mp_list, sizeof(start_list)); + + for (mp_h = ip->i_height - 1; mp_h > 0; mp_h--) { + if (start_list[mp_h]) + break; + } + start_aligned = mp_h; + + ret = gfs2_meta_inode_buffer(ip, &dibh); + if (ret) + return ret; + + mp.mp_bh[0] = dibh; + ret = lookup_metapath(ip, &mp); + if (ret) + goto out_metapath; + + /* issue read-ahead on metadata */ + for (mp_h = 0; mp_h < mp.mp_aheight - 1; mp_h++) { + metapointer_range(&mp, mp_h, start_list, start_aligned, + end_list, end_aligned, &start, &end); + gfs2_metapath_ra(ip->i_gl, start, end); + } + + if (mp.mp_aheight == ip->i_height) + state = DEALLOC_MP_FULL; /* We have a complete metapath */ + else + state = DEALLOC_FILL_MP; /* deal with partial metapath */ + + ret = gfs2_rindex_update(sdp); + if (ret) + goto out_metapath; + + ret = gfs2_quota_hold(ip, NO_UID_QUOTA_CHANGE, NO_GID_QUOTA_CHANGE); + if (ret) + goto out_metapath; + gfs2_holder_mark_uninitialized(&rd_gh); + + mp_h = strip_h; + + while (state != DEALLOC_DONE) { + switch (state) { + /* Truncate a full metapath at the given strip height. + * Note that strip_h == mp_h in order to be in this state. */ + case DEALLOC_MP_FULL: + bh = mp.mp_bh[mp_h]; + gfs2_assert_withdraw(sdp, bh); + if (gfs2_assert_withdraw(sdp, + prev_bnr != bh->b_blocknr)) { + fs_emerg(sdp, "inode %llu, block:%llu, i_h:%u," + "s_h:%u, mp_h:%u\n", + (unsigned long long)ip->i_no_addr, + prev_bnr, ip->i_height, strip_h, mp_h); + } + prev_bnr = bh->b_blocknr; + + if (gfs2_metatype_check(sdp, bh, + (mp_h ? GFS2_METATYPE_IN : + GFS2_METATYPE_DI))) { + ret = -EIO; + goto out; + } + + /* + * Below, passing end_aligned as 0 gives us the + * metapointer range excluding the end point: the end + * point is the first metapath we must not deallocate! + */ + + metapointer_range(&mp, mp_h, start_list, start_aligned, + end_list, 0 /* end_aligned */, + &start, &end); + ret = sweep_bh_for_rgrps(ip, &rd_gh, mp.mp_bh[mp_h], + start, end, + mp_h != ip->i_height - 1, + &btotal); + + /* If we hit an error or just swept dinode buffer, + just exit. */ + if (ret || !mp_h) { + state = DEALLOC_DONE; + break; + } + state = DEALLOC_MP_LOWER; + break; + + /* lower the metapath strip height */ + case DEALLOC_MP_LOWER: + /* We're done with the current buffer, so release it, + unless it's the dinode buffer. Then back up to the + previous pointer. */ + if (mp_h) { + brelse(mp.mp_bh[mp_h]); + mp.mp_bh[mp_h] = NULL; + } + /* If we can't get any lower in height, we've stripped + off all we can. Next step is to back up and start + stripping the previous level of metadata. */ + if (mp_h == 0) { + strip_h--; + memcpy(mp.mp_list, start_list, sizeof(start_list)); + mp_h = strip_h; + state = DEALLOC_FILL_MP; + break; + } + mp.mp_list[mp_h] = 0; + mp_h--; /* search one metadata height down */ + mp.mp_list[mp_h]++; + if (walk_done(sdp, &mp, mp_h, end_list, end_aligned)) + break; + /* Here we've found a part of the metapath that is not + * allocated. We need to search at that height for the + * next non-null pointer. */ + if (find_nonnull_ptr(sdp, &mp, mp_h, end_list, end_aligned)) { + state = DEALLOC_FILL_MP; + mp_h++; + } + /* No more non-null pointers at this height. Back up + to the previous height and try again. */ + break; /* loop around in the same state */ + + /* Fill the metapath with buffers to the given height. */ + case DEALLOC_FILL_MP: + /* Fill the buffers out to the current height. */ + ret = fillup_metapath(ip, &mp, mp_h); + if (ret < 0) + goto out; + + /* On the first pass, issue read-ahead on metadata. */ + if (mp.mp_aheight > 1 && strip_h == ip->i_height - 1) { + unsigned int height = mp.mp_aheight - 1; + + /* No read-ahead for data blocks. */ + if (mp.mp_aheight - 1 == strip_h) + height--; + + for (; height >= mp.mp_aheight - ret; height--) { + metapointer_range(&mp, height, + start_list, start_aligned, + end_list, end_aligned, + &start, &end); + gfs2_metapath_ra(ip->i_gl, start, end); + } + } + + /* If buffers found for the entire strip height */ + if (mp.mp_aheight - 1 == strip_h) { + state = DEALLOC_MP_FULL; + break; + } + if (mp.mp_aheight < ip->i_height) /* We have a partial height */ + mp_h = mp.mp_aheight - 1; + + /* If we find a non-null block pointer, crawl a bit + higher up in the metapath and try again, otherwise + we need to look lower for a new starting point. */ + if (find_nonnull_ptr(sdp, &mp, mp_h, end_list, end_aligned)) + mp_h++; + else + state = DEALLOC_MP_LOWER; + break; + } + } + + if (btotal) { + if (current->journal_info == NULL) { + ret = gfs2_trans_begin(sdp, RES_DINODE + RES_STATFS + + RES_QUOTA, 0); + if (ret) + goto out; + down_write(&ip->i_rw_mutex); + } + gfs2_statfs_change(sdp, 0, +btotal, 0); + gfs2_quota_change(ip, -(s64)btotal, ip->i_inode.i_uid, + ip->i_inode.i_gid); + ip->i_inode.i_mtime = inode_set_ctime_current(&ip->i_inode); + gfs2_trans_add_meta(ip->i_gl, dibh); + gfs2_dinode_out(ip, dibh->b_data); + up_write(&ip->i_rw_mutex); + gfs2_trans_end(sdp); + } + +out: + if (gfs2_holder_initialized(&rd_gh)) + gfs2_glock_dq_uninit(&rd_gh); + if (current->journal_info) { + up_write(&ip->i_rw_mutex); + gfs2_trans_end(sdp); + cond_resched(); + } + gfs2_quota_unhold(ip); +out_metapath: + release_metapath(&mp); + return ret; +} + +static int trunc_end(struct gfs2_inode *ip) +{ + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + struct buffer_head *dibh; + int error; + + error = gfs2_trans_begin(sdp, RES_DINODE, 0); + if (error) + return error; + + down_write(&ip->i_rw_mutex); + + error = gfs2_meta_inode_buffer(ip, &dibh); + if (error) + goto out; + + if (!i_size_read(&ip->i_inode)) { + ip->i_height = 0; + ip->i_goal = ip->i_no_addr; + gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode)); + gfs2_ordered_del_inode(ip); + } + ip->i_inode.i_mtime = inode_set_ctime_current(&ip->i_inode); + ip->i_diskflags &= ~GFS2_DIF_TRUNC_IN_PROG; + + gfs2_trans_add_meta(ip->i_gl, dibh); + gfs2_dinode_out(ip, dibh->b_data); + brelse(dibh); + +out: + up_write(&ip->i_rw_mutex); + gfs2_trans_end(sdp); + return error; +} + +/** + * do_shrink - make a file smaller + * @inode: the inode + * @newsize: the size to make the file + * + * Called with an exclusive lock on @inode. The @size must + * be equal to or smaller than the current inode size. + * + * Returns: errno + */ + +static int do_shrink(struct inode *inode, u64 newsize) +{ + struct gfs2_inode *ip = GFS2_I(inode); + int error; + + error = trunc_start(inode, newsize); + if (error < 0) + return error; + if (gfs2_is_stuffed(ip)) + return 0; + + error = punch_hole(ip, newsize, 0); + if (error == 0) + error = trunc_end(ip); + + return error; +} + +/** + * do_grow - Touch and update inode size + * @inode: The inode + * @size: The new size + * + * This function updates the timestamps on the inode and + * may also increase the size of the inode. This function + * must not be called with @size any smaller than the current + * inode size. + * + * Although it is not strictly required to unstuff files here, + * earlier versions of GFS2 have a bug in the stuffed file reading + * code which will result in a buffer overrun if the size is larger + * than the max stuffed file size. In order to prevent this from + * occurring, such files are unstuffed, but in other cases we can + * just update the inode size directly. + * + * Returns: 0 on success, or -ve on error + */ + +static int do_grow(struct inode *inode, u64 size) +{ + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_sbd *sdp = GFS2_SB(inode); + struct gfs2_alloc_parms ap = { .target = 1, }; + struct buffer_head *dibh; + int error; + int unstuff = 0; + + if (gfs2_is_stuffed(ip) && size > gfs2_max_stuffed_size(ip)) { + error = gfs2_quota_lock_check(ip, &ap); + if (error) + return error; + + error = gfs2_inplace_reserve(ip, &ap); + if (error) + goto do_grow_qunlock; + unstuff = 1; + } + + error = gfs2_trans_begin(sdp, RES_DINODE + RES_STATFS + RES_RG_BIT + + (unstuff && + gfs2_is_jdata(ip) ? RES_JDATA : 0) + + (sdp->sd_args.ar_quota == GFS2_QUOTA_OFF ? + 0 : RES_QUOTA), 0); + if (error) + goto do_grow_release; + + if (unstuff) { + error = gfs2_unstuff_dinode(ip); + if (error) + goto do_end_trans; + } + + error = gfs2_meta_inode_buffer(ip, &dibh); + if (error) + goto do_end_trans; + + truncate_setsize(inode, size); + ip->i_inode.i_mtime = inode_set_ctime_current(&ip->i_inode); + gfs2_trans_add_meta(ip->i_gl, dibh); + gfs2_dinode_out(ip, dibh->b_data); + brelse(dibh); + +do_end_trans: + gfs2_trans_end(sdp); +do_grow_release: + if (unstuff) { + gfs2_inplace_release(ip); +do_grow_qunlock: + gfs2_quota_unlock(ip); + } + return error; +} + +/** + * gfs2_setattr_size - make a file a given size + * @inode: the inode + * @newsize: the size to make the file + * + * The file size can grow, shrink, or stay the same size. This + * is called holding i_rwsem and an exclusive glock on the inode + * in question. + * + * Returns: errno + */ + +int gfs2_setattr_size(struct inode *inode, u64 newsize) +{ + struct gfs2_inode *ip = GFS2_I(inode); + int ret; + + BUG_ON(!S_ISREG(inode->i_mode)); + + ret = inode_newsize_ok(inode, newsize); + if (ret) + return ret; + + inode_dio_wait(inode); + + ret = gfs2_qa_get(ip); + if (ret) + goto out; + + if (newsize >= inode->i_size) { + ret = do_grow(inode, newsize); + goto out; + } + + ret = do_shrink(inode, newsize); +out: + gfs2_rs_delete(ip); + gfs2_qa_put(ip); + return ret; +} + +int gfs2_truncatei_resume(struct gfs2_inode *ip) +{ + int error; + error = punch_hole(ip, i_size_read(&ip->i_inode), 0); + if (!error) + error = trunc_end(ip); + return error; +} + +int gfs2_file_dealloc(struct gfs2_inode *ip) +{ + return punch_hole(ip, 0, 0); +} + +/** + * gfs2_free_journal_extents - Free cached journal bmap info + * @jd: The journal + * + */ + +void gfs2_free_journal_extents(struct gfs2_jdesc *jd) +{ + struct gfs2_journal_extent *jext; + + while(!list_empty(&jd->extent_list)) { + jext = list_first_entry(&jd->extent_list, struct gfs2_journal_extent, list); + list_del(&jext->list); + kfree(jext); + } +} + +/** + * gfs2_add_jextent - Add or merge a new extent to extent cache + * @jd: The journal descriptor + * @lblock: The logical block at start of new extent + * @dblock: The physical block at start of new extent + * @blocks: Size of extent in fs blocks + * + * Returns: 0 on success or -ENOMEM + */ + +static int gfs2_add_jextent(struct gfs2_jdesc *jd, u64 lblock, u64 dblock, u64 blocks) +{ + struct gfs2_journal_extent *jext; + + if (!list_empty(&jd->extent_list)) { + jext = list_last_entry(&jd->extent_list, struct gfs2_journal_extent, list); + if ((jext->dblock + jext->blocks) == dblock) { + jext->blocks += blocks; + return 0; + } + } + + jext = kzalloc(sizeof(struct gfs2_journal_extent), GFP_NOFS); + if (jext == NULL) + return -ENOMEM; + jext->dblock = dblock; + jext->lblock = lblock; + jext->blocks = blocks; + list_add_tail(&jext->list, &jd->extent_list); + jd->nr_extents++; + return 0; +} + +/** + * gfs2_map_journal_extents - Cache journal bmap info + * @sdp: The super block + * @jd: The journal to map + * + * Create a reusable "extent" mapping from all logical + * blocks to all physical blocks for the given journal. This will save + * us time when writing journal blocks. Most journals will have only one + * extent that maps all their logical blocks. That's because gfs2.mkfs + * arranges the journal blocks sequentially to maximize performance. + * So the extent would map the first block for the entire file length. + * However, gfs2_jadd can happen while file activity is happening, so + * those journals may not be sequential. Less likely is the case where + * the users created their own journals by mounting the metafs and + * laying it out. But it's still possible. These journals might have + * several extents. + * + * Returns: 0 on success, or error on failure + */ + +int gfs2_map_journal_extents(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd) +{ + u64 lblock = 0; + u64 lblock_stop; + struct gfs2_inode *ip = GFS2_I(jd->jd_inode); + struct buffer_head bh; + unsigned int shift = sdp->sd_sb.sb_bsize_shift; + u64 size; + int rc; + ktime_t start, end; + + start = ktime_get(); + lblock_stop = i_size_read(jd->jd_inode) >> shift; + size = (lblock_stop - lblock) << shift; + jd->nr_extents = 0; + WARN_ON(!list_empty(&jd->extent_list)); + + do { + bh.b_state = 0; + bh.b_blocknr = 0; + bh.b_size = size; + rc = gfs2_block_map(jd->jd_inode, lblock, &bh, 0); + if (rc || !buffer_mapped(&bh)) + goto fail; + rc = gfs2_add_jextent(jd, lblock, bh.b_blocknr, bh.b_size >> shift); + if (rc) + goto fail; + size -= bh.b_size; + lblock += (bh.b_size >> ip->i_inode.i_blkbits); + } while(size > 0); + + end = ktime_get(); + fs_info(sdp, "journal %d mapped with %u extents in %lldms\n", jd->jd_jid, + jd->nr_extents, ktime_ms_delta(end, start)); + return 0; + +fail: + fs_warn(sdp, "error %d mapping journal %u at offset %llu (extent %u)\n", + rc, jd->jd_jid, + (unsigned long long)(i_size_read(jd->jd_inode) - size), + jd->nr_extents); + fs_warn(sdp, "bmap=%d lblock=%llu block=%llu, state=0x%08lx, size=%llu\n", + rc, (unsigned long long)lblock, (unsigned long long)bh.b_blocknr, + bh.b_state, (unsigned long long)bh.b_size); + gfs2_free_journal_extents(jd); + return rc; +} + +/** + * gfs2_write_alloc_required - figure out if a write will require an allocation + * @ip: the file being written to + * @offset: the offset to write to + * @len: the number of bytes being written + * + * Returns: 1 if an alloc is required, 0 otherwise + */ + +int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset, + unsigned int len) +{ + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + struct buffer_head bh; + unsigned int shift; + u64 lblock, lblock_stop, size; + u64 end_of_file; + + if (!len) + return 0; + + if (gfs2_is_stuffed(ip)) { + if (offset + len > gfs2_max_stuffed_size(ip)) + return 1; + return 0; + } + + shift = sdp->sd_sb.sb_bsize_shift; + BUG_ON(gfs2_is_dir(ip)); + end_of_file = (i_size_read(&ip->i_inode) + sdp->sd_sb.sb_bsize - 1) >> shift; + lblock = offset >> shift; + lblock_stop = (offset + len + sdp->sd_sb.sb_bsize - 1) >> shift; + if (lblock_stop > end_of_file && ip != GFS2_I(sdp->sd_rindex)) + return 1; + + size = (lblock_stop - lblock) << shift; + do { + bh.b_state = 0; + bh.b_size = size; + gfs2_block_map(&ip->i_inode, lblock, &bh, 0); + if (!buffer_mapped(&bh)) + return 1; + size -= bh.b_size; + lblock += (bh.b_size >> ip->i_inode.i_blkbits); + } while(size > 0); + + return 0; +} + +static int stuffed_zero_range(struct inode *inode, loff_t offset, loff_t length) +{ + struct gfs2_inode *ip = GFS2_I(inode); + struct buffer_head *dibh; + int error; + + if (offset >= inode->i_size) + return 0; + if (offset + length > inode->i_size) + length = inode->i_size - offset; + + error = gfs2_meta_inode_buffer(ip, &dibh); + if (error) + return error; + gfs2_trans_add_meta(ip->i_gl, dibh); + memset(dibh->b_data + sizeof(struct gfs2_dinode) + offset, 0, + length); + brelse(dibh); + return 0; +} + +static int gfs2_journaled_truncate_range(struct inode *inode, loff_t offset, + loff_t length) +{ + struct gfs2_sbd *sdp = GFS2_SB(inode); + loff_t max_chunk = GFS2_JTRUNC_REVOKES * sdp->sd_vfs->s_blocksize; + int error; + + while (length) { + struct gfs2_trans *tr; + loff_t chunk; + unsigned int offs; + + chunk = length; + if (chunk > max_chunk) + chunk = max_chunk; + + offs = offset & ~PAGE_MASK; + if (offs && chunk > PAGE_SIZE) + chunk = offs + ((chunk - offs) & PAGE_MASK); + + truncate_pagecache_range(inode, offset, chunk); + offset += chunk; + length -= chunk; + + tr = current->journal_info; + if (!test_bit(TR_TOUCHED, &tr->tr_flags)) + continue; + + gfs2_trans_end(sdp); + error = gfs2_trans_begin(sdp, RES_DINODE, GFS2_JTRUNC_REVOKES); + if (error) + return error; + } + return 0; +} + +int __gfs2_punch_hole(struct file *file, loff_t offset, loff_t length) +{ + struct inode *inode = file_inode(file); + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_sbd *sdp = GFS2_SB(inode); + unsigned int blocksize = i_blocksize(inode); + loff_t start, end; + int error; + + if (!gfs2_is_stuffed(ip)) { + unsigned int start_off, end_len; + + start_off = offset & (blocksize - 1); + end_len = (offset + length) & (blocksize - 1); + if (start_off) { + unsigned int len = length; + if (length > blocksize - start_off) + len = blocksize - start_off; + error = gfs2_block_zero_range(inode, offset, len); + if (error) + goto out; + if (start_off + length < blocksize) + end_len = 0; + } + if (end_len) { + error = gfs2_block_zero_range(inode, + offset + length - end_len, end_len); + if (error) + goto out; + } + } + + start = round_down(offset, blocksize); + end = round_up(offset + length, blocksize) - 1; + error = filemap_write_and_wait_range(inode->i_mapping, start, end); + if (error) + return error; + + if (gfs2_is_jdata(ip)) + error = gfs2_trans_begin(sdp, RES_DINODE + 2 * RES_JDATA, + GFS2_JTRUNC_REVOKES); + else + error = gfs2_trans_begin(sdp, RES_DINODE, 0); + if (error) + return error; + + if (gfs2_is_stuffed(ip)) { + error = stuffed_zero_range(inode, offset, length); + if (error) + goto out; + } + + if (gfs2_is_jdata(ip)) { + BUG_ON(!current->journal_info); + gfs2_journaled_truncate_range(inode, offset, length); + } else + truncate_pagecache_range(inode, offset, offset + length - 1); + + file_update_time(file); + mark_inode_dirty(inode); + + if (current->journal_info) + gfs2_trans_end(sdp); + + if (!gfs2_is_stuffed(ip)) + error = punch_hole(ip, offset, length); + +out: + if (current->journal_info) + gfs2_trans_end(sdp); + return error; +} + +static int gfs2_map_blocks(struct iomap_writepage_ctx *wpc, struct inode *inode, + loff_t offset) +{ + int ret; + + if (WARN_ON_ONCE(gfs2_is_stuffed(GFS2_I(inode)))) + return -EIO; + + if (offset >= wpc->iomap.offset && + offset < wpc->iomap.offset + wpc->iomap.length) + return 0; + + memset(&wpc->iomap, 0, sizeof(wpc->iomap)); + ret = gfs2_iomap_get(inode, offset, INT_MAX, &wpc->iomap); + return ret; +} + +const struct iomap_writeback_ops gfs2_writeback_ops = { + .map_blocks = gfs2_map_blocks, +}; diff --git a/fs/gfs2/bmap.h b/fs/gfs2/bmap.h new file mode 100644 index 0000000000..e5b7d17131 --- /dev/null +++ b/fs/gfs2/bmap.h @@ -0,0 +1,69 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + */ + +#ifndef __BMAP_DOT_H__ +#define __BMAP_DOT_H__ + +#include <linux/iomap.h> + +#include "inode.h" + +struct inode; +struct gfs2_inode; +struct page; + + +/** + * gfs2_write_calc_reserv - calculate number of blocks needed to write to a file + * @ip: the file + * @len: the number of bytes to be written to the file + * @data_blocks: returns the number of data blocks required + * @ind_blocks: returns the number of indirect blocks required + * + */ + +static inline void gfs2_write_calc_reserv(const struct gfs2_inode *ip, + unsigned int len, + unsigned int *data_blocks, + unsigned int *ind_blocks) +{ + const struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + unsigned int tmp; + + BUG_ON(gfs2_is_dir(ip)); + *data_blocks = (len >> sdp->sd_sb.sb_bsize_shift) + 3; + *ind_blocks = 3 * (sdp->sd_max_height - 1); + + for (tmp = *data_blocks; tmp > sdp->sd_diptrs;) { + tmp = DIV_ROUND_UP(tmp, sdp->sd_inptrs); + *ind_blocks += tmp; + } +} + +extern const struct iomap_ops gfs2_iomap_ops; +extern const struct iomap_writeback_ops gfs2_writeback_ops; + +extern int gfs2_unstuff_dinode(struct gfs2_inode *ip); +extern int gfs2_block_map(struct inode *inode, sector_t lblock, + struct buffer_head *bh, int create); +extern int gfs2_iomap_get(struct inode *inode, loff_t pos, loff_t length, + struct iomap *iomap); +extern int gfs2_iomap_alloc(struct inode *inode, loff_t pos, loff_t length, + struct iomap *iomap); +extern int gfs2_get_extent(struct inode *inode, u64 lblock, u64 *dblock, + unsigned int *extlen); +extern int gfs2_alloc_extent(struct inode *inode, u64 lblock, u64 *dblock, + unsigned *extlen, bool *new); +extern int gfs2_setattr_size(struct inode *inode, u64 size); +extern int gfs2_truncatei_resume(struct gfs2_inode *ip); +extern int gfs2_file_dealloc(struct gfs2_inode *ip); +extern int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset, + unsigned int len); +extern int gfs2_map_journal_extents(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd); +extern void gfs2_free_journal_extents(struct gfs2_jdesc *jd); +extern int __gfs2_punch_hole(struct file *file, loff_t offset, loff_t length); + +#endif /* __BMAP_DOT_H__ */ diff --git a/fs/gfs2/dentry.c b/fs/gfs2/dentry.c new file mode 100644 index 0000000000..2e215e8c3c --- /dev/null +++ b/fs/gfs2/dentry.c @@ -0,0 +1,108 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + */ + +#include <linux/spinlock.h> +#include <linux/completion.h> +#include <linux/buffer_head.h> +#include <linux/gfs2_ondisk.h> +#include <linux/namei.h> +#include <linux/crc32.h> + +#include "gfs2.h" +#include "incore.h" +#include "dir.h" +#include "glock.h" +#include "super.h" +#include "util.h" +#include "inode.h" + +/** + * gfs2_drevalidate - Check directory lookup consistency + * @dentry: the mapping to check + * @flags: lookup flags + * + * Check to make sure the lookup necessary to arrive at this inode from its + * parent is still good. + * + * Returns: 1 if the dentry is ok, 0 if it isn't + */ + +static int gfs2_drevalidate(struct dentry *dentry, unsigned int flags) +{ + struct dentry *parent; + struct gfs2_sbd *sdp; + struct gfs2_inode *dip; + struct inode *inode; + struct gfs2_holder d_gh; + struct gfs2_inode *ip = NULL; + int error, valid = 0; + int had_lock = 0; + + if (flags & LOOKUP_RCU) + return -ECHILD; + + parent = dget_parent(dentry); + sdp = GFS2_SB(d_inode(parent)); + dip = GFS2_I(d_inode(parent)); + inode = d_inode(dentry); + + if (inode) { + if (is_bad_inode(inode)) + goto out; + ip = GFS2_I(inode); + } + + if (sdp->sd_lockstruct.ls_ops->lm_mount == NULL) { + valid = 1; + goto out; + } + + had_lock = (gfs2_glock_is_locked_by_me(dip->i_gl) != NULL); + if (!had_lock) { + error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED, 0, &d_gh); + if (error) + goto out; + } + + error = gfs2_dir_check(d_inode(parent), &dentry->d_name, ip); + valid = inode ? !error : (error == -ENOENT); + + if (!had_lock) + gfs2_glock_dq_uninit(&d_gh); +out: + dput(parent); + return valid; +} + +static int gfs2_dhash(const struct dentry *dentry, struct qstr *str) +{ + str->hash = gfs2_disk_hash(str->name, str->len); + return 0; +} + +static int gfs2_dentry_delete(const struct dentry *dentry) +{ + struct gfs2_inode *ginode; + + if (d_really_is_negative(dentry)) + return 0; + + ginode = GFS2_I(d_inode(dentry)); + if (!gfs2_holder_initialized(&ginode->i_iopen_gh)) + return 0; + + if (test_bit(GLF_DEMOTE, &ginode->i_iopen_gh.gh_gl->gl_flags)) + return 1; + + return 0; +} + +const struct dentry_operations gfs2_dops = { + .d_revalidate = gfs2_drevalidate, + .d_hash = gfs2_dhash, + .d_delete = gfs2_dentry_delete, +}; + diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c new file mode 100644 index 0000000000..1a2afa88f8 --- /dev/null +++ b/fs/gfs2/dir.c @@ -0,0 +1,2188 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + */ + +/* + * Implements Extendible Hashing as described in: + * "Extendible Hashing" by Fagin, et al in + * __ACM Trans. on Database Systems__, Sept 1979. + * + * + * Here's the layout of dirents which is essentially the same as that of ext2 + * within a single block. The field de_name_len is the number of bytes + * actually required for the name (no null terminator). The field de_rec_len + * is the number of bytes allocated to the dirent. The offset of the next + * dirent in the block is (dirent + dirent->de_rec_len). When a dirent is + * deleted, the preceding dirent inherits its allocated space, ie + * prev->de_rec_len += deleted->de_rec_len. Since the next dirent is obtained + * by adding de_rec_len to the current dirent, this essentially causes the + * deleted dirent to get jumped over when iterating through all the dirents. + * + * When deleting the first dirent in a block, there is no previous dirent so + * the field de_ino is set to zero to designate it as deleted. When allocating + * a dirent, gfs2_dirent_alloc iterates through the dirents in a block. If the + * first dirent has (de_ino == 0) and de_rec_len is large enough, this first + * dirent is allocated. Otherwise it must go through all the 'used' dirents + * searching for one in which the amount of total space minus the amount of + * used space will provide enough space for the new dirent. + * + * There are two types of blocks in which dirents reside. In a stuffed dinode, + * the dirents begin at offset sizeof(struct gfs2_dinode) from the beginning of + * the block. In leaves, they begin at offset sizeof(struct gfs2_leaf) from the + * beginning of the leaf block. The dirents reside in leaves when + * + * dip->i_diskflags & GFS2_DIF_EXHASH is true + * + * Otherwise, the dirents are "linear", within a single stuffed dinode block. + * + * When the dirents are in leaves, the actual contents of the directory file are + * used as an array of 64-bit block pointers pointing to the leaf blocks. The + * dirents are NOT in the directory file itself. There can be more than one + * block pointer in the array that points to the same leaf. In fact, when a + * directory is first converted from linear to exhash, all of the pointers + * point to the same leaf. + * + * When a leaf is completely full, the size of the hash table can be + * doubled unless it is already at the maximum size which is hard coded into + * GFS2_DIR_MAX_DEPTH. After that, leaves are chained together in a linked list, + * but never before the maximum hash table size has been reached. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/slab.h> +#include <linux/spinlock.h> +#include <linux/buffer_head.h> +#include <linux/sort.h> +#include <linux/gfs2_ondisk.h> +#include <linux/crc32.h> +#include <linux/vmalloc.h> +#include <linux/bio.h> + +#include "gfs2.h" +#include "incore.h" +#include "dir.h" +#include "glock.h" +#include "inode.h" +#include "meta_io.h" +#include "quota.h" +#include "rgrp.h" +#include "trans.h" +#include "bmap.h" +#include "util.h" + +#define MAX_RA_BLOCKS 32 /* max read-ahead blocks */ + +#define gfs2_disk_hash2offset(h) (((u64)(h)) >> 1) +#define gfs2_dir_offset2hash(p) ((u32)(((u64)(p)) << 1)) +#define GFS2_HASH_INDEX_MASK 0xffffc000 +#define GFS2_USE_HASH_FLAG 0x2000 + +struct qstr gfs2_qdot __read_mostly; +struct qstr gfs2_qdotdot __read_mostly; + +typedef int (*gfs2_dscan_t)(const struct gfs2_dirent *dent, + const struct qstr *name, void *opaque); + +int gfs2_dir_get_new_buffer(struct gfs2_inode *ip, u64 block, + struct buffer_head **bhp) +{ + struct buffer_head *bh; + + bh = gfs2_meta_new(ip->i_gl, block); + gfs2_trans_add_meta(ip->i_gl, bh); + gfs2_metatype_set(bh, GFS2_METATYPE_JD, GFS2_FORMAT_JD); + gfs2_buffer_clear_tail(bh, sizeof(struct gfs2_meta_header)); + *bhp = bh; + return 0; +} + +static int gfs2_dir_get_existing_buffer(struct gfs2_inode *ip, u64 block, + struct buffer_head **bhp) +{ + struct buffer_head *bh; + int error; + + error = gfs2_meta_read(ip->i_gl, block, DIO_WAIT, 0, &bh); + if (error) + return error; + if (gfs2_metatype_check(GFS2_SB(&ip->i_inode), bh, GFS2_METATYPE_JD)) { + brelse(bh); + return -EIO; + } + *bhp = bh; + return 0; +} + +static int gfs2_dir_write_stuffed(struct gfs2_inode *ip, const char *buf, + unsigned int offset, unsigned int size) +{ + struct buffer_head *dibh; + int error; + + error = gfs2_meta_inode_buffer(ip, &dibh); + if (error) + return error; + + gfs2_trans_add_meta(ip->i_gl, dibh); + memcpy(dibh->b_data + offset + sizeof(struct gfs2_dinode), buf, size); + if (ip->i_inode.i_size < offset + size) + i_size_write(&ip->i_inode, offset + size); + ip->i_inode.i_mtime = inode_set_ctime_current(&ip->i_inode); + gfs2_dinode_out(ip, dibh->b_data); + + brelse(dibh); + + return size; +} + + + +/** + * gfs2_dir_write_data - Write directory information to the inode + * @ip: The GFS2 inode + * @buf: The buffer containing information to be written + * @offset: The file offset to start writing at + * @size: The amount of data to write + * + * Returns: The number of bytes correctly written or error code + */ +static int gfs2_dir_write_data(struct gfs2_inode *ip, const char *buf, + u64 offset, unsigned int size) +{ + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + struct buffer_head *dibh; + u64 lblock, dblock; + u32 extlen = 0; + unsigned int o; + int copied = 0; + int error = 0; + bool new = false; + + if (!size) + return 0; + + if (gfs2_is_stuffed(ip) && offset + size <= gfs2_max_stuffed_size(ip)) + return gfs2_dir_write_stuffed(ip, buf, (unsigned int)offset, + size); + + if (gfs2_assert_warn(sdp, gfs2_is_jdata(ip))) + return -EINVAL; + + if (gfs2_is_stuffed(ip)) { + error = gfs2_unstuff_dinode(ip); + if (error) + return error; + } + + lblock = offset; + o = do_div(lblock, sdp->sd_jbsize) + sizeof(struct gfs2_meta_header); + + while (copied < size) { + unsigned int amount; + struct buffer_head *bh; + + amount = size - copied; + if (amount > sdp->sd_sb.sb_bsize - o) + amount = sdp->sd_sb.sb_bsize - o; + + if (!extlen) { + extlen = 1; + error = gfs2_alloc_extent(&ip->i_inode, lblock, &dblock, + &extlen, &new); + if (error) + goto fail; + error = -EIO; + if (gfs2_assert_withdraw(sdp, dblock)) + goto fail; + } + + if (amount == sdp->sd_jbsize || new) + error = gfs2_dir_get_new_buffer(ip, dblock, &bh); + else + error = gfs2_dir_get_existing_buffer(ip, dblock, &bh); + + if (error) + goto fail; + + gfs2_trans_add_meta(ip->i_gl, bh); + memcpy(bh->b_data + o, buf, amount); + brelse(bh); + + buf += amount; + copied += amount; + lblock++; + dblock++; + extlen--; + + o = sizeof(struct gfs2_meta_header); + } + +out: + error = gfs2_meta_inode_buffer(ip, &dibh); + if (error) + return error; + + if (ip->i_inode.i_size < offset + copied) + i_size_write(&ip->i_inode, offset + copied); + ip->i_inode.i_mtime = inode_set_ctime_current(&ip->i_inode); + + gfs2_trans_add_meta(ip->i_gl, dibh); + gfs2_dinode_out(ip, dibh->b_data); + brelse(dibh); + + return copied; +fail: + if (copied) + goto out; + return error; +} + +static int gfs2_dir_read_stuffed(struct gfs2_inode *ip, __be64 *buf, + unsigned int size) +{ + struct buffer_head *dibh; + int error; + + error = gfs2_meta_inode_buffer(ip, &dibh); + if (!error) { + memcpy(buf, dibh->b_data + sizeof(struct gfs2_dinode), size); + brelse(dibh); + } + + return (error) ? error : size; +} + + +/** + * gfs2_dir_read_data - Read a data from a directory inode + * @ip: The GFS2 Inode + * @buf: The buffer to place result into + * @size: Amount of data to transfer + * + * Returns: The amount of data actually copied or the error + */ +static int gfs2_dir_read_data(struct gfs2_inode *ip, __be64 *buf, + unsigned int size) +{ + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + u64 lblock, dblock; + u32 extlen = 0; + unsigned int o; + int copied = 0; + int error = 0; + + if (gfs2_is_stuffed(ip)) + return gfs2_dir_read_stuffed(ip, buf, size); + + if (gfs2_assert_warn(sdp, gfs2_is_jdata(ip))) + return -EINVAL; + + lblock = 0; + o = do_div(lblock, sdp->sd_jbsize) + sizeof(struct gfs2_meta_header); + + while (copied < size) { + unsigned int amount; + struct buffer_head *bh; + + amount = size - copied; + if (amount > sdp->sd_sb.sb_bsize - o) + amount = sdp->sd_sb.sb_bsize - o; + + if (!extlen) { + extlen = 32; + error = gfs2_get_extent(&ip->i_inode, lblock, + &dblock, &extlen); + if (error || !dblock) + goto fail; + BUG_ON(extlen < 1); + bh = gfs2_meta_ra(ip->i_gl, dblock, extlen); + } else { + error = gfs2_meta_read(ip->i_gl, dblock, DIO_WAIT, 0, &bh); + if (error) + goto fail; + } + error = gfs2_metatype_check(sdp, bh, GFS2_METATYPE_JD); + if (error) { + brelse(bh); + goto fail; + } + dblock++; + extlen--; + memcpy(buf, bh->b_data + o, amount); + brelse(bh); + buf += (amount/sizeof(__be64)); + copied += amount; + lblock++; + o = sizeof(struct gfs2_meta_header); + } + + return copied; +fail: + return (copied) ? copied : error; +} + +/** + * gfs2_dir_get_hash_table - Get pointer to the dir hash table + * @ip: The inode in question + * + * Returns: The hash table or an error + */ + +static __be64 *gfs2_dir_get_hash_table(struct gfs2_inode *ip) +{ + struct inode *inode = &ip->i_inode; + int ret; + u32 hsize; + __be64 *hc; + + BUG_ON(!(ip->i_diskflags & GFS2_DIF_EXHASH)); + + hc = ip->i_hash_cache; + if (hc) + return hc; + + hsize = BIT(ip->i_depth); + hsize *= sizeof(__be64); + if (hsize != i_size_read(&ip->i_inode)) { + gfs2_consist_inode(ip); + return ERR_PTR(-EIO); + } + + hc = kmalloc(hsize, GFP_NOFS | __GFP_NOWARN); + if (hc == NULL) + hc = __vmalloc(hsize, GFP_NOFS); + + if (hc == NULL) + return ERR_PTR(-ENOMEM); + + ret = gfs2_dir_read_data(ip, hc, hsize); + if (ret < 0) { + kvfree(hc); + return ERR_PTR(ret); + } + + spin_lock(&inode->i_lock); + if (likely(!ip->i_hash_cache)) { + ip->i_hash_cache = hc; + hc = NULL; + } + spin_unlock(&inode->i_lock); + kvfree(hc); + + return ip->i_hash_cache; +} + +/** + * gfs2_dir_hash_inval - Invalidate dir hash + * @ip: The directory inode + * + * Must be called with an exclusive glock, or during glock invalidation. + */ +void gfs2_dir_hash_inval(struct gfs2_inode *ip) +{ + __be64 *hc; + + spin_lock(&ip->i_inode.i_lock); + hc = ip->i_hash_cache; + ip->i_hash_cache = NULL; + spin_unlock(&ip->i_inode.i_lock); + + kvfree(hc); +} + +static inline int gfs2_dirent_sentinel(const struct gfs2_dirent *dent) +{ + return dent->de_inum.no_addr == 0 || dent->de_inum.no_formal_ino == 0; +} + +static inline int __gfs2_dirent_find(const struct gfs2_dirent *dent, + const struct qstr *name, int ret) +{ + if (!gfs2_dirent_sentinel(dent) && + be32_to_cpu(dent->de_hash) == name->hash && + be16_to_cpu(dent->de_name_len) == name->len && + memcmp(dent+1, name->name, name->len) == 0) + return ret; + return 0; +} + +static int gfs2_dirent_find(const struct gfs2_dirent *dent, + const struct qstr *name, + void *opaque) +{ + return __gfs2_dirent_find(dent, name, 1); +} + +static int gfs2_dirent_prev(const struct gfs2_dirent *dent, + const struct qstr *name, + void *opaque) +{ + return __gfs2_dirent_find(dent, name, 2); +} + +/* + * name->name holds ptr to start of block. + * name->len holds size of block. + */ +static int gfs2_dirent_last(const struct gfs2_dirent *dent, + const struct qstr *name, + void *opaque) +{ + const char *start = name->name; + const char *end = (const char *)dent + be16_to_cpu(dent->de_rec_len); + if (name->len == (end - start)) + return 1; + return 0; +} + +/* Look for the dirent that contains the offset specified in data. Once we + * find that dirent, there must be space available there for the new dirent */ +static int gfs2_dirent_find_offset(const struct gfs2_dirent *dent, + const struct qstr *name, + void *ptr) +{ + unsigned required = GFS2_DIRENT_SIZE(name->len); + unsigned actual = GFS2_DIRENT_SIZE(be16_to_cpu(dent->de_name_len)); + unsigned totlen = be16_to_cpu(dent->de_rec_len); + + if (ptr < (void *)dent || ptr >= (void *)dent + totlen) + return 0; + if (gfs2_dirent_sentinel(dent)) + actual = 0; + if (ptr < (void *)dent + actual) + return -1; + if ((void *)dent + totlen >= ptr + required) + return 1; + return -1; +} + +static int gfs2_dirent_find_space(const struct gfs2_dirent *dent, + const struct qstr *name, + void *opaque) +{ + unsigned required = GFS2_DIRENT_SIZE(name->len); + unsigned actual = GFS2_DIRENT_SIZE(be16_to_cpu(dent->de_name_len)); + unsigned totlen = be16_to_cpu(dent->de_rec_len); + + if (gfs2_dirent_sentinel(dent)) + actual = 0; + if (totlen - actual >= required) + return 1; + return 0; +} + +struct dirent_gather { + const struct gfs2_dirent **pdent; + unsigned offset; +}; + +static int gfs2_dirent_gather(const struct gfs2_dirent *dent, + const struct qstr *name, + void *opaque) +{ + struct dirent_gather *g = opaque; + if (!gfs2_dirent_sentinel(dent)) { + g->pdent[g->offset++] = dent; + } + return 0; +} + +/* + * Other possible things to check: + * - Inode located within filesystem size (and on valid block) + * - Valid directory entry type + * Not sure how heavy-weight we want to make this... could also check + * hash is correct for example, but that would take a lot of extra time. + * For now the most important thing is to check that the various sizes + * are correct. + */ +static int gfs2_check_dirent(struct gfs2_sbd *sdp, + struct gfs2_dirent *dent, unsigned int offset, + unsigned int size, unsigned int len, int first) +{ + const char *msg = "gfs2_dirent too small"; + if (unlikely(size < sizeof(struct gfs2_dirent))) + goto error; + msg = "gfs2_dirent misaligned"; + if (unlikely(offset & 0x7)) + goto error; + msg = "gfs2_dirent points beyond end of block"; + if (unlikely(offset + size > len)) + goto error; + msg = "zero inode number"; + if (unlikely(!first && gfs2_dirent_sentinel(dent))) + goto error; + msg = "name length is greater than space in dirent"; + if (!gfs2_dirent_sentinel(dent) && + unlikely(sizeof(struct gfs2_dirent)+be16_to_cpu(dent->de_name_len) > + size)) + goto error; + return 0; +error: + fs_warn(sdp, "%s: %s (%s)\n", + __func__, msg, first ? "first in block" : "not first in block"); + return -EIO; +} + +static int gfs2_dirent_offset(struct gfs2_sbd *sdp, const void *buf) +{ + const struct gfs2_meta_header *h = buf; + int offset; + + BUG_ON(buf == NULL); + + switch(be32_to_cpu(h->mh_type)) { + case GFS2_METATYPE_LF: + offset = sizeof(struct gfs2_leaf); + break; + case GFS2_METATYPE_DI: + offset = sizeof(struct gfs2_dinode); + break; + default: + goto wrong_type; + } + return offset; +wrong_type: + fs_warn(sdp, "%s: wrong block type %u\n", __func__, + be32_to_cpu(h->mh_type)); + return -1; +} + +static struct gfs2_dirent *gfs2_dirent_scan(struct inode *inode, void *buf, + unsigned int len, gfs2_dscan_t scan, + const struct qstr *name, + void *opaque) +{ + struct gfs2_dirent *dent, *prev; + unsigned offset; + unsigned size; + int ret = 0; + + ret = gfs2_dirent_offset(GFS2_SB(inode), buf); + if (ret < 0) + goto consist_inode; + + offset = ret; + prev = NULL; + dent = buf + offset; + size = be16_to_cpu(dent->de_rec_len); + if (gfs2_check_dirent(GFS2_SB(inode), dent, offset, size, len, 1)) + goto consist_inode; + do { + ret = scan(dent, name, opaque); + if (ret) + break; + offset += size; + if (offset == len) + break; + prev = dent; + dent = buf + offset; + size = be16_to_cpu(dent->de_rec_len); + if (gfs2_check_dirent(GFS2_SB(inode), dent, offset, size, + len, 0)) + goto consist_inode; + } while(1); + + switch(ret) { + case 0: + return NULL; + case 1: + return dent; + case 2: + return prev ? prev : dent; + default: + BUG_ON(ret > 0); + return ERR_PTR(ret); + } + +consist_inode: + gfs2_consist_inode(GFS2_I(inode)); + return ERR_PTR(-EIO); +} + +static int dirent_check_reclen(struct gfs2_inode *dip, + const struct gfs2_dirent *d, const void *end_p) +{ + const void *ptr = d; + u16 rec_len = be16_to_cpu(d->de_rec_len); + + if (unlikely(rec_len < sizeof(struct gfs2_dirent))) + goto broken; + ptr += rec_len; + if (ptr < end_p) + return rec_len; + if (ptr == end_p) + return -ENOENT; +broken: + gfs2_consist_inode(dip); + return -EIO; +} + +/** + * dirent_next - Next dirent + * @dip: the directory + * @bh: The buffer + * @dent: Pointer to list of dirents + * + * Returns: 0 on success, error code otherwise + */ + +static int dirent_next(struct gfs2_inode *dip, struct buffer_head *bh, + struct gfs2_dirent **dent) +{ + struct gfs2_dirent *cur = *dent, *tmp; + char *bh_end = bh->b_data + bh->b_size; + int ret; + + ret = dirent_check_reclen(dip, cur, bh_end); + if (ret < 0) + return ret; + + tmp = (void *)cur + ret; + ret = dirent_check_reclen(dip, tmp, bh_end); + if (ret == -EIO) + return ret; + + /* Only the first dent could ever have de_inum.no_addr == 0 */ + if (gfs2_dirent_sentinel(tmp)) { + gfs2_consist_inode(dip); + return -EIO; + } + + *dent = tmp; + return 0; +} + +/** + * dirent_del - Delete a dirent + * @dip: The GFS2 inode + * @bh: The buffer + * @prev: The previous dirent + * @cur: The current dirent + * + */ + +static void dirent_del(struct gfs2_inode *dip, struct buffer_head *bh, + struct gfs2_dirent *prev, struct gfs2_dirent *cur) +{ + u16 cur_rec_len, prev_rec_len; + + if (gfs2_dirent_sentinel(cur)) { + gfs2_consist_inode(dip); + return; + } + + gfs2_trans_add_meta(dip->i_gl, bh); + + /* If there is no prev entry, this is the first entry in the block. + The de_rec_len is already as big as it needs to be. Just zero + out the inode number and return. */ + + if (!prev) { + cur->de_inum.no_addr = 0; + cur->de_inum.no_formal_ino = 0; + return; + } + + /* Combine this dentry with the previous one. */ + + prev_rec_len = be16_to_cpu(prev->de_rec_len); + cur_rec_len = be16_to_cpu(cur->de_rec_len); + + if ((char *)prev + prev_rec_len != (char *)cur) + gfs2_consist_inode(dip); + if ((char *)cur + cur_rec_len > bh->b_data + bh->b_size) + gfs2_consist_inode(dip); + + prev_rec_len += cur_rec_len; + prev->de_rec_len = cpu_to_be16(prev_rec_len); +} + + +static struct gfs2_dirent *do_init_dirent(struct inode *inode, + struct gfs2_dirent *dent, + const struct qstr *name, + struct buffer_head *bh, + unsigned offset) +{ + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_dirent *ndent; + unsigned totlen; + + totlen = be16_to_cpu(dent->de_rec_len); + BUG_ON(offset + name->len > totlen); + gfs2_trans_add_meta(ip->i_gl, bh); + ndent = (struct gfs2_dirent *)((char *)dent + offset); + dent->de_rec_len = cpu_to_be16(offset); + gfs2_qstr2dirent(name, totlen - offset, ndent); + return ndent; +} + + +/* + * Takes a dent from which to grab space as an argument. Returns the + * newly created dent. + */ +static struct gfs2_dirent *gfs2_init_dirent(struct inode *inode, + struct gfs2_dirent *dent, + const struct qstr *name, + struct buffer_head *bh) +{ + unsigned offset = 0; + + if (!gfs2_dirent_sentinel(dent)) + offset = GFS2_DIRENT_SIZE(be16_to_cpu(dent->de_name_len)); + return do_init_dirent(inode, dent, name, bh, offset); +} + +static struct gfs2_dirent *gfs2_dirent_split_alloc(struct inode *inode, + struct buffer_head *bh, + const struct qstr *name, + void *ptr) +{ + struct gfs2_dirent *dent; + dent = gfs2_dirent_scan(inode, bh->b_data, bh->b_size, + gfs2_dirent_find_offset, name, ptr); + if (IS_ERR_OR_NULL(dent)) + return dent; + return do_init_dirent(inode, dent, name, bh, + (unsigned)(ptr - (void *)dent)); +} + +static int get_leaf(struct gfs2_inode *dip, u64 leaf_no, + struct buffer_head **bhp) +{ + int error; + + error = gfs2_meta_read(dip->i_gl, leaf_no, DIO_WAIT, 0, bhp); + if (!error && gfs2_metatype_check(GFS2_SB(&dip->i_inode), *bhp, GFS2_METATYPE_LF)) { + /* pr_info("block num=%llu\n", leaf_no); */ + error = -EIO; + } + + return error; +} + +/** + * get_leaf_nr - Get a leaf number associated with the index + * @dip: The GFS2 inode + * @index: hash table index of the targeted leaf + * @leaf_out: Resulting leaf block number + * + * Returns: 0 on success, error code otherwise + */ + +static int get_leaf_nr(struct gfs2_inode *dip, u32 index, u64 *leaf_out) +{ + __be64 *hash; + int error; + + hash = gfs2_dir_get_hash_table(dip); + error = PTR_ERR_OR_ZERO(hash); + + if (!error) + *leaf_out = be64_to_cpu(*(hash + index)); + + return error; +} + +static int get_first_leaf(struct gfs2_inode *dip, u32 index, + struct buffer_head **bh_out) +{ + u64 leaf_no; + int error; + + error = get_leaf_nr(dip, index, &leaf_no); + if (!error) + error = get_leaf(dip, leaf_no, bh_out); + + return error; +} + +static struct gfs2_dirent *gfs2_dirent_search(struct inode *inode, + const struct qstr *name, + gfs2_dscan_t scan, + struct buffer_head **pbh) +{ + struct buffer_head *bh; + struct gfs2_dirent *dent; + struct gfs2_inode *ip = GFS2_I(inode); + int error; + + if (ip->i_diskflags & GFS2_DIF_EXHASH) { + struct gfs2_leaf *leaf; + unsigned int hsize = BIT(ip->i_depth); + unsigned int index; + u64 ln; + if (hsize * sizeof(u64) != i_size_read(inode)) { + gfs2_consist_inode(ip); + return ERR_PTR(-EIO); + } + + index = name->hash >> (32 - ip->i_depth); + error = get_first_leaf(ip, index, &bh); + if (error) + return ERR_PTR(error); + do { + dent = gfs2_dirent_scan(inode, bh->b_data, bh->b_size, + scan, name, NULL); + if (dent) + goto got_dent; + leaf = (struct gfs2_leaf *)bh->b_data; + ln = be64_to_cpu(leaf->lf_next); + brelse(bh); + if (!ln) + break; + + error = get_leaf(ip, ln, &bh); + } while(!error); + + return error ? ERR_PTR(error) : NULL; + } + + + error = gfs2_meta_inode_buffer(ip, &bh); + if (error) + return ERR_PTR(error); + dent = gfs2_dirent_scan(inode, bh->b_data, bh->b_size, scan, name, NULL); +got_dent: + if (IS_ERR_OR_NULL(dent)) { + brelse(bh); + bh = NULL; + } + *pbh = bh; + return dent; +} + +static struct gfs2_leaf *new_leaf(struct inode *inode, struct buffer_head **pbh, u16 depth) +{ + struct gfs2_inode *ip = GFS2_I(inode); + unsigned int n = 1; + u64 bn; + int error; + struct buffer_head *bh; + struct gfs2_leaf *leaf; + struct gfs2_dirent *dent; + struct timespec64 tv = current_time(inode); + + error = gfs2_alloc_blocks(ip, &bn, &n, 0, NULL); + if (error) + return NULL; + bh = gfs2_meta_new(ip->i_gl, bn); + if (!bh) + return NULL; + + gfs2_trans_remove_revoke(GFS2_SB(inode), bn, 1); + gfs2_trans_add_meta(ip->i_gl, bh); + gfs2_metatype_set(bh, GFS2_METATYPE_LF, GFS2_FORMAT_LF); + leaf = (struct gfs2_leaf *)bh->b_data; + leaf->lf_depth = cpu_to_be16(depth); + leaf->lf_entries = 0; + leaf->lf_dirent_format = cpu_to_be32(GFS2_FORMAT_DE); + leaf->lf_next = 0; + leaf->lf_inode = cpu_to_be64(ip->i_no_addr); + leaf->lf_dist = cpu_to_be32(1); + leaf->lf_nsec = cpu_to_be32(tv.tv_nsec); + leaf->lf_sec = cpu_to_be64(tv.tv_sec); + memset(leaf->lf_reserved2, 0, sizeof(leaf->lf_reserved2)); + dent = (struct gfs2_dirent *)(leaf+1); + gfs2_qstr2dirent(&empty_name, bh->b_size - sizeof(struct gfs2_leaf), dent); + *pbh = bh; + return leaf; +} + +/** + * dir_make_exhash - Convert a stuffed directory into an ExHash directory + * @inode: The directory inode to be converted to exhash + * + * Returns: 0 on success, error code otherwise + */ + +static int dir_make_exhash(struct inode *inode) +{ + struct gfs2_inode *dip = GFS2_I(inode); + struct gfs2_sbd *sdp = GFS2_SB(inode); + struct gfs2_dirent *dent; + struct qstr args; + struct buffer_head *bh, *dibh; + struct gfs2_leaf *leaf; + int y; + u32 x; + __be64 *lp; + u64 bn; + int error; + + error = gfs2_meta_inode_buffer(dip, &dibh); + if (error) + return error; + + /* Turn over a new leaf */ + + leaf = new_leaf(inode, &bh, 0); + if (!leaf) + return -ENOSPC; + bn = bh->b_blocknr; + + gfs2_assert(sdp, dip->i_entries < BIT(16)); + leaf->lf_entries = cpu_to_be16(dip->i_entries); + + /* Copy dirents */ + + gfs2_buffer_copy_tail(bh, sizeof(struct gfs2_leaf), dibh, + sizeof(struct gfs2_dinode)); + + /* Find last entry */ + + x = 0; + args.len = bh->b_size - sizeof(struct gfs2_dinode) + + sizeof(struct gfs2_leaf); + args.name = bh->b_data; + dent = gfs2_dirent_scan(&dip->i_inode, bh->b_data, bh->b_size, + gfs2_dirent_last, &args, NULL); + if (!dent) { + brelse(bh); + brelse(dibh); + return -EIO; + } + if (IS_ERR(dent)) { + brelse(bh); + brelse(dibh); + return PTR_ERR(dent); + } + + /* Adjust the last dirent's record length + (Remember that dent still points to the last entry.) */ + + dent->de_rec_len = cpu_to_be16(be16_to_cpu(dent->de_rec_len) + + sizeof(struct gfs2_dinode) - + sizeof(struct gfs2_leaf)); + + brelse(bh); + + /* We're done with the new leaf block, now setup the new + hash table. */ + + gfs2_trans_add_meta(dip->i_gl, dibh); + gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode)); + + lp = (__be64 *)(dibh->b_data + sizeof(struct gfs2_dinode)); + + for (x = sdp->sd_hash_ptrs; x--; lp++) + *lp = cpu_to_be64(bn); + + i_size_write(inode, sdp->sd_sb.sb_bsize / 2); + gfs2_add_inode_blocks(&dip->i_inode, 1); + dip->i_diskflags |= GFS2_DIF_EXHASH; + + for (x = sdp->sd_hash_ptrs, y = -1; x; x >>= 1, y++) ; + dip->i_depth = y; + + gfs2_dinode_out(dip, dibh->b_data); + + brelse(dibh); + + return 0; +} + +/** + * dir_split_leaf - Split a leaf block into two + * @inode: The directory inode to be split + * @name: name of the dirent we're trying to insert + * + * Returns: 0 on success, error code on failure + */ + +static int dir_split_leaf(struct inode *inode, const struct qstr *name) +{ + struct gfs2_inode *dip = GFS2_I(inode); + struct buffer_head *nbh, *obh, *dibh; + struct gfs2_leaf *nleaf, *oleaf; + struct gfs2_dirent *dent = NULL, *prev = NULL, *next = NULL, *new; + u32 start, len, half_len, divider; + u64 bn, leaf_no; + __be64 *lp; + u32 index; + int x; + int error; + + index = name->hash >> (32 - dip->i_depth); + error = get_leaf_nr(dip, index, &leaf_no); + if (error) + return error; + + /* Get the old leaf block */ + error = get_leaf(dip, leaf_no, &obh); + if (error) + return error; + + oleaf = (struct gfs2_leaf *)obh->b_data; + if (dip->i_depth == be16_to_cpu(oleaf->lf_depth)) { + brelse(obh); + return 1; /* can't split */ + } + + gfs2_trans_add_meta(dip->i_gl, obh); + + nleaf = new_leaf(inode, &nbh, be16_to_cpu(oleaf->lf_depth) + 1); + if (!nleaf) { + brelse(obh); + return -ENOSPC; + } + bn = nbh->b_blocknr; + + /* Compute the start and len of leaf pointers in the hash table. */ + len = BIT(dip->i_depth - be16_to_cpu(oleaf->lf_depth)); + half_len = len >> 1; + if (!half_len) { + fs_warn(GFS2_SB(inode), "i_depth %u lf_depth %u index %u\n", + dip->i_depth, be16_to_cpu(oleaf->lf_depth), index); + gfs2_consist_inode(dip); + error = -EIO; + goto fail_brelse; + } + + start = (index & ~(len - 1)); + + /* Change the pointers. + Don't bother distinguishing stuffed from non-stuffed. + This code is complicated enough already. */ + lp = kmalloc_array(half_len, sizeof(__be64), GFP_NOFS); + if (!lp) { + error = -ENOMEM; + goto fail_brelse; + } + + /* Change the pointers */ + for (x = 0; x < half_len; x++) + lp[x] = cpu_to_be64(bn); + + gfs2_dir_hash_inval(dip); + + error = gfs2_dir_write_data(dip, (char *)lp, start * sizeof(u64), + half_len * sizeof(u64)); + if (error != half_len * sizeof(u64)) { + if (error >= 0) + error = -EIO; + goto fail_lpfree; + } + + kfree(lp); + + /* Compute the divider */ + divider = (start + half_len) << (32 - dip->i_depth); + + /* Copy the entries */ + dent = (struct gfs2_dirent *)(obh->b_data + sizeof(struct gfs2_leaf)); + + do { + next = dent; + if (dirent_next(dip, obh, &next)) + next = NULL; + + if (!gfs2_dirent_sentinel(dent) && + be32_to_cpu(dent->de_hash) < divider) { + struct qstr str; + void *ptr = ((char *)dent - obh->b_data) + nbh->b_data; + str.name = (char*)(dent+1); + str.len = be16_to_cpu(dent->de_name_len); + str.hash = be32_to_cpu(dent->de_hash); + new = gfs2_dirent_split_alloc(inode, nbh, &str, ptr); + if (IS_ERR(new)) { + error = PTR_ERR(new); + break; + } + + new->de_inum = dent->de_inum; /* No endian worries */ + new->de_type = dent->de_type; /* No endian worries */ + be16_add_cpu(&nleaf->lf_entries, 1); + + dirent_del(dip, obh, prev, dent); + + if (!oleaf->lf_entries) + gfs2_consist_inode(dip); + be16_add_cpu(&oleaf->lf_entries, -1); + + if (!prev) + prev = dent; + } else { + prev = dent; + } + dent = next; + } while (dent); + + oleaf->lf_depth = nleaf->lf_depth; + + error = gfs2_meta_inode_buffer(dip, &dibh); + if (!gfs2_assert_withdraw(GFS2_SB(&dip->i_inode), !error)) { + gfs2_trans_add_meta(dip->i_gl, dibh); + gfs2_add_inode_blocks(&dip->i_inode, 1); + gfs2_dinode_out(dip, dibh->b_data); + brelse(dibh); + } + + brelse(obh); + brelse(nbh); + + return error; + +fail_lpfree: + kfree(lp); + +fail_brelse: + brelse(obh); + brelse(nbh); + return error; +} + +/** + * dir_double_exhash - Double size of ExHash table + * @dip: The GFS2 dinode + * + * Returns: 0 on success, error code on failure + */ + +static int dir_double_exhash(struct gfs2_inode *dip) +{ + struct buffer_head *dibh; + u32 hsize; + u32 hsize_bytes; + __be64 *hc; + __be64 *hc2, *h; + int x; + int error = 0; + + hsize = BIT(dip->i_depth); + hsize_bytes = hsize * sizeof(__be64); + + hc = gfs2_dir_get_hash_table(dip); + if (IS_ERR(hc)) + return PTR_ERR(hc); + + hc2 = kmalloc_array(hsize_bytes, 2, GFP_NOFS | __GFP_NOWARN); + if (hc2 == NULL) + hc2 = __vmalloc(hsize_bytes * 2, GFP_NOFS); + + if (!hc2) + return -ENOMEM; + + h = hc2; + error = gfs2_meta_inode_buffer(dip, &dibh); + if (error) + goto out_kfree; + + for (x = 0; x < hsize; x++) { + *h++ = *hc; + *h++ = *hc; + hc++; + } + + error = gfs2_dir_write_data(dip, (char *)hc2, 0, hsize_bytes * 2); + if (error != (hsize_bytes * 2)) + goto fail; + + gfs2_dir_hash_inval(dip); + dip->i_hash_cache = hc2; + dip->i_depth++; + gfs2_dinode_out(dip, dibh->b_data); + brelse(dibh); + return 0; + +fail: + /* Replace original hash table & size */ + gfs2_dir_write_data(dip, (char *)hc, 0, hsize_bytes); + i_size_write(&dip->i_inode, hsize_bytes); + gfs2_dinode_out(dip, dibh->b_data); + brelse(dibh); +out_kfree: + kvfree(hc2); + return error; +} + +/** + * compare_dents - compare directory entries by hash value + * @a: first dent + * @b: second dent + * + * When comparing the hash entries of @a to @b: + * gt: returns 1 + * lt: returns -1 + * eq: returns 0 + */ + +static int compare_dents(const void *a, const void *b) +{ + const struct gfs2_dirent *dent_a, *dent_b; + u32 hash_a, hash_b; + int ret = 0; + + dent_a = *(const struct gfs2_dirent **)a; + hash_a = dent_a->de_cookie; + + dent_b = *(const struct gfs2_dirent **)b; + hash_b = dent_b->de_cookie; + + if (hash_a > hash_b) + ret = 1; + else if (hash_a < hash_b) + ret = -1; + else { + unsigned int len_a = be16_to_cpu(dent_a->de_name_len); + unsigned int len_b = be16_to_cpu(dent_b->de_name_len); + + if (len_a > len_b) + ret = 1; + else if (len_a < len_b) + ret = -1; + else + ret = memcmp(dent_a + 1, dent_b + 1, len_a); + } + + return ret; +} + +/** + * do_filldir_main - read out directory entries + * @dip: The GFS2 inode + * @ctx: what to feed the entries to + * @darr: an array of struct gfs2_dirent pointers to read + * @entries: the number of entries in darr + * @sort_start: index of the directory array to start our sort + * @copied: pointer to int that's non-zero if a entry has been copied out + * + * Jump through some hoops to make sure that if there are hash collsions, + * they are read out at the beginning of a buffer. We want to minimize + * the possibility that they will fall into different readdir buffers or + * that someone will want to seek to that location. + * + * Returns: errno, >0 if the actor tells you to stop + */ + +static int do_filldir_main(struct gfs2_inode *dip, struct dir_context *ctx, + struct gfs2_dirent **darr, u32 entries, + u32 sort_start, int *copied) +{ + const struct gfs2_dirent *dent, *dent_next; + u64 off, off_next; + unsigned int x, y; + int run = 0; + + if (sort_start < entries) + sort(&darr[sort_start], entries - sort_start, + sizeof(struct gfs2_dirent *), compare_dents, NULL); + + dent_next = darr[0]; + off_next = dent_next->de_cookie; + + for (x = 0, y = 1; x < entries; x++, y++) { + dent = dent_next; + off = off_next; + + if (y < entries) { + dent_next = darr[y]; + off_next = dent_next->de_cookie; + + if (off < ctx->pos) + continue; + ctx->pos = off; + + if (off_next == off) { + if (*copied && !run) + return 1; + run = 1; + } else + run = 0; + } else { + if (off < ctx->pos) + continue; + ctx->pos = off; + } + + if (!dir_emit(ctx, (const char *)(dent + 1), + be16_to_cpu(dent->de_name_len), + be64_to_cpu(dent->de_inum.no_addr), + be16_to_cpu(dent->de_type))) + return 1; + + *copied = 1; + } + + /* Increment the ctx->pos by one, so the next time we come into the + do_filldir fxn, we get the next entry instead of the last one in the + current leaf */ + + ctx->pos++; + + return 0; +} + +static void *gfs2_alloc_sort_buffer(unsigned size) +{ + void *ptr = NULL; + + if (size < KMALLOC_MAX_SIZE) + ptr = kmalloc(size, GFP_NOFS | __GFP_NOWARN); + if (!ptr) + ptr = __vmalloc(size, GFP_NOFS); + return ptr; +} + + +static int gfs2_set_cookies(struct gfs2_sbd *sdp, struct buffer_head *bh, + unsigned leaf_nr, struct gfs2_dirent **darr, + unsigned entries) +{ + int sort_id = -1; + int i; + + for (i = 0; i < entries; i++) { + unsigned offset; + + darr[i]->de_cookie = be32_to_cpu(darr[i]->de_hash); + darr[i]->de_cookie = gfs2_disk_hash2offset(darr[i]->de_cookie); + + if (!sdp->sd_args.ar_loccookie) + continue; + offset = (char *)(darr[i]) - + (bh->b_data + gfs2_dirent_offset(sdp, bh->b_data)); + offset /= GFS2_MIN_DIRENT_SIZE; + offset += leaf_nr * sdp->sd_max_dents_per_leaf; + if (offset >= GFS2_USE_HASH_FLAG || + leaf_nr >= GFS2_USE_HASH_FLAG) { + darr[i]->de_cookie |= GFS2_USE_HASH_FLAG; + if (sort_id < 0) + sort_id = i; + continue; + } + darr[i]->de_cookie &= GFS2_HASH_INDEX_MASK; + darr[i]->de_cookie |= offset; + } + return sort_id; +} + + +static int gfs2_dir_read_leaf(struct inode *inode, struct dir_context *ctx, + int *copied, unsigned *depth, + u64 leaf_no) +{ + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_sbd *sdp = GFS2_SB(inode); + struct buffer_head *bh; + struct gfs2_leaf *lf; + unsigned entries = 0, entries2 = 0; + unsigned leaves = 0, leaf = 0, offset, sort_offset; + struct gfs2_dirent **darr, *dent; + struct dirent_gather g; + struct buffer_head **larr; + int error, i, need_sort = 0, sort_id; + u64 lfn = leaf_no; + + do { + error = get_leaf(ip, lfn, &bh); + if (error) + goto out; + lf = (struct gfs2_leaf *)bh->b_data; + if (leaves == 0) + *depth = be16_to_cpu(lf->lf_depth); + entries += be16_to_cpu(lf->lf_entries); + leaves++; + lfn = be64_to_cpu(lf->lf_next); + brelse(bh); + } while(lfn); + + if (*depth < GFS2_DIR_MAX_DEPTH || !sdp->sd_args.ar_loccookie) { + need_sort = 1; + sort_offset = 0; + } + + if (!entries) + return 0; + + error = -ENOMEM; + /* + * The extra 99 entries are not normally used, but are a buffer + * zone in case the number of entries in the leaf is corrupt. + * 99 is the maximum number of entries that can fit in a single + * leaf block. + */ + larr = gfs2_alloc_sort_buffer((leaves + entries + 99) * sizeof(void *)); + if (!larr) + goto out; + darr = (struct gfs2_dirent **)(larr + leaves); + g.pdent = (const struct gfs2_dirent **)darr; + g.offset = 0; + lfn = leaf_no; + + do { + error = get_leaf(ip, lfn, &bh); + if (error) + goto out_free; + lf = (struct gfs2_leaf *)bh->b_data; + lfn = be64_to_cpu(lf->lf_next); + if (lf->lf_entries) { + offset = g.offset; + entries2 += be16_to_cpu(lf->lf_entries); + dent = gfs2_dirent_scan(inode, bh->b_data, bh->b_size, + gfs2_dirent_gather, NULL, &g); + error = PTR_ERR(dent); + if (IS_ERR(dent)) + goto out_free; + if (entries2 != g.offset) { + fs_warn(sdp, "Number of entries corrupt in dir " + "leaf %llu, entries2 (%u) != " + "g.offset (%u)\n", + (unsigned long long)bh->b_blocknr, + entries2, g.offset); + gfs2_consist_inode(ip); + error = -EIO; + goto out_free; + } + error = 0; + sort_id = gfs2_set_cookies(sdp, bh, leaf, &darr[offset], + be16_to_cpu(lf->lf_entries)); + if (!need_sort && sort_id >= 0) { + need_sort = 1; + sort_offset = offset + sort_id; + } + larr[leaf++] = bh; + } else { + larr[leaf++] = NULL; + brelse(bh); + } + } while(lfn); + + BUG_ON(entries2 != entries); + error = do_filldir_main(ip, ctx, darr, entries, need_sort ? + sort_offset : entries, copied); +out_free: + for(i = 0; i < leaf; i++) + brelse(larr[i]); + kvfree(larr); +out: + return error; +} + +/** + * gfs2_dir_readahead - Issue read-ahead requests for leaf blocks. + * @inode: the directory inode + * @hsize: hash table size + * @index: index into the hash table + * @f_ra: read-ahead parameters + * + * Note: we can't calculate each index like dir_e_read can because we don't + * have the leaf, and therefore we don't have the depth, and therefore we + * don't have the length. So we have to just read enough ahead to make up + * for the loss of information. + */ +static void gfs2_dir_readahead(struct inode *inode, unsigned hsize, u32 index, + struct file_ra_state *f_ra) +{ + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_glock *gl = ip->i_gl; + struct buffer_head *bh; + u64 blocknr = 0, last; + unsigned count; + + /* First check if we've already read-ahead for the whole range. */ + if (index + MAX_RA_BLOCKS < f_ra->start) + return; + + f_ra->start = max((pgoff_t)index, f_ra->start); + for (count = 0; count < MAX_RA_BLOCKS; count++) { + if (f_ra->start >= hsize) /* if exceeded the hash table */ + break; + + last = blocknr; + blocknr = be64_to_cpu(ip->i_hash_cache[f_ra->start]); + f_ra->start++; + if (blocknr == last) + continue; + + bh = gfs2_getbuf(gl, blocknr, 1); + if (trylock_buffer(bh)) { + if (buffer_uptodate(bh)) { + unlock_buffer(bh); + brelse(bh); + continue; + } + bh->b_end_io = end_buffer_read_sync; + submit_bh(REQ_OP_READ | REQ_RAHEAD | REQ_META | + REQ_PRIO, bh); + continue; + } + brelse(bh); + } +} + +/** + * dir_e_read - Reads the entries from a directory into a filldir buffer + * @inode: the directory inode + * @ctx: actor to feed the entries to + * @f_ra: read-ahead parameters + * + * Returns: errno + */ + +static int dir_e_read(struct inode *inode, struct dir_context *ctx, + struct file_ra_state *f_ra) +{ + struct gfs2_inode *dip = GFS2_I(inode); + u32 hsize, len = 0; + u32 hash, index; + __be64 *lp; + int copied = 0; + int error = 0; + unsigned depth = 0; + + hsize = BIT(dip->i_depth); + hash = gfs2_dir_offset2hash(ctx->pos); + index = hash >> (32 - dip->i_depth); + + if (dip->i_hash_cache == NULL) + f_ra->start = 0; + lp = gfs2_dir_get_hash_table(dip); + if (IS_ERR(lp)) + return PTR_ERR(lp); + + gfs2_dir_readahead(inode, hsize, index, f_ra); + + while (index < hsize) { + error = gfs2_dir_read_leaf(inode, ctx, + &copied, &depth, + be64_to_cpu(lp[index])); + if (error) + break; + + len = BIT(dip->i_depth - depth); + index = (index & ~(len - 1)) + len; + } + + if (error > 0) + error = 0; + return error; +} + +int gfs2_dir_read(struct inode *inode, struct dir_context *ctx, + struct file_ra_state *f_ra) +{ + struct gfs2_inode *dip = GFS2_I(inode); + struct gfs2_sbd *sdp = GFS2_SB(inode); + struct dirent_gather g; + struct gfs2_dirent **darr, *dent; + struct buffer_head *dibh; + int copied = 0; + int error; + + if (!dip->i_entries) + return 0; + + if (dip->i_diskflags & GFS2_DIF_EXHASH) + return dir_e_read(inode, ctx, f_ra); + + if (!gfs2_is_stuffed(dip)) { + gfs2_consist_inode(dip); + return -EIO; + } + + error = gfs2_meta_inode_buffer(dip, &dibh); + if (error) + return error; + + error = -ENOMEM; + /* 96 is max number of dirents which can be stuffed into an inode */ + darr = kmalloc_array(96, sizeof(struct gfs2_dirent *), GFP_NOFS); + if (darr) { + g.pdent = (const struct gfs2_dirent **)darr; + g.offset = 0; + dent = gfs2_dirent_scan(inode, dibh->b_data, dibh->b_size, + gfs2_dirent_gather, NULL, &g); + if (IS_ERR(dent)) { + error = PTR_ERR(dent); + goto out; + } + if (dip->i_entries != g.offset) { + fs_warn(sdp, "Number of entries corrupt in dir %llu, " + "ip->i_entries (%u) != g.offset (%u)\n", + (unsigned long long)dip->i_no_addr, + dip->i_entries, + g.offset); + gfs2_consist_inode(dip); + error = -EIO; + goto out; + } + gfs2_set_cookies(sdp, dibh, 0, darr, dip->i_entries); + error = do_filldir_main(dip, ctx, darr, + dip->i_entries, 0, &copied); +out: + kfree(darr); + } + + if (error > 0) + error = 0; + + brelse(dibh); + + return error; +} + +/** + * gfs2_dir_search - Search a directory + * @dir: The GFS2 directory inode + * @name: The name we are looking up + * @fail_on_exist: Fail if the name exists rather than looking it up + * + * This routine searches a directory for a file or another directory. + * Assumes a glock is held on dip. + * + * Returns: errno + */ + +struct inode *gfs2_dir_search(struct inode *dir, const struct qstr *name, + bool fail_on_exist) +{ + struct buffer_head *bh; + struct gfs2_dirent *dent; + u64 addr, formal_ino; + u16 dtype; + + dent = gfs2_dirent_search(dir, name, gfs2_dirent_find, &bh); + if (dent) { + struct inode *inode; + u16 rahead; + + if (IS_ERR(dent)) + return ERR_CAST(dent); + dtype = be16_to_cpu(dent->de_type); + rahead = be16_to_cpu(dent->de_rahead); + addr = be64_to_cpu(dent->de_inum.no_addr); + formal_ino = be64_to_cpu(dent->de_inum.no_formal_ino); + brelse(bh); + if (fail_on_exist) + return ERR_PTR(-EEXIST); + inode = gfs2_inode_lookup(dir->i_sb, dtype, addr, formal_ino, + GFS2_BLKST_FREE /* ignore */); + if (!IS_ERR(inode)) + GFS2_I(inode)->i_rahead = rahead; + return inode; + } + return ERR_PTR(-ENOENT); +} + +int gfs2_dir_check(struct inode *dir, const struct qstr *name, + const struct gfs2_inode *ip) +{ + struct buffer_head *bh; + struct gfs2_dirent *dent; + int ret = -ENOENT; + + dent = gfs2_dirent_search(dir, name, gfs2_dirent_find, &bh); + if (dent) { + if (IS_ERR(dent)) + return PTR_ERR(dent); + if (ip) { + if (be64_to_cpu(dent->de_inum.no_addr) != ip->i_no_addr) + goto out; + if (be64_to_cpu(dent->de_inum.no_formal_ino) != + ip->i_no_formal_ino) + goto out; + if (unlikely(IF2DT(ip->i_inode.i_mode) != + be16_to_cpu(dent->de_type))) { + gfs2_consist_inode(GFS2_I(dir)); + ret = -EIO; + goto out; + } + } + ret = 0; +out: + brelse(bh); + } + return ret; +} + +/** + * dir_new_leaf - Add a new leaf onto hash chain + * @inode: The directory + * @name: The name we are adding + * + * This adds a new dir leaf onto an existing leaf when there is not + * enough space to add a new dir entry. This is a last resort after + * we've expanded the hash table to max size and also split existing + * leaf blocks, so it will only occur for very large directories. + * + * The dist parameter is set to 1 for leaf blocks directly attached + * to the hash table, 2 for one layer of indirection, 3 for two layers + * etc. We are thus able to tell the difference between an old leaf + * with dist set to zero (i.e. "don't know") and a new one where we + * set this information for debug/fsck purposes. + * + * Returns: 0 on success, or -ve on error + */ + +static int dir_new_leaf(struct inode *inode, const struct qstr *name) +{ + struct buffer_head *bh, *obh; + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_leaf *leaf, *oleaf; + u32 dist = 1; + int error; + u32 index; + u64 bn; + + index = name->hash >> (32 - ip->i_depth); + error = get_first_leaf(ip, index, &obh); + if (error) + return error; + do { + dist++; + oleaf = (struct gfs2_leaf *)obh->b_data; + bn = be64_to_cpu(oleaf->lf_next); + if (!bn) + break; + brelse(obh); + error = get_leaf(ip, bn, &obh); + if (error) + return error; + } while(1); + + gfs2_trans_add_meta(ip->i_gl, obh); + + leaf = new_leaf(inode, &bh, be16_to_cpu(oleaf->lf_depth)); + if (!leaf) { + brelse(obh); + return -ENOSPC; + } + leaf->lf_dist = cpu_to_be32(dist); + oleaf->lf_next = cpu_to_be64(bh->b_blocknr); + brelse(bh); + brelse(obh); + + error = gfs2_meta_inode_buffer(ip, &bh); + if (error) + return error; + gfs2_trans_add_meta(ip->i_gl, bh); + gfs2_add_inode_blocks(&ip->i_inode, 1); + gfs2_dinode_out(ip, bh->b_data); + brelse(bh); + return 0; +} + +static u16 gfs2_inode_ra_len(const struct gfs2_inode *ip) +{ + u64 where = ip->i_no_addr + 1; + if (ip->i_eattr == where) + return 1; + return 0; +} + +/** + * gfs2_dir_add - Add new filename into directory + * @inode: The directory inode + * @name: The new name + * @nip: The GFS2 inode to be linked in to the directory + * @da: The directory addition info + * + * If the call to gfs2_diradd_alloc_required resulted in there being + * no need to allocate any new directory blocks, then it will contain + * a pointer to the directory entry and the bh in which it resides. We + * can use that without having to repeat the search. If there was no + * free space, then we must now create more space. + * + * Returns: 0 on success, error code on failure + */ + +int gfs2_dir_add(struct inode *inode, const struct qstr *name, + const struct gfs2_inode *nip, struct gfs2_diradd *da) +{ + struct gfs2_inode *ip = GFS2_I(inode); + struct buffer_head *bh = da->bh; + struct gfs2_dirent *dent = da->dent; + struct timespec64 tv; + struct gfs2_leaf *leaf; + int error; + + while(1) { + if (da->bh == NULL) { + dent = gfs2_dirent_search(inode, name, + gfs2_dirent_find_space, &bh); + } + if (dent) { + if (IS_ERR(dent)) + return PTR_ERR(dent); + dent = gfs2_init_dirent(inode, dent, name, bh); + gfs2_inum_out(nip, dent); + dent->de_type = cpu_to_be16(IF2DT(nip->i_inode.i_mode)); + dent->de_rahead = cpu_to_be16(gfs2_inode_ra_len(nip)); + tv = inode_set_ctime_current(&ip->i_inode); + if (ip->i_diskflags & GFS2_DIF_EXHASH) { + leaf = (struct gfs2_leaf *)bh->b_data; + be16_add_cpu(&leaf->lf_entries, 1); + leaf->lf_nsec = cpu_to_be32(tv.tv_nsec); + leaf->lf_sec = cpu_to_be64(tv.tv_sec); + } + da->dent = NULL; + da->bh = NULL; + brelse(bh); + ip->i_entries++; + ip->i_inode.i_mtime = tv; + if (S_ISDIR(nip->i_inode.i_mode)) + inc_nlink(&ip->i_inode); + mark_inode_dirty(inode); + error = 0; + break; + } + if (!(ip->i_diskflags & GFS2_DIF_EXHASH)) { + error = dir_make_exhash(inode); + if (error) + break; + continue; + } + error = dir_split_leaf(inode, name); + if (error == 0) + continue; + if (error < 0) + break; + if (ip->i_depth < GFS2_DIR_MAX_DEPTH) { + error = dir_double_exhash(ip); + if (error) + break; + error = dir_split_leaf(inode, name); + if (error < 0) + break; + if (error == 0) + continue; + } + error = dir_new_leaf(inode, name); + if (!error) + continue; + error = -ENOSPC; + break; + } + return error; +} + + +/** + * gfs2_dir_del - Delete a directory entry + * @dip: The GFS2 inode + * @dentry: The directory entry we want to delete + * + * Returns: 0 on success, error code on failure + */ + +int gfs2_dir_del(struct gfs2_inode *dip, const struct dentry *dentry) +{ + const struct qstr *name = &dentry->d_name; + struct gfs2_dirent *dent, *prev = NULL; + struct buffer_head *bh; + struct timespec64 tv; + + /* Returns _either_ the entry (if its first in block) or the + previous entry otherwise */ + dent = gfs2_dirent_search(&dip->i_inode, name, gfs2_dirent_prev, &bh); + if (!dent) { + gfs2_consist_inode(dip); + return -EIO; + } + if (IS_ERR(dent)) { + gfs2_consist_inode(dip); + return PTR_ERR(dent); + } + /* If not first in block, adjust pointers accordingly */ + if (gfs2_dirent_find(dent, name, NULL) == 0) { + prev = dent; + dent = (struct gfs2_dirent *)((char *)dent + be16_to_cpu(prev->de_rec_len)); + } + + dirent_del(dip, bh, prev, dent); + tv = inode_set_ctime_current(&dip->i_inode); + if (dip->i_diskflags & GFS2_DIF_EXHASH) { + struct gfs2_leaf *leaf = (struct gfs2_leaf *)bh->b_data; + u16 entries = be16_to_cpu(leaf->lf_entries); + if (!entries) + gfs2_consist_inode(dip); + leaf->lf_entries = cpu_to_be16(--entries); + leaf->lf_nsec = cpu_to_be32(tv.tv_nsec); + leaf->lf_sec = cpu_to_be64(tv.tv_sec); + } + brelse(bh); + + if (!dip->i_entries) + gfs2_consist_inode(dip); + dip->i_entries--; + dip->i_inode.i_mtime = tv; + if (d_is_dir(dentry)) + drop_nlink(&dip->i_inode); + mark_inode_dirty(&dip->i_inode); + + return 0; +} + +/** + * gfs2_dir_mvino - Change inode number of directory entry + * @dip: The GFS2 directory inode + * @filename: the filename to be moved + * @nip: the new GFS2 inode + * @new_type: the de_type of the new dirent + * + * This routine changes the inode number of a directory entry. It's used + * by rename to change ".." when a directory is moved. + * Assumes a glock is held on dvp. + * + * Returns: errno + */ + +int gfs2_dir_mvino(struct gfs2_inode *dip, const struct qstr *filename, + const struct gfs2_inode *nip, unsigned int new_type) +{ + struct buffer_head *bh; + struct gfs2_dirent *dent; + + dent = gfs2_dirent_search(&dip->i_inode, filename, gfs2_dirent_find, &bh); + if (!dent) { + gfs2_consist_inode(dip); + return -EIO; + } + if (IS_ERR(dent)) + return PTR_ERR(dent); + + gfs2_trans_add_meta(dip->i_gl, bh); + gfs2_inum_out(nip, dent); + dent->de_type = cpu_to_be16(new_type); + brelse(bh); + + dip->i_inode.i_mtime = inode_set_ctime_current(&dip->i_inode); + mark_inode_dirty_sync(&dip->i_inode); + return 0; +} + +/** + * leaf_dealloc - Deallocate a directory leaf + * @dip: the directory + * @index: the hash table offset in the directory + * @len: the number of pointers to this leaf + * @leaf_no: the leaf number + * @leaf_bh: buffer_head for the starting leaf + * @last_dealloc: 1 if this is the final dealloc for the leaf, else 0 + * + * Returns: errno + */ + +static int leaf_dealloc(struct gfs2_inode *dip, u32 index, u32 len, + u64 leaf_no, struct buffer_head *leaf_bh, + int last_dealloc) +{ + struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode); + struct gfs2_leaf *tmp_leaf; + struct gfs2_rgrp_list rlist; + struct buffer_head *bh, *dibh; + u64 blk, nblk; + unsigned int rg_blocks = 0, l_blocks = 0; + char *ht; + unsigned int x, size = len * sizeof(u64); + int error; + + error = gfs2_rindex_update(sdp); + if (error) + return error; + + memset(&rlist, 0, sizeof(struct gfs2_rgrp_list)); + + ht = kzalloc(size, GFP_NOFS | __GFP_NOWARN); + if (ht == NULL) + ht = __vmalloc(size, GFP_NOFS | __GFP_NOWARN | __GFP_ZERO); + if (!ht) + return -ENOMEM; + + error = gfs2_quota_hold(dip, NO_UID_QUOTA_CHANGE, NO_GID_QUOTA_CHANGE); + if (error) + goto out; + + /* Count the number of leaves */ + bh = leaf_bh; + + for (blk = leaf_no; blk; blk = nblk) { + if (blk != leaf_no) { + error = get_leaf(dip, blk, &bh); + if (error) + goto out_rlist; + } + tmp_leaf = (struct gfs2_leaf *)bh->b_data; + nblk = be64_to_cpu(tmp_leaf->lf_next); + if (blk != leaf_no) + brelse(bh); + + gfs2_rlist_add(dip, &rlist, blk); + l_blocks++; + } + + gfs2_rlist_alloc(&rlist, LM_ST_EXCLUSIVE, LM_FLAG_NODE_SCOPE); + + for (x = 0; x < rlist.rl_rgrps; x++) { + struct gfs2_rgrpd *rgd = gfs2_glock2rgrp(rlist.rl_ghs[x].gh_gl); + + rg_blocks += rgd->rd_length; + } + + error = gfs2_glock_nq_m(rlist.rl_rgrps, rlist.rl_ghs); + if (error) + goto out_rlist; + + error = gfs2_trans_begin(sdp, + rg_blocks + (DIV_ROUND_UP(size, sdp->sd_jbsize) + 1) + + RES_DINODE + RES_STATFS + RES_QUOTA, RES_DINODE + + l_blocks); + if (error) + goto out_rg_gunlock; + + bh = leaf_bh; + + for (blk = leaf_no; blk; blk = nblk) { + struct gfs2_rgrpd *rgd; + + if (blk != leaf_no) { + error = get_leaf(dip, blk, &bh); + if (error) + goto out_end_trans; + } + tmp_leaf = (struct gfs2_leaf *)bh->b_data; + nblk = be64_to_cpu(tmp_leaf->lf_next); + if (blk != leaf_no) + brelse(bh); + + rgd = gfs2_blk2rgrpd(sdp, blk, true); + gfs2_free_meta(dip, rgd, blk, 1); + gfs2_add_inode_blocks(&dip->i_inode, -1); + } + + error = gfs2_dir_write_data(dip, ht, index * sizeof(u64), size); + if (error != size) { + if (error >= 0) + error = -EIO; + goto out_end_trans; + } + + error = gfs2_meta_inode_buffer(dip, &dibh); + if (error) + goto out_end_trans; + + gfs2_trans_add_meta(dip->i_gl, dibh); + /* On the last dealloc, make this a regular file in case we crash. + (We don't want to free these blocks a second time.) */ + if (last_dealloc) + dip->i_inode.i_mode = S_IFREG; + gfs2_dinode_out(dip, dibh->b_data); + brelse(dibh); + +out_end_trans: + gfs2_trans_end(sdp); +out_rg_gunlock: + gfs2_glock_dq_m(rlist.rl_rgrps, rlist.rl_ghs); +out_rlist: + gfs2_rlist_free(&rlist); + gfs2_quota_unhold(dip); +out: + kvfree(ht); + return error; +} + +/** + * gfs2_dir_exhash_dealloc - free all the leaf blocks in a directory + * @dip: the directory + * + * Dealloc all on-disk directory leaves to FREEMETA state + * Change on-disk inode type to "regular file" + * + * Returns: errno + */ + +int gfs2_dir_exhash_dealloc(struct gfs2_inode *dip) +{ + struct buffer_head *bh; + struct gfs2_leaf *leaf; + u32 hsize, len; + u32 index = 0, next_index; + __be64 *lp; + u64 leaf_no; + int error = 0, last; + + hsize = BIT(dip->i_depth); + + lp = gfs2_dir_get_hash_table(dip); + if (IS_ERR(lp)) + return PTR_ERR(lp); + + while (index < hsize) { + leaf_no = be64_to_cpu(lp[index]); + if (leaf_no) { + error = get_leaf(dip, leaf_no, &bh); + if (error) + goto out; + leaf = (struct gfs2_leaf *)bh->b_data; + len = BIT(dip->i_depth - be16_to_cpu(leaf->lf_depth)); + + next_index = (index & ~(len - 1)) + len; + last = ((next_index >= hsize) ? 1 : 0); + error = leaf_dealloc(dip, index, len, leaf_no, bh, + last); + brelse(bh); + if (error) + goto out; + index = next_index; + } else + index++; + } + + if (index != hsize) { + gfs2_consist_inode(dip); + error = -EIO; + } + +out: + + return error; +} + +/** + * gfs2_diradd_alloc_required - find if adding entry will require an allocation + * @inode: the directory inode being written to + * @name: the filename that's going to be added + * @da: The structure to return dir alloc info + * + * Returns: 0 if ok, -ve on error + */ + +int gfs2_diradd_alloc_required(struct inode *inode, const struct qstr *name, + struct gfs2_diradd *da) +{ + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_sbd *sdp = GFS2_SB(inode); + const unsigned int extra = sizeof(struct gfs2_dinode) - sizeof(struct gfs2_leaf); + struct gfs2_dirent *dent; + struct buffer_head *bh; + + da->nr_blocks = 0; + da->bh = NULL; + da->dent = NULL; + + dent = gfs2_dirent_search(inode, name, gfs2_dirent_find_space, &bh); + if (!dent) { + da->nr_blocks = sdp->sd_max_dirres; + if (!(ip->i_diskflags & GFS2_DIF_EXHASH) && + (GFS2_DIRENT_SIZE(name->len) < extra)) + da->nr_blocks = 1; + return 0; + } + if (IS_ERR(dent)) + return PTR_ERR(dent); + + if (da->save_loc) { + da->bh = bh; + da->dent = dent; + } else { + brelse(bh); + } + return 0; +} + diff --git a/fs/gfs2/dir.h b/fs/gfs2/dir.h new file mode 100644 index 0000000000..5b76480c17 --- /dev/null +++ b/fs/gfs2/dir.h @@ -0,0 +1,82 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + */ + +#ifndef __DIR_DOT_H__ +#define __DIR_DOT_H__ + +#include <linux/dcache.h> +#include <linux/crc32.h> + +struct inode; +struct gfs2_inode; +struct gfs2_inum; +struct buffer_head; +struct gfs2_dirent; + +struct gfs2_diradd { + unsigned nr_blocks; + struct gfs2_dirent *dent; + struct buffer_head *bh; + int save_loc; +}; + +extern struct inode *gfs2_dir_search(struct inode *dir, + const struct qstr *filename, + bool fail_on_exist); +extern int gfs2_dir_check(struct inode *dir, const struct qstr *filename, + const struct gfs2_inode *ip); +extern int gfs2_dir_add(struct inode *inode, const struct qstr *filename, + const struct gfs2_inode *ip, struct gfs2_diradd *da); +static inline void gfs2_dir_no_add(struct gfs2_diradd *da) +{ + brelse(da->bh); + da->bh = NULL; +} +extern int gfs2_dir_del(struct gfs2_inode *dip, const struct dentry *dentry); +extern int gfs2_dir_read(struct inode *inode, struct dir_context *ctx, + struct file_ra_state *f_ra); +extern int gfs2_dir_mvino(struct gfs2_inode *dip, const struct qstr *filename, + const struct gfs2_inode *nip, unsigned int new_type); + +extern int gfs2_dir_exhash_dealloc(struct gfs2_inode *dip); + +extern int gfs2_diradd_alloc_required(struct inode *dir, + const struct qstr *filename, + struct gfs2_diradd *da); +extern int gfs2_dir_get_new_buffer(struct gfs2_inode *ip, u64 block, + struct buffer_head **bhp); +extern void gfs2_dir_hash_inval(struct gfs2_inode *ip); + +static inline u32 gfs2_disk_hash(const char *data, int len) +{ + return crc32_le((u32)~0, data, len) ^ (u32)~0; +} + + +static inline void gfs2_str2qstr(struct qstr *name, const char *fname) +{ + name->name = fname; + name->len = strlen(fname); + name->hash = gfs2_disk_hash(name->name, name->len); +} + +/* N.B. This probably ought to take inum & type as args as well */ +static inline void gfs2_qstr2dirent(const struct qstr *name, u16 reclen, struct gfs2_dirent *dent) +{ + dent->de_inum.no_addr = cpu_to_be64(0); + dent->de_inum.no_formal_ino = cpu_to_be64(0); + dent->de_hash = cpu_to_be32(name->hash); + dent->de_rec_len = cpu_to_be16(reclen); + dent->de_name_len = cpu_to_be16(name->len); + dent->de_type = cpu_to_be16(0); + memset(dent->__pad, 0, sizeof(dent->__pad)); + memcpy(dent + 1, name->name, name->len); +} + +extern struct qstr gfs2_qdot; +extern struct qstr gfs2_qdotdot; + +#endif /* __DIR_DOT_H__ */ diff --git a/fs/gfs2/export.c b/fs/gfs2/export.c new file mode 100644 index 0000000000..cf40895233 --- /dev/null +++ b/fs/gfs2/export.c @@ -0,0 +1,196 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + */ + +#include <linux/spinlock.h> +#include <linux/completion.h> +#include <linux/buffer_head.h> +#include <linux/exportfs.h> +#include <linux/gfs2_ondisk.h> +#include <linux/crc32.h> + +#include "gfs2.h" +#include "incore.h" +#include "dir.h" +#include "glock.h" +#include "glops.h" +#include "inode.h" +#include "super.h" +#include "rgrp.h" +#include "util.h" + +#define GFS2_SMALL_FH_SIZE 4 +#define GFS2_LARGE_FH_SIZE 8 +#define GFS2_OLD_FH_SIZE 10 + +static int gfs2_encode_fh(struct inode *inode, __u32 *p, int *len, + struct inode *parent) +{ + __be32 *fh = (__force __be32 *)p; + struct super_block *sb = inode->i_sb; + struct gfs2_inode *ip = GFS2_I(inode); + + if (parent && (*len < GFS2_LARGE_FH_SIZE)) { + *len = GFS2_LARGE_FH_SIZE; + return FILEID_INVALID; + } else if (*len < GFS2_SMALL_FH_SIZE) { + *len = GFS2_SMALL_FH_SIZE; + return FILEID_INVALID; + } + + fh[0] = cpu_to_be32(ip->i_no_formal_ino >> 32); + fh[1] = cpu_to_be32(ip->i_no_formal_ino & 0xFFFFFFFF); + fh[2] = cpu_to_be32(ip->i_no_addr >> 32); + fh[3] = cpu_to_be32(ip->i_no_addr & 0xFFFFFFFF); + *len = GFS2_SMALL_FH_SIZE; + + if (!parent || inode == d_inode(sb->s_root)) + return *len; + + ip = GFS2_I(parent); + + fh[4] = cpu_to_be32(ip->i_no_formal_ino >> 32); + fh[5] = cpu_to_be32(ip->i_no_formal_ino & 0xFFFFFFFF); + fh[6] = cpu_to_be32(ip->i_no_addr >> 32); + fh[7] = cpu_to_be32(ip->i_no_addr & 0xFFFFFFFF); + *len = GFS2_LARGE_FH_SIZE; + + return *len; +} + +struct get_name_filldir { + struct dir_context ctx; + struct gfs2_inum_host inum; + char *name; +}; + +static bool get_name_filldir(struct dir_context *ctx, const char *name, + int length, loff_t offset, u64 inum, + unsigned int type) +{ + struct get_name_filldir *gnfd = + container_of(ctx, struct get_name_filldir, ctx); + + if (inum != gnfd->inum.no_addr) + return true; + + memcpy(gnfd->name, name, length); + gnfd->name[length] = 0; + + return false; +} + +static int gfs2_get_name(struct dentry *parent, char *name, + struct dentry *child) +{ + struct inode *dir = d_inode(parent); + struct inode *inode = d_inode(child); + struct gfs2_inode *dip, *ip; + struct get_name_filldir gnfd = { + .ctx.actor = get_name_filldir, + .name = name + }; + struct gfs2_holder gh; + int error; + struct file_ra_state f_ra = { .start = 0 }; + + if (!dir) + return -EINVAL; + + if (!S_ISDIR(dir->i_mode) || !inode) + return -EINVAL; + + dip = GFS2_I(dir); + ip = GFS2_I(inode); + + *name = 0; + gnfd.inum.no_addr = ip->i_no_addr; + gnfd.inum.no_formal_ino = ip->i_no_formal_ino; + + error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED, 0, &gh); + if (error) + return error; + + error = gfs2_dir_read(dir, &gnfd.ctx, &f_ra); + + gfs2_glock_dq_uninit(&gh); + + if (!error && !*name) + error = -ENOENT; + + return error; +} + +static struct dentry *gfs2_get_parent(struct dentry *child) +{ + return d_obtain_alias(gfs2_lookupi(d_inode(child), &gfs2_qdotdot, 1)); +} + +static struct dentry *gfs2_get_dentry(struct super_block *sb, + struct gfs2_inum_host *inum) +{ + struct gfs2_sbd *sdp = sb->s_fs_info; + struct inode *inode; + + if (!inum->no_formal_ino) + return ERR_PTR(-ESTALE); + inode = gfs2_lookup_by_inum(sdp, inum->no_addr, inum->no_formal_ino, + GFS2_BLKST_DINODE); + if (IS_ERR(inode)) + return ERR_CAST(inode); + return d_obtain_alias(inode); +} + +static struct dentry *gfs2_fh_to_dentry(struct super_block *sb, struct fid *fid, + int fh_len, int fh_type) +{ + struct gfs2_inum_host this; + __be32 *fh = (__force __be32 *)fid->raw; + + switch (fh_type) { + case GFS2_SMALL_FH_SIZE: + case GFS2_LARGE_FH_SIZE: + case GFS2_OLD_FH_SIZE: + if (fh_len < GFS2_SMALL_FH_SIZE) + return NULL; + this.no_formal_ino = ((u64)be32_to_cpu(fh[0])) << 32; + this.no_formal_ino |= be32_to_cpu(fh[1]); + this.no_addr = ((u64)be32_to_cpu(fh[2])) << 32; + this.no_addr |= be32_to_cpu(fh[3]); + return gfs2_get_dentry(sb, &this); + default: + return NULL; + } +} + +static struct dentry *gfs2_fh_to_parent(struct super_block *sb, struct fid *fid, + int fh_len, int fh_type) +{ + struct gfs2_inum_host parent; + __be32 *fh = (__force __be32 *)fid->raw; + + switch (fh_type) { + case GFS2_LARGE_FH_SIZE: + case GFS2_OLD_FH_SIZE: + if (fh_len < GFS2_LARGE_FH_SIZE) + return NULL; + parent.no_formal_ino = ((u64)be32_to_cpu(fh[4])) << 32; + parent.no_formal_ino |= be32_to_cpu(fh[5]); + parent.no_addr = ((u64)be32_to_cpu(fh[6])) << 32; + parent.no_addr |= be32_to_cpu(fh[7]); + return gfs2_get_dentry(sb, &parent); + default: + return NULL; + } +} + +const struct export_operations gfs2_export_ops = { + .encode_fh = gfs2_encode_fh, + .fh_to_dentry = gfs2_fh_to_dentry, + .fh_to_parent = gfs2_fh_to_parent, + .get_name = gfs2_get_name, + .get_parent = gfs2_get_parent, +}; + diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c new file mode 100644 index 0000000000..f2700477a3 --- /dev/null +++ b/fs/gfs2/file.c @@ -0,0 +1,1624 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + */ + +#include <linux/slab.h> +#include <linux/spinlock.h> +#include <linux/compat.h> +#include <linux/completion.h> +#include <linux/buffer_head.h> +#include <linux/pagemap.h> +#include <linux/uio.h> +#include <linux/blkdev.h> +#include <linux/mm.h> +#include <linux/mount.h> +#include <linux/fs.h> +#include <linux/filelock.h> +#include <linux/gfs2_ondisk.h> +#include <linux/falloc.h> +#include <linux/swap.h> +#include <linux/crc32.h> +#include <linux/writeback.h> +#include <linux/uaccess.h> +#include <linux/dlm.h> +#include <linux/dlm_plock.h> +#include <linux/delay.h> +#include <linux/backing-dev.h> +#include <linux/fileattr.h> + +#include "gfs2.h" +#include "incore.h" +#include "bmap.h" +#include "aops.h" +#include "dir.h" +#include "glock.h" +#include "glops.h" +#include "inode.h" +#include "log.h" +#include "meta_io.h" +#include "quota.h" +#include "rgrp.h" +#include "trans.h" +#include "util.h" + +/** + * gfs2_llseek - seek to a location in a file + * @file: the file + * @offset: the offset + * @whence: Where to seek from (SEEK_SET, SEEK_CUR, or SEEK_END) + * + * SEEK_END requires the glock for the file because it references the + * file's size. + * + * Returns: The new offset, or errno + */ + +static loff_t gfs2_llseek(struct file *file, loff_t offset, int whence) +{ + struct gfs2_inode *ip = GFS2_I(file->f_mapping->host); + struct gfs2_holder i_gh; + loff_t error; + + switch (whence) { + case SEEK_END: + error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, + &i_gh); + if (!error) { + error = generic_file_llseek(file, offset, whence); + gfs2_glock_dq_uninit(&i_gh); + } + break; + + case SEEK_DATA: + error = gfs2_seek_data(file, offset); + break; + + case SEEK_HOLE: + error = gfs2_seek_hole(file, offset); + break; + + case SEEK_CUR: + case SEEK_SET: + /* + * These don't reference inode->i_size and don't depend on the + * block mapping, so we don't need the glock. + */ + error = generic_file_llseek(file, offset, whence); + break; + default: + error = -EINVAL; + } + + return error; +} + +/** + * gfs2_readdir - Iterator for a directory + * @file: The directory to read from + * @ctx: What to feed directory entries to + * + * Returns: errno + */ + +static int gfs2_readdir(struct file *file, struct dir_context *ctx) +{ + struct inode *dir = file->f_mapping->host; + struct gfs2_inode *dip = GFS2_I(dir); + struct gfs2_holder d_gh; + int error; + + error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED, 0, &d_gh); + if (error) + return error; + + error = gfs2_dir_read(dir, ctx, &file->f_ra); + + gfs2_glock_dq_uninit(&d_gh); + + return error; +} + +/* + * struct fsflag_gfs2flag + * + * The FS_JOURNAL_DATA_FL flag maps to GFS2_DIF_INHERIT_JDATA for directories, + * and to GFS2_DIF_JDATA for non-directories. + */ +static struct { + u32 fsflag; + u32 gfsflag; +} fsflag_gfs2flag[] = { + {FS_SYNC_FL, GFS2_DIF_SYNC}, + {FS_IMMUTABLE_FL, GFS2_DIF_IMMUTABLE}, + {FS_APPEND_FL, GFS2_DIF_APPENDONLY}, + {FS_NOATIME_FL, GFS2_DIF_NOATIME}, + {FS_INDEX_FL, GFS2_DIF_EXHASH}, + {FS_TOPDIR_FL, GFS2_DIF_TOPDIR}, + {FS_JOURNAL_DATA_FL, GFS2_DIF_JDATA | GFS2_DIF_INHERIT_JDATA}, +}; + +static inline u32 gfs2_gfsflags_to_fsflags(struct inode *inode, u32 gfsflags) +{ + int i; + u32 fsflags = 0; + + if (S_ISDIR(inode->i_mode)) + gfsflags &= ~GFS2_DIF_JDATA; + else + gfsflags &= ~GFS2_DIF_INHERIT_JDATA; + + for (i = 0; i < ARRAY_SIZE(fsflag_gfs2flag); i++) + if (gfsflags & fsflag_gfs2flag[i].gfsflag) + fsflags |= fsflag_gfs2flag[i].fsflag; + return fsflags; +} + +int gfs2_fileattr_get(struct dentry *dentry, struct fileattr *fa) +{ + struct inode *inode = d_inode(dentry); + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_holder gh; + int error; + u32 fsflags; + + if (d_is_special(dentry)) + return -ENOTTY; + + gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &gh); + error = gfs2_glock_nq(&gh); + if (error) + goto out_uninit; + + fsflags = gfs2_gfsflags_to_fsflags(inode, ip->i_diskflags); + + fileattr_fill_flags(fa, fsflags); + + gfs2_glock_dq(&gh); +out_uninit: + gfs2_holder_uninit(&gh); + return error; +} + +void gfs2_set_inode_flags(struct inode *inode) +{ + struct gfs2_inode *ip = GFS2_I(inode); + unsigned int flags = inode->i_flags; + + flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC|S_NOSEC); + if ((ip->i_eattr == 0) && !is_sxid(inode->i_mode)) + flags |= S_NOSEC; + if (ip->i_diskflags & GFS2_DIF_IMMUTABLE) + flags |= S_IMMUTABLE; + if (ip->i_diskflags & GFS2_DIF_APPENDONLY) + flags |= S_APPEND; + if (ip->i_diskflags & GFS2_DIF_NOATIME) + flags |= S_NOATIME; + if (ip->i_diskflags & GFS2_DIF_SYNC) + flags |= S_SYNC; + inode->i_flags = flags; +} + +/* Flags that can be set by user space */ +#define GFS2_FLAGS_USER_SET (GFS2_DIF_JDATA| \ + GFS2_DIF_IMMUTABLE| \ + GFS2_DIF_APPENDONLY| \ + GFS2_DIF_NOATIME| \ + GFS2_DIF_SYNC| \ + GFS2_DIF_TOPDIR| \ + GFS2_DIF_INHERIT_JDATA) + +/** + * do_gfs2_set_flags - set flags on an inode + * @inode: The inode + * @reqflags: The flags to set + * @mask: Indicates which flags are valid + * + */ +static int do_gfs2_set_flags(struct inode *inode, u32 reqflags, u32 mask) +{ + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_sbd *sdp = GFS2_SB(inode); + struct buffer_head *bh; + struct gfs2_holder gh; + int error; + u32 new_flags, flags; + + error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh); + if (error) + return error; + + error = 0; + flags = ip->i_diskflags; + new_flags = (flags & ~mask) | (reqflags & mask); + if ((new_flags ^ flags) == 0) + goto out; + + if (!IS_IMMUTABLE(inode)) { + error = gfs2_permission(&nop_mnt_idmap, inode, MAY_WRITE); + if (error) + goto out; + } + if ((flags ^ new_flags) & GFS2_DIF_JDATA) { + if (new_flags & GFS2_DIF_JDATA) + gfs2_log_flush(sdp, ip->i_gl, + GFS2_LOG_HEAD_FLUSH_NORMAL | + GFS2_LFC_SET_FLAGS); + error = filemap_fdatawrite(inode->i_mapping); + if (error) + goto out; + error = filemap_fdatawait(inode->i_mapping); + if (error) + goto out; + if (new_flags & GFS2_DIF_JDATA) + gfs2_ordered_del_inode(ip); + } + error = gfs2_trans_begin(sdp, RES_DINODE, 0); + if (error) + goto out; + error = gfs2_meta_inode_buffer(ip, &bh); + if (error) + goto out_trans_end; + inode_set_ctime_current(inode); + gfs2_trans_add_meta(ip->i_gl, bh); + ip->i_diskflags = new_flags; + gfs2_dinode_out(ip, bh->b_data); + brelse(bh); + gfs2_set_inode_flags(inode); + gfs2_set_aops(inode); +out_trans_end: + gfs2_trans_end(sdp); +out: + gfs2_glock_dq_uninit(&gh); + return error; +} + +int gfs2_fileattr_set(struct mnt_idmap *idmap, + struct dentry *dentry, struct fileattr *fa) +{ + struct inode *inode = d_inode(dentry); + u32 fsflags = fa->flags, gfsflags = 0; + u32 mask; + int i; + + if (d_is_special(dentry)) + return -ENOTTY; + + if (fileattr_has_fsx(fa)) + return -EOPNOTSUPP; + + for (i = 0; i < ARRAY_SIZE(fsflag_gfs2flag); i++) { + if (fsflags & fsflag_gfs2flag[i].fsflag) { + fsflags &= ~fsflag_gfs2flag[i].fsflag; + gfsflags |= fsflag_gfs2flag[i].gfsflag; + } + } + if (fsflags || gfsflags & ~GFS2_FLAGS_USER_SET) + return -EINVAL; + + mask = GFS2_FLAGS_USER_SET; + if (S_ISDIR(inode->i_mode)) { + mask &= ~GFS2_DIF_JDATA; + } else { + /* The GFS2_DIF_TOPDIR flag is only valid for directories. */ + if (gfsflags & GFS2_DIF_TOPDIR) + return -EINVAL; + mask &= ~(GFS2_DIF_TOPDIR | GFS2_DIF_INHERIT_JDATA); + } + + return do_gfs2_set_flags(inode, gfsflags, mask); +} + +static int gfs2_getlabel(struct file *filp, char __user *label) +{ + struct inode *inode = file_inode(filp); + struct gfs2_sbd *sdp = GFS2_SB(inode); + + if (copy_to_user(label, sdp->sd_sb.sb_locktable, GFS2_LOCKNAME_LEN)) + return -EFAULT; + + return 0; +} + +static long gfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) +{ + switch(cmd) { + case FITRIM: + return gfs2_fitrim(filp, (void __user *)arg); + case FS_IOC_GETFSLABEL: + return gfs2_getlabel(filp, (char __user *)arg); + } + + return -ENOTTY; +} + +#ifdef CONFIG_COMPAT +static long gfs2_compat_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) +{ + switch(cmd) { + /* Keep this list in sync with gfs2_ioctl */ + case FITRIM: + case FS_IOC_GETFSLABEL: + break; + default: + return -ENOIOCTLCMD; + } + + return gfs2_ioctl(filp, cmd, (unsigned long)compat_ptr(arg)); +} +#else +#define gfs2_compat_ioctl NULL +#endif + +/** + * gfs2_size_hint - Give a hint to the size of a write request + * @filep: The struct file + * @offset: The file offset of the write + * @size: The length of the write + * + * When we are about to do a write, this function records the total + * write size in order to provide a suitable hint to the lower layers + * about how many blocks will be required. + * + */ + +static void gfs2_size_hint(struct file *filep, loff_t offset, size_t size) +{ + struct inode *inode = file_inode(filep); + struct gfs2_sbd *sdp = GFS2_SB(inode); + struct gfs2_inode *ip = GFS2_I(inode); + size_t blks = (size + sdp->sd_sb.sb_bsize - 1) >> sdp->sd_sb.sb_bsize_shift; + int hint = min_t(size_t, INT_MAX, blks); + + if (hint > atomic_read(&ip->i_sizehint)) + atomic_set(&ip->i_sizehint, hint); +} + +/** + * gfs2_allocate_page_backing - Allocate blocks for a write fault + * @page: The (locked) page to allocate backing for + * @length: Size of the allocation + * + * We try to allocate all the blocks required for the page in one go. This + * might fail for various reasons, so we keep trying until all the blocks to + * back this page are allocated. If some of the blocks are already allocated, + * that is ok too. + */ +static int gfs2_allocate_page_backing(struct page *page, unsigned int length) +{ + u64 pos = page_offset(page); + + do { + struct iomap iomap = { }; + + if (gfs2_iomap_alloc(page->mapping->host, pos, length, &iomap)) + return -EIO; + + if (length < iomap.length) + iomap.length = length; + length -= iomap.length; + pos += iomap.length; + } while (length > 0); + + return 0; +} + +/** + * gfs2_page_mkwrite - Make a shared, mmap()ed, page writable + * @vmf: The virtual memory fault containing the page to become writable + * + * When the page becomes writable, we need to ensure that we have + * blocks allocated on disk to back that page. + */ + +static vm_fault_t gfs2_page_mkwrite(struct vm_fault *vmf) +{ + struct page *page = vmf->page; + struct inode *inode = file_inode(vmf->vma->vm_file); + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_sbd *sdp = GFS2_SB(inode); + struct gfs2_alloc_parms ap = { .aflags = 0, }; + u64 offset = page_offset(page); + unsigned int data_blocks, ind_blocks, rblocks; + vm_fault_t ret = VM_FAULT_LOCKED; + struct gfs2_holder gh; + unsigned int length; + loff_t size; + int err; + + sb_start_pagefault(inode->i_sb); + + gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh); + err = gfs2_glock_nq(&gh); + if (err) { + ret = vmf_fs_error(err); + goto out_uninit; + } + + /* Check page index against inode size */ + size = i_size_read(inode); + if (offset >= size) { + ret = VM_FAULT_SIGBUS; + goto out_unlock; + } + + /* Update file times before taking page lock */ + file_update_time(vmf->vma->vm_file); + + /* page is wholly or partially inside EOF */ + if (size - offset < PAGE_SIZE) + length = size - offset; + else + length = PAGE_SIZE; + + gfs2_size_hint(vmf->vma->vm_file, offset, length); + + set_bit(GLF_DIRTY, &ip->i_gl->gl_flags); + set_bit(GIF_SW_PAGED, &ip->i_flags); + + /* + * iomap_writepage / iomap_writepages currently don't support inline + * files, so always unstuff here. + */ + + if (!gfs2_is_stuffed(ip) && + !gfs2_write_alloc_required(ip, offset, length)) { + lock_page(page); + if (!PageUptodate(page) || page->mapping != inode->i_mapping) { + ret = VM_FAULT_NOPAGE; + unlock_page(page); + } + goto out_unlock; + } + + err = gfs2_rindex_update(sdp); + if (err) { + ret = vmf_fs_error(err); + goto out_unlock; + } + + gfs2_write_calc_reserv(ip, length, &data_blocks, &ind_blocks); + ap.target = data_blocks + ind_blocks; + err = gfs2_quota_lock_check(ip, &ap); + if (err) { + ret = vmf_fs_error(err); + goto out_unlock; + } + err = gfs2_inplace_reserve(ip, &ap); + if (err) { + ret = vmf_fs_error(err); + goto out_quota_unlock; + } + + rblocks = RES_DINODE + ind_blocks; + if (gfs2_is_jdata(ip)) + rblocks += data_blocks ? data_blocks : 1; + if (ind_blocks || data_blocks) { + rblocks += RES_STATFS + RES_QUOTA; + rblocks += gfs2_rg_blocks(ip, data_blocks + ind_blocks); + } + err = gfs2_trans_begin(sdp, rblocks, 0); + if (err) { + ret = vmf_fs_error(err); + goto out_trans_fail; + } + + /* Unstuff, if required, and allocate backing blocks for page */ + if (gfs2_is_stuffed(ip)) { + err = gfs2_unstuff_dinode(ip); + if (err) { + ret = vmf_fs_error(err); + goto out_trans_end; + } + } + + lock_page(page); + /* If truncated, we must retry the operation, we may have raced + * with the glock demotion code. + */ + if (!PageUptodate(page) || page->mapping != inode->i_mapping) { + ret = VM_FAULT_NOPAGE; + goto out_page_locked; + } + + err = gfs2_allocate_page_backing(page, length); + if (err) + ret = vmf_fs_error(err); + +out_page_locked: + if (ret != VM_FAULT_LOCKED) + unlock_page(page); +out_trans_end: + gfs2_trans_end(sdp); +out_trans_fail: + gfs2_inplace_release(ip); +out_quota_unlock: + gfs2_quota_unlock(ip); +out_unlock: + gfs2_glock_dq(&gh); +out_uninit: + gfs2_holder_uninit(&gh); + if (ret == VM_FAULT_LOCKED) { + set_page_dirty(page); + wait_for_stable_page(page); + } + sb_end_pagefault(inode->i_sb); + return ret; +} + +static vm_fault_t gfs2_fault(struct vm_fault *vmf) +{ + struct inode *inode = file_inode(vmf->vma->vm_file); + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_holder gh; + vm_fault_t ret; + int err; + + gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &gh); + err = gfs2_glock_nq(&gh); + if (err) { + ret = vmf_fs_error(err); + goto out_uninit; + } + ret = filemap_fault(vmf); + gfs2_glock_dq(&gh); +out_uninit: + gfs2_holder_uninit(&gh); + return ret; +} + +static const struct vm_operations_struct gfs2_vm_ops = { + .fault = gfs2_fault, + .map_pages = filemap_map_pages, + .page_mkwrite = gfs2_page_mkwrite, +}; + +/** + * gfs2_mmap + * @file: The file to map + * @vma: The VMA which described the mapping + * + * There is no need to get a lock here unless we should be updating + * atime. We ignore any locking errors since the only consequence is + * a missed atime update (which will just be deferred until later). + * + * Returns: 0 + */ + +static int gfs2_mmap(struct file *file, struct vm_area_struct *vma) +{ + struct gfs2_inode *ip = GFS2_I(file->f_mapping->host); + + if (!(file->f_flags & O_NOATIME) && + !IS_NOATIME(&ip->i_inode)) { + struct gfs2_holder i_gh; + int error; + + error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, + &i_gh); + if (error) + return error; + /* grab lock to update inode */ + gfs2_glock_dq_uninit(&i_gh); + file_accessed(file); + } + vma->vm_ops = &gfs2_vm_ops; + + return 0; +} + +/** + * gfs2_open_common - This is common to open and atomic_open + * @inode: The inode being opened + * @file: The file being opened + * + * This maybe called under a glock or not depending upon how it has + * been called. We must always be called under a glock for regular + * files, however. For other file types, it does not matter whether + * we hold the glock or not. + * + * Returns: Error code or 0 for success + */ + +int gfs2_open_common(struct inode *inode, struct file *file) +{ + struct gfs2_file *fp; + int ret; + + if (S_ISREG(inode->i_mode)) { + ret = generic_file_open(inode, file); + if (ret) + return ret; + + if (!gfs2_is_jdata(GFS2_I(inode))) + file->f_mode |= FMODE_CAN_ODIRECT; + } + + fp = kzalloc(sizeof(struct gfs2_file), GFP_NOFS); + if (!fp) + return -ENOMEM; + + mutex_init(&fp->f_fl_mutex); + + gfs2_assert_warn(GFS2_SB(inode), !file->private_data); + file->private_data = fp; + if (file->f_mode & FMODE_WRITE) { + ret = gfs2_qa_get(GFS2_I(inode)); + if (ret) + goto fail; + } + return 0; + +fail: + kfree(file->private_data); + file->private_data = NULL; + return ret; +} + +/** + * gfs2_open - open a file + * @inode: the inode to open + * @file: the struct file for this opening + * + * After atomic_open, this function is only used for opening files + * which are already cached. We must still get the glock for regular + * files to ensure that we have the file size uptodate for the large + * file check which is in the common code. That is only an issue for + * regular files though. + * + * Returns: errno + */ + +static int gfs2_open(struct inode *inode, struct file *file) +{ + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_holder i_gh; + int error; + bool need_unlock = false; + + if (S_ISREG(ip->i_inode.i_mode)) { + error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, + &i_gh); + if (error) + return error; + need_unlock = true; + } + + error = gfs2_open_common(inode, file); + + if (need_unlock) + gfs2_glock_dq_uninit(&i_gh); + + return error; +} + +/** + * gfs2_release - called to close a struct file + * @inode: the inode the struct file belongs to + * @file: the struct file being closed + * + * Returns: errno + */ + +static int gfs2_release(struct inode *inode, struct file *file) +{ + struct gfs2_inode *ip = GFS2_I(inode); + + kfree(file->private_data); + file->private_data = NULL; + + if (file->f_mode & FMODE_WRITE) { + if (gfs2_rs_active(&ip->i_res)) + gfs2_rs_delete(ip); + gfs2_qa_put(ip); + } + return 0; +} + +/** + * gfs2_fsync - sync the dirty data for a file (across the cluster) + * @file: the file that points to the dentry + * @start: the start position in the file to sync + * @end: the end position in the file to sync + * @datasync: set if we can ignore timestamp changes + * + * We split the data flushing here so that we don't wait for the data + * until after we've also sent the metadata to disk. Note that for + * data=ordered, we will write & wait for the data at the log flush + * stage anyway, so this is unlikely to make much of a difference + * except in the data=writeback case. + * + * If the fdatawrite fails due to any reason except -EIO, we will + * continue the remainder of the fsync, although we'll still report + * the error at the end. This is to match filemap_write_and_wait_range() + * behaviour. + * + * Returns: errno + */ + +static int gfs2_fsync(struct file *file, loff_t start, loff_t end, + int datasync) +{ + struct address_space *mapping = file->f_mapping; + struct inode *inode = mapping->host; + int sync_state = inode->i_state & I_DIRTY; + struct gfs2_inode *ip = GFS2_I(inode); + int ret = 0, ret1 = 0; + + if (mapping->nrpages) { + ret1 = filemap_fdatawrite_range(mapping, start, end); + if (ret1 == -EIO) + return ret1; + } + + if (!gfs2_is_jdata(ip)) + sync_state &= ~I_DIRTY_PAGES; + if (datasync) + sync_state &= ~I_DIRTY_SYNC; + + if (sync_state) { + ret = sync_inode_metadata(inode, 1); + if (ret) + return ret; + if (gfs2_is_jdata(ip)) + ret = file_write_and_wait(file); + if (ret) + return ret; + gfs2_ail_flush(ip->i_gl, 1); + } + + if (mapping->nrpages) + ret = file_fdatawait_range(file, start, end); + + return ret ? ret : ret1; +} + +static inline bool should_fault_in_pages(struct iov_iter *i, + struct kiocb *iocb, + size_t *prev_count, + size_t *window_size) +{ + size_t count = iov_iter_count(i); + size_t size, offs; + + if (!count) + return false; + if (!user_backed_iter(i)) + return false; + + /* + * Try to fault in multiple pages initially. When that doesn't result + * in any progress, fall back to a single page. + */ + size = PAGE_SIZE; + offs = offset_in_page(iocb->ki_pos); + if (*prev_count != count) { + size_t nr_dirtied; + + nr_dirtied = max(current->nr_dirtied_pause - + current->nr_dirtied, 8); + size = min_t(size_t, SZ_1M, nr_dirtied << PAGE_SHIFT); + } + + *prev_count = count; + *window_size = size - offs; + return true; +} + +static ssize_t gfs2_file_direct_read(struct kiocb *iocb, struct iov_iter *to, + struct gfs2_holder *gh) +{ + struct file *file = iocb->ki_filp; + struct gfs2_inode *ip = GFS2_I(file->f_mapping->host); + size_t prev_count = 0, window_size = 0; + size_t read = 0; + ssize_t ret; + + /* + * In this function, we disable page faults when we're holding the + * inode glock while doing I/O. If a page fault occurs, we indicate + * that the inode glock may be dropped, fault in the pages manually, + * and retry. + * + * Unlike generic_file_read_iter, for reads, iomap_dio_rw can trigger + * physical as well as manual page faults, and we need to disable both + * kinds. + * + * For direct I/O, gfs2 takes the inode glock in deferred mode. This + * locking mode is compatible with other deferred holders, so multiple + * processes and nodes can do direct I/O to a file at the same time. + * There's no guarantee that reads or writes will be atomic. Any + * coordination among readers and writers needs to happen externally. + */ + + if (!iov_iter_count(to)) + return 0; /* skip atime */ + + gfs2_holder_init(ip->i_gl, LM_ST_DEFERRED, 0, gh); +retry: + ret = gfs2_glock_nq(gh); + if (ret) + goto out_uninit; + pagefault_disable(); + to->nofault = true; + ret = iomap_dio_rw(iocb, to, &gfs2_iomap_ops, NULL, + IOMAP_DIO_PARTIAL, NULL, read); + to->nofault = false; + pagefault_enable(); + if (ret <= 0 && ret != -EFAULT) + goto out_unlock; + /* No increment (+=) because iomap_dio_rw returns a cumulative value. */ + if (ret > 0) + read = ret; + + if (should_fault_in_pages(to, iocb, &prev_count, &window_size)) { + gfs2_glock_dq(gh); + window_size -= fault_in_iov_iter_writeable(to, window_size); + if (window_size) + goto retry; + } +out_unlock: + if (gfs2_holder_queued(gh)) + gfs2_glock_dq(gh); +out_uninit: + gfs2_holder_uninit(gh); + /* User space doesn't expect partial success. */ + if (ret < 0) + return ret; + return read; +} + +static ssize_t gfs2_file_direct_write(struct kiocb *iocb, struct iov_iter *from, + struct gfs2_holder *gh) +{ + struct file *file = iocb->ki_filp; + struct inode *inode = file->f_mapping->host; + struct gfs2_inode *ip = GFS2_I(inode); + size_t prev_count = 0, window_size = 0; + size_t written = 0; + bool enough_retries; + ssize_t ret; + + /* + * In this function, we disable page faults when we're holding the + * inode glock while doing I/O. If a page fault occurs, we indicate + * that the inode glock may be dropped, fault in the pages manually, + * and retry. + * + * For writes, iomap_dio_rw only triggers manual page faults, so we + * don't need to disable physical ones. + */ + + /* + * Deferred lock, even if its a write, since we do no allocation on + * this path. All we need to change is the atime, and this lock mode + * ensures that other nodes have flushed their buffered read caches + * (i.e. their page cache entries for this inode). We do not, + * unfortunately, have the option of only flushing a range like the + * VFS does. + */ + gfs2_holder_init(ip->i_gl, LM_ST_DEFERRED, 0, gh); +retry: + ret = gfs2_glock_nq(gh); + if (ret) + goto out_uninit; + /* Silently fall back to buffered I/O when writing beyond EOF */ + if (iocb->ki_pos + iov_iter_count(from) > i_size_read(&ip->i_inode)) + goto out_unlock; + + from->nofault = true; + ret = iomap_dio_rw(iocb, from, &gfs2_iomap_ops, NULL, + IOMAP_DIO_PARTIAL, NULL, written); + from->nofault = false; + if (ret <= 0) { + if (ret == -ENOTBLK) + ret = 0; + if (ret != -EFAULT) + goto out_unlock; + } + /* No increment (+=) because iomap_dio_rw returns a cumulative value. */ + if (ret > 0) + written = ret; + + enough_retries = prev_count == iov_iter_count(from) && + window_size <= PAGE_SIZE; + if (should_fault_in_pages(from, iocb, &prev_count, &window_size)) { + gfs2_glock_dq(gh); + window_size -= fault_in_iov_iter_readable(from, window_size); + if (window_size) { + if (!enough_retries) + goto retry; + /* fall back to buffered I/O */ + ret = 0; + } + } +out_unlock: + if (gfs2_holder_queued(gh)) + gfs2_glock_dq(gh); +out_uninit: + gfs2_holder_uninit(gh); + /* User space doesn't expect partial success. */ + if (ret < 0) + return ret; + return written; +} + +static ssize_t gfs2_file_read_iter(struct kiocb *iocb, struct iov_iter *to) +{ + struct gfs2_inode *ip; + struct gfs2_holder gh; + size_t prev_count = 0, window_size = 0; + size_t read = 0; + ssize_t ret; + + /* + * In this function, we disable page faults when we're holding the + * inode glock while doing I/O. If a page fault occurs, we indicate + * that the inode glock may be dropped, fault in the pages manually, + * and retry. + */ + + if (iocb->ki_flags & IOCB_DIRECT) + return gfs2_file_direct_read(iocb, to, &gh); + + pagefault_disable(); + iocb->ki_flags |= IOCB_NOIO; + ret = generic_file_read_iter(iocb, to); + iocb->ki_flags &= ~IOCB_NOIO; + pagefault_enable(); + if (ret >= 0) { + if (!iov_iter_count(to)) + return ret; + read = ret; + } else if (ret != -EFAULT) { + if (ret != -EAGAIN) + return ret; + if (iocb->ki_flags & IOCB_NOWAIT) + return ret; + } + ip = GFS2_I(iocb->ki_filp->f_mapping->host); + gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &gh); +retry: + ret = gfs2_glock_nq(&gh); + if (ret) + goto out_uninit; + pagefault_disable(); + ret = generic_file_read_iter(iocb, to); + pagefault_enable(); + if (ret <= 0 && ret != -EFAULT) + goto out_unlock; + if (ret > 0) + read += ret; + + if (should_fault_in_pages(to, iocb, &prev_count, &window_size)) { + gfs2_glock_dq(&gh); + window_size -= fault_in_iov_iter_writeable(to, window_size); + if (window_size) + goto retry; + } +out_unlock: + if (gfs2_holder_queued(&gh)) + gfs2_glock_dq(&gh); +out_uninit: + gfs2_holder_uninit(&gh); + return read ? read : ret; +} + +static ssize_t gfs2_file_buffered_write(struct kiocb *iocb, + struct iov_iter *from, + struct gfs2_holder *gh) +{ + struct file *file = iocb->ki_filp; + struct inode *inode = file_inode(file); + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_sbd *sdp = GFS2_SB(inode); + struct gfs2_holder *statfs_gh = NULL; + size_t prev_count = 0, window_size = 0; + size_t orig_count = iov_iter_count(from); + size_t written = 0; + ssize_t ret; + + /* + * In this function, we disable page faults when we're holding the + * inode glock while doing I/O. If a page fault occurs, we indicate + * that the inode glock may be dropped, fault in the pages manually, + * and retry. + */ + + if (inode == sdp->sd_rindex) { + statfs_gh = kmalloc(sizeof(*statfs_gh), GFP_NOFS); + if (!statfs_gh) + return -ENOMEM; + } + + gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, gh); + if (should_fault_in_pages(from, iocb, &prev_count, &window_size)) { +retry: + window_size -= fault_in_iov_iter_readable(from, window_size); + if (!window_size) { + ret = -EFAULT; + goto out_uninit; + } + from->count = min(from->count, window_size); + } + ret = gfs2_glock_nq(gh); + if (ret) + goto out_uninit; + + if (inode == sdp->sd_rindex) { + struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode); + + ret = gfs2_glock_nq_init(m_ip->i_gl, LM_ST_EXCLUSIVE, + GL_NOCACHE, statfs_gh); + if (ret) + goto out_unlock; + } + + pagefault_disable(); + ret = iomap_file_buffered_write(iocb, from, &gfs2_iomap_ops); + pagefault_enable(); + if (ret > 0) + written += ret; + + if (inode == sdp->sd_rindex) + gfs2_glock_dq_uninit(statfs_gh); + + if (ret <= 0 && ret != -EFAULT) + goto out_unlock; + + from->count = orig_count - written; + if (should_fault_in_pages(from, iocb, &prev_count, &window_size)) { + gfs2_glock_dq(gh); + goto retry; + } +out_unlock: + if (gfs2_holder_queued(gh)) + gfs2_glock_dq(gh); +out_uninit: + gfs2_holder_uninit(gh); + kfree(statfs_gh); + from->count = orig_count - written; + return written ? written : ret; +} + +/** + * gfs2_file_write_iter - Perform a write to a file + * @iocb: The io context + * @from: The data to write + * + * We have to do a lock/unlock here to refresh the inode size for + * O_APPEND writes, otherwise we can land up writing at the wrong + * offset. There is still a race, but provided the app is using its + * own file locking, this will make O_APPEND work as expected. + * + */ + +static ssize_t gfs2_file_write_iter(struct kiocb *iocb, struct iov_iter *from) +{ + struct file *file = iocb->ki_filp; + struct inode *inode = file_inode(file); + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_holder gh; + ssize_t ret; + + gfs2_size_hint(file, iocb->ki_pos, iov_iter_count(from)); + + if (iocb->ki_flags & IOCB_APPEND) { + ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, 0, &gh); + if (ret) + return ret; + gfs2_glock_dq_uninit(&gh); + } + + inode_lock(inode); + ret = generic_write_checks(iocb, from); + if (ret <= 0) + goto out_unlock; + + ret = file_remove_privs(file); + if (ret) + goto out_unlock; + + ret = file_update_time(file); + if (ret) + goto out_unlock; + + if (iocb->ki_flags & IOCB_DIRECT) { + struct address_space *mapping = file->f_mapping; + ssize_t buffered, ret2; + + ret = gfs2_file_direct_write(iocb, from, &gh); + if (ret < 0 || !iov_iter_count(from)) + goto out_unlock; + + iocb->ki_flags |= IOCB_DSYNC; + buffered = gfs2_file_buffered_write(iocb, from, &gh); + if (unlikely(buffered <= 0)) { + if (!ret) + ret = buffered; + goto out_unlock; + } + + /* + * We need to ensure that the page cache pages are written to + * disk and invalidated to preserve the expected O_DIRECT + * semantics. If the writeback or invalidate fails, only report + * the direct I/O range as we don't know if the buffered pages + * made it to disk. + */ + ret2 = generic_write_sync(iocb, buffered); + invalidate_mapping_pages(mapping, + (iocb->ki_pos - buffered) >> PAGE_SHIFT, + (iocb->ki_pos - 1) >> PAGE_SHIFT); + if (!ret || ret2 > 0) + ret += ret2; + } else { + ret = gfs2_file_buffered_write(iocb, from, &gh); + if (likely(ret > 0)) + ret = generic_write_sync(iocb, ret); + } + +out_unlock: + inode_unlock(inode); + return ret; +} + +static int fallocate_chunk(struct inode *inode, loff_t offset, loff_t len, + int mode) +{ + struct super_block *sb = inode->i_sb; + struct gfs2_inode *ip = GFS2_I(inode); + loff_t end = offset + len; + struct buffer_head *dibh; + int error; + + error = gfs2_meta_inode_buffer(ip, &dibh); + if (unlikely(error)) + return error; + + gfs2_trans_add_meta(ip->i_gl, dibh); + + if (gfs2_is_stuffed(ip)) { + error = gfs2_unstuff_dinode(ip); + if (unlikely(error)) + goto out; + } + + while (offset < end) { + struct iomap iomap = { }; + + error = gfs2_iomap_alloc(inode, offset, end - offset, &iomap); + if (error) + goto out; + offset = iomap.offset + iomap.length; + if (!(iomap.flags & IOMAP_F_NEW)) + continue; + error = sb_issue_zeroout(sb, iomap.addr >> inode->i_blkbits, + iomap.length >> inode->i_blkbits, + GFP_NOFS); + if (error) { + fs_err(GFS2_SB(inode), "Failed to zero data buffers\n"); + goto out; + } + } +out: + brelse(dibh); + return error; +} + +/** + * calc_max_reserv() - Reverse of write_calc_reserv. Given a number of + * blocks, determine how many bytes can be written. + * @ip: The inode in question. + * @len: Max cap of bytes. What we return in *len must be <= this. + * @data_blocks: Compute and return the number of data blocks needed + * @ind_blocks: Compute and return the number of indirect blocks needed + * @max_blocks: The total blocks available to work with. + * + * Returns: void, but @len, @data_blocks and @ind_blocks are filled in. + */ +static void calc_max_reserv(struct gfs2_inode *ip, loff_t *len, + unsigned int *data_blocks, unsigned int *ind_blocks, + unsigned int max_blocks) +{ + loff_t max = *len; + const struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + unsigned int tmp, max_data = max_blocks - 3 * (sdp->sd_max_height - 1); + + for (tmp = max_data; tmp > sdp->sd_diptrs;) { + tmp = DIV_ROUND_UP(tmp, sdp->sd_inptrs); + max_data -= tmp; + } + + *data_blocks = max_data; + *ind_blocks = max_blocks - max_data; + *len = ((loff_t)max_data - 3) << sdp->sd_sb.sb_bsize_shift; + if (*len > max) { + *len = max; + gfs2_write_calc_reserv(ip, max, data_blocks, ind_blocks); + } +} + +static long __gfs2_fallocate(struct file *file, int mode, loff_t offset, loff_t len) +{ + struct inode *inode = file_inode(file); + struct gfs2_sbd *sdp = GFS2_SB(inode); + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_alloc_parms ap = { .aflags = 0, }; + unsigned int data_blocks = 0, ind_blocks = 0, rblocks; + loff_t bytes, max_bytes, max_blks; + int error; + const loff_t pos = offset; + const loff_t count = len; + loff_t bsize_mask = ~((loff_t)sdp->sd_sb.sb_bsize - 1); + loff_t next = (offset + len - 1) >> sdp->sd_sb.sb_bsize_shift; + loff_t max_chunk_size = UINT_MAX & bsize_mask; + + next = (next + 1) << sdp->sd_sb.sb_bsize_shift; + + offset &= bsize_mask; + + len = next - offset; + bytes = sdp->sd_max_rg_data * sdp->sd_sb.sb_bsize / 2; + if (!bytes) + bytes = UINT_MAX; + bytes &= bsize_mask; + if (bytes == 0) + bytes = sdp->sd_sb.sb_bsize; + + gfs2_size_hint(file, offset, len); + + gfs2_write_calc_reserv(ip, PAGE_SIZE, &data_blocks, &ind_blocks); + ap.min_target = data_blocks + ind_blocks; + + while (len > 0) { + if (len < bytes) + bytes = len; + if (!gfs2_write_alloc_required(ip, offset, bytes)) { + len -= bytes; + offset += bytes; + continue; + } + + /* We need to determine how many bytes we can actually + * fallocate without exceeding quota or going over the + * end of the fs. We start off optimistically by assuming + * we can write max_bytes */ + max_bytes = (len > max_chunk_size) ? max_chunk_size : len; + + /* Since max_bytes is most likely a theoretical max, we + * calculate a more realistic 'bytes' to serve as a good + * starting point for the number of bytes we may be able + * to write */ + gfs2_write_calc_reserv(ip, bytes, &data_blocks, &ind_blocks); + ap.target = data_blocks + ind_blocks; + + error = gfs2_quota_lock_check(ip, &ap); + if (error) + return error; + /* ap.allowed tells us how many blocks quota will allow + * us to write. Check if this reduces max_blks */ + max_blks = UINT_MAX; + if (ap.allowed) + max_blks = ap.allowed; + + error = gfs2_inplace_reserve(ip, &ap); + if (error) + goto out_qunlock; + + /* check if the selected rgrp limits our max_blks further */ + if (ip->i_res.rs_reserved < max_blks) + max_blks = ip->i_res.rs_reserved; + + /* Almost done. Calculate bytes that can be written using + * max_blks. We also recompute max_bytes, data_blocks and + * ind_blocks */ + calc_max_reserv(ip, &max_bytes, &data_blocks, + &ind_blocks, max_blks); + + rblocks = RES_DINODE + ind_blocks + RES_STATFS + RES_QUOTA + + RES_RG_HDR + gfs2_rg_blocks(ip, data_blocks + ind_blocks); + if (gfs2_is_jdata(ip)) + rblocks += data_blocks ? data_blocks : 1; + + error = gfs2_trans_begin(sdp, rblocks, + PAGE_SIZE >> inode->i_blkbits); + if (error) + goto out_trans_fail; + + error = fallocate_chunk(inode, offset, max_bytes, mode); + gfs2_trans_end(sdp); + + if (error) + goto out_trans_fail; + + len -= max_bytes; + offset += max_bytes; + gfs2_inplace_release(ip); + gfs2_quota_unlock(ip); + } + + if (!(mode & FALLOC_FL_KEEP_SIZE) && (pos + count) > inode->i_size) + i_size_write(inode, pos + count); + file_update_time(file); + mark_inode_dirty(inode); + + if ((file->f_flags & O_DSYNC) || IS_SYNC(file->f_mapping->host)) + return vfs_fsync_range(file, pos, pos + count - 1, + (file->f_flags & __O_SYNC) ? 0 : 1); + return 0; + +out_trans_fail: + gfs2_inplace_release(ip); +out_qunlock: + gfs2_quota_unlock(ip); + return error; +} + +static long gfs2_fallocate(struct file *file, int mode, loff_t offset, loff_t len) +{ + struct inode *inode = file_inode(file); + struct gfs2_sbd *sdp = GFS2_SB(inode); + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_holder gh; + int ret; + + if (mode & ~(FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE)) + return -EOPNOTSUPP; + /* fallocate is needed by gfs2_grow to reserve space in the rindex */ + if (gfs2_is_jdata(ip) && inode != sdp->sd_rindex) + return -EOPNOTSUPP; + + inode_lock(inode); + + gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh); + ret = gfs2_glock_nq(&gh); + if (ret) + goto out_uninit; + + if (!(mode & FALLOC_FL_KEEP_SIZE) && + (offset + len) > inode->i_size) { + ret = inode_newsize_ok(inode, offset + len); + if (ret) + goto out_unlock; + } + + ret = get_write_access(inode); + if (ret) + goto out_unlock; + + if (mode & FALLOC_FL_PUNCH_HOLE) { + ret = __gfs2_punch_hole(file, offset, len); + } else { + ret = __gfs2_fallocate(file, mode, offset, len); + if (ret) + gfs2_rs_deltree(&ip->i_res); + } + + put_write_access(inode); +out_unlock: + gfs2_glock_dq(&gh); +out_uninit: + gfs2_holder_uninit(&gh); + inode_unlock(inode); + return ret; +} + +static ssize_t gfs2_file_splice_write(struct pipe_inode_info *pipe, + struct file *out, loff_t *ppos, + size_t len, unsigned int flags) +{ + ssize_t ret; + + gfs2_size_hint(out, *ppos, len); + + ret = iter_file_splice_write(pipe, out, ppos, len, flags); + return ret; +} + +#ifdef CONFIG_GFS2_FS_LOCKING_DLM + +/** + * gfs2_lock - acquire/release a posix lock on a file + * @file: the file pointer + * @cmd: either modify or retrieve lock state, possibly wait + * @fl: type and range of lock + * + * Returns: errno + */ + +static int gfs2_lock(struct file *file, int cmd, struct file_lock *fl) +{ + struct gfs2_inode *ip = GFS2_I(file->f_mapping->host); + struct gfs2_sbd *sdp = GFS2_SB(file->f_mapping->host); + struct lm_lockstruct *ls = &sdp->sd_lockstruct; + + if (!(fl->fl_flags & FL_POSIX)) + return -ENOLCK; + if (unlikely(gfs2_withdrawn(sdp))) { + if (fl->fl_type == F_UNLCK) + locks_lock_file_wait(file, fl); + return -EIO; + } + if (cmd == F_CANCELLK) + return dlm_posix_cancel(ls->ls_dlm, ip->i_no_addr, file, fl); + else if (IS_GETLK(cmd)) + return dlm_posix_get(ls->ls_dlm, ip->i_no_addr, file, fl); + else if (fl->fl_type == F_UNLCK) + return dlm_posix_unlock(ls->ls_dlm, ip->i_no_addr, file, fl); + else + return dlm_posix_lock(ls->ls_dlm, ip->i_no_addr, file, cmd, fl); +} + +static void __flock_holder_uninit(struct file *file, struct gfs2_holder *fl_gh) +{ + struct gfs2_glock *gl = gfs2_glock_hold(fl_gh->gh_gl); + + /* + * Make sure gfs2_glock_put() won't sleep under the file->f_lock + * spinlock. + */ + + spin_lock(&file->f_lock); + gfs2_holder_uninit(fl_gh); + spin_unlock(&file->f_lock); + gfs2_glock_put(gl); +} + +static int do_flock(struct file *file, int cmd, struct file_lock *fl) +{ + struct gfs2_file *fp = file->private_data; + struct gfs2_holder *fl_gh = &fp->f_fl_gh; + struct gfs2_inode *ip = GFS2_I(file_inode(file)); + struct gfs2_glock *gl; + unsigned int state; + u16 flags; + int error = 0; + int sleeptime; + + state = (fl->fl_type == F_WRLCK) ? LM_ST_EXCLUSIVE : LM_ST_SHARED; + flags = GL_EXACT | GL_NOPID; + if (!IS_SETLKW(cmd)) + flags |= LM_FLAG_TRY_1CB; + + mutex_lock(&fp->f_fl_mutex); + + if (gfs2_holder_initialized(fl_gh)) { + struct file_lock request; + if (fl_gh->gh_state == state) + goto out; + locks_init_lock(&request); + request.fl_type = F_UNLCK; + request.fl_flags = FL_FLOCK; + locks_lock_file_wait(file, &request); + gfs2_glock_dq(fl_gh); + gfs2_holder_reinit(state, flags, fl_gh); + } else { + error = gfs2_glock_get(GFS2_SB(&ip->i_inode), ip->i_no_addr, + &gfs2_flock_glops, CREATE, &gl); + if (error) + goto out; + spin_lock(&file->f_lock); + gfs2_holder_init(gl, state, flags, fl_gh); + spin_unlock(&file->f_lock); + gfs2_glock_put(gl); + } + for (sleeptime = 1; sleeptime <= 4; sleeptime <<= 1) { + error = gfs2_glock_nq(fl_gh); + if (error != GLR_TRYFAILED) + break; + fl_gh->gh_flags &= ~LM_FLAG_TRY_1CB; + fl_gh->gh_flags |= LM_FLAG_TRY; + msleep(sleeptime); + } + if (error) { + __flock_holder_uninit(file, fl_gh); + if (error == GLR_TRYFAILED) + error = -EAGAIN; + } else { + error = locks_lock_file_wait(file, fl); + gfs2_assert_warn(GFS2_SB(&ip->i_inode), !error); + } + +out: + mutex_unlock(&fp->f_fl_mutex); + return error; +} + +static void do_unflock(struct file *file, struct file_lock *fl) +{ + struct gfs2_file *fp = file->private_data; + struct gfs2_holder *fl_gh = &fp->f_fl_gh; + + mutex_lock(&fp->f_fl_mutex); + locks_lock_file_wait(file, fl); + if (gfs2_holder_initialized(fl_gh)) { + gfs2_glock_dq(fl_gh); + __flock_holder_uninit(file, fl_gh); + } + mutex_unlock(&fp->f_fl_mutex); +} + +/** + * gfs2_flock - acquire/release a flock lock on a file + * @file: the file pointer + * @cmd: either modify or retrieve lock state, possibly wait + * @fl: type and range of lock + * + * Returns: errno + */ + +static int gfs2_flock(struct file *file, int cmd, struct file_lock *fl) +{ + if (!(fl->fl_flags & FL_FLOCK)) + return -ENOLCK; + + if (fl->fl_type == F_UNLCK) { + do_unflock(file, fl); + return 0; + } else { + return do_flock(file, cmd, fl); + } +} + +const struct file_operations gfs2_file_fops = { + .llseek = gfs2_llseek, + .read_iter = gfs2_file_read_iter, + .write_iter = gfs2_file_write_iter, + .iopoll = iocb_bio_iopoll, + .unlocked_ioctl = gfs2_ioctl, + .compat_ioctl = gfs2_compat_ioctl, + .mmap = gfs2_mmap, + .open = gfs2_open, + .release = gfs2_release, + .fsync = gfs2_fsync, + .lock = gfs2_lock, + .flock = gfs2_flock, + .splice_read = copy_splice_read, + .splice_write = gfs2_file_splice_write, + .setlease = simple_nosetlease, + .fallocate = gfs2_fallocate, +}; + +const struct file_operations gfs2_dir_fops = { + .iterate_shared = gfs2_readdir, + .unlocked_ioctl = gfs2_ioctl, + .compat_ioctl = gfs2_compat_ioctl, + .open = gfs2_open, + .release = gfs2_release, + .fsync = gfs2_fsync, + .lock = gfs2_lock, + .flock = gfs2_flock, + .llseek = default_llseek, +}; + +#endif /* CONFIG_GFS2_FS_LOCKING_DLM */ + +const struct file_operations gfs2_file_fops_nolock = { + .llseek = gfs2_llseek, + .read_iter = gfs2_file_read_iter, + .write_iter = gfs2_file_write_iter, + .iopoll = iocb_bio_iopoll, + .unlocked_ioctl = gfs2_ioctl, + .compat_ioctl = gfs2_compat_ioctl, + .mmap = gfs2_mmap, + .open = gfs2_open, + .release = gfs2_release, + .fsync = gfs2_fsync, + .splice_read = copy_splice_read, + .splice_write = gfs2_file_splice_write, + .setlease = generic_setlease, + .fallocate = gfs2_fallocate, +}; + +const struct file_operations gfs2_dir_fops_nolock = { + .iterate_shared = gfs2_readdir, + .unlocked_ioctl = gfs2_ioctl, + .compat_ioctl = gfs2_compat_ioctl, + .open = gfs2_open, + .release = gfs2_release, + .fsync = gfs2_fsync, + .llseek = default_llseek, +}; + diff --git a/fs/gfs2/gfs2.h b/fs/gfs2/gfs2.h new file mode 100644 index 0000000000..ed78e5f20f --- /dev/null +++ b/fs/gfs2/gfs2.h @@ -0,0 +1,23 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + */ + +#ifndef __GFS2_DOT_H__ +#define __GFS2_DOT_H__ + +enum { + NO_CREATE = 0, + CREATE = 1, +}; + +enum { + NO_FORCE = 0, + FORCE = 1, +}; + +#define GFS2_FAST_NAME_SIZE 8 + +#endif /* __GFS2_DOT_H__ */ + diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c new file mode 100644 index 0000000000..4a280be229 --- /dev/null +++ b/fs/gfs2/glock.c @@ -0,0 +1,2878 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/sched.h> +#include <linux/slab.h> +#include <linux/spinlock.h> +#include <linux/buffer_head.h> +#include <linux/delay.h> +#include <linux/sort.h> +#include <linux/hash.h> +#include <linux/jhash.h> +#include <linux/kallsyms.h> +#include <linux/gfs2_ondisk.h> +#include <linux/list.h> +#include <linux/wait.h> +#include <linux/module.h> +#include <linux/uaccess.h> +#include <linux/seq_file.h> +#include <linux/debugfs.h> +#include <linux/kthread.h> +#include <linux/freezer.h> +#include <linux/workqueue.h> +#include <linux/jiffies.h> +#include <linux/rcupdate.h> +#include <linux/rculist_bl.h> +#include <linux/bit_spinlock.h> +#include <linux/percpu.h> +#include <linux/list_sort.h> +#include <linux/lockref.h> +#include <linux/rhashtable.h> +#include <linux/pid_namespace.h> +#include <linux/fdtable.h> +#include <linux/file.h> + +#include "gfs2.h" +#include "incore.h" +#include "glock.h" +#include "glops.h" +#include "inode.h" +#include "lops.h" +#include "meta_io.h" +#include "quota.h" +#include "super.h" +#include "util.h" +#include "bmap.h" +#define CREATE_TRACE_POINTS +#include "trace_gfs2.h" + +struct gfs2_glock_iter { + struct gfs2_sbd *sdp; /* incore superblock */ + struct rhashtable_iter hti; /* rhashtable iterator */ + struct gfs2_glock *gl; /* current glock struct */ + loff_t last_pos; /* last position */ +}; + +typedef void (*glock_examiner) (struct gfs2_glock * gl); + +static void do_xmote(struct gfs2_glock *gl, struct gfs2_holder *gh, unsigned int target); +static void __gfs2_glock_dq(struct gfs2_holder *gh); +static void handle_callback(struct gfs2_glock *gl, unsigned int state, + unsigned long delay, bool remote); + +static struct dentry *gfs2_root; +static struct workqueue_struct *glock_workqueue; +static LIST_HEAD(lru_list); +static atomic_t lru_count = ATOMIC_INIT(0); +static DEFINE_SPINLOCK(lru_lock); + +#define GFS2_GL_HASH_SHIFT 15 +#define GFS2_GL_HASH_SIZE BIT(GFS2_GL_HASH_SHIFT) + +static const struct rhashtable_params ht_parms = { + .nelem_hint = GFS2_GL_HASH_SIZE * 3 / 4, + .key_len = offsetofend(struct lm_lockname, ln_type), + .key_offset = offsetof(struct gfs2_glock, gl_name), + .head_offset = offsetof(struct gfs2_glock, gl_node), +}; + +static struct rhashtable gl_hash_table; + +#define GLOCK_WAIT_TABLE_BITS 12 +#define GLOCK_WAIT_TABLE_SIZE (1 << GLOCK_WAIT_TABLE_BITS) +static wait_queue_head_t glock_wait_table[GLOCK_WAIT_TABLE_SIZE] __cacheline_aligned; + +struct wait_glock_queue { + struct lm_lockname *name; + wait_queue_entry_t wait; +}; + +static int glock_wake_function(wait_queue_entry_t *wait, unsigned int mode, + int sync, void *key) +{ + struct wait_glock_queue *wait_glock = + container_of(wait, struct wait_glock_queue, wait); + struct lm_lockname *wait_name = wait_glock->name; + struct lm_lockname *wake_name = key; + + if (wake_name->ln_sbd != wait_name->ln_sbd || + wake_name->ln_number != wait_name->ln_number || + wake_name->ln_type != wait_name->ln_type) + return 0; + return autoremove_wake_function(wait, mode, sync, key); +} + +static wait_queue_head_t *glock_waitqueue(struct lm_lockname *name) +{ + u32 hash = jhash2((u32 *)name, ht_parms.key_len / 4, 0); + + return glock_wait_table + hash_32(hash, GLOCK_WAIT_TABLE_BITS); +} + +/** + * wake_up_glock - Wake up waiters on a glock + * @gl: the glock + */ +static void wake_up_glock(struct gfs2_glock *gl) +{ + wait_queue_head_t *wq = glock_waitqueue(&gl->gl_name); + + if (waitqueue_active(wq)) + __wake_up(wq, TASK_NORMAL, 1, &gl->gl_name); +} + +static void gfs2_glock_dealloc(struct rcu_head *rcu) +{ + struct gfs2_glock *gl = container_of(rcu, struct gfs2_glock, gl_rcu); + + kfree(gl->gl_lksb.sb_lvbptr); + if (gl->gl_ops->go_flags & GLOF_ASPACE) { + struct gfs2_glock_aspace *gla = + container_of(gl, struct gfs2_glock_aspace, glock); + kmem_cache_free(gfs2_glock_aspace_cachep, gla); + } else + kmem_cache_free(gfs2_glock_cachep, gl); +} + +/** + * glock_blocked_by_withdraw - determine if we can still use a glock + * @gl: the glock + * + * We need to allow some glocks to be enqueued, dequeued, promoted, and demoted + * when we're withdrawn. For example, to maintain metadata integrity, we should + * disallow the use of inode and rgrp glocks when withdrawn. Other glocks like + * the iopen or freeze glock may be safely used because none of their + * metadata goes through the journal. So in general, we should disallow all + * glocks that are journaled, and allow all the others. One exception is: + * we need to allow our active journal to be promoted and demoted so others + * may recover it and we can reacquire it when they're done. + */ +static bool glock_blocked_by_withdraw(struct gfs2_glock *gl) +{ + struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; + + if (likely(!gfs2_withdrawn(sdp))) + return false; + if (gl->gl_ops->go_flags & GLOF_NONDISK) + return false; + if (!sdp->sd_jdesc || + gl->gl_name.ln_number == sdp->sd_jdesc->jd_no_addr) + return false; + return true; +} + +void gfs2_glock_free(struct gfs2_glock *gl) +{ + struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; + + gfs2_glock_assert_withdraw(gl, atomic_read(&gl->gl_revokes) == 0); + rhashtable_remove_fast(&gl_hash_table, &gl->gl_node, ht_parms); + smp_mb(); + wake_up_glock(gl); + call_rcu(&gl->gl_rcu, gfs2_glock_dealloc); + if (atomic_dec_and_test(&sdp->sd_glock_disposal)) + wake_up(&sdp->sd_kill_wait); +} + +/** + * gfs2_glock_hold() - increment reference count on glock + * @gl: The glock to hold + * + */ + +struct gfs2_glock *gfs2_glock_hold(struct gfs2_glock *gl) +{ + GLOCK_BUG_ON(gl, __lockref_is_dead(&gl->gl_lockref)); + lockref_get(&gl->gl_lockref); + return gl; +} + +/** + * demote_ok - Check to see if it's ok to unlock a glock + * @gl: the glock + * + * Returns: 1 if it's ok + */ + +static int demote_ok(const struct gfs2_glock *gl) +{ + const struct gfs2_glock_operations *glops = gl->gl_ops; + + if (gl->gl_state == LM_ST_UNLOCKED) + return 0; + if (!list_empty(&gl->gl_holders)) + return 0; + if (glops->go_demote_ok) + return glops->go_demote_ok(gl); + return 1; +} + + +void gfs2_glock_add_to_lru(struct gfs2_glock *gl) +{ + if (!(gl->gl_ops->go_flags & GLOF_LRU)) + return; + + spin_lock(&lru_lock); + + list_move_tail(&gl->gl_lru, &lru_list); + + if (!test_bit(GLF_LRU, &gl->gl_flags)) { + set_bit(GLF_LRU, &gl->gl_flags); + atomic_inc(&lru_count); + } + + spin_unlock(&lru_lock); +} + +static void gfs2_glock_remove_from_lru(struct gfs2_glock *gl) +{ + if (!(gl->gl_ops->go_flags & GLOF_LRU)) + return; + + spin_lock(&lru_lock); + if (test_bit(GLF_LRU, &gl->gl_flags)) { + list_del_init(&gl->gl_lru); + atomic_dec(&lru_count); + clear_bit(GLF_LRU, &gl->gl_flags); + } + spin_unlock(&lru_lock); +} + +/* + * Enqueue the glock on the work queue. Passes one glock reference on to the + * work queue. + */ +static void __gfs2_glock_queue_work(struct gfs2_glock *gl, unsigned long delay) { + if (!queue_delayed_work(glock_workqueue, &gl->gl_work, delay)) { + /* + * We are holding the lockref spinlock, and the work was still + * queued above. The queued work (glock_work_func) takes that + * spinlock before dropping its glock reference(s), so it + * cannot have dropped them in the meantime. + */ + GLOCK_BUG_ON(gl, gl->gl_lockref.count < 2); + gl->gl_lockref.count--; + } +} + +static void gfs2_glock_queue_work(struct gfs2_glock *gl, unsigned long delay) { + spin_lock(&gl->gl_lockref.lock); + __gfs2_glock_queue_work(gl, delay); + spin_unlock(&gl->gl_lockref.lock); +} + +static void __gfs2_glock_put(struct gfs2_glock *gl) +{ + struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; + struct address_space *mapping = gfs2_glock2aspace(gl); + + lockref_mark_dead(&gl->gl_lockref); + spin_unlock(&gl->gl_lockref.lock); + gfs2_glock_remove_from_lru(gl); + GLOCK_BUG_ON(gl, !list_empty(&gl->gl_holders)); + if (mapping) { + truncate_inode_pages_final(mapping); + if (!gfs2_withdrawn(sdp)) + GLOCK_BUG_ON(gl, !mapping_empty(mapping)); + } + trace_gfs2_glock_put(gl); + sdp->sd_lockstruct.ls_ops->lm_put_lock(gl); +} + +/* + * Cause the glock to be put in work queue context. + */ +void gfs2_glock_queue_put(struct gfs2_glock *gl) +{ + gfs2_glock_queue_work(gl, 0); +} + +/** + * gfs2_glock_put() - Decrement reference count on glock + * @gl: The glock to put + * + */ + +void gfs2_glock_put(struct gfs2_glock *gl) +{ + if (lockref_put_or_lock(&gl->gl_lockref)) + return; + + __gfs2_glock_put(gl); +} + +/** + * may_grant - check if it's ok to grant a new lock + * @gl: The glock + * @current_gh: One of the current holders of @gl + * @gh: The lock request which we wish to grant + * + * With our current compatibility rules, if a glock has one or more active + * holders (HIF_HOLDER flag set), any of those holders can be passed in as + * @current_gh; they are all the same as far as compatibility with the new @gh + * goes. + * + * Returns true if it's ok to grant the lock. + */ + +static inline bool may_grant(struct gfs2_glock *gl, + struct gfs2_holder *current_gh, + struct gfs2_holder *gh) +{ + if (current_gh) { + GLOCK_BUG_ON(gl, !test_bit(HIF_HOLDER, ¤t_gh->gh_iflags)); + + switch(current_gh->gh_state) { + case LM_ST_EXCLUSIVE: + /* + * Here we make a special exception to grant holders + * who agree to share the EX lock with other holders + * who also have the bit set. If the original holder + * has the LM_FLAG_NODE_SCOPE bit set, we grant more + * holders with the bit set. + */ + return gh->gh_state == LM_ST_EXCLUSIVE && + (current_gh->gh_flags & LM_FLAG_NODE_SCOPE) && + (gh->gh_flags & LM_FLAG_NODE_SCOPE); + + case LM_ST_SHARED: + case LM_ST_DEFERRED: + return gh->gh_state == current_gh->gh_state; + + default: + return false; + } + } + + if (gl->gl_state == gh->gh_state) + return true; + if (gh->gh_flags & GL_EXACT) + return false; + if (gl->gl_state == LM_ST_EXCLUSIVE) { + return gh->gh_state == LM_ST_SHARED || + gh->gh_state == LM_ST_DEFERRED; + } + if (gh->gh_flags & LM_FLAG_ANY) + return gl->gl_state != LM_ST_UNLOCKED; + return false; +} + +static void gfs2_holder_wake(struct gfs2_holder *gh) +{ + clear_bit(HIF_WAIT, &gh->gh_iflags); + smp_mb__after_atomic(); + wake_up_bit(&gh->gh_iflags, HIF_WAIT); + if (gh->gh_flags & GL_ASYNC) { + struct gfs2_sbd *sdp = gh->gh_gl->gl_name.ln_sbd; + + wake_up(&sdp->sd_async_glock_wait); + } +} + +/** + * do_error - Something unexpected has happened during a lock request + * @gl: The glock + * @ret: The status from the DLM + */ + +static void do_error(struct gfs2_glock *gl, const int ret) +{ + struct gfs2_holder *gh, *tmp; + + list_for_each_entry_safe(gh, tmp, &gl->gl_holders, gh_list) { + if (test_bit(HIF_HOLDER, &gh->gh_iflags)) + continue; + if (ret & LM_OUT_ERROR) + gh->gh_error = -EIO; + else if (gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)) + gh->gh_error = GLR_TRYFAILED; + else + continue; + list_del_init(&gh->gh_list); + trace_gfs2_glock_queue(gh, 0); + gfs2_holder_wake(gh); + } +} + +/** + * find_first_holder - find the first "holder" gh + * @gl: the glock + */ + +static inline struct gfs2_holder *find_first_holder(const struct gfs2_glock *gl) +{ + struct gfs2_holder *gh; + + if (!list_empty(&gl->gl_holders)) { + gh = list_first_entry(&gl->gl_holders, struct gfs2_holder, + gh_list); + if (test_bit(HIF_HOLDER, &gh->gh_iflags)) + return gh; + } + return NULL; +} + +/* + * gfs2_instantiate - Call the glops instantiate function + * @gh: The glock holder + * + * Returns: 0 if instantiate was successful, or error. + */ +int gfs2_instantiate(struct gfs2_holder *gh) +{ + struct gfs2_glock *gl = gh->gh_gl; + const struct gfs2_glock_operations *glops = gl->gl_ops; + int ret; + +again: + if (!test_bit(GLF_INSTANTIATE_NEEDED, &gl->gl_flags)) + goto done; + + /* + * Since we unlock the lockref lock, we set a flag to indicate + * instantiate is in progress. + */ + if (test_and_set_bit(GLF_INSTANTIATE_IN_PROG, &gl->gl_flags)) { + wait_on_bit(&gl->gl_flags, GLF_INSTANTIATE_IN_PROG, + TASK_UNINTERRUPTIBLE); + /* + * Here we just waited for a different instantiate to finish. + * But that may not have been successful, as when a process + * locks an inode glock _before_ it has an actual inode to + * instantiate into. So we check again. This process might + * have an inode to instantiate, so might be successful. + */ + goto again; + } + + ret = glops->go_instantiate(gl); + if (!ret) + clear_bit(GLF_INSTANTIATE_NEEDED, &gl->gl_flags); + clear_and_wake_up_bit(GLF_INSTANTIATE_IN_PROG, &gl->gl_flags); + if (ret) + return ret; + +done: + if (glops->go_held) + return glops->go_held(gh); + return 0; +} + +/** + * do_promote - promote as many requests as possible on the current queue + * @gl: The glock + * + * Returns true on success (i.e., progress was made or there are no waiters). + */ + +static bool do_promote(struct gfs2_glock *gl) +{ + struct gfs2_holder *gh, *current_gh; + + current_gh = find_first_holder(gl); + list_for_each_entry(gh, &gl->gl_holders, gh_list) { + if (test_bit(HIF_HOLDER, &gh->gh_iflags)) + continue; + if (!may_grant(gl, current_gh, gh)) { + /* + * If we get here, it means we may not grant this + * holder for some reason. If this holder is at the + * head of the list, it means we have a blocked holder + * at the head, so return false. + */ + if (list_is_first(&gh->gh_list, &gl->gl_holders)) + return false; + do_error(gl, 0); + break; + } + set_bit(HIF_HOLDER, &gh->gh_iflags); + trace_gfs2_promote(gh); + gfs2_holder_wake(gh); + if (!current_gh) + current_gh = gh; + } + return true; +} + +/** + * find_first_waiter - find the first gh that's waiting for the glock + * @gl: the glock + */ + +static inline struct gfs2_holder *find_first_waiter(const struct gfs2_glock *gl) +{ + struct gfs2_holder *gh; + + list_for_each_entry(gh, &gl->gl_holders, gh_list) { + if (!test_bit(HIF_HOLDER, &gh->gh_iflags)) + return gh; + } + return NULL; +} + +/** + * state_change - record that the glock is now in a different state + * @gl: the glock + * @new_state: the new state + */ + +static void state_change(struct gfs2_glock *gl, unsigned int new_state) +{ + int held1, held2; + + held1 = (gl->gl_state != LM_ST_UNLOCKED); + held2 = (new_state != LM_ST_UNLOCKED); + + if (held1 != held2) { + GLOCK_BUG_ON(gl, __lockref_is_dead(&gl->gl_lockref)); + if (held2) + gl->gl_lockref.count++; + else + gl->gl_lockref.count--; + } + if (new_state != gl->gl_target) + /* shorten our minimum hold time */ + gl->gl_hold_time = max(gl->gl_hold_time - GL_GLOCK_HOLD_DECR, + GL_GLOCK_MIN_HOLD); + gl->gl_state = new_state; + gl->gl_tchange = jiffies; +} + +static void gfs2_set_demote(struct gfs2_glock *gl) +{ + struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; + + set_bit(GLF_DEMOTE, &gl->gl_flags); + smp_mb(); + wake_up(&sdp->sd_async_glock_wait); +} + +static void gfs2_demote_wake(struct gfs2_glock *gl) +{ + gl->gl_demote_state = LM_ST_EXCLUSIVE; + clear_bit(GLF_DEMOTE, &gl->gl_flags); + smp_mb__after_atomic(); + wake_up_bit(&gl->gl_flags, GLF_DEMOTE); +} + +/** + * finish_xmote - The DLM has replied to one of our lock requests + * @gl: The glock + * @ret: The status from the DLM + * + */ + +static void finish_xmote(struct gfs2_glock *gl, unsigned int ret) +{ + const struct gfs2_glock_operations *glops = gl->gl_ops; + struct gfs2_holder *gh; + unsigned state = ret & LM_OUT_ST_MASK; + + spin_lock(&gl->gl_lockref.lock); + trace_gfs2_glock_state_change(gl, state); + state_change(gl, state); + gh = find_first_waiter(gl); + + /* Demote to UN request arrived during demote to SH or DF */ + if (test_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags) && + state != LM_ST_UNLOCKED && gl->gl_demote_state == LM_ST_UNLOCKED) + gl->gl_target = LM_ST_UNLOCKED; + + /* Check for state != intended state */ + if (unlikely(state != gl->gl_target)) { + if (gh && (ret & LM_OUT_CANCELED)) + gfs2_holder_wake(gh); + if (gh && !test_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags)) { + /* move to back of queue and try next entry */ + if (ret & LM_OUT_CANCELED) { + list_move_tail(&gh->gh_list, &gl->gl_holders); + gh = find_first_waiter(gl); + gl->gl_target = gh->gh_state; + if (do_promote(gl)) + goto out; + goto retry; + } + /* Some error or failed "try lock" - report it */ + if ((ret & LM_OUT_ERROR) || + (gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB))) { + gl->gl_target = gl->gl_state; + do_error(gl, ret); + goto out; + } + } + switch(state) { + /* Unlocked due to conversion deadlock, try again */ + case LM_ST_UNLOCKED: +retry: + do_xmote(gl, gh, gl->gl_target); + break; + /* Conversion fails, unlock and try again */ + case LM_ST_SHARED: + case LM_ST_DEFERRED: + do_xmote(gl, gh, LM_ST_UNLOCKED); + break; + default: /* Everything else */ + fs_err(gl->gl_name.ln_sbd, "wanted %u got %u\n", + gl->gl_target, state); + GLOCK_BUG_ON(gl, 1); + } + spin_unlock(&gl->gl_lockref.lock); + return; + } + + /* Fast path - we got what we asked for */ + if (test_and_clear_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags)) + gfs2_demote_wake(gl); + if (state != LM_ST_UNLOCKED) { + if (glops->go_xmote_bh) { + int rv; + + spin_unlock(&gl->gl_lockref.lock); + rv = glops->go_xmote_bh(gl); + spin_lock(&gl->gl_lockref.lock); + if (rv) { + do_error(gl, rv); + goto out; + } + } + do_promote(gl); + } +out: + clear_bit(GLF_LOCK, &gl->gl_flags); + spin_unlock(&gl->gl_lockref.lock); +} + +static bool is_system_glock(struct gfs2_glock *gl) +{ + struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; + struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode); + + if (gl == m_ip->i_gl) + return true; + return false; +} + +/** + * do_xmote - Calls the DLM to change the state of a lock + * @gl: The lock state + * @gh: The holder (only for promotes) + * @target: The target lock state + * + */ + +static void do_xmote(struct gfs2_glock *gl, struct gfs2_holder *gh, + unsigned int target) +__releases(&gl->gl_lockref.lock) +__acquires(&gl->gl_lockref.lock) +{ + const struct gfs2_glock_operations *glops = gl->gl_ops; + struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; + unsigned int lck_flags = (unsigned int)(gh ? gh->gh_flags : 0); + int ret; + + if (target != LM_ST_UNLOCKED && glock_blocked_by_withdraw(gl) && + gh && !(gh->gh_flags & LM_FLAG_NOEXP)) + goto skip_inval; + + lck_flags &= (LM_FLAG_TRY | LM_FLAG_TRY_1CB | LM_FLAG_NOEXP); + GLOCK_BUG_ON(gl, gl->gl_state == target); + GLOCK_BUG_ON(gl, gl->gl_state == gl->gl_target); + if ((target == LM_ST_UNLOCKED || target == LM_ST_DEFERRED) && + glops->go_inval) { + /* + * If another process is already doing the invalidate, let that + * finish first. The glock state machine will get back to this + * holder again later. + */ + if (test_and_set_bit(GLF_INVALIDATE_IN_PROGRESS, + &gl->gl_flags)) + return; + do_error(gl, 0); /* Fail queued try locks */ + } + gl->gl_req = target; + set_bit(GLF_BLOCKING, &gl->gl_flags); + if ((gl->gl_req == LM_ST_UNLOCKED) || + (gl->gl_state == LM_ST_EXCLUSIVE) || + (lck_flags & (LM_FLAG_TRY|LM_FLAG_TRY_1CB))) + clear_bit(GLF_BLOCKING, &gl->gl_flags); + spin_unlock(&gl->gl_lockref.lock); + if (glops->go_sync) { + ret = glops->go_sync(gl); + /* If we had a problem syncing (due to io errors or whatever, + * we should not invalidate the metadata or tell dlm to + * release the glock to other nodes. + */ + if (ret) { + if (cmpxchg(&sdp->sd_log_error, 0, ret)) { + fs_err(sdp, "Error %d syncing glock \n", ret); + gfs2_dump_glock(NULL, gl, true); + } + goto skip_inval; + } + } + if (test_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags)) { + /* + * The call to go_sync should have cleared out the ail list. + * If there are still items, we have a problem. We ought to + * withdraw, but we can't because the withdraw code also uses + * glocks. Warn about the error, dump the glock, then fall + * through and wait for logd to do the withdraw for us. + */ + if ((atomic_read(&gl->gl_ail_count) != 0) && + (!cmpxchg(&sdp->sd_log_error, 0, -EIO))) { + gfs2_glock_assert_warn(gl, + !atomic_read(&gl->gl_ail_count)); + gfs2_dump_glock(NULL, gl, true); + } + glops->go_inval(gl, target == LM_ST_DEFERRED ? 0 : DIO_METADATA); + clear_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags); + } + +skip_inval: + gfs2_glock_hold(gl); + /* + * Check for an error encountered since we called go_sync and go_inval. + * If so, we can't withdraw from the glock code because the withdraw + * code itself uses glocks (see function signal_our_withdraw) to + * change the mount to read-only. Most importantly, we must not call + * dlm to unlock the glock until the journal is in a known good state + * (after journal replay) otherwise other nodes may use the object + * (rgrp or dinode) and then later, journal replay will corrupt the + * file system. The best we can do here is wait for the logd daemon + * to see sd_log_error and withdraw, and in the meantime, requeue the + * work for later. + * + * We make a special exception for some system glocks, such as the + * system statfs inode glock, which needs to be granted before the + * gfs2_quotad daemon can exit, and that exit needs to finish before + * we can unmount the withdrawn file system. + * + * However, if we're just unlocking the lock (say, for unmount, when + * gfs2_gl_hash_clear calls clear_glock) and recovery is complete + * then it's okay to tell dlm to unlock it. + */ + if (unlikely(sdp->sd_log_error && !gfs2_withdrawn(sdp))) + gfs2_withdraw_delayed(sdp); + if (glock_blocked_by_withdraw(gl) && + (target != LM_ST_UNLOCKED || + test_bit(SDF_WITHDRAW_RECOVERY, &sdp->sd_flags))) { + if (!is_system_glock(gl)) { + handle_callback(gl, LM_ST_UNLOCKED, 0, false); /* sets demote */ + /* + * Ordinarily, we would call dlm and its callback would call + * finish_xmote, which would call state_change() to the new state. + * Since we withdrew, we won't call dlm, so call state_change + * manually, but to the UNLOCKED state we desire. + */ + state_change(gl, LM_ST_UNLOCKED); + /* + * We skip telling dlm to do the locking, so we won't get a + * reply that would otherwise clear GLF_LOCK. So we clear it here. + */ + clear_bit(GLF_LOCK, &gl->gl_flags); + clear_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags); + gfs2_glock_queue_work(gl, GL_GLOCK_DFT_HOLD); + goto out; + } else { + clear_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags); + } + } + + if (sdp->sd_lockstruct.ls_ops->lm_lock) { + /* lock_dlm */ + ret = sdp->sd_lockstruct.ls_ops->lm_lock(gl, target, lck_flags); + if (ret == -EINVAL && gl->gl_target == LM_ST_UNLOCKED && + target == LM_ST_UNLOCKED && + test_bit(SDF_SKIP_DLM_UNLOCK, &sdp->sd_flags)) { + finish_xmote(gl, target); + gfs2_glock_queue_work(gl, 0); + } else if (ret) { + fs_err(sdp, "lm_lock ret %d\n", ret); + GLOCK_BUG_ON(gl, !gfs2_withdrawn(sdp)); + } + } else { /* lock_nolock */ + finish_xmote(gl, target); + gfs2_glock_queue_work(gl, 0); + } +out: + spin_lock(&gl->gl_lockref.lock); +} + +/** + * run_queue - do all outstanding tasks related to a glock + * @gl: The glock in question + * @nonblock: True if we must not block in run_queue + * + */ + +static void run_queue(struct gfs2_glock *gl, const int nonblock) +__releases(&gl->gl_lockref.lock) +__acquires(&gl->gl_lockref.lock) +{ + struct gfs2_holder *gh = NULL; + + if (test_and_set_bit(GLF_LOCK, &gl->gl_flags)) + return; + + GLOCK_BUG_ON(gl, test_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags)); + + if (test_bit(GLF_DEMOTE, &gl->gl_flags) && + gl->gl_demote_state != gl->gl_state) { + if (find_first_holder(gl)) + goto out_unlock; + if (nonblock) + goto out_sched; + set_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags); + GLOCK_BUG_ON(gl, gl->gl_demote_state == LM_ST_EXCLUSIVE); + gl->gl_target = gl->gl_demote_state; + } else { + if (test_bit(GLF_DEMOTE, &gl->gl_flags)) + gfs2_demote_wake(gl); + if (do_promote(gl)) + goto out_unlock; + gh = find_first_waiter(gl); + gl->gl_target = gh->gh_state; + if (!(gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB))) + do_error(gl, 0); /* Fail queued try locks */ + } + do_xmote(gl, gh, gl->gl_target); + return; + +out_sched: + clear_bit(GLF_LOCK, &gl->gl_flags); + smp_mb__after_atomic(); + gl->gl_lockref.count++; + __gfs2_glock_queue_work(gl, 0); + return; + +out_unlock: + clear_bit(GLF_LOCK, &gl->gl_flags); + smp_mb__after_atomic(); + return; +} + +/** + * glock_set_object - set the gl_object field of a glock + * @gl: the glock + * @object: the object + */ +void glock_set_object(struct gfs2_glock *gl, void *object) +{ + void *prev_object; + + spin_lock(&gl->gl_lockref.lock); + prev_object = gl->gl_object; + gl->gl_object = object; + spin_unlock(&gl->gl_lockref.lock); + if (gfs2_assert_warn(gl->gl_name.ln_sbd, prev_object == NULL)) { + pr_warn("glock=%u/%llx\n", + gl->gl_name.ln_type, + (unsigned long long)gl->gl_name.ln_number); + gfs2_dump_glock(NULL, gl, true); + } +} + +/** + * glock_clear_object - clear the gl_object field of a glock + * @gl: the glock + * @object: object the glock currently points at + */ +void glock_clear_object(struct gfs2_glock *gl, void *object) +{ + void *prev_object; + + spin_lock(&gl->gl_lockref.lock); + prev_object = gl->gl_object; + gl->gl_object = NULL; + spin_unlock(&gl->gl_lockref.lock); + if (gfs2_assert_warn(gl->gl_name.ln_sbd, prev_object == object)) { + pr_warn("glock=%u/%llx\n", + gl->gl_name.ln_type, + (unsigned long long)gl->gl_name.ln_number); + gfs2_dump_glock(NULL, gl, true); + } +} + +void gfs2_inode_remember_delete(struct gfs2_glock *gl, u64 generation) +{ + struct gfs2_inode_lvb *ri = (void *)gl->gl_lksb.sb_lvbptr; + + if (ri->ri_magic == 0) + ri->ri_magic = cpu_to_be32(GFS2_MAGIC); + if (ri->ri_magic == cpu_to_be32(GFS2_MAGIC)) + ri->ri_generation_deleted = cpu_to_be64(generation); +} + +bool gfs2_inode_already_deleted(struct gfs2_glock *gl, u64 generation) +{ + struct gfs2_inode_lvb *ri = (void *)gl->gl_lksb.sb_lvbptr; + + if (ri->ri_magic != cpu_to_be32(GFS2_MAGIC)) + return false; + return generation <= be64_to_cpu(ri->ri_generation_deleted); +} + +static void gfs2_glock_poke(struct gfs2_glock *gl) +{ + int flags = LM_FLAG_TRY_1CB | LM_FLAG_ANY | GL_SKIP; + struct gfs2_holder gh; + int error; + + __gfs2_holder_init(gl, LM_ST_SHARED, flags, &gh, _RET_IP_); + error = gfs2_glock_nq(&gh); + if (!error) + gfs2_glock_dq(&gh); + gfs2_holder_uninit(&gh); +} + +static bool gfs2_try_evict(struct gfs2_glock *gl) +{ + struct gfs2_inode *ip; + bool evicted = false; + + /* + * If there is contention on the iopen glock and we have an inode, try + * to grab and release the inode so that it can be evicted. This will + * allow the remote node to go ahead and delete the inode without us + * having to do it, which will avoid rgrp glock thrashing. + * + * The remote node is likely still holding the corresponding inode + * glock, so it will run before we get to verify that the delete has + * happened below. + */ + spin_lock(&gl->gl_lockref.lock); + ip = gl->gl_object; + if (ip && !igrab(&ip->i_inode)) + ip = NULL; + spin_unlock(&gl->gl_lockref.lock); + if (ip) { + gl->gl_no_formal_ino = ip->i_no_formal_ino; + set_bit(GIF_DEFERRED_DELETE, &ip->i_flags); + d_prune_aliases(&ip->i_inode); + iput(&ip->i_inode); + + /* If the inode was evicted, gl->gl_object will now be NULL. */ + spin_lock(&gl->gl_lockref.lock); + ip = gl->gl_object; + if (ip) { + clear_bit(GIF_DEFERRED_DELETE, &ip->i_flags); + if (!igrab(&ip->i_inode)) + ip = NULL; + } + spin_unlock(&gl->gl_lockref.lock); + if (ip) { + gfs2_glock_poke(ip->i_gl); + iput(&ip->i_inode); + } + evicted = !ip; + } + return evicted; +} + +bool gfs2_queue_try_to_evict(struct gfs2_glock *gl) +{ + struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; + + if (test_and_set_bit(GLF_TRY_TO_EVICT, &gl->gl_flags)) + return false; + return queue_delayed_work(sdp->sd_delete_wq, + &gl->gl_delete, 0); +} + +static bool gfs2_queue_verify_evict(struct gfs2_glock *gl) +{ + struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; + + if (test_and_set_bit(GLF_VERIFY_EVICT, &gl->gl_flags)) + return false; + return queue_delayed_work(sdp->sd_delete_wq, + &gl->gl_delete, 5 * HZ); +} + +static void delete_work_func(struct work_struct *work) +{ + struct delayed_work *dwork = to_delayed_work(work); + struct gfs2_glock *gl = container_of(dwork, struct gfs2_glock, gl_delete); + struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; + struct inode *inode; + u64 no_addr = gl->gl_name.ln_number; + + if (test_and_clear_bit(GLF_TRY_TO_EVICT, &gl->gl_flags)) { + /* + * If we can evict the inode, give the remote node trying to + * delete the inode some time before verifying that the delete + * has happened. Otherwise, if we cause contention on the inode glock + * immediately, the remote node will think that we still have + * the inode in use, and so it will give up waiting. + * + * If we can't evict the inode, signal to the remote node that + * the inode is still in use. We'll later try to delete the + * inode locally in gfs2_evict_inode. + * + * FIXME: We only need to verify that the remote node has + * deleted the inode because nodes before this remote delete + * rework won't cooperate. At a later time, when we no longer + * care about compatibility with such nodes, we can skip this + * step entirely. + */ + if (gfs2_try_evict(gl)) { + if (test_bit(SDF_KILL, &sdp->sd_flags)) + goto out; + if (gfs2_queue_verify_evict(gl)) + return; + } + goto out; + } + + if (test_and_clear_bit(GLF_VERIFY_EVICT, &gl->gl_flags)) { + inode = gfs2_lookup_by_inum(sdp, no_addr, gl->gl_no_formal_ino, + GFS2_BLKST_UNLINKED); + if (IS_ERR(inode)) { + if (PTR_ERR(inode) == -EAGAIN && + !test_bit(SDF_KILL, &sdp->sd_flags) && + gfs2_queue_verify_evict(gl)) + return; + } else { + d_prune_aliases(inode); + iput(inode); + } + } + +out: + gfs2_glock_put(gl); +} + +static void glock_work_func(struct work_struct *work) +{ + unsigned long delay = 0; + struct gfs2_glock *gl = container_of(work, struct gfs2_glock, gl_work.work); + unsigned int drop_refs = 1; + + if (test_and_clear_bit(GLF_REPLY_PENDING, &gl->gl_flags)) { + finish_xmote(gl, gl->gl_reply); + drop_refs++; + } + spin_lock(&gl->gl_lockref.lock); + if (test_bit(GLF_PENDING_DEMOTE, &gl->gl_flags) && + gl->gl_state != LM_ST_UNLOCKED && + gl->gl_demote_state != LM_ST_EXCLUSIVE) { + unsigned long holdtime, now = jiffies; + + holdtime = gl->gl_tchange + gl->gl_hold_time; + if (time_before(now, holdtime)) + delay = holdtime - now; + + if (!delay) { + clear_bit(GLF_PENDING_DEMOTE, &gl->gl_flags); + gfs2_set_demote(gl); + } + } + run_queue(gl, 0); + if (delay) { + /* Keep one glock reference for the work we requeue. */ + drop_refs--; + if (gl->gl_name.ln_type != LM_TYPE_INODE) + delay = 0; + __gfs2_glock_queue_work(gl, delay); + } + + /* + * Drop the remaining glock references manually here. (Mind that + * __gfs2_glock_queue_work depends on the lockref spinlock begin held + * here as well.) + */ + gl->gl_lockref.count -= drop_refs; + if (!gl->gl_lockref.count) { + __gfs2_glock_put(gl); + return; + } + spin_unlock(&gl->gl_lockref.lock); +} + +static struct gfs2_glock *find_insert_glock(struct lm_lockname *name, + struct gfs2_glock *new) +{ + struct wait_glock_queue wait; + wait_queue_head_t *wq = glock_waitqueue(name); + struct gfs2_glock *gl; + + wait.name = name; + init_wait(&wait.wait); + wait.wait.func = glock_wake_function; + +again: + prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE); + rcu_read_lock(); + if (new) { + gl = rhashtable_lookup_get_insert_fast(&gl_hash_table, + &new->gl_node, ht_parms); + if (IS_ERR(gl)) + goto out; + } else { + gl = rhashtable_lookup_fast(&gl_hash_table, + name, ht_parms); + } + if (gl && !lockref_get_not_dead(&gl->gl_lockref)) { + rcu_read_unlock(); + schedule(); + goto again; + } +out: + rcu_read_unlock(); + finish_wait(wq, &wait.wait); + return gl; +} + +/** + * gfs2_glock_get() - Get a glock, or create one if one doesn't exist + * @sdp: The GFS2 superblock + * @number: the lock number + * @glops: The glock_operations to use + * @create: If 0, don't create the glock if it doesn't exist + * @glp: the glock is returned here + * + * This does not lock a glock, just finds/creates structures for one. + * + * Returns: errno + */ + +int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number, + const struct gfs2_glock_operations *glops, int create, + struct gfs2_glock **glp) +{ + struct super_block *s = sdp->sd_vfs; + struct lm_lockname name = { .ln_number = number, + .ln_type = glops->go_type, + .ln_sbd = sdp }; + struct gfs2_glock *gl, *tmp; + struct address_space *mapping; + int ret = 0; + + gl = find_insert_glock(&name, NULL); + if (gl) { + *glp = gl; + return 0; + } + if (!create) + return -ENOENT; + + if (glops->go_flags & GLOF_ASPACE) { + struct gfs2_glock_aspace *gla = + kmem_cache_alloc(gfs2_glock_aspace_cachep, GFP_NOFS); + if (!gla) + return -ENOMEM; + gl = &gla->glock; + } else { + gl = kmem_cache_alloc(gfs2_glock_cachep, GFP_NOFS); + if (!gl) + return -ENOMEM; + } + memset(&gl->gl_lksb, 0, sizeof(struct dlm_lksb)); + gl->gl_ops = glops; + + if (glops->go_flags & GLOF_LVB) { + gl->gl_lksb.sb_lvbptr = kzalloc(GDLM_LVB_SIZE, GFP_NOFS); + if (!gl->gl_lksb.sb_lvbptr) { + gfs2_glock_dealloc(&gl->gl_rcu); + return -ENOMEM; + } + } + + atomic_inc(&sdp->sd_glock_disposal); + gl->gl_node.next = NULL; + gl->gl_flags = glops->go_instantiate ? BIT(GLF_INSTANTIATE_NEEDED) : 0; + gl->gl_name = name; + lockdep_set_subclass(&gl->gl_lockref.lock, glops->go_subclass); + gl->gl_lockref.count = 1; + gl->gl_state = LM_ST_UNLOCKED; + gl->gl_target = LM_ST_UNLOCKED; + gl->gl_demote_state = LM_ST_EXCLUSIVE; + gl->gl_dstamp = 0; + preempt_disable(); + /* We use the global stats to estimate the initial per-glock stats */ + gl->gl_stats = this_cpu_ptr(sdp->sd_lkstats)->lkstats[glops->go_type]; + preempt_enable(); + gl->gl_stats.stats[GFS2_LKS_DCOUNT] = 0; + gl->gl_stats.stats[GFS2_LKS_QCOUNT] = 0; + gl->gl_tchange = jiffies; + gl->gl_object = NULL; + gl->gl_hold_time = GL_GLOCK_DFT_HOLD; + INIT_DELAYED_WORK(&gl->gl_work, glock_work_func); + if (gl->gl_name.ln_type == LM_TYPE_IOPEN) + INIT_DELAYED_WORK(&gl->gl_delete, delete_work_func); + + mapping = gfs2_glock2aspace(gl); + if (mapping) { + mapping->a_ops = &gfs2_meta_aops; + mapping->host = s->s_bdev->bd_inode; + mapping->flags = 0; + mapping_set_gfp_mask(mapping, GFP_NOFS); + mapping->private_data = NULL; + mapping->writeback_index = 0; + } + + tmp = find_insert_glock(&name, gl); + if (!tmp) { + *glp = gl; + goto out; + } + if (IS_ERR(tmp)) { + ret = PTR_ERR(tmp); + goto out_free; + } + *glp = tmp; + +out_free: + gfs2_glock_dealloc(&gl->gl_rcu); + if (atomic_dec_and_test(&sdp->sd_glock_disposal)) + wake_up(&sdp->sd_kill_wait); + +out: + return ret; +} + +/** + * __gfs2_holder_init - initialize a struct gfs2_holder in the default way + * @gl: the glock + * @state: the state we're requesting + * @flags: the modifier flags + * @gh: the holder structure + * + */ + +void __gfs2_holder_init(struct gfs2_glock *gl, unsigned int state, u16 flags, + struct gfs2_holder *gh, unsigned long ip) +{ + INIT_LIST_HEAD(&gh->gh_list); + gh->gh_gl = gfs2_glock_hold(gl); + gh->gh_ip = ip; + gh->gh_owner_pid = get_pid(task_pid(current)); + gh->gh_state = state; + gh->gh_flags = flags; + gh->gh_iflags = 0; +} + +/** + * gfs2_holder_reinit - reinitialize a struct gfs2_holder so we can requeue it + * @state: the state we're requesting + * @flags: the modifier flags + * @gh: the holder structure + * + * Don't mess with the glock. + * + */ + +void gfs2_holder_reinit(unsigned int state, u16 flags, struct gfs2_holder *gh) +{ + gh->gh_state = state; + gh->gh_flags = flags; + gh->gh_iflags = 0; + gh->gh_ip = _RET_IP_; + put_pid(gh->gh_owner_pid); + gh->gh_owner_pid = get_pid(task_pid(current)); +} + +/** + * gfs2_holder_uninit - uninitialize a holder structure (drop glock reference) + * @gh: the holder structure + * + */ + +void gfs2_holder_uninit(struct gfs2_holder *gh) +{ + put_pid(gh->gh_owner_pid); + gfs2_glock_put(gh->gh_gl); + gfs2_holder_mark_uninitialized(gh); + gh->gh_ip = 0; +} + +static void gfs2_glock_update_hold_time(struct gfs2_glock *gl, + unsigned long start_time) +{ + /* Have we waited longer that a second? */ + if (time_after(jiffies, start_time + HZ)) { + /* Lengthen the minimum hold time. */ + gl->gl_hold_time = min(gl->gl_hold_time + GL_GLOCK_HOLD_INCR, + GL_GLOCK_MAX_HOLD); + } +} + +/** + * gfs2_glock_holder_ready - holder is ready and its error code can be collected + * @gh: the glock holder + * + * Called when a glock holder no longer needs to be waited for because it is + * now either held (HIF_HOLDER set; gh_error == 0), or acquiring the lock has + * failed (gh_error != 0). + */ + +int gfs2_glock_holder_ready(struct gfs2_holder *gh) +{ + if (gh->gh_error || (gh->gh_flags & GL_SKIP)) + return gh->gh_error; + gh->gh_error = gfs2_instantiate(gh); + if (gh->gh_error) + gfs2_glock_dq(gh); + return gh->gh_error; +} + +/** + * gfs2_glock_wait - wait on a glock acquisition + * @gh: the glock holder + * + * Returns: 0 on success + */ + +int gfs2_glock_wait(struct gfs2_holder *gh) +{ + unsigned long start_time = jiffies; + + might_sleep(); + wait_on_bit(&gh->gh_iflags, HIF_WAIT, TASK_UNINTERRUPTIBLE); + gfs2_glock_update_hold_time(gh->gh_gl, start_time); + return gfs2_glock_holder_ready(gh); +} + +static int glocks_pending(unsigned int num_gh, struct gfs2_holder *ghs) +{ + int i; + + for (i = 0; i < num_gh; i++) + if (test_bit(HIF_WAIT, &ghs[i].gh_iflags)) + return 1; + return 0; +} + +/** + * gfs2_glock_async_wait - wait on multiple asynchronous glock acquisitions + * @num_gh: the number of holders in the array + * @ghs: the glock holder array + * + * Returns: 0 on success, meaning all glocks have been granted and are held. + * -ESTALE if the request timed out, meaning all glocks were released, + * and the caller should retry the operation. + */ + +int gfs2_glock_async_wait(unsigned int num_gh, struct gfs2_holder *ghs) +{ + struct gfs2_sbd *sdp = ghs[0].gh_gl->gl_name.ln_sbd; + int i, ret = 0, timeout = 0; + unsigned long start_time = jiffies; + + might_sleep(); + /* + * Total up the (minimum hold time * 2) of all glocks and use that to + * determine the max amount of time we should wait. + */ + for (i = 0; i < num_gh; i++) + timeout += ghs[i].gh_gl->gl_hold_time << 1; + + if (!wait_event_timeout(sdp->sd_async_glock_wait, + !glocks_pending(num_gh, ghs), timeout)) { + ret = -ESTALE; /* request timed out. */ + goto out; + } + + for (i = 0; i < num_gh; i++) { + struct gfs2_holder *gh = &ghs[i]; + int ret2; + + if (test_bit(HIF_HOLDER, &gh->gh_iflags)) { + gfs2_glock_update_hold_time(gh->gh_gl, + start_time); + } + ret2 = gfs2_glock_holder_ready(gh); + if (!ret) + ret = ret2; + } + +out: + if (ret) { + for (i = 0; i < num_gh; i++) { + struct gfs2_holder *gh = &ghs[i]; + + gfs2_glock_dq(gh); + } + } + return ret; +} + +/** + * handle_callback - process a demote request + * @gl: the glock + * @state: the state the caller wants us to change to + * @delay: zero to demote immediately; otherwise pending demote + * @remote: true if this came from a different cluster node + * + * There are only two requests that we are going to see in actual + * practise: LM_ST_SHARED and LM_ST_UNLOCKED + */ + +static void handle_callback(struct gfs2_glock *gl, unsigned int state, + unsigned long delay, bool remote) +{ + if (delay) + set_bit(GLF_PENDING_DEMOTE, &gl->gl_flags); + else + gfs2_set_demote(gl); + if (gl->gl_demote_state == LM_ST_EXCLUSIVE) { + gl->gl_demote_state = state; + gl->gl_demote_time = jiffies; + } else if (gl->gl_demote_state != LM_ST_UNLOCKED && + gl->gl_demote_state != state) { + gl->gl_demote_state = LM_ST_UNLOCKED; + } + if (gl->gl_ops->go_callback) + gl->gl_ops->go_callback(gl, remote); + trace_gfs2_demote_rq(gl, remote); +} + +void gfs2_print_dbg(struct seq_file *seq, const char *fmt, ...) +{ + struct va_format vaf; + va_list args; + + va_start(args, fmt); + + if (seq) { + seq_vprintf(seq, fmt, args); + } else { + vaf.fmt = fmt; + vaf.va = &args; + + pr_err("%pV", &vaf); + } + + va_end(args); +} + +static inline bool pid_is_meaningful(const struct gfs2_holder *gh) +{ + if (!(gh->gh_flags & GL_NOPID)) + return true; + if (gh->gh_state == LM_ST_UNLOCKED) + return true; + return false; +} + +/** + * add_to_queue - Add a holder to the wait queue (but look for recursion) + * @gh: the holder structure to add + * + * Eventually we should move the recursive locking trap to a + * debugging option or something like that. This is the fast + * path and needs to have the minimum number of distractions. + * + */ + +static inline void add_to_queue(struct gfs2_holder *gh) +__releases(&gl->gl_lockref.lock) +__acquires(&gl->gl_lockref.lock) +{ + struct gfs2_glock *gl = gh->gh_gl; + struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; + struct list_head *insert_pt = NULL; + struct gfs2_holder *gh2; + int try_futile = 0; + + GLOCK_BUG_ON(gl, gh->gh_owner_pid == NULL); + if (test_and_set_bit(HIF_WAIT, &gh->gh_iflags)) + GLOCK_BUG_ON(gl, true); + + if (gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)) { + if (test_bit(GLF_LOCK, &gl->gl_flags)) { + struct gfs2_holder *current_gh; + + current_gh = find_first_holder(gl); + try_futile = !may_grant(gl, current_gh, gh); + } + if (test_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags)) + goto fail; + } + + list_for_each_entry(gh2, &gl->gl_holders, gh_list) { + if (likely(gh2->gh_owner_pid != gh->gh_owner_pid)) + continue; + if (gh->gh_gl->gl_ops->go_type == LM_TYPE_FLOCK) + continue; + if (!pid_is_meaningful(gh2)) + continue; + goto trap_recursive; + } + list_for_each_entry(gh2, &gl->gl_holders, gh_list) { + if (try_futile && + !(gh2->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB))) { +fail: + gh->gh_error = GLR_TRYFAILED; + gfs2_holder_wake(gh); + return; + } + if (test_bit(HIF_HOLDER, &gh2->gh_iflags)) + continue; + } + trace_gfs2_glock_queue(gh, 1); + gfs2_glstats_inc(gl, GFS2_LKS_QCOUNT); + gfs2_sbstats_inc(gl, GFS2_LKS_QCOUNT); + if (likely(insert_pt == NULL)) { + list_add_tail(&gh->gh_list, &gl->gl_holders); + return; + } + list_add_tail(&gh->gh_list, insert_pt); + gh = list_first_entry(&gl->gl_holders, struct gfs2_holder, gh_list); + spin_unlock(&gl->gl_lockref.lock); + if (sdp->sd_lockstruct.ls_ops->lm_cancel) + sdp->sd_lockstruct.ls_ops->lm_cancel(gl); + spin_lock(&gl->gl_lockref.lock); + return; + +trap_recursive: + fs_err(sdp, "original: %pSR\n", (void *)gh2->gh_ip); + fs_err(sdp, "pid: %d\n", pid_nr(gh2->gh_owner_pid)); + fs_err(sdp, "lock type: %d req lock state : %d\n", + gh2->gh_gl->gl_name.ln_type, gh2->gh_state); + fs_err(sdp, "new: %pSR\n", (void *)gh->gh_ip); + fs_err(sdp, "pid: %d\n", pid_nr(gh->gh_owner_pid)); + fs_err(sdp, "lock type: %d req lock state : %d\n", + gh->gh_gl->gl_name.ln_type, gh->gh_state); + gfs2_dump_glock(NULL, gl, true); + BUG(); +} + +/** + * gfs2_glock_nq - enqueue a struct gfs2_holder onto a glock (acquire a glock) + * @gh: the holder structure + * + * if (gh->gh_flags & GL_ASYNC), this never returns an error + * + * Returns: 0, GLR_TRYFAILED, or errno on failure + */ + +int gfs2_glock_nq(struct gfs2_holder *gh) +{ + struct gfs2_glock *gl = gh->gh_gl; + int error = 0; + + if (glock_blocked_by_withdraw(gl) && !(gh->gh_flags & LM_FLAG_NOEXP)) + return -EIO; + + if (test_bit(GLF_LRU, &gl->gl_flags)) + gfs2_glock_remove_from_lru(gl); + + gh->gh_error = 0; + spin_lock(&gl->gl_lockref.lock); + add_to_queue(gh); + if (unlikely((LM_FLAG_NOEXP & gh->gh_flags) && + test_and_clear_bit(GLF_FROZEN, &gl->gl_flags))) { + set_bit(GLF_REPLY_PENDING, &gl->gl_flags); + gl->gl_lockref.count++; + __gfs2_glock_queue_work(gl, 0); + } + run_queue(gl, 1); + spin_unlock(&gl->gl_lockref.lock); + + if (!(gh->gh_flags & GL_ASYNC)) + error = gfs2_glock_wait(gh); + + return error; +} + +/** + * gfs2_glock_poll - poll to see if an async request has been completed + * @gh: the holder + * + * Returns: 1 if the request is ready to be gfs2_glock_wait()ed on + */ + +int gfs2_glock_poll(struct gfs2_holder *gh) +{ + return test_bit(HIF_WAIT, &gh->gh_iflags) ? 0 : 1; +} + +static inline bool needs_demote(struct gfs2_glock *gl) +{ + return (test_bit(GLF_DEMOTE, &gl->gl_flags) || + test_bit(GLF_PENDING_DEMOTE, &gl->gl_flags)); +} + +static void __gfs2_glock_dq(struct gfs2_holder *gh) +{ + struct gfs2_glock *gl = gh->gh_gl; + unsigned delay = 0; + int fast_path = 0; + + /* + * This holder should not be cached, so mark it for demote. + * Note: this should be done before the check for needs_demote + * below. + */ + if (gh->gh_flags & GL_NOCACHE) + handle_callback(gl, LM_ST_UNLOCKED, 0, false); + + list_del_init(&gh->gh_list); + clear_bit(HIF_HOLDER, &gh->gh_iflags); + trace_gfs2_glock_queue(gh, 0); + + /* + * If there hasn't been a demote request we are done. + * (Let the remaining holders, if any, keep holding it.) + */ + if (!needs_demote(gl)) { + if (list_empty(&gl->gl_holders)) + fast_path = 1; + } + + if (!test_bit(GLF_LFLUSH, &gl->gl_flags) && demote_ok(gl)) + gfs2_glock_add_to_lru(gl); + + if (unlikely(!fast_path)) { + gl->gl_lockref.count++; + if (test_bit(GLF_PENDING_DEMOTE, &gl->gl_flags) && + !test_bit(GLF_DEMOTE, &gl->gl_flags) && + gl->gl_name.ln_type == LM_TYPE_INODE) + delay = gl->gl_hold_time; + __gfs2_glock_queue_work(gl, delay); + } +} + +/** + * gfs2_glock_dq - dequeue a struct gfs2_holder from a glock (release a glock) + * @gh: the glock holder + * + */ +void gfs2_glock_dq(struct gfs2_holder *gh) +{ + struct gfs2_glock *gl = gh->gh_gl; + struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; + + spin_lock(&gl->gl_lockref.lock); + if (!gfs2_holder_queued(gh)) { + /* + * May have already been dequeued because the locking request + * was GL_ASYNC and it has failed in the meantime. + */ + goto out; + } + + if (list_is_first(&gh->gh_list, &gl->gl_holders) && + !test_bit(HIF_HOLDER, &gh->gh_iflags)) { + spin_unlock(&gl->gl_lockref.lock); + gl->gl_name.ln_sbd->sd_lockstruct.ls_ops->lm_cancel(gl); + wait_on_bit(&gh->gh_iflags, HIF_WAIT, TASK_UNINTERRUPTIBLE); + spin_lock(&gl->gl_lockref.lock); + } + + /* + * If we're in the process of file system withdraw, we cannot just + * dequeue any glocks until our journal is recovered, lest we introduce + * file system corruption. We need two exceptions to this rule: We need + * to allow unlocking of nondisk glocks and the glock for our own + * journal that needs recovery. + */ + if (test_bit(SDF_WITHDRAW_RECOVERY, &sdp->sd_flags) && + glock_blocked_by_withdraw(gl) && + gh->gh_gl != sdp->sd_jinode_gl) { + sdp->sd_glock_dqs_held++; + spin_unlock(&gl->gl_lockref.lock); + might_sleep(); + wait_on_bit(&sdp->sd_flags, SDF_WITHDRAW_RECOVERY, + TASK_UNINTERRUPTIBLE); + spin_lock(&gl->gl_lockref.lock); + } + + __gfs2_glock_dq(gh); +out: + spin_unlock(&gl->gl_lockref.lock); +} + +void gfs2_glock_dq_wait(struct gfs2_holder *gh) +{ + struct gfs2_glock *gl = gh->gh_gl; + gfs2_glock_dq(gh); + might_sleep(); + wait_on_bit(&gl->gl_flags, GLF_DEMOTE, TASK_UNINTERRUPTIBLE); +} + +/** + * gfs2_glock_dq_uninit - dequeue a holder from a glock and initialize it + * @gh: the holder structure + * + */ + +void gfs2_glock_dq_uninit(struct gfs2_holder *gh) +{ + gfs2_glock_dq(gh); + gfs2_holder_uninit(gh); +} + +/** + * gfs2_glock_nq_num - acquire a glock based on lock number + * @sdp: the filesystem + * @number: the lock number + * @glops: the glock operations for the type of glock + * @state: the state to acquire the glock in + * @flags: modifier flags for the acquisition + * @gh: the struct gfs2_holder + * + * Returns: errno + */ + +int gfs2_glock_nq_num(struct gfs2_sbd *sdp, u64 number, + const struct gfs2_glock_operations *glops, + unsigned int state, u16 flags, struct gfs2_holder *gh) +{ + struct gfs2_glock *gl; + int error; + + error = gfs2_glock_get(sdp, number, glops, CREATE, &gl); + if (!error) { + error = gfs2_glock_nq_init(gl, state, flags, gh); + gfs2_glock_put(gl); + } + + return error; +} + +/** + * glock_compare - Compare two struct gfs2_glock structures for sorting + * @arg_a: the first structure + * @arg_b: the second structure + * + */ + +static int glock_compare(const void *arg_a, const void *arg_b) +{ + const struct gfs2_holder *gh_a = *(const struct gfs2_holder **)arg_a; + const struct gfs2_holder *gh_b = *(const struct gfs2_holder **)arg_b; + const struct lm_lockname *a = &gh_a->gh_gl->gl_name; + const struct lm_lockname *b = &gh_b->gh_gl->gl_name; + + if (a->ln_number > b->ln_number) + return 1; + if (a->ln_number < b->ln_number) + return -1; + BUG_ON(gh_a->gh_gl->gl_ops->go_type == gh_b->gh_gl->gl_ops->go_type); + return 0; +} + +/** + * nq_m_sync - synchronously acquire more than one glock in deadlock free order + * @num_gh: the number of structures + * @ghs: an array of struct gfs2_holder structures + * @p: placeholder for the holder structure to pass back + * + * Returns: 0 on success (all glocks acquired), + * errno on failure (no glocks acquired) + */ + +static int nq_m_sync(unsigned int num_gh, struct gfs2_holder *ghs, + struct gfs2_holder **p) +{ + unsigned int x; + int error = 0; + + for (x = 0; x < num_gh; x++) + p[x] = &ghs[x]; + + sort(p, num_gh, sizeof(struct gfs2_holder *), glock_compare, NULL); + + for (x = 0; x < num_gh; x++) { + error = gfs2_glock_nq(p[x]); + if (error) { + while (x--) + gfs2_glock_dq(p[x]); + break; + } + } + + return error; +} + +/** + * gfs2_glock_nq_m - acquire multiple glocks + * @num_gh: the number of structures + * @ghs: an array of struct gfs2_holder structures + * + * Returns: 0 on success (all glocks acquired), + * errno on failure (no glocks acquired) + */ + +int gfs2_glock_nq_m(unsigned int num_gh, struct gfs2_holder *ghs) +{ + struct gfs2_holder *tmp[4]; + struct gfs2_holder **pph = tmp; + int error = 0; + + switch(num_gh) { + case 0: + return 0; + case 1: + return gfs2_glock_nq(ghs); + default: + if (num_gh <= 4) + break; + pph = kmalloc_array(num_gh, sizeof(struct gfs2_holder *), + GFP_NOFS); + if (!pph) + return -ENOMEM; + } + + error = nq_m_sync(num_gh, ghs, pph); + + if (pph != tmp) + kfree(pph); + + return error; +} + +/** + * gfs2_glock_dq_m - release multiple glocks + * @num_gh: the number of structures + * @ghs: an array of struct gfs2_holder structures + * + */ + +void gfs2_glock_dq_m(unsigned int num_gh, struct gfs2_holder *ghs) +{ + while (num_gh--) + gfs2_glock_dq(&ghs[num_gh]); +} + +void gfs2_glock_cb(struct gfs2_glock *gl, unsigned int state) +{ + unsigned long delay = 0; + unsigned long holdtime; + unsigned long now = jiffies; + + gfs2_glock_hold(gl); + spin_lock(&gl->gl_lockref.lock); + holdtime = gl->gl_tchange + gl->gl_hold_time; + if (!list_empty(&gl->gl_holders) && + gl->gl_name.ln_type == LM_TYPE_INODE) { + if (time_before(now, holdtime)) + delay = holdtime - now; + if (test_bit(GLF_REPLY_PENDING, &gl->gl_flags)) + delay = gl->gl_hold_time; + } + handle_callback(gl, state, delay, true); + __gfs2_glock_queue_work(gl, delay); + spin_unlock(&gl->gl_lockref.lock); +} + +/** + * gfs2_should_freeze - Figure out if glock should be frozen + * @gl: The glock in question + * + * Glocks are not frozen if (a) the result of the dlm operation is + * an error, (b) the locking operation was an unlock operation or + * (c) if there is a "noexp" flagged request anywhere in the queue + * + * Returns: 1 if freezing should occur, 0 otherwise + */ + +static int gfs2_should_freeze(const struct gfs2_glock *gl) +{ + const struct gfs2_holder *gh; + + if (gl->gl_reply & ~LM_OUT_ST_MASK) + return 0; + if (gl->gl_target == LM_ST_UNLOCKED) + return 0; + + list_for_each_entry(gh, &gl->gl_holders, gh_list) { + if (test_bit(HIF_HOLDER, &gh->gh_iflags)) + continue; + if (LM_FLAG_NOEXP & gh->gh_flags) + return 0; + } + + return 1; +} + +/** + * gfs2_glock_complete - Callback used by locking + * @gl: Pointer to the glock + * @ret: The return value from the dlm + * + * The gl_reply field is under the gl_lockref.lock lock so that it is ok + * to use a bitfield shared with other glock state fields. + */ + +void gfs2_glock_complete(struct gfs2_glock *gl, int ret) +{ + struct lm_lockstruct *ls = &gl->gl_name.ln_sbd->sd_lockstruct; + + spin_lock(&gl->gl_lockref.lock); + gl->gl_reply = ret; + + if (unlikely(test_bit(DFL_BLOCK_LOCKS, &ls->ls_recover_flags))) { + if (gfs2_should_freeze(gl)) { + set_bit(GLF_FROZEN, &gl->gl_flags); + spin_unlock(&gl->gl_lockref.lock); + return; + } + } + + gl->gl_lockref.count++; + set_bit(GLF_REPLY_PENDING, &gl->gl_flags); + __gfs2_glock_queue_work(gl, 0); + spin_unlock(&gl->gl_lockref.lock); +} + +static int glock_cmp(void *priv, const struct list_head *a, + const struct list_head *b) +{ + struct gfs2_glock *gla, *glb; + + gla = list_entry(a, struct gfs2_glock, gl_lru); + glb = list_entry(b, struct gfs2_glock, gl_lru); + + if (gla->gl_name.ln_number > glb->gl_name.ln_number) + return 1; + if (gla->gl_name.ln_number < glb->gl_name.ln_number) + return -1; + + return 0; +} + +/** + * gfs2_dispose_glock_lru - Demote a list of glocks + * @list: The list to dispose of + * + * Disposing of glocks may involve disk accesses, so that here we sort + * the glocks by number (i.e. disk location of the inodes) so that if + * there are any such accesses, they'll be sent in order (mostly). + * + * Must be called under the lru_lock, but may drop and retake this + * lock. While the lru_lock is dropped, entries may vanish from the + * list, but no new entries will appear on the list (since it is + * private) + */ + +static void gfs2_dispose_glock_lru(struct list_head *list) +__releases(&lru_lock) +__acquires(&lru_lock) +{ + struct gfs2_glock *gl; + + list_sort(NULL, list, glock_cmp); + + while(!list_empty(list)) { + gl = list_first_entry(list, struct gfs2_glock, gl_lru); + list_del_init(&gl->gl_lru); + clear_bit(GLF_LRU, &gl->gl_flags); + if (!spin_trylock(&gl->gl_lockref.lock)) { +add_back_to_lru: + list_add(&gl->gl_lru, &lru_list); + set_bit(GLF_LRU, &gl->gl_flags); + atomic_inc(&lru_count); + continue; + } + if (test_and_set_bit(GLF_LOCK, &gl->gl_flags)) { + spin_unlock(&gl->gl_lockref.lock); + goto add_back_to_lru; + } + gl->gl_lockref.count++; + if (demote_ok(gl)) + handle_callback(gl, LM_ST_UNLOCKED, 0, false); + WARN_ON(!test_and_clear_bit(GLF_LOCK, &gl->gl_flags)); + __gfs2_glock_queue_work(gl, 0); + spin_unlock(&gl->gl_lockref.lock); + cond_resched_lock(&lru_lock); + } +} + +/** + * gfs2_scan_glock_lru - Scan the LRU looking for locks to demote + * @nr: The number of entries to scan + * + * This function selects the entries on the LRU which are able to + * be demoted, and then kicks off the process by calling + * gfs2_dispose_glock_lru() above. + */ + +static long gfs2_scan_glock_lru(int nr) +{ + struct gfs2_glock *gl, *next; + LIST_HEAD(dispose); + long freed = 0; + + spin_lock(&lru_lock); + list_for_each_entry_safe(gl, next, &lru_list, gl_lru) { + if (nr-- <= 0) + break; + /* Test for being demotable */ + if (!test_bit(GLF_LOCK, &gl->gl_flags)) { + if (!spin_trylock(&gl->gl_lockref.lock)) + continue; + if (gl->gl_lockref.count <= 1 && + (gl->gl_state == LM_ST_UNLOCKED || + demote_ok(gl))) { + list_move(&gl->gl_lru, &dispose); + atomic_dec(&lru_count); + freed++; + } + spin_unlock(&gl->gl_lockref.lock); + } + } + if (!list_empty(&dispose)) + gfs2_dispose_glock_lru(&dispose); + spin_unlock(&lru_lock); + + return freed; +} + +static unsigned long gfs2_glock_shrink_scan(struct shrinker *shrink, + struct shrink_control *sc) +{ + if (!(sc->gfp_mask & __GFP_FS)) + return SHRINK_STOP; + return gfs2_scan_glock_lru(sc->nr_to_scan); +} + +static unsigned long gfs2_glock_shrink_count(struct shrinker *shrink, + struct shrink_control *sc) +{ + return vfs_pressure_ratio(atomic_read(&lru_count)); +} + +static struct shrinker glock_shrinker = { + .seeks = DEFAULT_SEEKS, + .count_objects = gfs2_glock_shrink_count, + .scan_objects = gfs2_glock_shrink_scan, +}; + +/** + * glock_hash_walk - Call a function for glock in a hash bucket + * @examiner: the function + * @sdp: the filesystem + * + * Note that the function can be called multiple times on the same + * object. So the user must ensure that the function can cope with + * that. + */ + +static void glock_hash_walk(glock_examiner examiner, const struct gfs2_sbd *sdp) +{ + struct gfs2_glock *gl; + struct rhashtable_iter iter; + + rhashtable_walk_enter(&gl_hash_table, &iter); + + do { + rhashtable_walk_start(&iter); + + while ((gl = rhashtable_walk_next(&iter)) && !IS_ERR(gl)) { + if (gl->gl_name.ln_sbd == sdp) + examiner(gl); + } + + rhashtable_walk_stop(&iter); + } while (cond_resched(), gl == ERR_PTR(-EAGAIN)); + + rhashtable_walk_exit(&iter); +} + +void gfs2_cancel_delete_work(struct gfs2_glock *gl) +{ + clear_bit(GLF_TRY_TO_EVICT, &gl->gl_flags); + clear_bit(GLF_VERIFY_EVICT, &gl->gl_flags); + if (cancel_delayed_work(&gl->gl_delete)) + gfs2_glock_put(gl); +} + +static void flush_delete_work(struct gfs2_glock *gl) +{ + if (gl->gl_name.ln_type == LM_TYPE_IOPEN) { + struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; + + if (cancel_delayed_work(&gl->gl_delete)) { + queue_delayed_work(sdp->sd_delete_wq, + &gl->gl_delete, 0); + } + } +} + +void gfs2_flush_delete_work(struct gfs2_sbd *sdp) +{ + glock_hash_walk(flush_delete_work, sdp); + flush_workqueue(sdp->sd_delete_wq); +} + +/** + * thaw_glock - thaw out a glock which has an unprocessed reply waiting + * @gl: The glock to thaw + * + */ + +static void thaw_glock(struct gfs2_glock *gl) +{ + if (!test_and_clear_bit(GLF_FROZEN, &gl->gl_flags)) + return; + if (!lockref_get_not_dead(&gl->gl_lockref)) + return; + set_bit(GLF_REPLY_PENDING, &gl->gl_flags); + gfs2_glock_queue_work(gl, 0); +} + +/** + * clear_glock - look at a glock and see if we can free it from glock cache + * @gl: the glock to look at + * + */ + +static void clear_glock(struct gfs2_glock *gl) +{ + gfs2_glock_remove_from_lru(gl); + + spin_lock(&gl->gl_lockref.lock); + if (!__lockref_is_dead(&gl->gl_lockref)) { + gl->gl_lockref.count++; + if (gl->gl_state != LM_ST_UNLOCKED) + handle_callback(gl, LM_ST_UNLOCKED, 0, false); + __gfs2_glock_queue_work(gl, 0); + } + spin_unlock(&gl->gl_lockref.lock); +} + +/** + * gfs2_glock_thaw - Thaw any frozen glocks + * @sdp: The super block + * + */ + +void gfs2_glock_thaw(struct gfs2_sbd *sdp) +{ + glock_hash_walk(thaw_glock, sdp); +} + +static void dump_glock(struct seq_file *seq, struct gfs2_glock *gl, bool fsid) +{ + spin_lock(&gl->gl_lockref.lock); + gfs2_dump_glock(seq, gl, fsid); + spin_unlock(&gl->gl_lockref.lock); +} + +static void dump_glock_func(struct gfs2_glock *gl) +{ + dump_glock(NULL, gl, true); +} + +static void withdraw_dq(struct gfs2_glock *gl) +{ + spin_lock(&gl->gl_lockref.lock); + if (!__lockref_is_dead(&gl->gl_lockref) && + glock_blocked_by_withdraw(gl)) + do_error(gl, LM_OUT_ERROR); /* remove pending waiters */ + spin_unlock(&gl->gl_lockref.lock); +} + +void gfs2_gl_dq_holders(struct gfs2_sbd *sdp) +{ + glock_hash_walk(withdraw_dq, sdp); +} + +/** + * gfs2_gl_hash_clear - Empty out the glock hash table + * @sdp: the filesystem + * + * Called when unmounting the filesystem. + */ + +void gfs2_gl_hash_clear(struct gfs2_sbd *sdp) +{ + set_bit(SDF_SKIP_DLM_UNLOCK, &sdp->sd_flags); + flush_workqueue(glock_workqueue); + glock_hash_walk(clear_glock, sdp); + flush_workqueue(glock_workqueue); + wait_event_timeout(sdp->sd_kill_wait, + atomic_read(&sdp->sd_glock_disposal) == 0, + HZ * 600); + glock_hash_walk(dump_glock_func, sdp); +} + +static const char *state2str(unsigned state) +{ + switch(state) { + case LM_ST_UNLOCKED: + return "UN"; + case LM_ST_SHARED: + return "SH"; + case LM_ST_DEFERRED: + return "DF"; + case LM_ST_EXCLUSIVE: + return "EX"; + } + return "??"; +} + +static const char *hflags2str(char *buf, u16 flags, unsigned long iflags) +{ + char *p = buf; + if (flags & LM_FLAG_TRY) + *p++ = 't'; + if (flags & LM_FLAG_TRY_1CB) + *p++ = 'T'; + if (flags & LM_FLAG_NOEXP) + *p++ = 'e'; + if (flags & LM_FLAG_ANY) + *p++ = 'A'; + if (flags & LM_FLAG_NODE_SCOPE) + *p++ = 'n'; + if (flags & GL_ASYNC) + *p++ = 'a'; + if (flags & GL_EXACT) + *p++ = 'E'; + if (flags & GL_NOCACHE) + *p++ = 'c'; + if (test_bit(HIF_HOLDER, &iflags)) + *p++ = 'H'; + if (test_bit(HIF_WAIT, &iflags)) + *p++ = 'W'; + if (flags & GL_SKIP) + *p++ = 's'; + *p = 0; + return buf; +} + +/** + * dump_holder - print information about a glock holder + * @seq: the seq_file struct + * @gh: the glock holder + * @fs_id_buf: pointer to file system id (if requested) + * + */ + +static void dump_holder(struct seq_file *seq, const struct gfs2_holder *gh, + const char *fs_id_buf) +{ + const char *comm = "(none)"; + pid_t owner_pid = 0; + char flags_buf[32]; + + rcu_read_lock(); + if (pid_is_meaningful(gh)) { + struct task_struct *gh_owner; + + comm = "(ended)"; + owner_pid = pid_nr(gh->gh_owner_pid); + gh_owner = pid_task(gh->gh_owner_pid, PIDTYPE_PID); + if (gh_owner) + comm = gh_owner->comm; + } + gfs2_print_dbg(seq, "%s H: s:%s f:%s e:%d p:%ld [%s] %pS\n", + fs_id_buf, state2str(gh->gh_state), + hflags2str(flags_buf, gh->gh_flags, gh->gh_iflags), + gh->gh_error, (long)owner_pid, comm, (void *)gh->gh_ip); + rcu_read_unlock(); +} + +static const char *gflags2str(char *buf, const struct gfs2_glock *gl) +{ + const unsigned long *gflags = &gl->gl_flags; + char *p = buf; + + if (test_bit(GLF_LOCK, gflags)) + *p++ = 'l'; + if (test_bit(GLF_DEMOTE, gflags)) + *p++ = 'D'; + if (test_bit(GLF_PENDING_DEMOTE, gflags)) + *p++ = 'd'; + if (test_bit(GLF_DEMOTE_IN_PROGRESS, gflags)) + *p++ = 'p'; + if (test_bit(GLF_DIRTY, gflags)) + *p++ = 'y'; + if (test_bit(GLF_LFLUSH, gflags)) + *p++ = 'f'; + if (test_bit(GLF_INVALIDATE_IN_PROGRESS, gflags)) + *p++ = 'i'; + if (test_bit(GLF_REPLY_PENDING, gflags)) + *p++ = 'r'; + if (test_bit(GLF_INITIAL, gflags)) + *p++ = 'I'; + if (test_bit(GLF_FROZEN, gflags)) + *p++ = 'F'; + if (!list_empty(&gl->gl_holders)) + *p++ = 'q'; + if (test_bit(GLF_LRU, gflags)) + *p++ = 'L'; + if (gl->gl_object) + *p++ = 'o'; + if (test_bit(GLF_BLOCKING, gflags)) + *p++ = 'b'; + if (test_bit(GLF_FREEING, gflags)) + *p++ = 'x'; + if (test_bit(GLF_INSTANTIATE_NEEDED, gflags)) + *p++ = 'n'; + if (test_bit(GLF_INSTANTIATE_IN_PROG, gflags)) + *p++ = 'N'; + if (test_bit(GLF_TRY_TO_EVICT, gflags)) + *p++ = 'e'; + if (test_bit(GLF_VERIFY_EVICT, gflags)) + *p++ = 'E'; + *p = 0; + return buf; +} + +/** + * gfs2_dump_glock - print information about a glock + * @seq: The seq_file struct + * @gl: the glock + * @fsid: If true, also dump the file system id + * + * The file format is as follows: + * One line per object, capital letters are used to indicate objects + * G = glock, I = Inode, R = rgrp, H = holder. Glocks are not indented, + * other objects are indented by a single space and follow the glock to + * which they are related. Fields are indicated by lower case letters + * followed by a colon and the field value, except for strings which are in + * [] so that its possible to see if they are composed of spaces for + * example. The field's are n = number (id of the object), f = flags, + * t = type, s = state, r = refcount, e = error, p = pid. + * + */ + +void gfs2_dump_glock(struct seq_file *seq, struct gfs2_glock *gl, bool fsid) +{ + const struct gfs2_glock_operations *glops = gl->gl_ops; + unsigned long long dtime; + const struct gfs2_holder *gh; + char gflags_buf[32]; + struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; + char fs_id_buf[sizeof(sdp->sd_fsname) + 7]; + unsigned long nrpages = 0; + + if (gl->gl_ops->go_flags & GLOF_ASPACE) { + struct address_space *mapping = gfs2_glock2aspace(gl); + + nrpages = mapping->nrpages; + } + memset(fs_id_buf, 0, sizeof(fs_id_buf)); + if (fsid && sdp) /* safety precaution */ + sprintf(fs_id_buf, "fsid=%s: ", sdp->sd_fsname); + dtime = jiffies - gl->gl_demote_time; + dtime *= 1000000/HZ; /* demote time in uSec */ + if (!test_bit(GLF_DEMOTE, &gl->gl_flags)) + dtime = 0; + gfs2_print_dbg(seq, "%sG: s:%s n:%u/%llx f:%s t:%s d:%s/%llu a:%d " + "v:%d r:%d m:%ld p:%lu\n", + fs_id_buf, state2str(gl->gl_state), + gl->gl_name.ln_type, + (unsigned long long)gl->gl_name.ln_number, + gflags2str(gflags_buf, gl), + state2str(gl->gl_target), + state2str(gl->gl_demote_state), dtime, + atomic_read(&gl->gl_ail_count), + atomic_read(&gl->gl_revokes), + (int)gl->gl_lockref.count, gl->gl_hold_time, nrpages); + + list_for_each_entry(gh, &gl->gl_holders, gh_list) + dump_holder(seq, gh, fs_id_buf); + + if (gl->gl_state != LM_ST_UNLOCKED && glops->go_dump) + glops->go_dump(seq, gl, fs_id_buf); +} + +static int gfs2_glstats_seq_show(struct seq_file *seq, void *iter_ptr) +{ + struct gfs2_glock *gl = iter_ptr; + + seq_printf(seq, "G: n:%u/%llx rtt:%llu/%llu rttb:%llu/%llu irt:%llu/%llu dcnt: %llu qcnt: %llu\n", + gl->gl_name.ln_type, + (unsigned long long)gl->gl_name.ln_number, + (unsigned long long)gl->gl_stats.stats[GFS2_LKS_SRTT], + (unsigned long long)gl->gl_stats.stats[GFS2_LKS_SRTTVAR], + (unsigned long long)gl->gl_stats.stats[GFS2_LKS_SRTTB], + (unsigned long long)gl->gl_stats.stats[GFS2_LKS_SRTTVARB], + (unsigned long long)gl->gl_stats.stats[GFS2_LKS_SIRT], + (unsigned long long)gl->gl_stats.stats[GFS2_LKS_SIRTVAR], + (unsigned long long)gl->gl_stats.stats[GFS2_LKS_DCOUNT], + (unsigned long long)gl->gl_stats.stats[GFS2_LKS_QCOUNT]); + return 0; +} + +static const char *gfs2_gltype[] = { + "type", + "reserved", + "nondisk", + "inode", + "rgrp", + "meta", + "iopen", + "flock", + "plock", + "quota", + "journal", +}; + +static const char *gfs2_stype[] = { + [GFS2_LKS_SRTT] = "srtt", + [GFS2_LKS_SRTTVAR] = "srttvar", + [GFS2_LKS_SRTTB] = "srttb", + [GFS2_LKS_SRTTVARB] = "srttvarb", + [GFS2_LKS_SIRT] = "sirt", + [GFS2_LKS_SIRTVAR] = "sirtvar", + [GFS2_LKS_DCOUNT] = "dlm", + [GFS2_LKS_QCOUNT] = "queue", +}; + +#define GFS2_NR_SBSTATS (ARRAY_SIZE(gfs2_gltype) * ARRAY_SIZE(gfs2_stype)) + +static int gfs2_sbstats_seq_show(struct seq_file *seq, void *iter_ptr) +{ + struct gfs2_sbd *sdp = seq->private; + loff_t pos = *(loff_t *)iter_ptr; + unsigned index = pos >> 3; + unsigned subindex = pos & 0x07; + int i; + + if (index == 0 && subindex != 0) + return 0; + + seq_printf(seq, "%-10s %8s:", gfs2_gltype[index], + (index == 0) ? "cpu": gfs2_stype[subindex]); + + for_each_possible_cpu(i) { + const struct gfs2_pcpu_lkstats *lkstats = per_cpu_ptr(sdp->sd_lkstats, i); + + if (index == 0) + seq_printf(seq, " %15u", i); + else + seq_printf(seq, " %15llu", (unsigned long long)lkstats-> + lkstats[index - 1].stats[subindex]); + } + seq_putc(seq, '\n'); + return 0; +} + +int __init gfs2_glock_init(void) +{ + int i, ret; + + ret = rhashtable_init(&gl_hash_table, &ht_parms); + if (ret < 0) + return ret; + + glock_workqueue = alloc_workqueue("glock_workqueue", WQ_MEM_RECLAIM | + WQ_HIGHPRI | WQ_FREEZABLE, 0); + if (!glock_workqueue) { + rhashtable_destroy(&gl_hash_table); + return -ENOMEM; + } + + ret = register_shrinker(&glock_shrinker, "gfs2-glock"); + if (ret) { + destroy_workqueue(glock_workqueue); + rhashtable_destroy(&gl_hash_table); + return ret; + } + + for (i = 0; i < GLOCK_WAIT_TABLE_SIZE; i++) + init_waitqueue_head(glock_wait_table + i); + + return 0; +} + +void gfs2_glock_exit(void) +{ + unregister_shrinker(&glock_shrinker); + rhashtable_destroy(&gl_hash_table); + destroy_workqueue(glock_workqueue); +} + +static void gfs2_glock_iter_next(struct gfs2_glock_iter *gi, loff_t n) +{ + struct gfs2_glock *gl = gi->gl; + + if (gl) { + if (n == 0) + return; + if (!lockref_put_not_zero(&gl->gl_lockref)) + gfs2_glock_queue_put(gl); + } + for (;;) { + gl = rhashtable_walk_next(&gi->hti); + if (IS_ERR_OR_NULL(gl)) { + if (gl == ERR_PTR(-EAGAIN)) { + n = 1; + continue; + } + gl = NULL; + break; + } + if (gl->gl_name.ln_sbd != gi->sdp) + continue; + if (n <= 1) { + if (!lockref_get_not_dead(&gl->gl_lockref)) + continue; + break; + } else { + if (__lockref_is_dead(&gl->gl_lockref)) + continue; + n--; + } + } + gi->gl = gl; +} + +static void *gfs2_glock_seq_start(struct seq_file *seq, loff_t *pos) + __acquires(RCU) +{ + struct gfs2_glock_iter *gi = seq->private; + loff_t n; + + /* + * We can either stay where we are, skip to the next hash table + * entry, or start from the beginning. + */ + if (*pos < gi->last_pos) { + rhashtable_walk_exit(&gi->hti); + rhashtable_walk_enter(&gl_hash_table, &gi->hti); + n = *pos + 1; + } else { + n = *pos - gi->last_pos; + } + + rhashtable_walk_start(&gi->hti); + + gfs2_glock_iter_next(gi, n); + gi->last_pos = *pos; + return gi->gl; +} + +static void *gfs2_glock_seq_next(struct seq_file *seq, void *iter_ptr, + loff_t *pos) +{ + struct gfs2_glock_iter *gi = seq->private; + + (*pos)++; + gi->last_pos = *pos; + gfs2_glock_iter_next(gi, 1); + return gi->gl; +} + +static void gfs2_glock_seq_stop(struct seq_file *seq, void *iter_ptr) + __releases(RCU) +{ + struct gfs2_glock_iter *gi = seq->private; + + rhashtable_walk_stop(&gi->hti); +} + +static int gfs2_glock_seq_show(struct seq_file *seq, void *iter_ptr) +{ + dump_glock(seq, iter_ptr, false); + return 0; +} + +static void *gfs2_sbstats_seq_start(struct seq_file *seq, loff_t *pos) +{ + preempt_disable(); + if (*pos >= GFS2_NR_SBSTATS) + return NULL; + return pos; +} + +static void *gfs2_sbstats_seq_next(struct seq_file *seq, void *iter_ptr, + loff_t *pos) +{ + (*pos)++; + if (*pos >= GFS2_NR_SBSTATS) + return NULL; + return pos; +} + +static void gfs2_sbstats_seq_stop(struct seq_file *seq, void *iter_ptr) +{ + preempt_enable(); +} + +static const struct seq_operations gfs2_glock_seq_ops = { + .start = gfs2_glock_seq_start, + .next = gfs2_glock_seq_next, + .stop = gfs2_glock_seq_stop, + .show = gfs2_glock_seq_show, +}; + +static const struct seq_operations gfs2_glstats_seq_ops = { + .start = gfs2_glock_seq_start, + .next = gfs2_glock_seq_next, + .stop = gfs2_glock_seq_stop, + .show = gfs2_glstats_seq_show, +}; + +static const struct seq_operations gfs2_sbstats_sops = { + .start = gfs2_sbstats_seq_start, + .next = gfs2_sbstats_seq_next, + .stop = gfs2_sbstats_seq_stop, + .show = gfs2_sbstats_seq_show, +}; + +#define GFS2_SEQ_GOODSIZE min(PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER, 65536UL) + +static int __gfs2_glocks_open(struct inode *inode, struct file *file, + const struct seq_operations *ops) +{ + int ret = seq_open_private(file, ops, sizeof(struct gfs2_glock_iter)); + if (ret == 0) { + struct seq_file *seq = file->private_data; + struct gfs2_glock_iter *gi = seq->private; + + gi->sdp = inode->i_private; + seq->buf = kmalloc(GFS2_SEQ_GOODSIZE, GFP_KERNEL | __GFP_NOWARN); + if (seq->buf) + seq->size = GFS2_SEQ_GOODSIZE; + /* + * Initially, we are "before" the first hash table entry; the + * first call to rhashtable_walk_next gets us the first entry. + */ + gi->last_pos = -1; + gi->gl = NULL; + rhashtable_walk_enter(&gl_hash_table, &gi->hti); + } + return ret; +} + +static int gfs2_glocks_open(struct inode *inode, struct file *file) +{ + return __gfs2_glocks_open(inode, file, &gfs2_glock_seq_ops); +} + +static int gfs2_glocks_release(struct inode *inode, struct file *file) +{ + struct seq_file *seq = file->private_data; + struct gfs2_glock_iter *gi = seq->private; + + if (gi->gl) + gfs2_glock_put(gi->gl); + rhashtable_walk_exit(&gi->hti); + return seq_release_private(inode, file); +} + +static int gfs2_glstats_open(struct inode *inode, struct file *file) +{ + return __gfs2_glocks_open(inode, file, &gfs2_glstats_seq_ops); +} + +static const struct file_operations gfs2_glocks_fops = { + .owner = THIS_MODULE, + .open = gfs2_glocks_open, + .read = seq_read, + .llseek = seq_lseek, + .release = gfs2_glocks_release, +}; + +static const struct file_operations gfs2_glstats_fops = { + .owner = THIS_MODULE, + .open = gfs2_glstats_open, + .read = seq_read, + .llseek = seq_lseek, + .release = gfs2_glocks_release, +}; + +struct gfs2_glockfd_iter { + struct super_block *sb; + unsigned int tgid; + struct task_struct *task; + unsigned int fd; + struct file *file; +}; + +static struct task_struct *gfs2_glockfd_next_task(struct gfs2_glockfd_iter *i) +{ + struct pid_namespace *ns = task_active_pid_ns(current); + struct pid *pid; + + if (i->task) + put_task_struct(i->task); + + rcu_read_lock(); +retry: + i->task = NULL; + pid = find_ge_pid(i->tgid, ns); + if (pid) { + i->tgid = pid_nr_ns(pid, ns); + i->task = pid_task(pid, PIDTYPE_TGID); + if (!i->task) { + i->tgid++; + goto retry; + } + get_task_struct(i->task); + } + rcu_read_unlock(); + return i->task; +} + +static struct file *gfs2_glockfd_next_file(struct gfs2_glockfd_iter *i) +{ + if (i->file) { + fput(i->file); + i->file = NULL; + } + + rcu_read_lock(); + for(;; i->fd++) { + struct inode *inode; + + i->file = task_lookup_next_fd_rcu(i->task, &i->fd); + if (!i->file) { + i->fd = 0; + break; + } + inode = file_inode(i->file); + if (inode->i_sb != i->sb) + continue; + if (get_file_rcu(i->file)) + break; + } + rcu_read_unlock(); + return i->file; +} + +static void *gfs2_glockfd_seq_start(struct seq_file *seq, loff_t *pos) +{ + struct gfs2_glockfd_iter *i = seq->private; + + if (*pos) + return NULL; + while (gfs2_glockfd_next_task(i)) { + if (gfs2_glockfd_next_file(i)) + return i; + i->tgid++; + } + return NULL; +} + +static void *gfs2_glockfd_seq_next(struct seq_file *seq, void *iter_ptr, + loff_t *pos) +{ + struct gfs2_glockfd_iter *i = seq->private; + + (*pos)++; + i->fd++; + do { + if (gfs2_glockfd_next_file(i)) + return i; + i->tgid++; + } while (gfs2_glockfd_next_task(i)); + return NULL; +} + +static void gfs2_glockfd_seq_stop(struct seq_file *seq, void *iter_ptr) +{ + struct gfs2_glockfd_iter *i = seq->private; + + if (i->file) + fput(i->file); + if (i->task) + put_task_struct(i->task); +} + +static void gfs2_glockfd_seq_show_flock(struct seq_file *seq, + struct gfs2_glockfd_iter *i) +{ + struct gfs2_file *fp = i->file->private_data; + struct gfs2_holder *fl_gh = &fp->f_fl_gh; + struct lm_lockname gl_name = { .ln_type = LM_TYPE_RESERVED }; + + if (!READ_ONCE(fl_gh->gh_gl)) + return; + + spin_lock(&i->file->f_lock); + if (gfs2_holder_initialized(fl_gh)) + gl_name = fl_gh->gh_gl->gl_name; + spin_unlock(&i->file->f_lock); + + if (gl_name.ln_type != LM_TYPE_RESERVED) { + seq_printf(seq, "%d %u %u/%llx\n", + i->tgid, i->fd, gl_name.ln_type, + (unsigned long long)gl_name.ln_number); + } +} + +static int gfs2_glockfd_seq_show(struct seq_file *seq, void *iter_ptr) +{ + struct gfs2_glockfd_iter *i = seq->private; + struct inode *inode = file_inode(i->file); + struct gfs2_glock *gl; + + inode_lock_shared(inode); + gl = GFS2_I(inode)->i_iopen_gh.gh_gl; + if (gl) { + seq_printf(seq, "%d %u %u/%llx\n", + i->tgid, i->fd, gl->gl_name.ln_type, + (unsigned long long)gl->gl_name.ln_number); + } + gfs2_glockfd_seq_show_flock(seq, i); + inode_unlock_shared(inode); + return 0; +} + +static const struct seq_operations gfs2_glockfd_seq_ops = { + .start = gfs2_glockfd_seq_start, + .next = gfs2_glockfd_seq_next, + .stop = gfs2_glockfd_seq_stop, + .show = gfs2_glockfd_seq_show, +}; + +static int gfs2_glockfd_open(struct inode *inode, struct file *file) +{ + struct gfs2_glockfd_iter *i; + struct gfs2_sbd *sdp = inode->i_private; + + i = __seq_open_private(file, &gfs2_glockfd_seq_ops, + sizeof(struct gfs2_glockfd_iter)); + if (!i) + return -ENOMEM; + i->sb = sdp->sd_vfs; + return 0; +} + +static const struct file_operations gfs2_glockfd_fops = { + .owner = THIS_MODULE, + .open = gfs2_glockfd_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_private, +}; + +DEFINE_SEQ_ATTRIBUTE(gfs2_sbstats); + +void gfs2_create_debugfs_file(struct gfs2_sbd *sdp) +{ + sdp->debugfs_dir = debugfs_create_dir(sdp->sd_table_name, gfs2_root); + + debugfs_create_file("glocks", S_IFREG | S_IRUGO, sdp->debugfs_dir, sdp, + &gfs2_glocks_fops); + + debugfs_create_file("glockfd", S_IFREG | S_IRUGO, sdp->debugfs_dir, sdp, + &gfs2_glockfd_fops); + + debugfs_create_file("glstats", S_IFREG | S_IRUGO, sdp->debugfs_dir, sdp, + &gfs2_glstats_fops); + + debugfs_create_file("sbstats", S_IFREG | S_IRUGO, sdp->debugfs_dir, sdp, + &gfs2_sbstats_fops); +} + +void gfs2_delete_debugfs_file(struct gfs2_sbd *sdp) +{ + debugfs_remove_recursive(sdp->debugfs_dir); + sdp->debugfs_dir = NULL; +} + +void gfs2_register_debugfs(void) +{ + gfs2_root = debugfs_create_dir("gfs2", NULL); +} + +void gfs2_unregister_debugfs(void) +{ + debugfs_remove(gfs2_root); + gfs2_root = NULL; +} diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h new file mode 100644 index 0000000000..c8685ca7d2 --- /dev/null +++ b/fs/gfs2/glock.h @@ -0,0 +1,301 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + */ + +#ifndef __GLOCK_DOT_H__ +#define __GLOCK_DOT_H__ + +#include <linux/sched.h> +#include <linux/parser.h> +#include "incore.h" +#include "util.h" + +/* Options for hostdata parser */ + +enum { + Opt_jid, + Opt_id, + Opt_first, + Opt_nodir, + Opt_err, +}; + +/* + * lm_lockname types + */ + +#define LM_TYPE_RESERVED 0x00 +#define LM_TYPE_NONDISK 0x01 +#define LM_TYPE_INODE 0x02 +#define LM_TYPE_RGRP 0x03 +#define LM_TYPE_META 0x04 +#define LM_TYPE_IOPEN 0x05 +#define LM_TYPE_FLOCK 0x06 +#define LM_TYPE_PLOCK 0x07 +#define LM_TYPE_QUOTA 0x08 +#define LM_TYPE_JOURNAL 0x09 + +/* + * lm_lock() states + * + * SHARED is compatible with SHARED, not with DEFERRED or EX. + * DEFERRED is compatible with DEFERRED, not with SHARED or EX. + */ + +#define LM_ST_UNLOCKED 0 +#define LM_ST_EXCLUSIVE 1 +#define LM_ST_DEFERRED 2 +#define LM_ST_SHARED 3 + +/* + * lm_lock() flags + * + * LM_FLAG_TRY + * Don't wait to acquire the lock if it can't be granted immediately. + * + * LM_FLAG_TRY_1CB + * Send one blocking callback if TRY is set and the lock is not granted. + * + * LM_FLAG_NOEXP + * GFS sets this flag on lock requests it makes while doing journal recovery. + * These special requests should not be blocked due to the recovery like + * ordinary locks would be. + * + * LM_FLAG_ANY + * A SHARED request may also be granted in DEFERRED, or a DEFERRED request may + * also be granted in SHARED. The preferred state is whichever is compatible + * with other granted locks, or the specified state if no other locks exist. + * + * LM_FLAG_NODE_SCOPE + * This holder agrees to share the lock within this node. In other words, + * the glock is held in EX mode according to DLM, but local holders on the + * same node can share it. + */ + +#define LM_FLAG_TRY 0x0001 +#define LM_FLAG_TRY_1CB 0x0002 +#define LM_FLAG_NOEXP 0x0004 +#define LM_FLAG_ANY 0x0008 +#define LM_FLAG_NODE_SCOPE 0x0020 +#define GL_ASYNC 0x0040 +#define GL_EXACT 0x0080 +#define GL_SKIP 0x0100 +#define GL_NOPID 0x0200 +#define GL_NOCACHE 0x0400 + +/* + * lm_async_cb return flags + * + * LM_OUT_ST_MASK + * Masks the lower two bits of lock state in the returned value. + * + * LM_OUT_CANCELED + * The lock request was canceled. + * + */ + +#define LM_OUT_ST_MASK 0x00000003 +#define LM_OUT_CANCELED 0x00000008 +#define LM_OUT_ERROR 0x00000004 + +/* + * lm_recovery_done() messages + */ + +#define LM_RD_GAVEUP 308 +#define LM_RD_SUCCESS 309 + +#define GLR_TRYFAILED 13 + +#define GL_GLOCK_MAX_HOLD (long)(HZ / 5) +#define GL_GLOCK_DFT_HOLD (long)(HZ / 5) +#define GL_GLOCK_MIN_HOLD (long)(10) +#define GL_GLOCK_HOLD_INCR (long)(HZ / 20) +#define GL_GLOCK_HOLD_DECR (long)(HZ / 40) + +struct lm_lockops { + const char *lm_proto_name; + int (*lm_mount) (struct gfs2_sbd *sdp, const char *table); + void (*lm_first_done) (struct gfs2_sbd *sdp); + void (*lm_recovery_result) (struct gfs2_sbd *sdp, unsigned int jid, + unsigned int result); + void (*lm_unmount) (struct gfs2_sbd *sdp); + void (*lm_withdraw) (struct gfs2_sbd *sdp); + void (*lm_put_lock) (struct gfs2_glock *gl); + int (*lm_lock) (struct gfs2_glock *gl, unsigned int req_state, + unsigned int flags); + void (*lm_cancel) (struct gfs2_glock *gl); + const match_table_t *lm_tokens; +}; + +struct gfs2_glock_aspace { + struct gfs2_glock glock; + struct address_space mapping; +}; + +static inline struct gfs2_holder *gfs2_glock_is_locked_by_me(struct gfs2_glock *gl) +{ + struct gfs2_holder *gh; + struct pid *pid; + + /* Look in glock's list of holders for one with current task as owner */ + spin_lock(&gl->gl_lockref.lock); + pid = task_pid(current); + list_for_each_entry(gh, &gl->gl_holders, gh_list) { + if (!test_bit(HIF_HOLDER, &gh->gh_iflags)) + break; + if (gh->gh_owner_pid == pid) + goto out; + } + gh = NULL; +out: + spin_unlock(&gl->gl_lockref.lock); + + return gh; +} + +static inline int gfs2_glock_is_held_excl(struct gfs2_glock *gl) +{ + return gl->gl_state == LM_ST_EXCLUSIVE; +} + +static inline int gfs2_glock_is_held_dfrd(struct gfs2_glock *gl) +{ + return gl->gl_state == LM_ST_DEFERRED; +} + +static inline int gfs2_glock_is_held_shrd(struct gfs2_glock *gl) +{ + return gl->gl_state == LM_ST_SHARED; +} + +static inline struct address_space *gfs2_glock2aspace(struct gfs2_glock *gl) +{ + if (gl->gl_ops->go_flags & GLOF_ASPACE) { + struct gfs2_glock_aspace *gla = + container_of(gl, struct gfs2_glock_aspace, glock); + return &gla->mapping; + } + return NULL; +} + +extern int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number, + const struct gfs2_glock_operations *glops, + int create, struct gfs2_glock **glp); +extern struct gfs2_glock *gfs2_glock_hold(struct gfs2_glock *gl); +extern void gfs2_glock_put(struct gfs2_glock *gl); +extern void gfs2_glock_queue_put(struct gfs2_glock *gl); + +extern void __gfs2_holder_init(struct gfs2_glock *gl, unsigned int state, + u16 flags, struct gfs2_holder *gh, + unsigned long ip); +static inline void gfs2_holder_init(struct gfs2_glock *gl, unsigned int state, + u16 flags, struct gfs2_holder *gh) { + __gfs2_holder_init(gl, state, flags, gh, _RET_IP_); +} + +extern void gfs2_holder_reinit(unsigned int state, u16 flags, + struct gfs2_holder *gh); +extern void gfs2_holder_uninit(struct gfs2_holder *gh); +extern int gfs2_glock_nq(struct gfs2_holder *gh); +extern int gfs2_glock_poll(struct gfs2_holder *gh); +extern int gfs2_instantiate(struct gfs2_holder *gh); +extern int gfs2_glock_holder_ready(struct gfs2_holder *gh); +extern int gfs2_glock_wait(struct gfs2_holder *gh); +extern int gfs2_glock_async_wait(unsigned int num_gh, struct gfs2_holder *ghs); +extern void gfs2_glock_dq(struct gfs2_holder *gh); +extern void gfs2_glock_dq_wait(struct gfs2_holder *gh); +extern void gfs2_glock_dq_uninit(struct gfs2_holder *gh); +extern int gfs2_glock_nq_num(struct gfs2_sbd *sdp, u64 number, + const struct gfs2_glock_operations *glops, + unsigned int state, u16 flags, + struct gfs2_holder *gh); +extern int gfs2_glock_nq_m(unsigned int num_gh, struct gfs2_holder *ghs); +extern void gfs2_glock_dq_m(unsigned int num_gh, struct gfs2_holder *ghs); +extern void gfs2_dump_glock(struct seq_file *seq, struct gfs2_glock *gl, + bool fsid); +#define GLOCK_BUG_ON(gl,x) do { if (unlikely(x)) { \ + gfs2_dump_glock(NULL, gl, true); \ + BUG(); } } while(0) +#define gfs2_glock_assert_warn(gl, x) do { if (unlikely(!(x))) { \ + gfs2_dump_glock(NULL, gl, true); \ + gfs2_assert_warn((gl)->gl_name.ln_sbd, (x)); } } \ + while (0) +#define gfs2_glock_assert_withdraw(gl, x) do { if (unlikely(!(x))) { \ + gfs2_dump_glock(NULL, gl, true); \ + gfs2_assert_withdraw((gl)->gl_name.ln_sbd, (x)); } } \ + while (0) + +extern __printf(2, 3) +void gfs2_print_dbg(struct seq_file *seq, const char *fmt, ...); + +/** + * gfs2_glock_nq_init - initialize a holder and enqueue it on a glock + * @gl: the glock + * @state: the state we're requesting + * @flags: the modifier flags + * @gh: the holder structure + * + * Returns: 0, GLR_*, or errno + */ + +static inline int gfs2_glock_nq_init(struct gfs2_glock *gl, + unsigned int state, u16 flags, + struct gfs2_holder *gh) +{ + int error; + + __gfs2_holder_init(gl, state, flags, gh, _RET_IP_); + + error = gfs2_glock_nq(gh); + if (error) + gfs2_holder_uninit(gh); + + return error; +} + +extern void gfs2_glock_cb(struct gfs2_glock *gl, unsigned int state); +extern void gfs2_glock_complete(struct gfs2_glock *gl, int ret); +extern bool gfs2_queue_try_to_evict(struct gfs2_glock *gl); +extern void gfs2_cancel_delete_work(struct gfs2_glock *gl); +extern void gfs2_flush_delete_work(struct gfs2_sbd *sdp); +extern void gfs2_gl_hash_clear(struct gfs2_sbd *sdp); +extern void gfs2_gl_dq_holders(struct gfs2_sbd *sdp); +extern void gfs2_glock_thaw(struct gfs2_sbd *sdp); +extern void gfs2_glock_add_to_lru(struct gfs2_glock *gl); +extern void gfs2_glock_free(struct gfs2_glock *gl); + +extern int __init gfs2_glock_init(void); +extern void gfs2_glock_exit(void); + +extern void gfs2_create_debugfs_file(struct gfs2_sbd *sdp); +extern void gfs2_delete_debugfs_file(struct gfs2_sbd *sdp); +extern void gfs2_register_debugfs(void); +extern void gfs2_unregister_debugfs(void); + +extern void glock_set_object(struct gfs2_glock *gl, void *object); +extern void glock_clear_object(struct gfs2_glock *gl, void *object); + +extern const struct lm_lockops gfs2_dlm_ops; + +static inline void gfs2_holder_mark_uninitialized(struct gfs2_holder *gh) +{ + gh->gh_gl = NULL; +} + +static inline bool gfs2_holder_initialized(struct gfs2_holder *gh) +{ + return gh->gh_gl; +} + +static inline bool gfs2_holder_queued(struct gfs2_holder *gh) +{ + return !list_empty(&gh->gh_list); +} + +extern void gfs2_inode_remember_delete(struct gfs2_glock *gl, u64 generation); +extern bool gfs2_inode_already_deleted(struct gfs2_glock *gl, u64 generation); + +#endif /* __GLOCK_DOT_H__ */ diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c new file mode 100644 index 0000000000..f41ca89d21 --- /dev/null +++ b/fs/gfs2/glops.c @@ -0,0 +1,792 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved. + */ + +#include <linux/spinlock.h> +#include <linux/completion.h> +#include <linux/buffer_head.h> +#include <linux/gfs2_ondisk.h> +#include <linux/bio.h> +#include <linux/posix_acl.h> +#include <linux/security.h> + +#include "gfs2.h" +#include "incore.h" +#include "bmap.h" +#include "glock.h" +#include "glops.h" +#include "inode.h" +#include "log.h" +#include "meta_io.h" +#include "recovery.h" +#include "rgrp.h" +#include "util.h" +#include "trans.h" +#include "dir.h" +#include "lops.h" + +struct workqueue_struct *gfs2_freeze_wq; + +extern struct workqueue_struct *gfs2_control_wq; + +static void gfs2_ail_error(struct gfs2_glock *gl, const struct buffer_head *bh) +{ + struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; + + fs_err(sdp, + "AIL buffer %p: blocknr %llu state 0x%08lx mapping %p page " + "state 0x%lx\n", + bh, (unsigned long long)bh->b_blocknr, bh->b_state, + bh->b_folio->mapping, bh->b_folio->flags); + fs_err(sdp, "AIL glock %u:%llu mapping %p\n", + gl->gl_name.ln_type, gl->gl_name.ln_number, + gfs2_glock2aspace(gl)); + gfs2_lm(sdp, "AIL error\n"); + gfs2_withdraw_delayed(sdp); +} + +/** + * __gfs2_ail_flush - remove all buffers for a given lock from the AIL + * @gl: the glock + * @fsync: set when called from fsync (not all buffers will be clean) + * @nr_revokes: Number of buffers to revoke + * + * None of the buffers should be dirty, locked, or pinned. + */ + +static void __gfs2_ail_flush(struct gfs2_glock *gl, bool fsync, + unsigned int nr_revokes) +{ + struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; + struct list_head *head = &gl->gl_ail_list; + struct gfs2_bufdata *bd, *tmp; + struct buffer_head *bh; + const unsigned long b_state = (1UL << BH_Dirty)|(1UL << BH_Pinned)|(1UL << BH_Lock); + + gfs2_log_lock(sdp); + spin_lock(&sdp->sd_ail_lock); + list_for_each_entry_safe_reverse(bd, tmp, head, bd_ail_gl_list) { + if (nr_revokes == 0) + break; + bh = bd->bd_bh; + if (bh->b_state & b_state) { + if (fsync) + continue; + gfs2_ail_error(gl, bh); + } + gfs2_trans_add_revoke(sdp, bd); + nr_revokes--; + } + GLOCK_BUG_ON(gl, !fsync && atomic_read(&gl->gl_ail_count)); + spin_unlock(&sdp->sd_ail_lock); + gfs2_log_unlock(sdp); +} + + +static int gfs2_ail_empty_gl(struct gfs2_glock *gl) +{ + struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; + struct gfs2_trans tr; + unsigned int revokes; + int ret = 0; + + revokes = atomic_read(&gl->gl_ail_count); + + if (!revokes) { + bool have_revokes; + bool log_in_flight; + + /* + * We have nothing on the ail, but there could be revokes on + * the sdp revoke queue, in which case, we still want to flush + * the log and wait for it to finish. + * + * If the sdp revoke list is empty too, we might still have an + * io outstanding for writing revokes, so we should wait for + * it before returning. + * + * If none of these conditions are true, our revokes are all + * flushed and we can return. + */ + gfs2_log_lock(sdp); + have_revokes = !list_empty(&sdp->sd_log_revokes); + log_in_flight = atomic_read(&sdp->sd_log_in_flight); + gfs2_log_unlock(sdp); + if (have_revokes) + goto flush; + if (log_in_flight) + log_flush_wait(sdp); + return 0; + } + + memset(&tr, 0, sizeof(tr)); + set_bit(TR_ONSTACK, &tr.tr_flags); + ret = __gfs2_trans_begin(&tr, sdp, 0, revokes, _RET_IP_); + if (ret) { + fs_err(sdp, "Transaction error %d: Unable to write revokes.", ret); + goto flush; + } + __gfs2_ail_flush(gl, 0, revokes); + gfs2_trans_end(sdp); + +flush: + if (!ret) + gfs2_log_flush(sdp, NULL, GFS2_LOG_HEAD_FLUSH_NORMAL | + GFS2_LFC_AIL_EMPTY_GL); + return ret; +} + +void gfs2_ail_flush(struct gfs2_glock *gl, bool fsync) +{ + struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; + unsigned int revokes = atomic_read(&gl->gl_ail_count); + int ret; + + if (!revokes) + return; + + ret = gfs2_trans_begin(sdp, 0, revokes); + if (ret) + return; + __gfs2_ail_flush(gl, fsync, revokes); + gfs2_trans_end(sdp); + gfs2_log_flush(sdp, NULL, GFS2_LOG_HEAD_FLUSH_NORMAL | + GFS2_LFC_AIL_FLUSH); +} + +/** + * gfs2_rgrp_metasync - sync out the metadata of a resource group + * @gl: the glock protecting the resource group + * + */ + +static int gfs2_rgrp_metasync(struct gfs2_glock *gl) +{ + struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; + struct address_space *metamapping = &sdp->sd_aspace; + struct gfs2_rgrpd *rgd = gfs2_glock2rgrp(gl); + const unsigned bsize = sdp->sd_sb.sb_bsize; + loff_t start = (rgd->rd_addr * bsize) & PAGE_MASK; + loff_t end = PAGE_ALIGN((rgd->rd_addr + rgd->rd_length) * bsize) - 1; + int error; + + filemap_fdatawrite_range(metamapping, start, end); + error = filemap_fdatawait_range(metamapping, start, end); + WARN_ON_ONCE(error && !gfs2_withdrawn(sdp)); + mapping_set_error(metamapping, error); + if (error) + gfs2_io_error(sdp); + return error; +} + +/** + * rgrp_go_sync - sync out the metadata for this glock + * @gl: the glock + * + * Called when demoting or unlocking an EX glock. We must flush + * to disk all dirty buffers/pages relating to this glock, and must not + * return to caller to demote/unlock the glock until I/O is complete. + */ + +static int rgrp_go_sync(struct gfs2_glock *gl) +{ + struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; + struct gfs2_rgrpd *rgd = gfs2_glock2rgrp(gl); + int error; + + if (!rgd || !test_and_clear_bit(GLF_DIRTY, &gl->gl_flags)) + return 0; + GLOCK_BUG_ON(gl, gl->gl_state != LM_ST_EXCLUSIVE); + + gfs2_log_flush(sdp, gl, GFS2_LOG_HEAD_FLUSH_NORMAL | + GFS2_LFC_RGRP_GO_SYNC); + error = gfs2_rgrp_metasync(gl); + if (!error) + error = gfs2_ail_empty_gl(gl); + gfs2_free_clones(rgd); + return error; +} + +/** + * rgrp_go_inval - invalidate the metadata for this glock + * @gl: the glock + * @flags: + * + * We never used LM_ST_DEFERRED with resource groups, so that we + * should always see the metadata flag set here. + * + */ + +static void rgrp_go_inval(struct gfs2_glock *gl, int flags) +{ + struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; + struct address_space *mapping = &sdp->sd_aspace; + struct gfs2_rgrpd *rgd = gfs2_glock2rgrp(gl); + const unsigned bsize = sdp->sd_sb.sb_bsize; + loff_t start, end; + + if (!rgd) + return; + start = (rgd->rd_addr * bsize) & PAGE_MASK; + end = PAGE_ALIGN((rgd->rd_addr + rgd->rd_length) * bsize) - 1; + gfs2_rgrp_brelse(rgd); + WARN_ON_ONCE(!(flags & DIO_METADATA)); + truncate_inode_pages_range(mapping, start, end); +} + +static void gfs2_rgrp_go_dump(struct seq_file *seq, const struct gfs2_glock *gl, + const char *fs_id_buf) +{ + struct gfs2_rgrpd *rgd = gl->gl_object; + + if (rgd) + gfs2_rgrp_dump(seq, rgd, fs_id_buf); +} + +static struct gfs2_inode *gfs2_glock2inode(struct gfs2_glock *gl) +{ + struct gfs2_inode *ip; + + spin_lock(&gl->gl_lockref.lock); + ip = gl->gl_object; + if (ip) + set_bit(GIF_GLOP_PENDING, &ip->i_flags); + spin_unlock(&gl->gl_lockref.lock); + return ip; +} + +struct gfs2_rgrpd *gfs2_glock2rgrp(struct gfs2_glock *gl) +{ + struct gfs2_rgrpd *rgd; + + spin_lock(&gl->gl_lockref.lock); + rgd = gl->gl_object; + spin_unlock(&gl->gl_lockref.lock); + + return rgd; +} + +static void gfs2_clear_glop_pending(struct gfs2_inode *ip) +{ + if (!ip) + return; + + clear_bit_unlock(GIF_GLOP_PENDING, &ip->i_flags); + wake_up_bit(&ip->i_flags, GIF_GLOP_PENDING); +} + +/** + * gfs2_inode_metasync - sync out the metadata of an inode + * @gl: the glock protecting the inode + * + */ +int gfs2_inode_metasync(struct gfs2_glock *gl) +{ + struct address_space *metamapping = gfs2_glock2aspace(gl); + int error; + + filemap_fdatawrite(metamapping); + error = filemap_fdatawait(metamapping); + if (error) + gfs2_io_error(gl->gl_name.ln_sbd); + return error; +} + +/** + * inode_go_sync - Sync the dirty metadata of an inode + * @gl: the glock protecting the inode + * + */ + +static int inode_go_sync(struct gfs2_glock *gl) +{ + struct gfs2_inode *ip = gfs2_glock2inode(gl); + int isreg = ip && S_ISREG(ip->i_inode.i_mode); + struct address_space *metamapping = gfs2_glock2aspace(gl); + int error = 0, ret; + + if (isreg) { + if (test_and_clear_bit(GIF_SW_PAGED, &ip->i_flags)) + unmap_shared_mapping_range(ip->i_inode.i_mapping, 0, 0); + inode_dio_wait(&ip->i_inode); + } + if (!test_and_clear_bit(GLF_DIRTY, &gl->gl_flags)) + goto out; + + GLOCK_BUG_ON(gl, gl->gl_state != LM_ST_EXCLUSIVE); + + gfs2_log_flush(gl->gl_name.ln_sbd, gl, GFS2_LOG_HEAD_FLUSH_NORMAL | + GFS2_LFC_INODE_GO_SYNC); + filemap_fdatawrite(metamapping); + if (isreg) { + struct address_space *mapping = ip->i_inode.i_mapping; + filemap_fdatawrite(mapping); + error = filemap_fdatawait(mapping); + mapping_set_error(mapping, error); + } + ret = gfs2_inode_metasync(gl); + if (!error) + error = ret; + ret = gfs2_ail_empty_gl(gl); + if (!error) + error = ret; + /* + * Writeback of the data mapping may cause the dirty flag to be set + * so we have to clear it again here. + */ + smp_mb__before_atomic(); + clear_bit(GLF_DIRTY, &gl->gl_flags); + +out: + gfs2_clear_glop_pending(ip); + return error; +} + +/** + * inode_go_inval - prepare a inode glock to be released + * @gl: the glock + * @flags: + * + * Normally we invalidate everything, but if we are moving into + * LM_ST_DEFERRED from LM_ST_SHARED or LM_ST_EXCLUSIVE then we + * can keep hold of the metadata, since it won't have changed. + * + */ + +static void inode_go_inval(struct gfs2_glock *gl, int flags) +{ + struct gfs2_inode *ip = gfs2_glock2inode(gl); + + if (flags & DIO_METADATA) { + struct address_space *mapping = gfs2_glock2aspace(gl); + truncate_inode_pages(mapping, 0); + if (ip) { + set_bit(GLF_INSTANTIATE_NEEDED, &gl->gl_flags); + forget_all_cached_acls(&ip->i_inode); + security_inode_invalidate_secctx(&ip->i_inode); + gfs2_dir_hash_inval(ip); + } + } + + if (ip == GFS2_I(gl->gl_name.ln_sbd->sd_rindex)) { + gfs2_log_flush(gl->gl_name.ln_sbd, NULL, + GFS2_LOG_HEAD_FLUSH_NORMAL | + GFS2_LFC_INODE_GO_INVAL); + gl->gl_name.ln_sbd->sd_rindex_uptodate = 0; + } + if (ip && S_ISREG(ip->i_inode.i_mode)) + truncate_inode_pages(ip->i_inode.i_mapping, 0); + + gfs2_clear_glop_pending(ip); +} + +/** + * inode_go_demote_ok - Check to see if it's ok to unlock an inode glock + * @gl: the glock + * + * Returns: 1 if it's ok + */ + +static int inode_go_demote_ok(const struct gfs2_glock *gl) +{ + struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; + + if (sdp->sd_jindex == gl->gl_object || sdp->sd_rindex == gl->gl_object) + return 0; + + return 1; +} + +static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf) +{ + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + const struct gfs2_dinode *str = buf; + struct timespec64 atime; + u16 height, depth; + umode_t mode = be32_to_cpu(str->di_mode); + struct inode *inode = &ip->i_inode; + bool is_new = inode->i_state & I_NEW; + + if (unlikely(ip->i_no_addr != be64_to_cpu(str->di_num.no_addr))) + goto corrupt; + if (unlikely(!is_new && inode_wrong_type(inode, mode))) + goto corrupt; + ip->i_no_formal_ino = be64_to_cpu(str->di_num.no_formal_ino); + inode->i_mode = mode; + if (is_new) { + inode->i_rdev = 0; + switch (mode & S_IFMT) { + case S_IFBLK: + case S_IFCHR: + inode->i_rdev = MKDEV(be32_to_cpu(str->di_major), + be32_to_cpu(str->di_minor)); + break; + } + } + + i_uid_write(inode, be32_to_cpu(str->di_uid)); + i_gid_write(inode, be32_to_cpu(str->di_gid)); + set_nlink(inode, be32_to_cpu(str->di_nlink)); + i_size_write(inode, be64_to_cpu(str->di_size)); + gfs2_set_inode_blocks(inode, be64_to_cpu(str->di_blocks)); + atime.tv_sec = be64_to_cpu(str->di_atime); + atime.tv_nsec = be32_to_cpu(str->di_atime_nsec); + if (timespec64_compare(&inode->i_atime, &atime) < 0) + inode->i_atime = atime; + inode->i_mtime.tv_sec = be64_to_cpu(str->di_mtime); + inode->i_mtime.tv_nsec = be32_to_cpu(str->di_mtime_nsec); + inode_set_ctime(inode, be64_to_cpu(str->di_ctime), + be32_to_cpu(str->di_ctime_nsec)); + + ip->i_goal = be64_to_cpu(str->di_goal_meta); + ip->i_generation = be64_to_cpu(str->di_generation); + + ip->i_diskflags = be32_to_cpu(str->di_flags); + ip->i_eattr = be64_to_cpu(str->di_eattr); + /* i_diskflags and i_eattr must be set before gfs2_set_inode_flags() */ + gfs2_set_inode_flags(inode); + height = be16_to_cpu(str->di_height); + if (unlikely(height > sdp->sd_max_height)) + goto corrupt; + ip->i_height = (u8)height; + + depth = be16_to_cpu(str->di_depth); + if (unlikely(depth > GFS2_DIR_MAX_DEPTH)) + goto corrupt; + ip->i_depth = (u8)depth; + ip->i_entries = be32_to_cpu(str->di_entries); + + if (gfs2_is_stuffed(ip) && inode->i_size > gfs2_max_stuffed_size(ip)) + goto corrupt; + + if (S_ISREG(inode->i_mode)) + gfs2_set_aops(inode); + + return 0; +corrupt: + gfs2_consist_inode(ip); + return -EIO; +} + +/** + * gfs2_inode_refresh - Refresh the incore copy of the dinode + * @ip: The GFS2 inode + * + * Returns: errno + */ + +int gfs2_inode_refresh(struct gfs2_inode *ip) +{ + struct buffer_head *dibh; + int error; + + error = gfs2_meta_inode_buffer(ip, &dibh); + if (error) + return error; + + error = gfs2_dinode_in(ip, dibh->b_data); + brelse(dibh); + return error; +} + +/** + * inode_go_instantiate - read in an inode if necessary + * @gh: The glock holder + * + * Returns: errno + */ + +static int inode_go_instantiate(struct gfs2_glock *gl) +{ + struct gfs2_inode *ip = gl->gl_object; + + if (!ip) /* no inode to populate - read it in later */ + return 0; + + return gfs2_inode_refresh(ip); +} + +static int inode_go_held(struct gfs2_holder *gh) +{ + struct gfs2_glock *gl = gh->gh_gl; + struct gfs2_inode *ip = gl->gl_object; + int error = 0; + + if (!ip) /* no inode to populate - read it in later */ + return 0; + + if (gh->gh_state != LM_ST_DEFERRED) + inode_dio_wait(&ip->i_inode); + + if ((ip->i_diskflags & GFS2_DIF_TRUNC_IN_PROG) && + (gl->gl_state == LM_ST_EXCLUSIVE) && + (gh->gh_state == LM_ST_EXCLUSIVE)) + error = gfs2_truncatei_resume(ip); + + return error; +} + +/** + * inode_go_dump - print information about an inode + * @seq: The iterator + * @gl: The glock + * @fs_id_buf: file system id (may be empty) + * + */ + +static void inode_go_dump(struct seq_file *seq, const struct gfs2_glock *gl, + const char *fs_id_buf) +{ + struct gfs2_inode *ip = gl->gl_object; + const struct inode *inode = &ip->i_inode; + + if (ip == NULL) + return; + + gfs2_print_dbg(seq, "%s I: n:%llu/%llu t:%u f:0x%02lx d:0x%08x s:%llu " + "p:%lu\n", fs_id_buf, + (unsigned long long)ip->i_no_formal_ino, + (unsigned long long)ip->i_no_addr, + IF2DT(inode->i_mode), ip->i_flags, + (unsigned int)ip->i_diskflags, + (unsigned long long)i_size_read(inode), + inode->i_data.nrpages); +} + +/** + * freeze_go_callback - A cluster node is requesting a freeze + * @gl: the glock + * @remote: true if this came from a different cluster node + */ + +static void freeze_go_callback(struct gfs2_glock *gl, bool remote) +{ + struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; + struct super_block *sb = sdp->sd_vfs; + + if (!remote || + (gl->gl_state != LM_ST_SHARED && + gl->gl_state != LM_ST_UNLOCKED) || + gl->gl_demote_state != LM_ST_UNLOCKED) + return; + + /* + * Try to get an active super block reference to prevent racing with + * unmount (see super_trylock_shared()). But note that unmount isn't + * the only place where a write lock on s_umount is taken, and we can + * fail here because of things like remount as well. + */ + if (down_read_trylock(&sb->s_umount)) { + atomic_inc(&sb->s_active); + up_read(&sb->s_umount); + if (!queue_work(gfs2_freeze_wq, &sdp->sd_freeze_work)) + deactivate_super(sb); + } +} + +/** + * freeze_go_xmote_bh - After promoting/demoting the freeze glock + * @gl: the glock + */ +static int freeze_go_xmote_bh(struct gfs2_glock *gl) +{ + struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; + struct gfs2_inode *ip = GFS2_I(sdp->sd_jdesc->jd_inode); + struct gfs2_glock *j_gl = ip->i_gl; + struct gfs2_log_header_host head; + int error; + + if (test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) { + j_gl->gl_ops->go_inval(j_gl, DIO_METADATA); + + error = gfs2_find_jhead(sdp->sd_jdesc, &head, false); + if (gfs2_assert_withdraw_delayed(sdp, !error)) + return error; + if (gfs2_assert_withdraw_delayed(sdp, head.lh_flags & + GFS2_LOG_HEAD_UNMOUNT)) + return -EIO; + sdp->sd_log_sequence = head.lh_sequence + 1; + gfs2_log_pointers_init(sdp, head.lh_blkno); + } + return 0; +} + +/** + * freeze_go_demote_ok + * @gl: the glock + * + * Always returns 0 + */ + +static int freeze_go_demote_ok(const struct gfs2_glock *gl) +{ + return 0; +} + +/** + * iopen_go_callback - schedule the dcache entry for the inode to be deleted + * @gl: the glock + * @remote: true if this came from a different cluster node + * + * gl_lockref.lock lock is held while calling this + */ +static void iopen_go_callback(struct gfs2_glock *gl, bool remote) +{ + struct gfs2_inode *ip = gl->gl_object; + struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; + + if (!remote || sb_rdonly(sdp->sd_vfs) || + test_bit(SDF_KILL, &sdp->sd_flags)) + return; + + if (gl->gl_demote_state == LM_ST_UNLOCKED && + gl->gl_state == LM_ST_SHARED && ip) { + gl->gl_lockref.count++; + if (!gfs2_queue_try_to_evict(gl)) + gl->gl_lockref.count--; + } +} + +/** + * inode_go_free - wake up anyone waiting for dlm's unlock ast to free it + * @gl: glock being freed + * + * For now, this is only used for the journal inode glock. In withdraw + * situations, we need to wait for the glock to be freed so that we know + * other nodes may proceed with recovery / journal replay. + */ +static void inode_go_free(struct gfs2_glock *gl) +{ + /* Note that we cannot reference gl_object because it's already set + * to NULL by this point in its lifecycle. */ + if (!test_bit(GLF_FREEING, &gl->gl_flags)) + return; + clear_bit_unlock(GLF_FREEING, &gl->gl_flags); + wake_up_bit(&gl->gl_flags, GLF_FREEING); +} + +/** + * nondisk_go_callback - used to signal when a node did a withdraw + * @gl: the nondisk glock + * @remote: true if this came from a different cluster node + * + */ +static void nondisk_go_callback(struct gfs2_glock *gl, bool remote) +{ + struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; + + /* Ignore the callback unless it's from another node, and it's the + live lock. */ + if (!remote || gl->gl_name.ln_number != GFS2_LIVE_LOCK) + return; + + /* First order of business is to cancel the demote request. We don't + * really want to demote a nondisk glock. At best it's just to inform + * us of another node's withdraw. We'll keep it in SH mode. */ + clear_bit(GLF_DEMOTE, &gl->gl_flags); + clear_bit(GLF_PENDING_DEMOTE, &gl->gl_flags); + + /* Ignore the unlock if we're withdrawn, unmounting, or in recovery. */ + if (test_bit(SDF_NORECOVERY, &sdp->sd_flags) || + test_bit(SDF_WITHDRAWN, &sdp->sd_flags) || + test_bit(SDF_REMOTE_WITHDRAW, &sdp->sd_flags)) + return; + + /* We only care when a node wants us to unlock, because that means + * they want a journal recovered. */ + if (gl->gl_demote_state != LM_ST_UNLOCKED) + return; + + if (sdp->sd_args.ar_spectator) { + fs_warn(sdp, "Spectator node cannot recover journals.\n"); + return; + } + + fs_warn(sdp, "Some node has withdrawn; checking for recovery.\n"); + set_bit(SDF_REMOTE_WITHDRAW, &sdp->sd_flags); + /* + * We can't call remote_withdraw directly here or gfs2_recover_journal + * because this is called from the glock unlock function and the + * remote_withdraw needs to enqueue and dequeue the same "live" glock + * we were called from. So we queue it to the control work queue in + * lock_dlm. + */ + queue_delayed_work(gfs2_control_wq, &sdp->sd_control_work, 0); +} + +const struct gfs2_glock_operations gfs2_meta_glops = { + .go_type = LM_TYPE_META, + .go_flags = GLOF_NONDISK, +}; + +const struct gfs2_glock_operations gfs2_inode_glops = { + .go_sync = inode_go_sync, + .go_inval = inode_go_inval, + .go_demote_ok = inode_go_demote_ok, + .go_instantiate = inode_go_instantiate, + .go_held = inode_go_held, + .go_dump = inode_go_dump, + .go_type = LM_TYPE_INODE, + .go_flags = GLOF_ASPACE | GLOF_LRU | GLOF_LVB, + .go_free = inode_go_free, +}; + +const struct gfs2_glock_operations gfs2_rgrp_glops = { + .go_sync = rgrp_go_sync, + .go_inval = rgrp_go_inval, + .go_instantiate = gfs2_rgrp_go_instantiate, + .go_dump = gfs2_rgrp_go_dump, + .go_type = LM_TYPE_RGRP, + .go_flags = GLOF_LVB, +}; + +const struct gfs2_glock_operations gfs2_freeze_glops = { + .go_xmote_bh = freeze_go_xmote_bh, + .go_demote_ok = freeze_go_demote_ok, + .go_callback = freeze_go_callback, + .go_type = LM_TYPE_NONDISK, + .go_flags = GLOF_NONDISK, +}; + +const struct gfs2_glock_operations gfs2_iopen_glops = { + .go_type = LM_TYPE_IOPEN, + .go_callback = iopen_go_callback, + .go_dump = inode_go_dump, + .go_flags = GLOF_LRU | GLOF_NONDISK, + .go_subclass = 1, +}; + +const struct gfs2_glock_operations gfs2_flock_glops = { + .go_type = LM_TYPE_FLOCK, + .go_flags = GLOF_LRU | GLOF_NONDISK, +}; + +const struct gfs2_glock_operations gfs2_nondisk_glops = { + .go_type = LM_TYPE_NONDISK, + .go_flags = GLOF_NONDISK, + .go_callback = nondisk_go_callback, +}; + +const struct gfs2_glock_operations gfs2_quota_glops = { + .go_type = LM_TYPE_QUOTA, + .go_flags = GLOF_LVB | GLOF_LRU | GLOF_NONDISK, +}; + +const struct gfs2_glock_operations gfs2_journal_glops = { + .go_type = LM_TYPE_JOURNAL, + .go_flags = GLOF_NONDISK, +}; + +const struct gfs2_glock_operations *gfs2_glops_list[] = { + [LM_TYPE_META] = &gfs2_meta_glops, + [LM_TYPE_INODE] = &gfs2_inode_glops, + [LM_TYPE_RGRP] = &gfs2_rgrp_glops, + [LM_TYPE_IOPEN] = &gfs2_iopen_glops, + [LM_TYPE_FLOCK] = &gfs2_flock_glops, + [LM_TYPE_NONDISK] = &gfs2_nondisk_glops, + [LM_TYPE_QUOTA] = &gfs2_quota_glops, + [LM_TYPE_JOURNAL] = &gfs2_journal_glops, +}; + diff --git a/fs/gfs2/glops.h b/fs/gfs2/glops.h new file mode 100644 index 0000000000..695898afca --- /dev/null +++ b/fs/gfs2/glops.h @@ -0,0 +1,28 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + */ + +#ifndef __GLOPS_DOT_H__ +#define __GLOPS_DOT_H__ + +#include "incore.h" + +extern struct workqueue_struct *gfs2_freeze_wq; + +extern const struct gfs2_glock_operations gfs2_meta_glops; +extern const struct gfs2_glock_operations gfs2_inode_glops; +extern const struct gfs2_glock_operations gfs2_rgrp_glops; +extern const struct gfs2_glock_operations gfs2_freeze_glops; +extern const struct gfs2_glock_operations gfs2_iopen_glops; +extern const struct gfs2_glock_operations gfs2_flock_glops; +extern const struct gfs2_glock_operations gfs2_nondisk_glops; +extern const struct gfs2_glock_operations gfs2_quota_glops; +extern const struct gfs2_glock_operations gfs2_journal_glops; +extern const struct gfs2_glock_operations *gfs2_glops_list[]; + +extern int gfs2_inode_metasync(struct gfs2_glock *gl); +extern void gfs2_ail_flush(struct gfs2_glock *gl, bool fsync); + +#endif /* __GLOPS_DOT_H__ */ diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h new file mode 100644 index 0000000000..a8c95c5293 --- /dev/null +++ b/fs/gfs2/incore.h @@ -0,0 +1,874 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved. + */ + +#ifndef __INCORE_DOT_H__ +#define __INCORE_DOT_H__ + +#include <linux/fs.h> +#include <linux/kobject.h> +#include <linux/workqueue.h> +#include <linux/dlm.h> +#include <linux/buffer_head.h> +#include <linux/rcupdate.h> +#include <linux/rculist_bl.h> +#include <linux/completion.h> +#include <linux/rbtree.h> +#include <linux/ktime.h> +#include <linux/percpu.h> +#include <linux/lockref.h> +#include <linux/rhashtable.h> +#include <linux/mutex.h> + +#define DIO_WAIT 0x00000010 +#define DIO_METADATA 0x00000020 + +struct gfs2_log_operations; +struct gfs2_bufdata; +struct gfs2_holder; +struct gfs2_glock; +struct gfs2_quota_data; +struct gfs2_trans; +struct gfs2_jdesc; +struct gfs2_sbd; +struct lm_lockops; + +typedef void (*gfs2_glop_bh_t) (struct gfs2_glock *gl, unsigned int ret); + +struct gfs2_log_header_host { + u64 lh_sequence; /* Sequence number of this transaction */ + u32 lh_flags; /* GFS2_LOG_HEAD_... */ + u32 lh_tail; /* Block number of log tail */ + u32 lh_blkno; + + s64 lh_local_total; + s64 lh_local_free; + s64 lh_local_dinodes; +}; + +/* + * Structure of operations that are associated with each + * type of element in the log. + */ + +struct gfs2_log_operations { + void (*lo_before_commit) (struct gfs2_sbd *sdp, struct gfs2_trans *tr); + void (*lo_after_commit) (struct gfs2_sbd *sdp, struct gfs2_trans *tr); + void (*lo_before_scan) (struct gfs2_jdesc *jd, + struct gfs2_log_header_host *head, int pass); + int (*lo_scan_elements) (struct gfs2_jdesc *jd, unsigned int start, + struct gfs2_log_descriptor *ld, __be64 *ptr, + int pass); + void (*lo_after_scan) (struct gfs2_jdesc *jd, int error, int pass); + const char *lo_name; +}; + +#define GBF_FULL 1 + +/** + * Clone bitmaps (bi_clone): + * + * - When a block is freed, we remember the previous state of the block in the + * clone bitmap, and only mark the block as free in the real bitmap. + * + * - When looking for a block to allocate, we check for a free block in the + * clone bitmap, and if no clone bitmap exists, in the real bitmap. + * + * - For allocating a block, we mark it as allocated in the real bitmap, and if + * a clone bitmap exists, also in the clone bitmap. + * + * - At the end of a log_flush, we copy the real bitmap into the clone bitmap + * to make the clone bitmap reflect the current allocation state. + * (Alternatively, we could remove the clone bitmap.) + * + * The clone bitmaps are in-core only, and is never written to disk. + * + * These steps ensure that blocks which have been freed in a transaction cannot + * be reallocated in that same transaction. + */ +struct gfs2_bitmap { + struct buffer_head *bi_bh; + char *bi_clone; + unsigned long bi_flags; + u32 bi_offset; + u32 bi_start; + u32 bi_bytes; + u32 bi_blocks; +}; + +struct gfs2_rgrpd { + struct rb_node rd_node; /* Link with superblock */ + struct gfs2_glock *rd_gl; /* Glock for this rgrp */ + u64 rd_addr; /* grp block disk address */ + u64 rd_data0; /* first data location */ + u32 rd_length; /* length of rgrp header in fs blocks */ + u32 rd_data; /* num of data blocks in rgrp */ + u32 rd_bitbytes; /* number of bytes in data bitmaps */ + u32 rd_free; + u32 rd_requested; /* number of blocks in rd_rstree */ + u32 rd_reserved; /* number of reserved blocks */ + u32 rd_free_clone; + u32 rd_dinodes; + u64 rd_igeneration; + struct gfs2_bitmap *rd_bits; + struct gfs2_sbd *rd_sbd; + struct gfs2_rgrp_lvb *rd_rgl; + u32 rd_last_alloc; + u32 rd_flags; + u32 rd_extfail_pt; /* extent failure point */ +#define GFS2_RDF_CHECK 0x10000000 /* check for unlinked inodes */ +#define GFS2_RDF_ERROR 0x40000000 /* error in rg */ +#define GFS2_RDF_PREFERRED 0x80000000 /* This rgrp is preferred */ +#define GFS2_RDF_MASK 0xf0000000 /* mask for internal flags */ + spinlock_t rd_rsspin; /* protects reservation related vars */ + struct mutex rd_mutex; + struct rb_root rd_rstree; /* multi-block reservation tree */ +}; + +enum gfs2_state_bits { + BH_Pinned = BH_PrivateStart, + BH_Escaped = BH_PrivateStart + 1, +}; + +BUFFER_FNS(Pinned, pinned) +TAS_BUFFER_FNS(Pinned, pinned) +BUFFER_FNS(Escaped, escaped) +TAS_BUFFER_FNS(Escaped, escaped) + +struct gfs2_bufdata { + struct buffer_head *bd_bh; + struct gfs2_glock *bd_gl; + u64 bd_blkno; + + struct list_head bd_list; + + struct gfs2_trans *bd_tr; + struct list_head bd_ail_st_list; + struct list_head bd_ail_gl_list; +}; + +/* + * Internally, we prefix things with gdlm_ and GDLM_ (for gfs-dlm) since a + * prefix of lock_dlm_ gets awkward. + */ + +#define GDLM_STRNAME_BYTES 25 +#define GDLM_LVB_SIZE 32 + +/* + * ls_recover_flags: + * + * DFL_BLOCK_LOCKS: dlm is in recovery and will grant locks that had been + * held by failed nodes whose journals need recovery. Those locks should + * only be used for journal recovery until the journal recovery is done. + * This is set by the dlm recover_prep callback and cleared by the + * gfs2_control thread when journal recovery is complete. To avoid + * races between recover_prep setting and gfs2_control clearing, recover_spin + * is held while changing this bit and reading/writing recover_block + * and recover_start. + * + * DFL_NO_DLM_OPS: dlm lockspace ops/callbacks are not being used. + * + * DFL_FIRST_MOUNT: this node is the first to mount this fs and is doing + * recovery of all journals before allowing other nodes to mount the fs. + * This is cleared when FIRST_MOUNT_DONE is set. + * + * DFL_FIRST_MOUNT_DONE: this node was the first mounter, and has finished + * recovery of all journals, and now allows other nodes to mount the fs. + * + * DFL_MOUNT_DONE: gdlm_mount has completed successfully and cleared + * BLOCK_LOCKS for the first time. The gfs2_control thread should now + * control clearing BLOCK_LOCKS for further recoveries. + * + * DFL_UNMOUNT: gdlm_unmount sets to keep sdp off gfs2_control_wq. + * + * DFL_DLM_RECOVERY: set while dlm is in recovery, between recover_prep() + * and recover_done(), i.e. set while recover_block == recover_start. + */ + +enum { + DFL_BLOCK_LOCKS = 0, + DFL_NO_DLM_OPS = 1, + DFL_FIRST_MOUNT = 2, + DFL_FIRST_MOUNT_DONE = 3, + DFL_MOUNT_DONE = 4, + DFL_UNMOUNT = 5, + DFL_DLM_RECOVERY = 6, +}; + +/* + * We are using struct lm_lockname as an rhashtable key. Avoid holes within + * the struct; padding at the end is fine. + */ +struct lm_lockname { + u64 ln_number; + struct gfs2_sbd *ln_sbd; + unsigned int ln_type; +}; + +#define lm_name_equal(name1, name2) \ + (((name1)->ln_number == (name2)->ln_number) && \ + ((name1)->ln_type == (name2)->ln_type) && \ + ((name1)->ln_sbd == (name2)->ln_sbd)) + + +struct gfs2_glock_operations { + int (*go_sync) (struct gfs2_glock *gl); + int (*go_xmote_bh)(struct gfs2_glock *gl); + void (*go_inval) (struct gfs2_glock *gl, int flags); + int (*go_demote_ok) (const struct gfs2_glock *gl); + int (*go_instantiate) (struct gfs2_glock *gl); + int (*go_held)(struct gfs2_holder *gh); + void (*go_dump)(struct seq_file *seq, const struct gfs2_glock *gl, + const char *fs_id_buf); + void (*go_callback)(struct gfs2_glock *gl, bool remote); + void (*go_free)(struct gfs2_glock *gl); + const int go_subclass; + const int go_type; + const unsigned long go_flags; +#define GLOF_ASPACE 1 /* address space attached */ +#define GLOF_LVB 2 /* Lock Value Block attached */ +#define GLOF_LRU 4 /* LRU managed */ +#define GLOF_NONDISK 8 /* not I/O related */ +}; + +enum { + GFS2_LKS_SRTT = 0, /* Non blocking smoothed round trip time */ + GFS2_LKS_SRTTVAR = 1, /* Non blocking smoothed variance */ + GFS2_LKS_SRTTB = 2, /* Blocking smoothed round trip time */ + GFS2_LKS_SRTTVARB = 3, /* Blocking smoothed variance */ + GFS2_LKS_SIRT = 4, /* Smoothed Inter-request time */ + GFS2_LKS_SIRTVAR = 5, /* Smoothed Inter-request variance */ + GFS2_LKS_DCOUNT = 6, /* Count of dlm requests */ + GFS2_LKS_QCOUNT = 7, /* Count of gfs2_holder queues */ + GFS2_NR_LKSTATS +}; + +struct gfs2_lkstats { + u64 stats[GFS2_NR_LKSTATS]; +}; + +enum { + /* States */ + HIF_HOLDER = 6, /* Set for gh that "holds" the glock */ + HIF_WAIT = 10, +}; + +struct gfs2_holder { + struct list_head gh_list; + + struct gfs2_glock *gh_gl; + struct pid *gh_owner_pid; + u16 gh_flags; + u16 gh_state; + + int gh_error; + unsigned long gh_iflags; /* HIF_... */ + unsigned long gh_ip; +}; + +/* Number of quota types we support */ +#define GFS2_MAXQUOTAS 2 + +struct gfs2_qadata { /* quota allocation data */ + /* Quota stuff */ + struct gfs2_quota_data *qa_qd[2 * GFS2_MAXQUOTAS]; + struct gfs2_holder qa_qd_ghs[2 * GFS2_MAXQUOTAS]; + unsigned int qa_qd_num; + int qa_ref; +}; + +/* Resource group multi-block reservation, in order of appearance: + + Step 1. Function prepares to write, allocates a mb, sets the size hint. + Step 2. User calls inplace_reserve to target an rgrp, sets the rgrp info + Step 3. Function get_local_rgrp locks the rgrp, determines which bits to use + Step 4. Bits are assigned from the rgrp based on either the reservation + or wherever it can. +*/ + +struct gfs2_blkreserv { + struct rb_node rs_node; /* node within rd_rstree */ + struct gfs2_rgrpd *rs_rgd; + u64 rs_start; + u32 rs_requested; + u32 rs_reserved; /* number of reserved blocks */ +}; + +/* + * Allocation parameters + * @target: The number of blocks we'd ideally like to allocate + * @aflags: The flags (e.g. Orlov flag) + * + * The intent is to gradually expand this structure over time in + * order to give more information, e.g. alignment, min extent size + * to the allocation code. + */ +struct gfs2_alloc_parms { + u64 target; + u32 min_target; + u32 aflags; + u64 allowed; +}; + +enum { + GLF_LOCK = 1, + GLF_INSTANTIATE_NEEDED = 2, /* needs instantiate */ + GLF_DEMOTE = 3, + GLF_PENDING_DEMOTE = 4, + GLF_DEMOTE_IN_PROGRESS = 5, + GLF_DIRTY = 6, + GLF_LFLUSH = 7, + GLF_INVALIDATE_IN_PROGRESS = 8, + GLF_REPLY_PENDING = 9, + GLF_INITIAL = 10, + GLF_FROZEN = 11, + GLF_INSTANTIATE_IN_PROG = 12, /* instantiate happening now */ + GLF_LRU = 13, + GLF_OBJECT = 14, /* Used only for tracing */ + GLF_BLOCKING = 15, + GLF_FREEING = 16, /* Wait for glock to be freed */ + GLF_TRY_TO_EVICT = 17, /* iopen glocks only */ + GLF_VERIFY_EVICT = 18, /* iopen glocks only */ +}; + +struct gfs2_glock { + unsigned long gl_flags; /* GLF_... */ + struct lm_lockname gl_name; + + struct lockref gl_lockref; + + /* State fields protected by gl_lockref.lock */ + unsigned int gl_state:2, /* Current state */ + gl_target:2, /* Target state */ + gl_demote_state:2, /* State requested by remote node */ + gl_req:2, /* State in last dlm request */ + gl_reply:8; /* Last reply from the dlm */ + + unsigned long gl_demote_time; /* time of first demote request */ + long gl_hold_time; + struct list_head gl_holders; + + const struct gfs2_glock_operations *gl_ops; + ktime_t gl_dstamp; + struct gfs2_lkstats gl_stats; + struct dlm_lksb gl_lksb; + unsigned long gl_tchange; + void *gl_object; + + struct list_head gl_lru; + struct list_head gl_ail_list; + atomic_t gl_ail_count; + atomic_t gl_revokes; + struct delayed_work gl_work; + /* For iopen glocks only */ + struct { + struct delayed_work gl_delete; + u64 gl_no_formal_ino; + }; + struct rcu_head gl_rcu; + struct rhash_head gl_node; +}; + +enum { + GIF_QD_LOCKED = 1, + GIF_ALLOC_FAILED = 2, + GIF_SW_PAGED = 3, + GIF_FREE_VFS_INODE = 5, + GIF_GLOP_PENDING = 6, + GIF_DEFERRED_DELETE = 7, +}; + +struct gfs2_inode { + struct inode i_inode; + u64 i_no_addr; + u64 i_no_formal_ino; + u64 i_generation; + u64 i_eattr; + unsigned long i_flags; /* GIF_... */ + struct gfs2_glock *i_gl; + struct gfs2_holder i_iopen_gh; + struct gfs2_qadata *i_qadata; /* quota allocation data */ + struct gfs2_holder i_rgd_gh; + struct gfs2_blkreserv i_res; /* rgrp multi-block reservation */ + u64 i_goal; /* goal block for allocations */ + atomic_t i_sizehint; /* hint of the write size */ + struct rw_semaphore i_rw_mutex; + struct list_head i_ordered; + __be64 *i_hash_cache; + u32 i_entries; + u32 i_diskflags; + u8 i_height; + u8 i_depth; + u16 i_rahead; +}; + +/* + * Since i_inode is the first element of struct gfs2_inode, + * this is effectively a cast. + */ +static inline struct gfs2_inode *GFS2_I(struct inode *inode) +{ + return container_of(inode, struct gfs2_inode, i_inode); +} + +static inline struct gfs2_sbd *GFS2_SB(const struct inode *inode) +{ + return inode->i_sb->s_fs_info; +} + +struct gfs2_file { + struct mutex f_fl_mutex; + struct gfs2_holder f_fl_gh; +}; + +struct gfs2_revoke_replay { + struct list_head rr_list; + u64 rr_blkno; + unsigned int rr_where; +}; + +enum { + QDF_CHANGE = 1, + QDF_LOCKED = 2, + QDF_REFRESH = 3, + QDF_QMSG_QUIET = 4, +}; + +struct gfs2_quota_data { + struct hlist_bl_node qd_hlist; + struct list_head qd_list; + struct kqid qd_id; + struct gfs2_sbd *qd_sbd; + struct lockref qd_lockref; + struct list_head qd_lru; + unsigned qd_hash; + + unsigned long qd_flags; /* QDF_... */ + + s64 qd_change; + s64 qd_change_sync; + + unsigned int qd_slot; + unsigned int qd_slot_ref; + + struct buffer_head *qd_bh; + struct gfs2_quota_change *qd_bh_qc; + unsigned int qd_bh_count; + + struct gfs2_glock *qd_gl; + struct gfs2_quota_lvb qd_qb; + + u64 qd_sync_gen; + unsigned long qd_last_warn; + struct rcu_head qd_rcu; +}; + +enum { + TR_TOUCHED = 1, + TR_ATTACHED = 2, + TR_ONSTACK = 3, +}; + +struct gfs2_trans { + unsigned long tr_ip; + + unsigned int tr_blocks; + unsigned int tr_revokes; + unsigned int tr_reserved; + unsigned long tr_flags; + + unsigned int tr_num_buf_new; + unsigned int tr_num_databuf_new; + unsigned int tr_num_buf_rm; + unsigned int tr_num_databuf_rm; + unsigned int tr_num_revoke; + + struct list_head tr_list; + struct list_head tr_databuf; + struct list_head tr_buf; + + unsigned int tr_first; + struct list_head tr_ail1_list; + struct list_head tr_ail2_list; +}; + +struct gfs2_journal_extent { + struct list_head list; + + unsigned int lblock; /* First logical block */ + u64 dblock; /* First disk block */ + u64 blocks; +}; + +struct gfs2_jdesc { + struct list_head jd_list; + struct list_head extent_list; + unsigned int nr_extents; + struct work_struct jd_work; + struct inode *jd_inode; + struct bio *jd_log_bio; + unsigned long jd_flags; +#define JDF_RECOVERY 1 + unsigned int jd_jid; + u32 jd_blocks; + int jd_recover_error; + /* Replay stuff */ + + unsigned int jd_found_blocks; + unsigned int jd_found_revokes; + unsigned int jd_replayed_blocks; + + struct list_head jd_revoke_list; + unsigned int jd_replay_tail; + + u64 jd_no_addr; +}; + +struct gfs2_statfs_change_host { + s64 sc_total; + s64 sc_free; + s64 sc_dinodes; +}; + +#define GFS2_QUOTA_DEFAULT GFS2_QUOTA_OFF +#define GFS2_QUOTA_OFF 0 +#define GFS2_QUOTA_ACCOUNT 1 +#define GFS2_QUOTA_ON 2 +#define GFS2_QUOTA_QUIET 3 /* on but not complaining */ + +#define GFS2_DATA_DEFAULT GFS2_DATA_ORDERED +#define GFS2_DATA_WRITEBACK 1 +#define GFS2_DATA_ORDERED 2 + +#define GFS2_ERRORS_DEFAULT GFS2_ERRORS_WITHDRAW +#define GFS2_ERRORS_WITHDRAW 0 +#define GFS2_ERRORS_CONTINUE 1 /* place holder for future feature */ +#define GFS2_ERRORS_RO 2 /* place holder for future feature */ +#define GFS2_ERRORS_PANIC 3 + +struct gfs2_args { + char ar_lockproto[GFS2_LOCKNAME_LEN]; /* Name of the Lock Protocol */ + char ar_locktable[GFS2_LOCKNAME_LEN]; /* Name of the Lock Table */ + char ar_hostdata[GFS2_LOCKNAME_LEN]; /* Host specific data */ + unsigned int ar_spectator:1; /* Don't get a journal */ + unsigned int ar_localflocks:1; /* Let the VFS do flock|fcntl */ + unsigned int ar_debug:1; /* Oops on errors */ + unsigned int ar_posix_acl:1; /* Enable posix acls */ + unsigned int ar_quota:2; /* off/account/on */ + unsigned int ar_suiddir:1; /* suiddir support */ + unsigned int ar_data:2; /* ordered/writeback */ + unsigned int ar_meta:1; /* mount metafs */ + unsigned int ar_discard:1; /* discard requests */ + unsigned int ar_errors:2; /* errors=withdraw | panic */ + unsigned int ar_nobarrier:1; /* do not send barriers */ + unsigned int ar_rgrplvb:1; /* use lvbs for rgrp info */ + unsigned int ar_got_rgrplvb:1; /* Was the rgrplvb opt given? */ + unsigned int ar_loccookie:1; /* use location based readdir + cookies */ + s32 ar_commit; /* Commit interval */ + s32 ar_statfs_quantum; /* The fast statfs interval */ + s32 ar_quota_quantum; /* The quota interval */ + s32 ar_statfs_percent; /* The % change to force sync */ +}; + +struct gfs2_tune { + spinlock_t gt_spin; + + unsigned int gt_logd_secs; + + unsigned int gt_quota_warn_period; /* Secs between quota warn msgs */ + unsigned int gt_quota_scale_num; /* Numerator */ + unsigned int gt_quota_scale_den; /* Denominator */ + unsigned int gt_quota_quantum; /* Secs between syncs to quota file */ + unsigned int gt_new_files_jdata; + unsigned int gt_max_readahead; /* Max bytes to read-ahead from disk */ + unsigned int gt_complain_secs; + unsigned int gt_statfs_quantum; + unsigned int gt_statfs_slow; +}; + +enum { + SDF_JOURNAL_CHECKED = 0, + SDF_JOURNAL_LIVE = 1, + SDF_WITHDRAWN = 2, + SDF_NOBARRIERS = 3, + SDF_NORECOVERY = 4, + SDF_DEMOTE = 5, + SDF_NOJOURNALID = 6, + SDF_RORECOVERY = 7, /* read only recovery */ + SDF_SKIP_DLM_UNLOCK = 8, + SDF_FORCE_AIL_FLUSH = 9, + SDF_FREEZE_INITIATOR = 10, + SDF_WITHDRAWING = 11, /* Will withdraw eventually */ + SDF_WITHDRAW_IN_PROG = 12, /* Withdraw is in progress */ + SDF_REMOTE_WITHDRAW = 13, /* Performing remote recovery */ + SDF_WITHDRAW_RECOVERY = 14, /* Wait for journal recovery when we are + withdrawing */ + SDF_KILL = 15, + SDF_EVICTING = 16, + SDF_FROZEN = 17, +}; + +#define GFS2_FSNAME_LEN 256 + +struct gfs2_inum_host { + u64 no_formal_ino; + u64 no_addr; +}; + +struct gfs2_sb_host { + u32 sb_magic; + u32 sb_type; + + u32 sb_fs_format; + u32 sb_multihost_format; + u32 sb_bsize; + u32 sb_bsize_shift; + + struct gfs2_inum_host sb_master_dir; + struct gfs2_inum_host sb_root_dir; + + char sb_lockproto[GFS2_LOCKNAME_LEN]; + char sb_locktable[GFS2_LOCKNAME_LEN]; +}; + +/* + * lm_mount() return values + * + * ls_jid - the journal ID this node should use + * ls_first - this node is the first to mount the file system + * ls_lockspace - lock module's context for this file system + * ls_ops - lock module's functions + */ + +struct lm_lockstruct { + int ls_jid; + unsigned int ls_first; + const struct lm_lockops *ls_ops; + dlm_lockspace_t *ls_dlm; + + int ls_recover_jid_done; /* These two are deprecated, */ + int ls_recover_jid_status; /* used previously by gfs_controld */ + + struct dlm_lksb ls_mounted_lksb; /* mounted_lock */ + struct dlm_lksb ls_control_lksb; /* control_lock */ + char ls_control_lvb[GDLM_LVB_SIZE]; /* control_lock lvb */ + struct completion ls_sync_wait; /* {control,mounted}_{lock,unlock} */ + char *ls_lvb_bits; + + spinlock_t ls_recover_spin; /* protects following fields */ + unsigned long ls_recover_flags; /* DFL_ */ + uint32_t ls_recover_mount; /* gen in first recover_done cb */ + uint32_t ls_recover_start; /* gen in last recover_done cb */ + uint32_t ls_recover_block; /* copy recover_start in last recover_prep */ + uint32_t ls_recover_size; /* size of recover_submit, recover_result */ + uint32_t *ls_recover_submit; /* gen in last recover_slot cb per jid */ + uint32_t *ls_recover_result; /* result of last jid recovery */ +}; + +struct gfs2_pcpu_lkstats { + /* One struct for each glock type */ + struct gfs2_lkstats lkstats[10]; +}; + +/* List of local (per node) statfs inodes */ +struct local_statfs_inode { + struct list_head si_list; + struct inode *si_sc_inode; + unsigned int si_jid; /* journal id this statfs inode corresponds to */ +}; + +struct gfs2_sbd { + struct super_block *sd_vfs; + struct gfs2_pcpu_lkstats __percpu *sd_lkstats; + struct kobject sd_kobj; + struct completion sd_kobj_unregister; + unsigned long sd_flags; /* SDF_... */ + struct gfs2_sb_host sd_sb; + + /* Constants computed on mount */ + + u32 sd_fsb2bb; + u32 sd_fsb2bb_shift; + u32 sd_diptrs; /* Number of pointers in a dinode */ + u32 sd_inptrs; /* Number of pointers in a indirect block */ + u32 sd_ldptrs; /* Number of pointers in a log descriptor block */ + u32 sd_jbsize; /* Size of a journaled data block */ + u32 sd_hash_bsize; /* sizeof(exhash block) */ + u32 sd_hash_bsize_shift; + u32 sd_hash_ptrs; /* Number of pointers in a hash block */ + u32 sd_qc_per_block; + u32 sd_blocks_per_bitmap; + u32 sd_max_dirres; /* Max blocks needed to add a directory entry */ + u32 sd_max_height; /* Max height of a file's metadata tree */ + u64 sd_heightsize[GFS2_MAX_META_HEIGHT + 1]; + u32 sd_max_dents_per_leaf; /* Max number of dirents in a leaf block */ + + struct gfs2_args sd_args; /* Mount arguments */ + struct gfs2_tune sd_tune; /* Filesystem tuning structure */ + + /* Lock Stuff */ + + struct lm_lockstruct sd_lockstruct; + struct gfs2_holder sd_live_gh; + struct gfs2_glock *sd_rename_gl; + struct gfs2_glock *sd_freeze_gl; + struct work_struct sd_freeze_work; + wait_queue_head_t sd_kill_wait; + wait_queue_head_t sd_async_glock_wait; + atomic_t sd_glock_disposal; + struct completion sd_locking_init; + struct completion sd_wdack; + struct delayed_work sd_control_work; + + /* Inode Stuff */ + + struct dentry *sd_master_dir; + struct dentry *sd_root_dir; + + struct inode *sd_jindex; + struct inode *sd_statfs_inode; + struct inode *sd_sc_inode; + struct list_head sd_sc_inodes_list; + struct inode *sd_qc_inode; + struct inode *sd_rindex; + struct inode *sd_quota_inode; + + /* StatFS stuff */ + + spinlock_t sd_statfs_spin; + struct gfs2_statfs_change_host sd_statfs_master; + struct gfs2_statfs_change_host sd_statfs_local; + int sd_statfs_force_sync; + + /* Resource group stuff */ + + int sd_rindex_uptodate; + spinlock_t sd_rindex_spin; + struct rb_root sd_rindex_tree; + unsigned int sd_rgrps; + unsigned int sd_max_rg_data; + + /* Journal index stuff */ + + struct list_head sd_jindex_list; + spinlock_t sd_jindex_spin; + struct mutex sd_jindex_mutex; + unsigned int sd_journals; + + struct gfs2_jdesc *sd_jdesc; + struct gfs2_holder sd_journal_gh; + struct gfs2_holder sd_jinode_gh; + struct gfs2_glock *sd_jinode_gl; + + struct gfs2_holder sd_sc_gh; + struct buffer_head *sd_sc_bh; + struct gfs2_holder sd_qc_gh; + + struct completion sd_journal_ready; + + /* Workqueue stuff */ + + struct workqueue_struct *sd_delete_wq; + + /* Daemon stuff */ + + struct task_struct *sd_logd_process; + struct task_struct *sd_quotad_process; + + /* Quota stuff */ + + struct list_head sd_quota_list; + atomic_t sd_quota_count; + struct mutex sd_quota_mutex; + struct mutex sd_quota_sync_mutex; + wait_queue_head_t sd_quota_wait; + + unsigned int sd_quota_slots; + unsigned long *sd_quota_bitmap; + spinlock_t sd_bitmap_lock; + + u64 sd_quota_sync_gen; + + /* Log stuff */ + + struct address_space sd_aspace; + + spinlock_t sd_log_lock; + + struct gfs2_trans *sd_log_tr; + unsigned int sd_log_blks_reserved; + + atomic_t sd_log_pinned; + unsigned int sd_log_num_revoke; + + struct list_head sd_log_revokes; + struct list_head sd_log_ordered; + spinlock_t sd_ordered_lock; + + atomic_t sd_log_thresh1; + atomic_t sd_log_thresh2; + atomic_t sd_log_blks_free; + atomic_t sd_log_blks_needed; + atomic_t sd_log_revokes_available; + wait_queue_head_t sd_log_waitq; + wait_queue_head_t sd_logd_waitq; + + u64 sd_log_sequence; + int sd_log_idle; + + struct rw_semaphore sd_log_flush_lock; + atomic_t sd_log_in_flight; + wait_queue_head_t sd_log_flush_wait; + int sd_log_error; /* First log error */ + wait_queue_head_t sd_withdraw_wait; + + unsigned int sd_log_tail; + unsigned int sd_log_flush_tail; + unsigned int sd_log_head; + unsigned int sd_log_flush_head; + + spinlock_t sd_ail_lock; + struct list_head sd_ail1_list; + struct list_head sd_ail2_list; + + /* For quiescing the filesystem */ + struct gfs2_holder sd_freeze_gh; + struct mutex sd_freeze_mutex; + + char sd_fsname[GFS2_FSNAME_LEN + 3 * sizeof(int) + 2]; + char sd_table_name[GFS2_FSNAME_LEN]; + char sd_proto_name[GFS2_FSNAME_LEN]; + + /* Debugging crud */ + + unsigned long sd_last_warning; + struct dentry *debugfs_dir; /* debugfs directory */ + unsigned long sd_glock_dqs_held; +}; + +static inline void gfs2_glstats_inc(struct gfs2_glock *gl, int which) +{ + gl->gl_stats.stats[which]++; +} + +static inline void gfs2_sbstats_inc(const struct gfs2_glock *gl, int which) +{ + const struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; + preempt_disable(); + this_cpu_ptr(sdp->sd_lkstats)->lkstats[gl->gl_name.ln_type].stats[which]++; + preempt_enable(); +} + +extern struct gfs2_rgrpd *gfs2_glock2rgrp(struct gfs2_glock *gl); + +static inline unsigned gfs2_max_stuffed_size(const struct gfs2_inode *ip) +{ + return GFS2_SB(&ip->i_inode)->sd_sb.sb_bsize - sizeof(struct gfs2_dinode); +} + +#endif /* __INCORE_DOT_H__ */ + diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c new file mode 100644 index 0000000000..4e63fbb631 --- /dev/null +++ b/fs/gfs2/inode.c @@ -0,0 +1,2219 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2011 Red Hat, Inc. All rights reserved. + */ + +#include <linux/slab.h> +#include <linux/spinlock.h> +#include <linux/completion.h> +#include <linux/buffer_head.h> +#include <linux/namei.h> +#include <linux/mm.h> +#include <linux/cred.h> +#include <linux/xattr.h> +#include <linux/posix_acl.h> +#include <linux/gfs2_ondisk.h> +#include <linux/crc32.h> +#include <linux/iomap.h> +#include <linux/security.h> +#include <linux/fiemap.h> +#include <linux/uaccess.h> + +#include "gfs2.h" +#include "incore.h" +#include "acl.h" +#include "bmap.h" +#include "dir.h" +#include "xattr.h" +#include "glock.h" +#include "inode.h" +#include "meta_io.h" +#include "quota.h" +#include "rgrp.h" +#include "trans.h" +#include "util.h" +#include "super.h" +#include "glops.h" + +static const struct inode_operations gfs2_file_iops; +static const struct inode_operations gfs2_dir_iops; +static const struct inode_operations gfs2_symlink_iops; + +/** + * gfs2_set_iop - Sets inode operations + * @inode: The inode with correct i_mode filled in + * + * GFS2 lookup code fills in vfs inode contents based on info obtained + * from directory entry inside gfs2_inode_lookup(). + */ + +static void gfs2_set_iop(struct inode *inode) +{ + struct gfs2_sbd *sdp = GFS2_SB(inode); + umode_t mode = inode->i_mode; + + if (S_ISREG(mode)) { + inode->i_op = &gfs2_file_iops; + if (gfs2_localflocks(sdp)) + inode->i_fop = &gfs2_file_fops_nolock; + else + inode->i_fop = &gfs2_file_fops; + } else if (S_ISDIR(mode)) { + inode->i_op = &gfs2_dir_iops; + if (gfs2_localflocks(sdp)) + inode->i_fop = &gfs2_dir_fops_nolock; + else + inode->i_fop = &gfs2_dir_fops; + } else if (S_ISLNK(mode)) { + inode->i_op = &gfs2_symlink_iops; + } else { + inode->i_op = &gfs2_file_iops; + init_special_inode(inode, inode->i_mode, inode->i_rdev); + } +} + +static int iget_test(struct inode *inode, void *opaque) +{ + u64 no_addr = *(u64 *)opaque; + + return GFS2_I(inode)->i_no_addr == no_addr; +} + +static int iget_set(struct inode *inode, void *opaque) +{ + u64 no_addr = *(u64 *)opaque; + + GFS2_I(inode)->i_no_addr = no_addr; + inode->i_ino = no_addr; + return 0; +} + +/** + * gfs2_inode_lookup - Lookup an inode + * @sb: The super block + * @type: The type of the inode + * @no_addr: The inode number + * @no_formal_ino: The inode generation number + * @blktype: Requested block type (GFS2_BLKST_DINODE or GFS2_BLKST_UNLINKED; + * GFS2_BLKST_FREE to indicate not to verify) + * + * If @type is DT_UNKNOWN, the inode type is fetched from disk. + * + * If @blktype is anything other than GFS2_BLKST_FREE (which is used as a + * placeholder because it doesn't otherwise make sense), the on-disk block type + * is verified to be @blktype. + * + * When @no_formal_ino is non-zero, this function will return ERR_PTR(-ESTALE) + * if it detects that @no_formal_ino doesn't match the actual inode generation + * number. However, it doesn't always know unless @type is DT_UNKNOWN. + * + * Returns: A VFS inode, or an error + */ + +struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type, + u64 no_addr, u64 no_formal_ino, + unsigned int blktype) +{ + struct inode *inode; + struct gfs2_inode *ip; + struct gfs2_holder i_gh; + int error; + + gfs2_holder_mark_uninitialized(&i_gh); + inode = iget5_locked(sb, no_addr, iget_test, iget_set, &no_addr); + if (!inode) + return ERR_PTR(-ENOMEM); + + ip = GFS2_I(inode); + + if (inode->i_state & I_NEW) { + struct gfs2_sbd *sdp = GFS2_SB(inode); + struct gfs2_glock *io_gl; + int extra_flags = 0; + + error = gfs2_glock_get(sdp, no_addr, &gfs2_inode_glops, CREATE, + &ip->i_gl); + if (unlikely(error)) + goto fail; + + error = gfs2_glock_get(sdp, no_addr, &gfs2_iopen_glops, CREATE, + &io_gl); + if (unlikely(error)) + goto fail; + + /* + * The only caller that sets @blktype to GFS2_BLKST_UNLINKED is + * delete_work_func(). Make sure not to cancel the delete work + * from within itself here. + */ + if (blktype == GFS2_BLKST_UNLINKED) + extra_flags |= LM_FLAG_TRY; + else + gfs2_cancel_delete_work(io_gl); + error = gfs2_glock_nq_init(io_gl, LM_ST_SHARED, + GL_EXACT | GL_NOPID | extra_flags, + &ip->i_iopen_gh); + gfs2_glock_put(io_gl); + if (unlikely(error)) + goto fail; + + if (type == DT_UNKNOWN || blktype != GFS2_BLKST_FREE) { + /* + * The GL_SKIP flag indicates to skip reading the inode + * block. We read the inode when instantiating it + * after possibly checking the block type. + */ + error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, + GL_SKIP, &i_gh); + if (error) + goto fail; + + error = -ESTALE; + if (no_formal_ino && + gfs2_inode_already_deleted(ip->i_gl, no_formal_ino)) + goto fail; + + if (blktype != GFS2_BLKST_FREE) { + error = gfs2_check_blk_type(sdp, no_addr, + blktype); + if (error) + goto fail; + } + } + + set_bit(GLF_INSTANTIATE_NEEDED, &ip->i_gl->gl_flags); + + /* Lowest possible timestamp; will be overwritten in gfs2_dinode_in. */ + inode->i_atime.tv_sec = 1LL << (8 * sizeof(inode->i_atime.tv_sec) - 1); + inode->i_atime.tv_nsec = 0; + + glock_set_object(ip->i_gl, ip); + + if (type == DT_UNKNOWN) { + /* Inode glock must be locked already */ + error = gfs2_instantiate(&i_gh); + if (error) { + glock_clear_object(ip->i_gl, ip); + goto fail; + } + } else { + ip->i_no_formal_ino = no_formal_ino; + inode->i_mode = DT2IF(type); + } + + if (gfs2_holder_initialized(&i_gh)) + gfs2_glock_dq_uninit(&i_gh); + glock_set_object(ip->i_iopen_gh.gh_gl, ip); + + gfs2_set_iop(inode); + unlock_new_inode(inode); + } + + if (no_formal_ino && ip->i_no_formal_ino && + no_formal_ino != ip->i_no_formal_ino) { + iput(inode); + return ERR_PTR(-ESTALE); + } + + return inode; + +fail: + if (error == GLR_TRYFAILED) + error = -EAGAIN; + if (gfs2_holder_initialized(&ip->i_iopen_gh)) + gfs2_glock_dq_uninit(&ip->i_iopen_gh); + if (gfs2_holder_initialized(&i_gh)) + gfs2_glock_dq_uninit(&i_gh); + if (ip->i_gl) { + gfs2_glock_put(ip->i_gl); + ip->i_gl = NULL; + } + iget_failed(inode); + return ERR_PTR(error); +} + +/** + * gfs2_lookup_by_inum - look up an inode by inode number + * @sdp: The super block + * @no_addr: The inode number + * @no_formal_ino: The inode generation number (0 for any) + * @blktype: Requested block type (see gfs2_inode_lookup) + */ +struct inode *gfs2_lookup_by_inum(struct gfs2_sbd *sdp, u64 no_addr, + u64 no_formal_ino, unsigned int blktype) +{ + struct super_block *sb = sdp->sd_vfs; + struct inode *inode; + int error; + + inode = gfs2_inode_lookup(sb, DT_UNKNOWN, no_addr, no_formal_ino, + blktype); + if (IS_ERR(inode)) + return inode; + + if (no_formal_ino) { + error = -EIO; + if (GFS2_I(inode)->i_diskflags & GFS2_DIF_SYSTEM) + goto fail_iput; + } + return inode; + +fail_iput: + iput(inode); + return ERR_PTR(error); +} + + +struct inode *gfs2_lookup_simple(struct inode *dip, const char *name) +{ + struct qstr qstr; + struct inode *inode; + gfs2_str2qstr(&qstr, name); + inode = gfs2_lookupi(dip, &qstr, 1); + /* gfs2_lookupi has inconsistent callers: vfs + * related routines expect NULL for no entry found, + * gfs2_lookup_simple callers expect ENOENT + * and do not check for NULL. + */ + if (IS_ERR_OR_NULL(inode)) + return inode ? inode : ERR_PTR(-ENOENT); + + /* + * Must not call back into the filesystem when allocating + * pages in the metadata inode's address space. + */ + mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS); + + return inode; +} + + +/** + * gfs2_lookupi - Look up a filename in a directory and return its inode + * @dir: The inode of the directory containing the inode to look-up + * @name: The name of the inode to look for + * @is_root: If 1, ignore the caller's permissions + * + * This can be called via the VFS filldir function when NFS is doing + * a readdirplus and the inode which its intending to stat isn't + * already in cache. In this case we must not take the directory glock + * again, since the readdir call will have already taken that lock. + * + * Returns: errno + */ + +struct inode *gfs2_lookupi(struct inode *dir, const struct qstr *name, + int is_root) +{ + struct super_block *sb = dir->i_sb; + struct gfs2_inode *dip = GFS2_I(dir); + struct gfs2_holder d_gh; + int error = 0; + struct inode *inode = NULL; + + gfs2_holder_mark_uninitialized(&d_gh); + if (!name->len || name->len > GFS2_FNAMESIZE) + return ERR_PTR(-ENAMETOOLONG); + + if ((name->len == 1 && memcmp(name->name, ".", 1) == 0) || + (name->len == 2 && memcmp(name->name, "..", 2) == 0 && + dir == d_inode(sb->s_root))) { + igrab(dir); + return dir; + } + + if (gfs2_glock_is_locked_by_me(dip->i_gl) == NULL) { + error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED, 0, &d_gh); + if (error) + return ERR_PTR(error); + } + + if (!is_root) { + error = gfs2_permission(&nop_mnt_idmap, dir, MAY_EXEC); + if (error) + goto out; + } + + inode = gfs2_dir_search(dir, name, false); + if (IS_ERR(inode)) + error = PTR_ERR(inode); +out: + if (gfs2_holder_initialized(&d_gh)) + gfs2_glock_dq_uninit(&d_gh); + if (error == -ENOENT) + return NULL; + return inode ? inode : ERR_PTR(error); +} + +/** + * create_ok - OK to create a new on-disk inode here? + * @dip: Directory in which dinode is to be created + * @name: Name of new dinode + * @mode: + * + * Returns: errno + */ + +static int create_ok(struct gfs2_inode *dip, const struct qstr *name, + umode_t mode) +{ + int error; + + error = gfs2_permission(&nop_mnt_idmap, &dip->i_inode, + MAY_WRITE | MAY_EXEC); + if (error) + return error; + + /* Don't create entries in an unlinked directory */ + if (!dip->i_inode.i_nlink) + return -ENOENT; + + if (dip->i_entries == (u32)-1) + return -EFBIG; + if (S_ISDIR(mode) && dip->i_inode.i_nlink == (u32)-1) + return -EMLINK; + + return 0; +} + +static void munge_mode_uid_gid(const struct gfs2_inode *dip, + struct inode *inode) +{ + if (GFS2_SB(&dip->i_inode)->sd_args.ar_suiddir && + (dip->i_inode.i_mode & S_ISUID) && + !uid_eq(dip->i_inode.i_uid, GLOBAL_ROOT_UID)) { + if (S_ISDIR(inode->i_mode)) + inode->i_mode |= S_ISUID; + else if (!uid_eq(dip->i_inode.i_uid, current_fsuid())) + inode->i_mode &= ~07111; + inode->i_uid = dip->i_inode.i_uid; + } else + inode->i_uid = current_fsuid(); + + if (dip->i_inode.i_mode & S_ISGID) { + if (S_ISDIR(inode->i_mode)) + inode->i_mode |= S_ISGID; + inode->i_gid = dip->i_inode.i_gid; + } else + inode->i_gid = current_fsgid(); +} + +static int alloc_dinode(struct gfs2_inode *ip, u32 flags, unsigned *dblocks) +{ + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + struct gfs2_alloc_parms ap = { .target = *dblocks, .aflags = flags, }; + int error; + + error = gfs2_quota_lock_check(ip, &ap); + if (error) + goto out; + + error = gfs2_inplace_reserve(ip, &ap); + if (error) + goto out_quota; + + error = gfs2_trans_begin(sdp, (*dblocks * RES_RG_BIT) + RES_STATFS + RES_QUOTA, 0); + if (error) + goto out_ipreserv; + + error = gfs2_alloc_blocks(ip, &ip->i_no_addr, dblocks, 1, &ip->i_generation); + if (error) + goto out_trans_end; + + ip->i_no_formal_ino = ip->i_generation; + ip->i_inode.i_ino = ip->i_no_addr; + ip->i_goal = ip->i_no_addr; + if (*dblocks > 1) + ip->i_eattr = ip->i_no_addr + 1; + +out_trans_end: + gfs2_trans_end(sdp); +out_ipreserv: + gfs2_inplace_release(ip); +out_quota: + gfs2_quota_unlock(ip); +out: + return error; +} + +static void gfs2_init_dir(struct buffer_head *dibh, + const struct gfs2_inode *parent) +{ + struct gfs2_dinode *di = (struct gfs2_dinode *)dibh->b_data; + struct gfs2_dirent *dent = (struct gfs2_dirent *)(di+1); + + gfs2_qstr2dirent(&gfs2_qdot, GFS2_DIRENT_SIZE(gfs2_qdot.len), dent); + dent->de_inum = di->di_num; /* already GFS2 endian */ + dent->de_type = cpu_to_be16(DT_DIR); + + dent = (struct gfs2_dirent *)((char*)dent + GFS2_DIRENT_SIZE(1)); + gfs2_qstr2dirent(&gfs2_qdotdot, dibh->b_size - GFS2_DIRENT_SIZE(1) - sizeof(struct gfs2_dinode), dent); + gfs2_inum_out(parent, dent); + dent->de_type = cpu_to_be16(DT_DIR); + +} + +/** + * gfs2_init_xattr - Initialise an xattr block for a new inode + * @ip: The inode in question + * + * This sets up an empty xattr block for a new inode, ready to + * take any ACLs, LSM xattrs, etc. + */ + +static void gfs2_init_xattr(struct gfs2_inode *ip) +{ + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + struct buffer_head *bh; + struct gfs2_ea_header *ea; + + bh = gfs2_meta_new(ip->i_gl, ip->i_eattr); + gfs2_trans_add_meta(ip->i_gl, bh); + gfs2_metatype_set(bh, GFS2_METATYPE_EA, GFS2_FORMAT_EA); + gfs2_buffer_clear_tail(bh, sizeof(struct gfs2_meta_header)); + + ea = GFS2_EA_BH2FIRST(bh); + ea->ea_rec_len = cpu_to_be32(sdp->sd_jbsize); + ea->ea_type = GFS2_EATYPE_UNUSED; + ea->ea_flags = GFS2_EAFLAG_LAST; + + brelse(bh); +} + +/** + * init_dinode - Fill in a new dinode structure + * @dip: The directory this inode is being created in + * @ip: The inode + * @symname: The symlink destination (if a symlink) + * + */ + +static void init_dinode(struct gfs2_inode *dip, struct gfs2_inode *ip, + const char *symname) +{ + struct gfs2_dinode *di; + struct buffer_head *dibh; + + dibh = gfs2_meta_new(ip->i_gl, ip->i_no_addr); + gfs2_trans_add_meta(ip->i_gl, dibh); + di = (struct gfs2_dinode *)dibh->b_data; + gfs2_dinode_out(ip, di); + + di->di_major = cpu_to_be32(imajor(&ip->i_inode)); + di->di_minor = cpu_to_be32(iminor(&ip->i_inode)); + di->__pad1 = 0; + di->__pad2 = 0; + di->__pad3 = 0; + memset(&di->__pad4, 0, sizeof(di->__pad4)); + memset(&di->di_reserved, 0, sizeof(di->di_reserved)); + gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode)); + + switch(ip->i_inode.i_mode & S_IFMT) { + case S_IFDIR: + gfs2_init_dir(dibh, dip); + break; + case S_IFLNK: + memcpy(dibh->b_data + sizeof(struct gfs2_dinode), symname, ip->i_inode.i_size); + break; + } + + set_buffer_uptodate(dibh); + brelse(dibh); +} + +/** + * gfs2_trans_da_blks - Calculate number of blocks to link inode + * @dip: The directory we are linking into + * @da: The dir add information + * @nr_inodes: The number of inodes involved + * + * This calculate the number of blocks we need to reserve in a + * transaction to link @nr_inodes into a directory. In most cases + * @nr_inodes will be 2 (the directory plus the inode being linked in) + * but in case of rename, 4 may be required. + * + * Returns: Number of blocks + */ + +static unsigned gfs2_trans_da_blks(const struct gfs2_inode *dip, + const struct gfs2_diradd *da, + unsigned nr_inodes) +{ + return da->nr_blocks + gfs2_rg_blocks(dip, da->nr_blocks) + + (nr_inodes * RES_DINODE) + RES_QUOTA + RES_STATFS; +} + +static int link_dinode(struct gfs2_inode *dip, const struct qstr *name, + struct gfs2_inode *ip, struct gfs2_diradd *da) +{ + struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode); + struct gfs2_alloc_parms ap = { .target = da->nr_blocks, }; + int error; + + if (da->nr_blocks) { + error = gfs2_quota_lock_check(dip, &ap); + if (error) + goto fail_quota_locks; + + error = gfs2_inplace_reserve(dip, &ap); + if (error) + goto fail_quota_locks; + + error = gfs2_trans_begin(sdp, gfs2_trans_da_blks(dip, da, 2), 0); + if (error) + goto fail_ipreserv; + } else { + error = gfs2_trans_begin(sdp, RES_LEAF + 2 * RES_DINODE, 0); + if (error) + goto fail_quota_locks; + } + + error = gfs2_dir_add(&dip->i_inode, name, ip, da); + + gfs2_trans_end(sdp); +fail_ipreserv: + gfs2_inplace_release(dip); +fail_quota_locks: + gfs2_quota_unlock(dip); + return error; +} + +static int gfs2_initxattrs(struct inode *inode, const struct xattr *xattr_array, + void *fs_info) +{ + const struct xattr *xattr; + int err = 0; + + for (xattr = xattr_array; xattr->name != NULL; xattr++) { + err = __gfs2_xattr_set(inode, xattr->name, xattr->value, + xattr->value_len, 0, + GFS2_EATYPE_SECURITY); + if (err < 0) + break; + } + return err; +} + +/** + * gfs2_create_inode - Create a new inode + * @dir: The parent directory + * @dentry: The new dentry + * @file: If non-NULL, the file which is being opened + * @mode: The permissions on the new inode + * @dev: For device nodes, this is the device number + * @symname: For symlinks, this is the link destination + * @size: The initial size of the inode (ignored for directories) + * @excl: Force fail if inode exists + * + * FIXME: Change to allocate the disk blocks and write them out in the same + * transaction. That way, we can no longer end up in a situation in which an + * inode is allocated, the node crashes, and the block looks like a valid + * inode. (With atomic creates in place, we will also no longer need to zero + * the link count and dirty the inode here on failure.) + * + * Returns: 0 on success, or error code + */ + +static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, + struct file *file, + umode_t mode, dev_t dev, const char *symname, + unsigned int size, int excl) +{ + const struct qstr *name = &dentry->d_name; + struct posix_acl *default_acl, *acl; + struct gfs2_holder d_gh, gh; + struct inode *inode = NULL; + struct gfs2_inode *dip = GFS2_I(dir), *ip; + struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode); + struct gfs2_glock *io_gl; + int error; + u32 aflags = 0; + unsigned blocks = 1; + struct gfs2_diradd da = { .bh = NULL, .save_loc = 1, }; + + if (!name->len || name->len > GFS2_FNAMESIZE) + return -ENAMETOOLONG; + + error = gfs2_qa_get(dip); + if (error) + return error; + + error = gfs2_rindex_update(sdp); + if (error) + goto fail; + + error = gfs2_glock_nq_init(dip->i_gl, LM_ST_EXCLUSIVE, 0, &d_gh); + if (error) + goto fail; + gfs2_holder_mark_uninitialized(&gh); + + error = create_ok(dip, name, mode); + if (error) + goto fail_gunlock; + + inode = gfs2_dir_search(dir, &dentry->d_name, !S_ISREG(mode) || excl); + error = PTR_ERR(inode); + if (!IS_ERR(inode)) { + if (S_ISDIR(inode->i_mode)) { + iput(inode); + inode = ERR_PTR(-EISDIR); + goto fail_gunlock; + } + d_instantiate(dentry, inode); + error = 0; + if (file) { + if (S_ISREG(inode->i_mode)) + error = finish_open(file, dentry, gfs2_open_common); + else + error = finish_no_open(file, NULL); + } + gfs2_glock_dq_uninit(&d_gh); + goto fail; + } else if (error != -ENOENT) { + goto fail_gunlock; + } + + error = gfs2_diradd_alloc_required(dir, name, &da); + if (error < 0) + goto fail_gunlock; + + inode = new_inode(sdp->sd_vfs); + error = -ENOMEM; + if (!inode) + goto fail_gunlock; + ip = GFS2_I(inode); + + error = posix_acl_create(dir, &mode, &default_acl, &acl); + if (error) + goto fail_gunlock; + + error = gfs2_qa_get(ip); + if (error) + goto fail_free_acls; + + inode->i_mode = mode; + set_nlink(inode, S_ISDIR(mode) ? 2 : 1); + inode->i_rdev = dev; + inode->i_size = size; + inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode); + munge_mode_uid_gid(dip, inode); + check_and_update_goal(dip); + ip->i_goal = dip->i_goal; + ip->i_diskflags = 0; + ip->i_eattr = 0; + ip->i_height = 0; + ip->i_depth = 0; + ip->i_entries = 0; + ip->i_no_addr = 0; /* Temporarily zero until real addr is assigned */ + + switch(mode & S_IFMT) { + case S_IFREG: + if ((dip->i_diskflags & GFS2_DIF_INHERIT_JDATA) || + gfs2_tune_get(sdp, gt_new_files_jdata)) + ip->i_diskflags |= GFS2_DIF_JDATA; + gfs2_set_aops(inode); + break; + case S_IFDIR: + ip->i_diskflags |= (dip->i_diskflags & GFS2_DIF_INHERIT_JDATA); + ip->i_diskflags |= GFS2_DIF_JDATA; + ip->i_entries = 2; + break; + } + + /* Force SYSTEM flag on all files and subdirs of a SYSTEM directory */ + if (dip->i_diskflags & GFS2_DIF_SYSTEM) + ip->i_diskflags |= GFS2_DIF_SYSTEM; + + gfs2_set_inode_flags(inode); + + if ((GFS2_I(d_inode(sdp->sd_root_dir)) == dip) || + (dip->i_diskflags & GFS2_DIF_TOPDIR)) + aflags |= GFS2_AF_ORLOV; + + if (default_acl || acl) + blocks++; + + error = alloc_dinode(ip, aflags, &blocks); + if (error) + goto fail_free_inode; + + gfs2_set_inode_blocks(inode, blocks); + + error = gfs2_glock_get(sdp, ip->i_no_addr, &gfs2_inode_glops, CREATE, &ip->i_gl); + if (error) + goto fail_free_inode; + + error = gfs2_glock_get(sdp, ip->i_no_addr, &gfs2_iopen_glops, CREATE, &io_gl); + if (error) + goto fail_free_inode; + gfs2_cancel_delete_work(io_gl); + +retry: + error = insert_inode_locked4(inode, ip->i_no_addr, iget_test, &ip->i_no_addr); + if (error == -EBUSY) + goto retry; + if (error) + goto fail_gunlock2; + + error = gfs2_glock_nq_init(io_gl, LM_ST_SHARED, GL_EXACT | GL_NOPID, + &ip->i_iopen_gh); + if (error) + goto fail_gunlock2; + + error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, GL_SKIP, &gh); + if (error) + goto fail_gunlock3; + + error = gfs2_trans_begin(sdp, blocks, 0); + if (error) + goto fail_gunlock3; + + if (blocks > 1) + gfs2_init_xattr(ip); + init_dinode(dip, ip, symname); + gfs2_trans_end(sdp); + + glock_set_object(ip->i_gl, ip); + glock_set_object(io_gl, ip); + gfs2_set_iop(inode); + + if (default_acl) { + error = __gfs2_set_acl(inode, default_acl, ACL_TYPE_DEFAULT); + if (error) + goto fail_gunlock4; + posix_acl_release(default_acl); + default_acl = NULL; + } + if (acl) { + error = __gfs2_set_acl(inode, acl, ACL_TYPE_ACCESS); + if (error) + goto fail_gunlock4; + posix_acl_release(acl); + acl = NULL; + } + + error = security_inode_init_security(&ip->i_inode, &dip->i_inode, name, + &gfs2_initxattrs, NULL); + if (error) + goto fail_gunlock4; + + error = link_dinode(dip, name, ip, &da); + if (error) + goto fail_gunlock4; + + mark_inode_dirty(inode); + d_instantiate(dentry, inode); + /* After instantiate, errors should result in evict which will destroy + * both inode and iopen glocks properly. */ + if (file) { + file->f_mode |= FMODE_CREATED; + error = finish_open(file, dentry, gfs2_open_common); + } + gfs2_glock_dq_uninit(&d_gh); + gfs2_qa_put(ip); + gfs2_glock_dq_uninit(&gh); + gfs2_glock_put(io_gl); + gfs2_qa_put(dip); + unlock_new_inode(inode); + return error; + +fail_gunlock4: + glock_clear_object(ip->i_gl, ip); + glock_clear_object(io_gl, ip); +fail_gunlock3: + gfs2_glock_dq_uninit(&ip->i_iopen_gh); +fail_gunlock2: + gfs2_glock_put(io_gl); +fail_free_inode: + if (ip->i_gl) { + gfs2_glock_put(ip->i_gl); + ip->i_gl = NULL; + } + gfs2_rs_deltree(&ip->i_res); + gfs2_qa_put(ip); +fail_free_acls: + posix_acl_release(default_acl); + posix_acl_release(acl); +fail_gunlock: + gfs2_dir_no_add(&da); + gfs2_glock_dq_uninit(&d_gh); + if (!IS_ERR_OR_NULL(inode)) { + set_bit(GIF_ALLOC_FAILED, &ip->i_flags); + clear_nlink(inode); + if (ip->i_no_addr) + mark_inode_dirty(inode); + if (inode->i_state & I_NEW) + iget_failed(inode); + else + iput(inode); + } + if (gfs2_holder_initialized(&gh)) + gfs2_glock_dq_uninit(&gh); +fail: + gfs2_qa_put(dip); + return error; +} + +/** + * gfs2_create - Create a file + * @idmap: idmap of the mount the inode was found from + * @dir: The directory in which to create the file + * @dentry: The dentry of the new file + * @mode: The mode of the new file + * @excl: Force fail if inode exists + * + * Returns: errno + */ + +static int gfs2_create(struct mnt_idmap *idmap, struct inode *dir, + struct dentry *dentry, umode_t mode, bool excl) +{ + return gfs2_create_inode(dir, dentry, NULL, S_IFREG | mode, 0, NULL, 0, excl); +} + +/** + * __gfs2_lookup - Look up a filename in a directory and return its inode + * @dir: The directory inode + * @dentry: The dentry of the new inode + * @file: File to be opened + * + * + * Returns: errno + */ + +static struct dentry *__gfs2_lookup(struct inode *dir, struct dentry *dentry, + struct file *file) +{ + struct inode *inode; + struct dentry *d; + struct gfs2_holder gh; + struct gfs2_glock *gl; + int error; + + inode = gfs2_lookupi(dir, &dentry->d_name, 0); + if (inode == NULL) { + d_add(dentry, NULL); + return NULL; + } + if (IS_ERR(inode)) + return ERR_CAST(inode); + + gl = GFS2_I(inode)->i_gl; + error = gfs2_glock_nq_init(gl, LM_ST_SHARED, LM_FLAG_ANY, &gh); + if (error) { + iput(inode); + return ERR_PTR(error); + } + + d = d_splice_alias(inode, dentry); + if (IS_ERR(d)) { + gfs2_glock_dq_uninit(&gh); + return d; + } + if (file && S_ISREG(inode->i_mode)) + error = finish_open(file, dentry, gfs2_open_common); + + gfs2_glock_dq_uninit(&gh); + if (error) { + dput(d); + return ERR_PTR(error); + } + return d; +} + +static struct dentry *gfs2_lookup(struct inode *dir, struct dentry *dentry, + unsigned flags) +{ + return __gfs2_lookup(dir, dentry, NULL); +} + +/** + * gfs2_link - Link to a file + * @old_dentry: The inode to link + * @dir: Add link to this directory + * @dentry: The name of the link + * + * Link the inode in "old_dentry" into the directory "dir" with the + * name in "dentry". + * + * Returns: errno + */ + +static int gfs2_link(struct dentry *old_dentry, struct inode *dir, + struct dentry *dentry) +{ + struct gfs2_inode *dip = GFS2_I(dir); + struct gfs2_sbd *sdp = GFS2_SB(dir); + struct inode *inode = d_inode(old_dentry); + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_holder d_gh, gh; + struct buffer_head *dibh; + struct gfs2_diradd da = { .bh = NULL, .save_loc = 1, }; + int error; + + if (S_ISDIR(inode->i_mode)) + return -EPERM; + + error = gfs2_qa_get(dip); + if (error) + return error; + + gfs2_holder_init(dip->i_gl, LM_ST_EXCLUSIVE, 0, &d_gh); + gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh); + + error = gfs2_glock_nq(&d_gh); + if (error) + goto out_parent; + + error = gfs2_glock_nq(&gh); + if (error) + goto out_child; + + error = -ENOENT; + if (inode->i_nlink == 0) + goto out_gunlock; + + error = gfs2_permission(&nop_mnt_idmap, dir, MAY_WRITE | MAY_EXEC); + if (error) + goto out_gunlock; + + error = gfs2_dir_check(dir, &dentry->d_name, NULL); + switch (error) { + case -ENOENT: + break; + case 0: + error = -EEXIST; + goto out_gunlock; + default: + goto out_gunlock; + } + + error = -EINVAL; + if (!dip->i_inode.i_nlink) + goto out_gunlock; + error = -EFBIG; + if (dip->i_entries == (u32)-1) + goto out_gunlock; + error = -EPERM; + if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) + goto out_gunlock; + error = -EMLINK; + if (ip->i_inode.i_nlink == (u32)-1) + goto out_gunlock; + + error = gfs2_diradd_alloc_required(dir, &dentry->d_name, &da); + if (error < 0) + goto out_gunlock; + + if (da.nr_blocks) { + struct gfs2_alloc_parms ap = { .target = da.nr_blocks, }; + error = gfs2_quota_lock_check(dip, &ap); + if (error) + goto out_gunlock; + + error = gfs2_inplace_reserve(dip, &ap); + if (error) + goto out_gunlock_q; + + error = gfs2_trans_begin(sdp, gfs2_trans_da_blks(dip, &da, 2), 0); + if (error) + goto out_ipres; + } else { + error = gfs2_trans_begin(sdp, 2 * RES_DINODE + RES_LEAF, 0); + if (error) + goto out_ipres; + } + + error = gfs2_meta_inode_buffer(ip, &dibh); + if (error) + goto out_end_trans; + + error = gfs2_dir_add(dir, &dentry->d_name, ip, &da); + if (error) + goto out_brelse; + + gfs2_trans_add_meta(ip->i_gl, dibh); + inc_nlink(&ip->i_inode); + inode_set_ctime_current(&ip->i_inode); + ihold(inode); + d_instantiate(dentry, inode); + mark_inode_dirty(inode); + +out_brelse: + brelse(dibh); +out_end_trans: + gfs2_trans_end(sdp); +out_ipres: + if (da.nr_blocks) + gfs2_inplace_release(dip); +out_gunlock_q: + if (da.nr_blocks) + gfs2_quota_unlock(dip); +out_gunlock: + gfs2_dir_no_add(&da); + gfs2_glock_dq(&gh); +out_child: + gfs2_glock_dq(&d_gh); +out_parent: + gfs2_qa_put(dip); + gfs2_holder_uninit(&d_gh); + gfs2_holder_uninit(&gh); + return error; +} + +/* + * gfs2_unlink_ok - check to see that a inode is still in a directory + * @dip: the directory + * @name: the name of the file + * @ip: the inode + * + * Assumes that the lock on (at least) @dip is held. + * + * Returns: 0 if the parent/child relationship is correct, errno if it isn't + */ + +static int gfs2_unlink_ok(struct gfs2_inode *dip, const struct qstr *name, + const struct gfs2_inode *ip) +{ + int error; + + if (IS_IMMUTABLE(&ip->i_inode) || IS_APPEND(&ip->i_inode)) + return -EPERM; + + if ((dip->i_inode.i_mode & S_ISVTX) && + !uid_eq(dip->i_inode.i_uid, current_fsuid()) && + !uid_eq(ip->i_inode.i_uid, current_fsuid()) && !capable(CAP_FOWNER)) + return -EPERM; + + if (IS_APPEND(&dip->i_inode)) + return -EPERM; + + error = gfs2_permission(&nop_mnt_idmap, &dip->i_inode, + MAY_WRITE | MAY_EXEC); + if (error) + return error; + + return gfs2_dir_check(&dip->i_inode, name, ip); +} + +/** + * gfs2_unlink_inode - Removes an inode from its parent dir and unlinks it + * @dip: The parent directory + * @dentry: The dentry to unlink + * + * Called with all the locks and in a transaction. This will only be + * called for a directory after it has been checked to ensure it is empty. + * + * Returns: 0 on success, or an error + */ + +static int gfs2_unlink_inode(struct gfs2_inode *dip, + const struct dentry *dentry) +{ + struct inode *inode = d_inode(dentry); + struct gfs2_inode *ip = GFS2_I(inode); + int error; + + error = gfs2_dir_del(dip, dentry); + if (error) + return error; + + ip->i_entries = 0; + inode_set_ctime_current(inode); + if (S_ISDIR(inode->i_mode)) + clear_nlink(inode); + else + drop_nlink(inode); + mark_inode_dirty(inode); + if (inode->i_nlink == 0) + gfs2_unlink_di(inode); + return 0; +} + + +/** + * gfs2_unlink - Unlink an inode (this does rmdir as well) + * @dir: The inode of the directory containing the inode to unlink + * @dentry: The file itself + * + * This routine uses the type of the inode as a flag to figure out + * whether this is an unlink or an rmdir. + * + * Returns: errno + */ + +static int gfs2_unlink(struct inode *dir, struct dentry *dentry) +{ + struct gfs2_inode *dip = GFS2_I(dir); + struct gfs2_sbd *sdp = GFS2_SB(dir); + struct inode *inode = d_inode(dentry); + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_holder d_gh, r_gh, gh; + struct gfs2_rgrpd *rgd; + int error; + + error = gfs2_rindex_update(sdp); + if (error) + return error; + + error = -EROFS; + + gfs2_holder_init(dip->i_gl, LM_ST_EXCLUSIVE, 0, &d_gh); + gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh); + + rgd = gfs2_blk2rgrpd(sdp, ip->i_no_addr, 1); + if (!rgd) + goto out_inodes; + + gfs2_holder_init(rgd->rd_gl, LM_ST_EXCLUSIVE, LM_FLAG_NODE_SCOPE, &r_gh); + + + error = gfs2_glock_nq(&d_gh); + if (error) + goto out_parent; + + error = gfs2_glock_nq(&gh); + if (error) + goto out_child; + + error = -ENOENT; + if (inode->i_nlink == 0) + goto out_rgrp; + + if (S_ISDIR(inode->i_mode)) { + error = -ENOTEMPTY; + if (ip->i_entries > 2 || inode->i_nlink > 2) + goto out_rgrp; + } + + error = gfs2_glock_nq(&r_gh); /* rgrp */ + if (error) + goto out_rgrp; + + error = gfs2_unlink_ok(dip, &dentry->d_name, ip); + if (error) + goto out_gunlock; + + error = gfs2_trans_begin(sdp, 2*RES_DINODE + 3*RES_LEAF + RES_RG_BIT, 0); + if (error) + goto out_gunlock; + + error = gfs2_unlink_inode(dip, dentry); + gfs2_trans_end(sdp); + +out_gunlock: + gfs2_glock_dq(&r_gh); +out_rgrp: + gfs2_glock_dq(&gh); +out_child: + gfs2_glock_dq(&d_gh); +out_parent: + gfs2_holder_uninit(&r_gh); +out_inodes: + gfs2_holder_uninit(&gh); + gfs2_holder_uninit(&d_gh); + return error; +} + +/** + * gfs2_symlink - Create a symlink + * @idmap: idmap of the mount the inode was found from + * @dir: The directory to create the symlink in + * @dentry: The dentry to put the symlink in + * @symname: The thing which the link points to + * + * Returns: errno + */ + +static int gfs2_symlink(struct mnt_idmap *idmap, struct inode *dir, + struct dentry *dentry, const char *symname) +{ + unsigned int size; + + size = strlen(symname); + if (size >= gfs2_max_stuffed_size(GFS2_I(dir))) + return -ENAMETOOLONG; + + return gfs2_create_inode(dir, dentry, NULL, S_IFLNK | S_IRWXUGO, 0, symname, size, 0); +} + +/** + * gfs2_mkdir - Make a directory + * @idmap: idmap of the mount the inode was found from + * @dir: The parent directory of the new one + * @dentry: The dentry of the new directory + * @mode: The mode of the new directory + * + * Returns: errno + */ + +static int gfs2_mkdir(struct mnt_idmap *idmap, struct inode *dir, + struct dentry *dentry, umode_t mode) +{ + unsigned dsize = gfs2_max_stuffed_size(GFS2_I(dir)); + return gfs2_create_inode(dir, dentry, NULL, S_IFDIR | mode, 0, NULL, dsize, 0); +} + +/** + * gfs2_mknod - Make a special file + * @idmap: idmap of the mount the inode was found from + * @dir: The directory in which the special file will reside + * @dentry: The dentry of the special file + * @mode: The mode of the special file + * @dev: The device specification of the special file + * + */ + +static int gfs2_mknod(struct mnt_idmap *idmap, struct inode *dir, + struct dentry *dentry, umode_t mode, dev_t dev) +{ + return gfs2_create_inode(dir, dentry, NULL, mode, dev, NULL, 0, 0); +} + +/** + * gfs2_atomic_open - Atomically open a file + * @dir: The directory + * @dentry: The proposed new entry + * @file: The proposed new struct file + * @flags: open flags + * @mode: File mode + * + * Returns: error code or 0 for success + */ + +static int gfs2_atomic_open(struct inode *dir, struct dentry *dentry, + struct file *file, unsigned flags, + umode_t mode) +{ + struct dentry *d; + bool excl = !!(flags & O_EXCL); + + if (!d_in_lookup(dentry)) + goto skip_lookup; + + d = __gfs2_lookup(dir, dentry, file); + if (IS_ERR(d)) + return PTR_ERR(d); + if (d != NULL) + dentry = d; + if (d_really_is_positive(dentry)) { + if (!(file->f_mode & FMODE_OPENED)) + return finish_no_open(file, d); + dput(d); + return excl && (flags & O_CREAT) ? -EEXIST : 0; + } + + BUG_ON(d != NULL); + +skip_lookup: + if (!(flags & O_CREAT)) + return -ENOENT; + + return gfs2_create_inode(dir, dentry, file, S_IFREG | mode, 0, NULL, 0, excl); +} + +/* + * gfs2_ok_to_move - check if it's ok to move a directory to another directory + * @this: move this + * @to: to here + * + * Follow @to back to the root and make sure we don't encounter @this + * Assumes we already hold the rename lock. + * + * Returns: errno + */ + +static int gfs2_ok_to_move(struct gfs2_inode *this, struct gfs2_inode *to) +{ + struct inode *dir = &to->i_inode; + struct super_block *sb = dir->i_sb; + struct inode *tmp; + int error = 0; + + igrab(dir); + + for (;;) { + if (dir == &this->i_inode) { + error = -EINVAL; + break; + } + if (dir == d_inode(sb->s_root)) { + error = 0; + break; + } + + tmp = gfs2_lookupi(dir, &gfs2_qdotdot, 1); + if (!tmp) { + error = -ENOENT; + break; + } + if (IS_ERR(tmp)) { + error = PTR_ERR(tmp); + break; + } + + iput(dir); + dir = tmp; + } + + iput(dir); + + return error; +} + +/** + * update_moved_ino - Update an inode that's being moved + * @ip: The inode being moved + * @ndip: The parent directory of the new filename + * @dir_rename: True of ip is a directory + * + * Returns: errno + */ + +static int update_moved_ino(struct gfs2_inode *ip, struct gfs2_inode *ndip, + int dir_rename) +{ + if (dir_rename) + return gfs2_dir_mvino(ip, &gfs2_qdotdot, ndip, DT_DIR); + + inode_set_ctime_current(&ip->i_inode); + mark_inode_dirty_sync(&ip->i_inode); + return 0; +} + + +/** + * gfs2_rename - Rename a file + * @odir: Parent directory of old file name + * @odentry: The old dentry of the file + * @ndir: Parent directory of new file name + * @ndentry: The new dentry of the file + * + * Returns: errno + */ + +static int gfs2_rename(struct inode *odir, struct dentry *odentry, + struct inode *ndir, struct dentry *ndentry) +{ + struct gfs2_inode *odip = GFS2_I(odir); + struct gfs2_inode *ndip = GFS2_I(ndir); + struct gfs2_inode *ip = GFS2_I(d_inode(odentry)); + struct gfs2_inode *nip = NULL; + struct gfs2_sbd *sdp = GFS2_SB(odir); + struct gfs2_holder ghs[4], r_gh, rd_gh; + struct gfs2_rgrpd *nrgd; + unsigned int num_gh; + int dir_rename = 0; + struct gfs2_diradd da = { .nr_blocks = 0, .save_loc = 0, }; + unsigned int x; + int error; + + gfs2_holder_mark_uninitialized(&r_gh); + gfs2_holder_mark_uninitialized(&rd_gh); + if (d_really_is_positive(ndentry)) { + nip = GFS2_I(d_inode(ndentry)); + if (ip == nip) + return 0; + } + + error = gfs2_rindex_update(sdp); + if (error) + return error; + + error = gfs2_qa_get(ndip); + if (error) + return error; + + if (odip != ndip) { + error = gfs2_glock_nq_init(sdp->sd_rename_gl, LM_ST_EXCLUSIVE, + 0, &r_gh); + if (error) + goto out; + + if (S_ISDIR(ip->i_inode.i_mode)) { + dir_rename = 1; + /* don't move a directory into its subdir */ + error = gfs2_ok_to_move(ip, ndip); + if (error) + goto out_gunlock_r; + } + } + + num_gh = 1; + gfs2_holder_init(odip->i_gl, LM_ST_EXCLUSIVE, GL_ASYNC, ghs); + if (odip != ndip) { + gfs2_holder_init(ndip->i_gl, LM_ST_EXCLUSIVE,GL_ASYNC, + ghs + num_gh); + num_gh++; + } + gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, GL_ASYNC, ghs + num_gh); + num_gh++; + + if (nip) { + gfs2_holder_init(nip->i_gl, LM_ST_EXCLUSIVE, GL_ASYNC, + ghs + num_gh); + num_gh++; + } + + for (x = 0; x < num_gh; x++) { + error = gfs2_glock_nq(ghs + x); + if (error) + goto out_gunlock; + } + error = gfs2_glock_async_wait(num_gh, ghs); + if (error) + goto out_gunlock; + + if (nip) { + /* Grab the resource group glock for unlink flag twiddling. + * This is the case where the target dinode already exists + * so we unlink before doing the rename. + */ + nrgd = gfs2_blk2rgrpd(sdp, nip->i_no_addr, 1); + if (!nrgd) { + error = -ENOENT; + goto out_gunlock; + } + error = gfs2_glock_nq_init(nrgd->rd_gl, LM_ST_EXCLUSIVE, + LM_FLAG_NODE_SCOPE, &rd_gh); + if (error) + goto out_gunlock; + } + + error = -ENOENT; + if (ip->i_inode.i_nlink == 0) + goto out_gunlock; + + /* Check out the old directory */ + + error = gfs2_unlink_ok(odip, &odentry->d_name, ip); + if (error) + goto out_gunlock; + + /* Check out the new directory */ + + if (nip) { + error = gfs2_unlink_ok(ndip, &ndentry->d_name, nip); + if (error) + goto out_gunlock; + + if (nip->i_inode.i_nlink == 0) { + error = -EAGAIN; + goto out_gunlock; + } + + if (S_ISDIR(nip->i_inode.i_mode)) { + if (nip->i_entries < 2) { + gfs2_consist_inode(nip); + error = -EIO; + goto out_gunlock; + } + if (nip->i_entries > 2) { + error = -ENOTEMPTY; + goto out_gunlock; + } + } + } else { + error = gfs2_permission(&nop_mnt_idmap, ndir, + MAY_WRITE | MAY_EXEC); + if (error) + goto out_gunlock; + + error = gfs2_dir_check(ndir, &ndentry->d_name, NULL); + switch (error) { + case -ENOENT: + error = 0; + break; + case 0: + error = -EEXIST; + goto out_gunlock; + default: + goto out_gunlock; + } + + if (odip != ndip) { + if (!ndip->i_inode.i_nlink) { + error = -ENOENT; + goto out_gunlock; + } + if (ndip->i_entries == (u32)-1) { + error = -EFBIG; + goto out_gunlock; + } + if (S_ISDIR(ip->i_inode.i_mode) && + ndip->i_inode.i_nlink == (u32)-1) { + error = -EMLINK; + goto out_gunlock; + } + } + } + + /* Check out the dir to be renamed */ + + if (dir_rename) { + error = gfs2_permission(&nop_mnt_idmap, d_inode(odentry), + MAY_WRITE); + if (error) + goto out_gunlock; + } + + if (nip == NULL) { + error = gfs2_diradd_alloc_required(ndir, &ndentry->d_name, &da); + if (error) + goto out_gunlock; + } + + if (da.nr_blocks) { + struct gfs2_alloc_parms ap = { .target = da.nr_blocks, }; + error = gfs2_quota_lock_check(ndip, &ap); + if (error) + goto out_gunlock; + + error = gfs2_inplace_reserve(ndip, &ap); + if (error) + goto out_gunlock_q; + + error = gfs2_trans_begin(sdp, gfs2_trans_da_blks(ndip, &da, 4) + + 4 * RES_LEAF + 4, 0); + if (error) + goto out_ipreserv; + } else { + error = gfs2_trans_begin(sdp, 4 * RES_DINODE + + 5 * RES_LEAF + 4, 0); + if (error) + goto out_gunlock; + } + + /* Remove the target file, if it exists */ + + if (nip) + error = gfs2_unlink_inode(ndip, ndentry); + + error = update_moved_ino(ip, ndip, dir_rename); + if (error) + goto out_end_trans; + + error = gfs2_dir_del(odip, odentry); + if (error) + goto out_end_trans; + + error = gfs2_dir_add(ndir, &ndentry->d_name, ip, &da); + if (error) + goto out_end_trans; + +out_end_trans: + gfs2_trans_end(sdp); +out_ipreserv: + if (da.nr_blocks) + gfs2_inplace_release(ndip); +out_gunlock_q: + if (da.nr_blocks) + gfs2_quota_unlock(ndip); +out_gunlock: + gfs2_dir_no_add(&da); + if (gfs2_holder_initialized(&rd_gh)) + gfs2_glock_dq_uninit(&rd_gh); + + while (x--) { + if (gfs2_holder_queued(ghs + x)) + gfs2_glock_dq(ghs + x); + gfs2_holder_uninit(ghs + x); + } +out_gunlock_r: + if (gfs2_holder_initialized(&r_gh)) + gfs2_glock_dq_uninit(&r_gh); +out: + gfs2_qa_put(ndip); + return error; +} + +/** + * gfs2_exchange - exchange two files + * @odir: Parent directory of old file name + * @odentry: The old dentry of the file + * @ndir: Parent directory of new file name + * @ndentry: The new dentry of the file + * @flags: The rename flags + * + * Returns: errno + */ + +static int gfs2_exchange(struct inode *odir, struct dentry *odentry, + struct inode *ndir, struct dentry *ndentry, + unsigned int flags) +{ + struct gfs2_inode *odip = GFS2_I(odir); + struct gfs2_inode *ndip = GFS2_I(ndir); + struct gfs2_inode *oip = GFS2_I(odentry->d_inode); + struct gfs2_inode *nip = GFS2_I(ndentry->d_inode); + struct gfs2_sbd *sdp = GFS2_SB(odir); + struct gfs2_holder ghs[4], r_gh; + unsigned int num_gh; + unsigned int x; + umode_t old_mode = oip->i_inode.i_mode; + umode_t new_mode = nip->i_inode.i_mode; + int error; + + gfs2_holder_mark_uninitialized(&r_gh); + error = gfs2_rindex_update(sdp); + if (error) + return error; + + if (odip != ndip) { + error = gfs2_glock_nq_init(sdp->sd_rename_gl, LM_ST_EXCLUSIVE, + 0, &r_gh); + if (error) + goto out; + + if (S_ISDIR(old_mode)) { + /* don't move a directory into its subdir */ + error = gfs2_ok_to_move(oip, ndip); + if (error) + goto out_gunlock_r; + } + + if (S_ISDIR(new_mode)) { + /* don't move a directory into its subdir */ + error = gfs2_ok_to_move(nip, odip); + if (error) + goto out_gunlock_r; + } + } + + num_gh = 1; + gfs2_holder_init(odip->i_gl, LM_ST_EXCLUSIVE, GL_ASYNC, ghs); + if (odip != ndip) { + gfs2_holder_init(ndip->i_gl, LM_ST_EXCLUSIVE, GL_ASYNC, + ghs + num_gh); + num_gh++; + } + gfs2_holder_init(oip->i_gl, LM_ST_EXCLUSIVE, GL_ASYNC, ghs + num_gh); + num_gh++; + + gfs2_holder_init(nip->i_gl, LM_ST_EXCLUSIVE, GL_ASYNC, ghs + num_gh); + num_gh++; + + for (x = 0; x < num_gh; x++) { + error = gfs2_glock_nq(ghs + x); + if (error) + goto out_gunlock; + } + + error = gfs2_glock_async_wait(num_gh, ghs); + if (error) + goto out_gunlock; + + error = -ENOENT; + if (oip->i_inode.i_nlink == 0 || nip->i_inode.i_nlink == 0) + goto out_gunlock; + + error = gfs2_unlink_ok(odip, &odentry->d_name, oip); + if (error) + goto out_gunlock; + error = gfs2_unlink_ok(ndip, &ndentry->d_name, nip); + if (error) + goto out_gunlock; + + if (S_ISDIR(old_mode)) { + error = gfs2_permission(&nop_mnt_idmap, odentry->d_inode, + MAY_WRITE); + if (error) + goto out_gunlock; + } + if (S_ISDIR(new_mode)) { + error = gfs2_permission(&nop_mnt_idmap, ndentry->d_inode, + MAY_WRITE); + if (error) + goto out_gunlock; + } + error = gfs2_trans_begin(sdp, 4 * RES_DINODE + 4 * RES_LEAF, 0); + if (error) + goto out_gunlock; + + error = update_moved_ino(oip, ndip, S_ISDIR(old_mode)); + if (error) + goto out_end_trans; + + error = update_moved_ino(nip, odip, S_ISDIR(new_mode)); + if (error) + goto out_end_trans; + + error = gfs2_dir_mvino(ndip, &ndentry->d_name, oip, + IF2DT(old_mode)); + if (error) + goto out_end_trans; + + error = gfs2_dir_mvino(odip, &odentry->d_name, nip, + IF2DT(new_mode)); + if (error) + goto out_end_trans; + + if (odip != ndip) { + if (S_ISDIR(new_mode) && !S_ISDIR(old_mode)) { + inc_nlink(&odip->i_inode); + drop_nlink(&ndip->i_inode); + } else if (S_ISDIR(old_mode) && !S_ISDIR(new_mode)) { + inc_nlink(&ndip->i_inode); + drop_nlink(&odip->i_inode); + } + } + mark_inode_dirty(&ndip->i_inode); + if (odip != ndip) + mark_inode_dirty(&odip->i_inode); + +out_end_trans: + gfs2_trans_end(sdp); +out_gunlock: + while (x--) { + if (gfs2_holder_queued(ghs + x)) + gfs2_glock_dq(ghs + x); + gfs2_holder_uninit(ghs + x); + } +out_gunlock_r: + if (gfs2_holder_initialized(&r_gh)) + gfs2_glock_dq_uninit(&r_gh); +out: + return error; +} + +static int gfs2_rename2(struct mnt_idmap *idmap, struct inode *odir, + struct dentry *odentry, struct inode *ndir, + struct dentry *ndentry, unsigned int flags) +{ + flags &= ~RENAME_NOREPLACE; + + if (flags & ~RENAME_EXCHANGE) + return -EINVAL; + + if (flags & RENAME_EXCHANGE) + return gfs2_exchange(odir, odentry, ndir, ndentry, flags); + + return gfs2_rename(odir, odentry, ndir, ndentry); +} + +/** + * gfs2_get_link - Follow a symbolic link + * @dentry: The dentry of the link + * @inode: The inode of the link + * @done: destructor for return value + * + * This can handle symlinks of any size. + * + * Returns: 0 on success or error code + */ + +static const char *gfs2_get_link(struct dentry *dentry, + struct inode *inode, + struct delayed_call *done) +{ + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_holder i_gh; + struct buffer_head *dibh; + unsigned int size; + char *buf; + int error; + + if (!dentry) + return ERR_PTR(-ECHILD); + + gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &i_gh); + error = gfs2_glock_nq(&i_gh); + if (error) { + gfs2_holder_uninit(&i_gh); + return ERR_PTR(error); + } + + size = (unsigned int)i_size_read(&ip->i_inode); + if (size == 0) { + gfs2_consist_inode(ip); + buf = ERR_PTR(-EIO); + goto out; + } + + error = gfs2_meta_inode_buffer(ip, &dibh); + if (error) { + buf = ERR_PTR(error); + goto out; + } + + buf = kzalloc(size + 1, GFP_NOFS); + if (!buf) + buf = ERR_PTR(-ENOMEM); + else + memcpy(buf, dibh->b_data + sizeof(struct gfs2_dinode), size); + brelse(dibh); +out: + gfs2_glock_dq_uninit(&i_gh); + if (!IS_ERR(buf)) + set_delayed_call(done, kfree_link, buf); + return buf; +} + +/** + * gfs2_permission + * @idmap: idmap of the mount the inode was found from + * @inode: The inode + * @mask: The mask to be tested + * + * This may be called from the VFS directly, or from within GFS2 with the + * inode locked, so we look to see if the glock is already locked and only + * lock the glock if its not already been done. + * + * Returns: errno + */ + +int gfs2_permission(struct mnt_idmap *idmap, struct inode *inode, + int mask) +{ + int may_not_block = mask & MAY_NOT_BLOCK; + struct gfs2_inode *ip; + struct gfs2_holder i_gh; + struct gfs2_glock *gl; + int error; + + gfs2_holder_mark_uninitialized(&i_gh); + ip = GFS2_I(inode); + gl = rcu_dereference_check(ip->i_gl, !may_not_block); + if (unlikely(!gl)) { + /* inode is getting torn down, must be RCU mode */ + WARN_ON_ONCE(!may_not_block); + return -ECHILD; + } + if (gfs2_glock_is_locked_by_me(gl) == NULL) { + if (may_not_block) + return -ECHILD; + error = gfs2_glock_nq_init(gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh); + if (error) + return error; + } + + if ((mask & MAY_WRITE) && IS_IMMUTABLE(inode)) + error = -EPERM; + else + error = generic_permission(&nop_mnt_idmap, inode, mask); + if (gfs2_holder_initialized(&i_gh)) + gfs2_glock_dq_uninit(&i_gh); + + return error; +} + +static int __gfs2_setattr_simple(struct inode *inode, struct iattr *attr) +{ + setattr_copy(&nop_mnt_idmap, inode, attr); + mark_inode_dirty(inode); + return 0; +} + +static int gfs2_setattr_simple(struct inode *inode, struct iattr *attr) +{ + int error; + + if (current->journal_info) + return __gfs2_setattr_simple(inode, attr); + + error = gfs2_trans_begin(GFS2_SB(inode), RES_DINODE, 0); + if (error) + return error; + + error = __gfs2_setattr_simple(inode, attr); + gfs2_trans_end(GFS2_SB(inode)); + return error; +} + +static int setattr_chown(struct inode *inode, struct iattr *attr) +{ + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_sbd *sdp = GFS2_SB(inode); + kuid_t ouid, nuid; + kgid_t ogid, ngid; + int error; + struct gfs2_alloc_parms ap; + + ouid = inode->i_uid; + ogid = inode->i_gid; + nuid = attr->ia_uid; + ngid = attr->ia_gid; + + if (!(attr->ia_valid & ATTR_UID) || uid_eq(ouid, nuid)) + ouid = nuid = NO_UID_QUOTA_CHANGE; + if (!(attr->ia_valid & ATTR_GID) || gid_eq(ogid, ngid)) + ogid = ngid = NO_GID_QUOTA_CHANGE; + error = gfs2_qa_get(ip); + if (error) + return error; + + error = gfs2_rindex_update(sdp); + if (error) + goto out; + + error = gfs2_quota_lock(ip, nuid, ngid); + if (error) + goto out; + + ap.target = gfs2_get_inode_blocks(&ip->i_inode); + + if (!uid_eq(ouid, NO_UID_QUOTA_CHANGE) || + !gid_eq(ogid, NO_GID_QUOTA_CHANGE)) { + error = gfs2_quota_check(ip, nuid, ngid, &ap); + if (error) + goto out_gunlock_q; + } + + error = gfs2_trans_begin(sdp, RES_DINODE + 2 * RES_QUOTA, 0); + if (error) + goto out_gunlock_q; + + error = gfs2_setattr_simple(inode, attr); + if (error) + goto out_end_trans; + + if (!uid_eq(ouid, NO_UID_QUOTA_CHANGE) || + !gid_eq(ogid, NO_GID_QUOTA_CHANGE)) { + gfs2_quota_change(ip, -(s64)ap.target, ouid, ogid); + gfs2_quota_change(ip, ap.target, nuid, ngid); + } + +out_end_trans: + gfs2_trans_end(sdp); +out_gunlock_q: + gfs2_quota_unlock(ip); +out: + gfs2_qa_put(ip); + return error; +} + +/** + * gfs2_setattr - Change attributes on an inode + * @idmap: idmap of the mount the inode was found from + * @dentry: The dentry which is changing + * @attr: The structure describing the change + * + * The VFS layer wants to change one or more of an inodes attributes. Write + * that change out to disk. + * + * Returns: errno + */ + +static int gfs2_setattr(struct mnt_idmap *idmap, + struct dentry *dentry, struct iattr *attr) +{ + struct inode *inode = d_inode(dentry); + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_holder i_gh; + int error; + + error = gfs2_qa_get(ip); + if (error) + return error; + + error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &i_gh); + if (error) + goto out; + + error = may_setattr(&nop_mnt_idmap, inode, attr->ia_valid); + if (error) + goto error; + + error = setattr_prepare(&nop_mnt_idmap, dentry, attr); + if (error) + goto error; + + if (attr->ia_valid & ATTR_SIZE) + error = gfs2_setattr_size(inode, attr->ia_size); + else if (attr->ia_valid & (ATTR_UID | ATTR_GID)) + error = setattr_chown(inode, attr); + else { + error = gfs2_setattr_simple(inode, attr); + if (!error && attr->ia_valid & ATTR_MODE) + error = posix_acl_chmod(&nop_mnt_idmap, dentry, + inode->i_mode); + } + +error: + if (!error) + mark_inode_dirty(inode); + gfs2_glock_dq_uninit(&i_gh); +out: + gfs2_qa_put(ip); + return error; +} + +/** + * gfs2_getattr - Read out an inode's attributes + * @idmap: idmap of the mount the inode was found from + * @path: Object to query + * @stat: The inode's stats + * @request_mask: Mask of STATX_xxx flags indicating the caller's interests + * @flags: AT_STATX_xxx setting + * + * This may be called from the VFS directly, or from within GFS2 with the + * inode locked, so we look to see if the glock is already locked and only + * lock the glock if its not already been done. Note that its the NFS + * readdirplus operation which causes this to be called (from filldir) + * with the glock already held. + * + * Returns: errno + */ + +static int gfs2_getattr(struct mnt_idmap *idmap, + const struct path *path, struct kstat *stat, + u32 request_mask, unsigned int flags) +{ + struct inode *inode = d_inode(path->dentry); + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_holder gh; + u32 gfsflags; + int error; + + gfs2_holder_mark_uninitialized(&gh); + if (gfs2_glock_is_locked_by_me(ip->i_gl) == NULL) { + error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &gh); + if (error) + return error; + } + + gfsflags = ip->i_diskflags; + if (gfsflags & GFS2_DIF_APPENDONLY) + stat->attributes |= STATX_ATTR_APPEND; + if (gfsflags & GFS2_DIF_IMMUTABLE) + stat->attributes |= STATX_ATTR_IMMUTABLE; + + stat->attributes_mask |= (STATX_ATTR_APPEND | + STATX_ATTR_COMPRESSED | + STATX_ATTR_ENCRYPTED | + STATX_ATTR_IMMUTABLE | + STATX_ATTR_NODUMP); + + generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat); + + if (gfs2_holder_initialized(&gh)) + gfs2_glock_dq_uninit(&gh); + + return 0; +} + +static int gfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, + u64 start, u64 len) +{ + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_holder gh; + int ret; + + inode_lock_shared(inode); + + ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, 0, &gh); + if (ret) + goto out; + + ret = iomap_fiemap(inode, fieinfo, start, len, &gfs2_iomap_ops); + + gfs2_glock_dq_uninit(&gh); + +out: + inode_unlock_shared(inode); + return ret; +} + +loff_t gfs2_seek_data(struct file *file, loff_t offset) +{ + struct inode *inode = file->f_mapping->host; + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_holder gh; + loff_t ret; + + inode_lock_shared(inode); + ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, 0, &gh); + if (!ret) + ret = iomap_seek_data(inode, offset, &gfs2_iomap_ops); + gfs2_glock_dq_uninit(&gh); + inode_unlock_shared(inode); + + if (ret < 0) + return ret; + return vfs_setpos(file, ret, inode->i_sb->s_maxbytes); +} + +loff_t gfs2_seek_hole(struct file *file, loff_t offset) +{ + struct inode *inode = file->f_mapping->host; + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_holder gh; + loff_t ret; + + inode_lock_shared(inode); + ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, 0, &gh); + if (!ret) + ret = iomap_seek_hole(inode, offset, &gfs2_iomap_ops); + gfs2_glock_dq_uninit(&gh); + inode_unlock_shared(inode); + + if (ret < 0) + return ret; + return vfs_setpos(file, ret, inode->i_sb->s_maxbytes); +} + +static int gfs2_update_time(struct inode *inode, int flags) +{ + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_glock *gl = ip->i_gl; + struct gfs2_holder *gh; + int error; + + gh = gfs2_glock_is_locked_by_me(gl); + if (gh && !gfs2_glock_is_held_excl(gl)) { + gfs2_glock_dq(gh); + gfs2_holder_reinit(LM_ST_EXCLUSIVE, 0, gh); + error = gfs2_glock_nq(gh); + if (error) + return error; + } + generic_update_time(inode, flags); + return 0; +} + +static const struct inode_operations gfs2_file_iops = { + .permission = gfs2_permission, + .setattr = gfs2_setattr, + .getattr = gfs2_getattr, + .listxattr = gfs2_listxattr, + .fiemap = gfs2_fiemap, + .get_inode_acl = gfs2_get_acl, + .set_acl = gfs2_set_acl, + .update_time = gfs2_update_time, + .fileattr_get = gfs2_fileattr_get, + .fileattr_set = gfs2_fileattr_set, +}; + +static const struct inode_operations gfs2_dir_iops = { + .create = gfs2_create, + .lookup = gfs2_lookup, + .link = gfs2_link, + .unlink = gfs2_unlink, + .symlink = gfs2_symlink, + .mkdir = gfs2_mkdir, + .rmdir = gfs2_unlink, + .mknod = gfs2_mknod, + .rename = gfs2_rename2, + .permission = gfs2_permission, + .setattr = gfs2_setattr, + .getattr = gfs2_getattr, + .listxattr = gfs2_listxattr, + .fiemap = gfs2_fiemap, + .get_inode_acl = gfs2_get_acl, + .set_acl = gfs2_set_acl, + .update_time = gfs2_update_time, + .atomic_open = gfs2_atomic_open, + .fileattr_get = gfs2_fileattr_get, + .fileattr_set = gfs2_fileattr_set, +}; + +static const struct inode_operations gfs2_symlink_iops = { + .get_link = gfs2_get_link, + .permission = gfs2_permission, + .setattr = gfs2_setattr, + .getattr = gfs2_getattr, + .listxattr = gfs2_listxattr, + .fiemap = gfs2_fiemap, +}; + diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h new file mode 100644 index 0000000000..c8c5814e72 --- /dev/null +++ b/fs/gfs2/inode.h @@ -0,0 +1,137 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + */ + +#ifndef __INODE_DOT_H__ +#define __INODE_DOT_H__ + +#include <linux/fs.h> +#include <linux/buffer_head.h> +#include <linux/mm.h> +#include "util.h" + +bool gfs2_release_folio(struct folio *folio, gfp_t gfp_mask); +extern int gfs2_internal_read(struct gfs2_inode *ip, + char *buf, loff_t *pos, unsigned size); +extern void gfs2_set_aops(struct inode *inode); + +static inline int gfs2_is_stuffed(const struct gfs2_inode *ip) +{ + return !ip->i_height; +} + +static inline int gfs2_is_jdata(const struct gfs2_inode *ip) +{ + return ip->i_diskflags & GFS2_DIF_JDATA; +} + +static inline bool gfs2_is_ordered(const struct gfs2_sbd *sdp) +{ + return sdp->sd_args.ar_data == GFS2_DATA_ORDERED; +} + +static inline bool gfs2_is_writeback(const struct gfs2_sbd *sdp) +{ + return sdp->sd_args.ar_data == GFS2_DATA_WRITEBACK; +} + +static inline int gfs2_is_dir(const struct gfs2_inode *ip) +{ + return S_ISDIR(ip->i_inode.i_mode); +} + +static inline void gfs2_set_inode_blocks(struct inode *inode, u64 blocks) +{ + inode->i_blocks = blocks << + (GFS2_SB(inode)->sd_sb.sb_bsize_shift - GFS2_BASIC_BLOCK_SHIFT); +} + +static inline u64 gfs2_get_inode_blocks(const struct inode *inode) +{ + return inode->i_blocks >> + (GFS2_SB(inode)->sd_sb.sb_bsize_shift - GFS2_BASIC_BLOCK_SHIFT); +} + +static inline void gfs2_add_inode_blocks(struct inode *inode, s64 change) +{ + change <<= inode->i_blkbits - GFS2_BASIC_BLOCK_SHIFT; + gfs2_assert(GFS2_SB(inode), (change >= 0 || inode->i_blocks >= -change)); + inode->i_blocks += change; +} + +static inline int gfs2_check_inum(const struct gfs2_inode *ip, u64 no_addr, + u64 no_formal_ino) +{ + return ip->i_no_addr == no_addr && ip->i_no_formal_ino == no_formal_ino; +} + +static inline void gfs2_inum_out(const struct gfs2_inode *ip, + struct gfs2_dirent *dent) +{ + dent->de_inum.no_formal_ino = cpu_to_be64(ip->i_no_formal_ino); + dent->de_inum.no_addr = cpu_to_be64(ip->i_no_addr); +} + +static inline int gfs2_check_internal_file_size(struct inode *inode, + u64 minsize, u64 maxsize) +{ + u64 size = i_size_read(inode); + if (size < minsize || size > maxsize) + goto err; + if (size & (BIT(inode->i_blkbits) - 1)) + goto err; + return 0; +err: + gfs2_consist_inode(GFS2_I(inode)); + return -EIO; +} + +extern struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned type, + u64 no_addr, u64 no_formal_ino, + unsigned int blktype); +extern struct inode *gfs2_lookup_by_inum(struct gfs2_sbd *sdp, u64 no_addr, + u64 no_formal_ino, + unsigned int blktype); + +extern int gfs2_inode_refresh(struct gfs2_inode *ip); + +extern struct inode *gfs2_lookupi(struct inode *dir, const struct qstr *name, + int is_root); +extern int gfs2_permission(struct mnt_idmap *idmap, + struct inode *inode, int mask); +extern struct inode *gfs2_lookup_simple(struct inode *dip, const char *name); +extern void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf); +extern int gfs2_open_common(struct inode *inode, struct file *file); +extern loff_t gfs2_seek_data(struct file *file, loff_t offset); +extern loff_t gfs2_seek_hole(struct file *file, loff_t offset); + +extern const struct file_operations gfs2_file_fops_nolock; +extern const struct file_operations gfs2_dir_fops_nolock; + +extern int gfs2_fileattr_get(struct dentry *dentry, struct fileattr *fa); +extern int gfs2_fileattr_set(struct mnt_idmap *idmap, + struct dentry *dentry, struct fileattr *fa); +extern void gfs2_set_inode_flags(struct inode *inode); + +#ifdef CONFIG_GFS2_FS_LOCKING_DLM +extern const struct file_operations gfs2_file_fops; +extern const struct file_operations gfs2_dir_fops; + +static inline int gfs2_localflocks(const struct gfs2_sbd *sdp) +{ + return sdp->sd_args.ar_localflocks; +} +#else /* Single node only */ +#define gfs2_file_fops gfs2_file_fops_nolock +#define gfs2_dir_fops gfs2_dir_fops_nolock + +static inline int gfs2_localflocks(const struct gfs2_sbd *sdp) +{ + return 1; +} +#endif /* CONFIG_GFS2_FS_LOCKING_DLM */ + +#endif /* __INODE_DOT_H__ */ + diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c new file mode 100644 index 0000000000..59ab18c798 --- /dev/null +++ b/fs/gfs2/lock_dlm.c @@ -0,0 +1,1409 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright 2004-2011 Red Hat, Inc. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/fs.h> +#include <linux/dlm.h> +#include <linux/slab.h> +#include <linux/types.h> +#include <linux/delay.h> +#include <linux/gfs2_ondisk.h> +#include <linux/sched/signal.h> + +#include "incore.h" +#include "glock.h" +#include "glops.h" +#include "recovery.h" +#include "util.h" +#include "sys.h" +#include "trace_gfs2.h" + +/** + * gfs2_update_stats - Update time based stats + * @s: The stats to update (local or global) + * @index: The index inside @s + * @sample: New data to include + */ +static inline void gfs2_update_stats(struct gfs2_lkstats *s, unsigned index, + s64 sample) +{ + /* + * @delta is the difference between the current rtt sample and the + * running average srtt. We add 1/8 of that to the srtt in order to + * update the current srtt estimate. The variance estimate is a bit + * more complicated. We subtract the current variance estimate from + * the abs value of the @delta and add 1/4 of that to the running + * total. That's equivalent to 3/4 of the current variance + * estimate plus 1/4 of the abs of @delta. + * + * Note that the index points at the array entry containing the + * smoothed mean value, and the variance is always in the following + * entry + * + * Reference: TCP/IP Illustrated, vol 2, p. 831,832 + * All times are in units of integer nanoseconds. Unlike the TCP/IP + * case, they are not scaled fixed point. + */ + + s64 delta = sample - s->stats[index]; + s->stats[index] += (delta >> 3); + index++; + s->stats[index] += (s64)(abs(delta) - s->stats[index]) >> 2; +} + +/** + * gfs2_update_reply_times - Update locking statistics + * @gl: The glock to update + * + * This assumes that gl->gl_dstamp has been set earlier. + * + * The rtt (lock round trip time) is an estimate of the time + * taken to perform a dlm lock request. We update it on each + * reply from the dlm. + * + * The blocking flag is set on the glock for all dlm requests + * which may potentially block due to lock requests from other nodes. + * DLM requests where the current lock state is exclusive, the + * requested state is null (or unlocked) or where the TRY or + * TRY_1CB flags are set are classified as non-blocking. All + * other DLM requests are counted as (potentially) blocking. + */ +static inline void gfs2_update_reply_times(struct gfs2_glock *gl) +{ + struct gfs2_pcpu_lkstats *lks; + const unsigned gltype = gl->gl_name.ln_type; + unsigned index = test_bit(GLF_BLOCKING, &gl->gl_flags) ? + GFS2_LKS_SRTTB : GFS2_LKS_SRTT; + s64 rtt; + + preempt_disable(); + rtt = ktime_to_ns(ktime_sub(ktime_get_real(), gl->gl_dstamp)); + lks = this_cpu_ptr(gl->gl_name.ln_sbd->sd_lkstats); + gfs2_update_stats(&gl->gl_stats, index, rtt); /* Local */ + gfs2_update_stats(&lks->lkstats[gltype], index, rtt); /* Global */ + preempt_enable(); + + trace_gfs2_glock_lock_time(gl, rtt); +} + +/** + * gfs2_update_request_times - Update locking statistics + * @gl: The glock to update + * + * The irt (lock inter-request times) measures the average time + * between requests to the dlm. It is updated immediately before + * each dlm call. + */ + +static inline void gfs2_update_request_times(struct gfs2_glock *gl) +{ + struct gfs2_pcpu_lkstats *lks; + const unsigned gltype = gl->gl_name.ln_type; + ktime_t dstamp; + s64 irt; + + preempt_disable(); + dstamp = gl->gl_dstamp; + gl->gl_dstamp = ktime_get_real(); + irt = ktime_to_ns(ktime_sub(gl->gl_dstamp, dstamp)); + lks = this_cpu_ptr(gl->gl_name.ln_sbd->sd_lkstats); + gfs2_update_stats(&gl->gl_stats, GFS2_LKS_SIRT, irt); /* Local */ + gfs2_update_stats(&lks->lkstats[gltype], GFS2_LKS_SIRT, irt); /* Global */ + preempt_enable(); +} + +static void gdlm_ast(void *arg) +{ + struct gfs2_glock *gl = arg; + unsigned ret = gl->gl_state; + + gfs2_update_reply_times(gl); + BUG_ON(gl->gl_lksb.sb_flags & DLM_SBF_DEMOTED); + + if ((gl->gl_lksb.sb_flags & DLM_SBF_VALNOTVALID) && gl->gl_lksb.sb_lvbptr) + memset(gl->gl_lksb.sb_lvbptr, 0, GDLM_LVB_SIZE); + + switch (gl->gl_lksb.sb_status) { + case -DLM_EUNLOCK: /* Unlocked, so glock can be freed */ + if (gl->gl_ops->go_free) + gl->gl_ops->go_free(gl); + gfs2_glock_free(gl); + return; + case -DLM_ECANCEL: /* Cancel while getting lock */ + ret |= LM_OUT_CANCELED; + goto out; + case -EAGAIN: /* Try lock fails */ + case -EDEADLK: /* Deadlock detected */ + goto out; + case -ETIMEDOUT: /* Canceled due to timeout */ + ret |= LM_OUT_ERROR; + goto out; + case 0: /* Success */ + break; + default: /* Something unexpected */ + BUG(); + } + + ret = gl->gl_req; + if (gl->gl_lksb.sb_flags & DLM_SBF_ALTMODE) { + if (gl->gl_req == LM_ST_SHARED) + ret = LM_ST_DEFERRED; + else if (gl->gl_req == LM_ST_DEFERRED) + ret = LM_ST_SHARED; + else + BUG(); + } + + set_bit(GLF_INITIAL, &gl->gl_flags); + gfs2_glock_complete(gl, ret); + return; +out: + if (!test_bit(GLF_INITIAL, &gl->gl_flags)) + gl->gl_lksb.sb_lkid = 0; + gfs2_glock_complete(gl, ret); +} + +static void gdlm_bast(void *arg, int mode) +{ + struct gfs2_glock *gl = arg; + + switch (mode) { + case DLM_LOCK_EX: + gfs2_glock_cb(gl, LM_ST_UNLOCKED); + break; + case DLM_LOCK_CW: + gfs2_glock_cb(gl, LM_ST_DEFERRED); + break; + case DLM_LOCK_PR: + gfs2_glock_cb(gl, LM_ST_SHARED); + break; + default: + fs_err(gl->gl_name.ln_sbd, "unknown bast mode %d\n", mode); + BUG(); + } +} + +/* convert gfs lock-state to dlm lock-mode */ + +static int make_mode(struct gfs2_sbd *sdp, const unsigned int lmstate) +{ + switch (lmstate) { + case LM_ST_UNLOCKED: + return DLM_LOCK_NL; + case LM_ST_EXCLUSIVE: + return DLM_LOCK_EX; + case LM_ST_DEFERRED: + return DLM_LOCK_CW; + case LM_ST_SHARED: + return DLM_LOCK_PR; + } + fs_err(sdp, "unknown LM state %d\n", lmstate); + BUG(); + return -1; +} + +static u32 make_flags(struct gfs2_glock *gl, const unsigned int gfs_flags, + const int req) +{ + u32 lkf = 0; + + if (gl->gl_lksb.sb_lvbptr) + lkf |= DLM_LKF_VALBLK; + + if (gfs_flags & LM_FLAG_TRY) + lkf |= DLM_LKF_NOQUEUE; + + if (gfs_flags & LM_FLAG_TRY_1CB) { + lkf |= DLM_LKF_NOQUEUE; + lkf |= DLM_LKF_NOQUEUEBAST; + } + + if (gfs_flags & LM_FLAG_ANY) { + if (req == DLM_LOCK_PR) + lkf |= DLM_LKF_ALTCW; + else if (req == DLM_LOCK_CW) + lkf |= DLM_LKF_ALTPR; + else + BUG(); + } + + if (gl->gl_lksb.sb_lkid != 0) { + lkf |= DLM_LKF_CONVERT; + if (test_bit(GLF_BLOCKING, &gl->gl_flags)) + lkf |= DLM_LKF_QUECVT; + } + + return lkf; +} + +static void gfs2_reverse_hex(char *c, u64 value) +{ + *c = '0'; + while (value) { + *c-- = hex_asc[value & 0x0f]; + value >>= 4; + } +} + +static int gdlm_lock(struct gfs2_glock *gl, unsigned int req_state, + unsigned int flags) +{ + struct lm_lockstruct *ls = &gl->gl_name.ln_sbd->sd_lockstruct; + int req; + u32 lkf; + char strname[GDLM_STRNAME_BYTES] = ""; + int error; + + req = make_mode(gl->gl_name.ln_sbd, req_state); + lkf = make_flags(gl, flags, req); + gfs2_glstats_inc(gl, GFS2_LKS_DCOUNT); + gfs2_sbstats_inc(gl, GFS2_LKS_DCOUNT); + if (gl->gl_lksb.sb_lkid) { + gfs2_update_request_times(gl); + } else { + memset(strname, ' ', GDLM_STRNAME_BYTES - 1); + strname[GDLM_STRNAME_BYTES - 1] = '\0'; + gfs2_reverse_hex(strname + 7, gl->gl_name.ln_type); + gfs2_reverse_hex(strname + 23, gl->gl_name.ln_number); + gl->gl_dstamp = ktime_get_real(); + } + /* + * Submit the actual lock request. + */ + +again: + error = dlm_lock(ls->ls_dlm, req, &gl->gl_lksb, lkf, strname, + GDLM_STRNAME_BYTES - 1, 0, gdlm_ast, gl, gdlm_bast); + if (error == -EBUSY) { + msleep(20); + goto again; + } + return error; +} + +static void gdlm_put_lock(struct gfs2_glock *gl) +{ + struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; + struct lm_lockstruct *ls = &sdp->sd_lockstruct; + int error; + + if (gl->gl_lksb.sb_lkid == 0) + goto out_free; + + clear_bit(GLF_BLOCKING, &gl->gl_flags); + gfs2_glstats_inc(gl, GFS2_LKS_DCOUNT); + gfs2_sbstats_inc(gl, GFS2_LKS_DCOUNT); + gfs2_update_request_times(gl); + + /* don't want to call dlm if we've unmounted the lock protocol */ + if (test_bit(DFL_UNMOUNT, &ls->ls_recover_flags)) + goto out_free; + /* don't want to skip dlm_unlock writing the lvb when lock has one */ + + if (test_bit(SDF_SKIP_DLM_UNLOCK, &sdp->sd_flags) && + !gl->gl_lksb.sb_lvbptr) + goto out_free; + +again: + error = dlm_unlock(ls->ls_dlm, gl->gl_lksb.sb_lkid, DLM_LKF_VALBLK, + NULL, gl); + if (error == -EBUSY) { + msleep(20); + goto again; + } + + if (error) { + fs_err(sdp, "gdlm_unlock %x,%llx err=%d\n", + gl->gl_name.ln_type, + (unsigned long long)gl->gl_name.ln_number, error); + } + return; + +out_free: + gfs2_glock_free(gl); +} + +static void gdlm_cancel(struct gfs2_glock *gl) +{ + struct lm_lockstruct *ls = &gl->gl_name.ln_sbd->sd_lockstruct; + dlm_unlock(ls->ls_dlm, gl->gl_lksb.sb_lkid, DLM_LKF_CANCEL, NULL, gl); +} + +/* + * dlm/gfs2 recovery coordination using dlm_recover callbacks + * + * 0. gfs2 checks for another cluster node withdraw, needing journal replay + * 1. dlm_controld sees lockspace members change + * 2. dlm_controld blocks dlm-kernel locking activity + * 3. dlm_controld within dlm-kernel notifies gfs2 (recover_prep) + * 4. dlm_controld starts and finishes its own user level recovery + * 5. dlm_controld starts dlm-kernel dlm_recoverd to do kernel recovery + * 6. dlm_recoverd notifies gfs2 of failed nodes (recover_slot) + * 7. dlm_recoverd does its own lock recovery + * 8. dlm_recoverd unblocks dlm-kernel locking activity + * 9. dlm_recoverd notifies gfs2 when done (recover_done with new generation) + * 10. gfs2_control updates control_lock lvb with new generation and jid bits + * 11. gfs2_control enqueues journals for gfs2_recover to recover (maybe none) + * 12. gfs2_recover dequeues and recovers journals of failed nodes + * 13. gfs2_recover provides recovery results to gfs2_control (recovery_result) + * 14. gfs2_control updates control_lock lvb jid bits for recovered journals + * 15. gfs2_control unblocks normal locking when all journals are recovered + * + * - failures during recovery + * + * recover_prep() may set BLOCK_LOCKS (step 3) again before gfs2_control + * clears BLOCK_LOCKS (step 15), e.g. another node fails while still + * recovering for a prior failure. gfs2_control needs a way to detect + * this so it can leave BLOCK_LOCKS set in step 15. This is managed using + * the recover_block and recover_start values. + * + * recover_done() provides a new lockspace generation number each time it + * is called (step 9). This generation number is saved as recover_start. + * When recover_prep() is called, it sets BLOCK_LOCKS and sets + * recover_block = recover_start. So, while recover_block is equal to + * recover_start, BLOCK_LOCKS should remain set. (recover_spin must + * be held around the BLOCK_LOCKS/recover_block/recover_start logic.) + * + * - more specific gfs2 steps in sequence above + * + * 3. recover_prep sets BLOCK_LOCKS and sets recover_block = recover_start + * 6. recover_slot records any failed jids (maybe none) + * 9. recover_done sets recover_start = new generation number + * 10. gfs2_control sets control_lock lvb = new gen + bits for failed jids + * 12. gfs2_recover does journal recoveries for failed jids identified above + * 14. gfs2_control clears control_lock lvb bits for recovered jids + * 15. gfs2_control checks if recover_block == recover_start (step 3 occured + * again) then do nothing, otherwise if recover_start > recover_block + * then clear BLOCK_LOCKS. + * + * - parallel recovery steps across all nodes + * + * All nodes attempt to update the control_lock lvb with the new generation + * number and jid bits, but only the first to get the control_lock EX will + * do so; others will see that it's already done (lvb already contains new + * generation number.) + * + * . All nodes get the same recover_prep/recover_slot/recover_done callbacks + * . All nodes attempt to set control_lock lvb gen + bits for the new gen + * . One node gets control_lock first and writes the lvb, others see it's done + * . All nodes attempt to recover jids for which they see control_lock bits set + * . One node succeeds for a jid, and that one clears the jid bit in the lvb + * . All nodes will eventually see all lvb bits clear and unblock locks + * + * - is there a problem with clearing an lvb bit that should be set + * and missing a journal recovery? + * + * 1. jid fails + * 2. lvb bit set for step 1 + * 3. jid recovered for step 1 + * 4. jid taken again (new mount) + * 5. jid fails (for step 4) + * 6. lvb bit set for step 5 (will already be set) + * 7. lvb bit cleared for step 3 + * + * This is not a problem because the failure in step 5 does not + * require recovery, because the mount in step 4 could not have + * progressed far enough to unblock locks and access the fs. The + * control_mount() function waits for all recoveries to be complete + * for the latest lockspace generation before ever unblocking locks + * and returning. The mount in step 4 waits until the recovery in + * step 1 is done. + * + * - special case of first mounter: first node to mount the fs + * + * The first node to mount a gfs2 fs needs to check all the journals + * and recover any that need recovery before other nodes are allowed + * to mount the fs. (Others may begin mounting, but they must wait + * for the first mounter to be done before taking locks on the fs + * or accessing the fs.) This has two parts: + * + * 1. The mounted_lock tells a node it's the first to mount the fs. + * Each node holds the mounted_lock in PR while it's mounted. + * Each node tries to acquire the mounted_lock in EX when it mounts. + * If a node is granted the mounted_lock EX it means there are no + * other mounted nodes (no PR locks exist), and it is the first mounter. + * The mounted_lock is demoted to PR when first recovery is done, so + * others will fail to get an EX lock, but will get a PR lock. + * + * 2. The control_lock blocks others in control_mount() while the first + * mounter is doing first mount recovery of all journals. + * A mounting node needs to acquire control_lock in EX mode before + * it can proceed. The first mounter holds control_lock in EX while doing + * the first mount recovery, blocking mounts from other nodes, then demotes + * control_lock to NL when it's done (others_may_mount/first_done), + * allowing other nodes to continue mounting. + * + * first mounter: + * control_lock EX/NOQUEUE success + * mounted_lock EX/NOQUEUE success (no other PR, so no other mounters) + * set first=1 + * do first mounter recovery + * mounted_lock EX->PR + * control_lock EX->NL, write lvb generation + * + * other mounter: + * control_lock EX/NOQUEUE success (if fail -EAGAIN, retry) + * mounted_lock EX/NOQUEUE fail -EAGAIN (expected due to other mounters PR) + * mounted_lock PR/NOQUEUE success + * read lvb generation + * control_lock EX->NL + * set first=0 + * + * - mount during recovery + * + * If a node mounts while others are doing recovery (not first mounter), + * the mounting node will get its initial recover_done() callback without + * having seen any previous failures/callbacks. + * + * It must wait for all recoveries preceding its mount to be finished + * before it unblocks locks. It does this by repeating the "other mounter" + * steps above until the lvb generation number is >= its mount generation + * number (from initial recover_done) and all lvb bits are clear. + * + * - control_lock lvb format + * + * 4 bytes generation number: the latest dlm lockspace generation number + * from recover_done callback. Indicates the jid bitmap has been updated + * to reflect all slot failures through that generation. + * 4 bytes unused. + * GDLM_LVB_SIZE-8 bytes of jid bit map. If bit N is set, it indicates + * that jid N needs recovery. + */ + +#define JID_BITMAP_OFFSET 8 /* 4 byte generation number + 4 byte unused */ + +static void control_lvb_read(struct lm_lockstruct *ls, uint32_t *lvb_gen, + char *lvb_bits) +{ + __le32 gen; + memcpy(lvb_bits, ls->ls_control_lvb, GDLM_LVB_SIZE); + memcpy(&gen, lvb_bits, sizeof(__le32)); + *lvb_gen = le32_to_cpu(gen); +} + +static void control_lvb_write(struct lm_lockstruct *ls, uint32_t lvb_gen, + char *lvb_bits) +{ + __le32 gen; + memcpy(ls->ls_control_lvb, lvb_bits, GDLM_LVB_SIZE); + gen = cpu_to_le32(lvb_gen); + memcpy(ls->ls_control_lvb, &gen, sizeof(__le32)); +} + +static int all_jid_bits_clear(char *lvb) +{ + return !memchr_inv(lvb + JID_BITMAP_OFFSET, 0, + GDLM_LVB_SIZE - JID_BITMAP_OFFSET); +} + +static void sync_wait_cb(void *arg) +{ + struct lm_lockstruct *ls = arg; + complete(&ls->ls_sync_wait); +} + +static int sync_unlock(struct gfs2_sbd *sdp, struct dlm_lksb *lksb, char *name) +{ + struct lm_lockstruct *ls = &sdp->sd_lockstruct; + int error; + + error = dlm_unlock(ls->ls_dlm, lksb->sb_lkid, 0, lksb, ls); + if (error) { + fs_err(sdp, "%s lkid %x error %d\n", + name, lksb->sb_lkid, error); + return error; + } + + wait_for_completion(&ls->ls_sync_wait); + + if (lksb->sb_status != -DLM_EUNLOCK) { + fs_err(sdp, "%s lkid %x status %d\n", + name, lksb->sb_lkid, lksb->sb_status); + return -1; + } + return 0; +} + +static int sync_lock(struct gfs2_sbd *sdp, int mode, uint32_t flags, + unsigned int num, struct dlm_lksb *lksb, char *name) +{ + struct lm_lockstruct *ls = &sdp->sd_lockstruct; + char strname[GDLM_STRNAME_BYTES]; + int error, status; + + memset(strname, 0, GDLM_STRNAME_BYTES); + snprintf(strname, GDLM_STRNAME_BYTES, "%8x%16x", LM_TYPE_NONDISK, num); + + error = dlm_lock(ls->ls_dlm, mode, lksb, flags, + strname, GDLM_STRNAME_BYTES - 1, + 0, sync_wait_cb, ls, NULL); + if (error) { + fs_err(sdp, "%s lkid %x flags %x mode %d error %d\n", + name, lksb->sb_lkid, flags, mode, error); + return error; + } + + wait_for_completion(&ls->ls_sync_wait); + + status = lksb->sb_status; + + if (status && status != -EAGAIN) { + fs_err(sdp, "%s lkid %x flags %x mode %d status %d\n", + name, lksb->sb_lkid, flags, mode, status); + } + + return status; +} + +static int mounted_unlock(struct gfs2_sbd *sdp) +{ + struct lm_lockstruct *ls = &sdp->sd_lockstruct; + return sync_unlock(sdp, &ls->ls_mounted_lksb, "mounted_lock"); +} + +static int mounted_lock(struct gfs2_sbd *sdp, int mode, uint32_t flags) +{ + struct lm_lockstruct *ls = &sdp->sd_lockstruct; + return sync_lock(sdp, mode, flags, GFS2_MOUNTED_LOCK, + &ls->ls_mounted_lksb, "mounted_lock"); +} + +static int control_unlock(struct gfs2_sbd *sdp) +{ + struct lm_lockstruct *ls = &sdp->sd_lockstruct; + return sync_unlock(sdp, &ls->ls_control_lksb, "control_lock"); +} + +static int control_lock(struct gfs2_sbd *sdp, int mode, uint32_t flags) +{ + struct lm_lockstruct *ls = &sdp->sd_lockstruct; + return sync_lock(sdp, mode, flags, GFS2_CONTROL_LOCK, + &ls->ls_control_lksb, "control_lock"); +} + +/** + * remote_withdraw - react to a node withdrawing from the file system + * @sdp: The superblock + */ +static void remote_withdraw(struct gfs2_sbd *sdp) +{ + struct gfs2_jdesc *jd; + int ret = 0, count = 0; + + list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) { + if (jd->jd_jid == sdp->sd_lockstruct.ls_jid) + continue; + ret = gfs2_recover_journal(jd, true); + if (ret) + break; + count++; + } + + /* Now drop the additional reference we acquired */ + fs_err(sdp, "Journals checked: %d, ret = %d.\n", count, ret); +} + +static void gfs2_control_func(struct work_struct *work) +{ + struct gfs2_sbd *sdp = container_of(work, struct gfs2_sbd, sd_control_work.work); + struct lm_lockstruct *ls = &sdp->sd_lockstruct; + uint32_t block_gen, start_gen, lvb_gen, flags; + int recover_set = 0; + int write_lvb = 0; + int recover_size; + int i, error; + + /* First check for other nodes that may have done a withdraw. */ + if (test_bit(SDF_REMOTE_WITHDRAW, &sdp->sd_flags)) { + remote_withdraw(sdp); + clear_bit(SDF_REMOTE_WITHDRAW, &sdp->sd_flags); + return; + } + + spin_lock(&ls->ls_recover_spin); + /* + * No MOUNT_DONE means we're still mounting; control_mount() + * will set this flag, after which this thread will take over + * all further clearing of BLOCK_LOCKS. + * + * FIRST_MOUNT means this node is doing first mounter recovery, + * for which recovery control is handled by + * control_mount()/control_first_done(), not this thread. + */ + if (!test_bit(DFL_MOUNT_DONE, &ls->ls_recover_flags) || + test_bit(DFL_FIRST_MOUNT, &ls->ls_recover_flags)) { + spin_unlock(&ls->ls_recover_spin); + return; + } + block_gen = ls->ls_recover_block; + start_gen = ls->ls_recover_start; + spin_unlock(&ls->ls_recover_spin); + + /* + * Equal block_gen and start_gen implies we are between + * recover_prep and recover_done callbacks, which means + * dlm recovery is in progress and dlm locking is blocked. + * There's no point trying to do any work until recover_done. + */ + + if (block_gen == start_gen) + return; + + /* + * Propagate recover_submit[] and recover_result[] to lvb: + * dlm_recoverd adds to recover_submit[] jids needing recovery + * gfs2_recover adds to recover_result[] journal recovery results + * + * set lvb bit for jids in recover_submit[] if the lvb has not + * yet been updated for the generation of the failure + * + * clear lvb bit for jids in recover_result[] if the result of + * the journal recovery is SUCCESS + */ + + error = control_lock(sdp, DLM_LOCK_EX, DLM_LKF_CONVERT|DLM_LKF_VALBLK); + if (error) { + fs_err(sdp, "control lock EX error %d\n", error); + return; + } + + control_lvb_read(ls, &lvb_gen, ls->ls_lvb_bits); + + spin_lock(&ls->ls_recover_spin); + if (block_gen != ls->ls_recover_block || + start_gen != ls->ls_recover_start) { + fs_info(sdp, "recover generation %u block1 %u %u\n", + start_gen, block_gen, ls->ls_recover_block); + spin_unlock(&ls->ls_recover_spin); + control_lock(sdp, DLM_LOCK_NL, DLM_LKF_CONVERT); + return; + } + + recover_size = ls->ls_recover_size; + + if (lvb_gen <= start_gen) { + /* + * Clear lvb bits for jids we've successfully recovered. + * Because all nodes attempt to recover failed journals, + * a journal can be recovered multiple times successfully + * in succession. Only the first will really do recovery, + * the others find it clean, but still report a successful + * recovery. So, another node may have already recovered + * the jid and cleared the lvb bit for it. + */ + for (i = 0; i < recover_size; i++) { + if (ls->ls_recover_result[i] != LM_RD_SUCCESS) + continue; + + ls->ls_recover_result[i] = 0; + + if (!test_bit_le(i, ls->ls_lvb_bits + JID_BITMAP_OFFSET)) + continue; + + __clear_bit_le(i, ls->ls_lvb_bits + JID_BITMAP_OFFSET); + write_lvb = 1; + } + } + + if (lvb_gen == start_gen) { + /* + * Failed slots before start_gen are already set in lvb. + */ + for (i = 0; i < recover_size; i++) { + if (!ls->ls_recover_submit[i]) + continue; + if (ls->ls_recover_submit[i] < lvb_gen) + ls->ls_recover_submit[i] = 0; + } + } else if (lvb_gen < start_gen) { + /* + * Failed slots before start_gen are not yet set in lvb. + */ + for (i = 0; i < recover_size; i++) { + if (!ls->ls_recover_submit[i]) + continue; + if (ls->ls_recover_submit[i] < start_gen) { + ls->ls_recover_submit[i] = 0; + __set_bit_le(i, ls->ls_lvb_bits + JID_BITMAP_OFFSET); + } + } + /* even if there are no bits to set, we need to write the + latest generation to the lvb */ + write_lvb = 1; + } else { + /* + * we should be getting a recover_done() for lvb_gen soon + */ + } + spin_unlock(&ls->ls_recover_spin); + + if (write_lvb) { + control_lvb_write(ls, start_gen, ls->ls_lvb_bits); + flags = DLM_LKF_CONVERT | DLM_LKF_VALBLK; + } else { + flags = DLM_LKF_CONVERT; + } + + error = control_lock(sdp, DLM_LOCK_NL, flags); + if (error) { + fs_err(sdp, "control lock NL error %d\n", error); + return; + } + + /* + * Everyone will see jid bits set in the lvb, run gfs2_recover_set(), + * and clear a jid bit in the lvb if the recovery is a success. + * Eventually all journals will be recovered, all jid bits will + * be cleared in the lvb, and everyone will clear BLOCK_LOCKS. + */ + + for (i = 0; i < recover_size; i++) { + if (test_bit_le(i, ls->ls_lvb_bits + JID_BITMAP_OFFSET)) { + fs_info(sdp, "recover generation %u jid %d\n", + start_gen, i); + gfs2_recover_set(sdp, i); + recover_set++; + } + } + if (recover_set) + return; + + /* + * No more jid bits set in lvb, all recovery is done, unblock locks + * (unless a new recover_prep callback has occured blocking locks + * again while working above) + */ + + spin_lock(&ls->ls_recover_spin); + if (ls->ls_recover_block == block_gen && + ls->ls_recover_start == start_gen) { + clear_bit(DFL_BLOCK_LOCKS, &ls->ls_recover_flags); + spin_unlock(&ls->ls_recover_spin); + fs_info(sdp, "recover generation %u done\n", start_gen); + gfs2_glock_thaw(sdp); + } else { + fs_info(sdp, "recover generation %u block2 %u %u\n", + start_gen, block_gen, ls->ls_recover_block); + spin_unlock(&ls->ls_recover_spin); + } +} + +static int control_mount(struct gfs2_sbd *sdp) +{ + struct lm_lockstruct *ls = &sdp->sd_lockstruct; + uint32_t start_gen, block_gen, mount_gen, lvb_gen; + int mounted_mode; + int retries = 0; + int error; + + memset(&ls->ls_mounted_lksb, 0, sizeof(struct dlm_lksb)); + memset(&ls->ls_control_lksb, 0, sizeof(struct dlm_lksb)); + memset(&ls->ls_control_lvb, 0, GDLM_LVB_SIZE); + ls->ls_control_lksb.sb_lvbptr = ls->ls_control_lvb; + init_completion(&ls->ls_sync_wait); + + set_bit(DFL_BLOCK_LOCKS, &ls->ls_recover_flags); + + error = control_lock(sdp, DLM_LOCK_NL, DLM_LKF_VALBLK); + if (error) { + fs_err(sdp, "control_mount control_lock NL error %d\n", error); + return error; + } + + error = mounted_lock(sdp, DLM_LOCK_NL, 0); + if (error) { + fs_err(sdp, "control_mount mounted_lock NL error %d\n", error); + control_unlock(sdp); + return error; + } + mounted_mode = DLM_LOCK_NL; + +restart: + if (retries++ && signal_pending(current)) { + error = -EINTR; + goto fail; + } + + /* + * We always start with both locks in NL. control_lock is + * demoted to NL below so we don't need to do it here. + */ + + if (mounted_mode != DLM_LOCK_NL) { + error = mounted_lock(sdp, DLM_LOCK_NL, DLM_LKF_CONVERT); + if (error) + goto fail; + mounted_mode = DLM_LOCK_NL; + } + + /* + * Other nodes need to do some work in dlm recovery and gfs2_control + * before the recover_done and control_lock will be ready for us below. + * A delay here is not required but often avoids having to retry. + */ + + msleep_interruptible(500); + + /* + * Acquire control_lock in EX and mounted_lock in either EX or PR. + * control_lock lvb keeps track of any pending journal recoveries. + * mounted_lock indicates if any other nodes have the fs mounted. + */ + + error = control_lock(sdp, DLM_LOCK_EX, DLM_LKF_CONVERT|DLM_LKF_NOQUEUE|DLM_LKF_VALBLK); + if (error == -EAGAIN) { + goto restart; + } else if (error) { + fs_err(sdp, "control_mount control_lock EX error %d\n", error); + goto fail; + } + + /** + * If we're a spectator, we don't want to take the lock in EX because + * we cannot do the first-mount responsibility it implies: recovery. + */ + if (sdp->sd_args.ar_spectator) + goto locks_done; + + error = mounted_lock(sdp, DLM_LOCK_EX, DLM_LKF_CONVERT|DLM_LKF_NOQUEUE); + if (!error) { + mounted_mode = DLM_LOCK_EX; + goto locks_done; + } else if (error != -EAGAIN) { + fs_err(sdp, "control_mount mounted_lock EX error %d\n", error); + goto fail; + } + + error = mounted_lock(sdp, DLM_LOCK_PR, DLM_LKF_CONVERT|DLM_LKF_NOQUEUE); + if (!error) { + mounted_mode = DLM_LOCK_PR; + goto locks_done; + } else { + /* not even -EAGAIN should happen here */ + fs_err(sdp, "control_mount mounted_lock PR error %d\n", error); + goto fail; + } + +locks_done: + /* + * If we got both locks above in EX, then we're the first mounter. + * If not, then we need to wait for the control_lock lvb to be + * updated by other mounted nodes to reflect our mount generation. + * + * In simple first mounter cases, first mounter will see zero lvb_gen, + * but in cases where all existing nodes leave/fail before mounting + * nodes finish control_mount, then all nodes will be mounting and + * lvb_gen will be non-zero. + */ + + control_lvb_read(ls, &lvb_gen, ls->ls_lvb_bits); + + if (lvb_gen == 0xFFFFFFFF) { + /* special value to force mount attempts to fail */ + fs_err(sdp, "control_mount control_lock disabled\n"); + error = -EINVAL; + goto fail; + } + + if (mounted_mode == DLM_LOCK_EX) { + /* first mounter, keep both EX while doing first recovery */ + spin_lock(&ls->ls_recover_spin); + clear_bit(DFL_BLOCK_LOCKS, &ls->ls_recover_flags); + set_bit(DFL_MOUNT_DONE, &ls->ls_recover_flags); + set_bit(DFL_FIRST_MOUNT, &ls->ls_recover_flags); + spin_unlock(&ls->ls_recover_spin); + fs_info(sdp, "first mounter control generation %u\n", lvb_gen); + return 0; + } + + error = control_lock(sdp, DLM_LOCK_NL, DLM_LKF_CONVERT); + if (error) + goto fail; + + /* + * We are not first mounter, now we need to wait for the control_lock + * lvb generation to be >= the generation from our first recover_done + * and all lvb bits to be clear (no pending journal recoveries.) + */ + + if (!all_jid_bits_clear(ls->ls_lvb_bits)) { + /* journals need recovery, wait until all are clear */ + fs_info(sdp, "control_mount wait for journal recovery\n"); + goto restart; + } + + spin_lock(&ls->ls_recover_spin); + block_gen = ls->ls_recover_block; + start_gen = ls->ls_recover_start; + mount_gen = ls->ls_recover_mount; + + if (lvb_gen < mount_gen) { + /* wait for mounted nodes to update control_lock lvb to our + generation, which might include new recovery bits set */ + if (sdp->sd_args.ar_spectator) { + fs_info(sdp, "Recovery is required. Waiting for a " + "non-spectator to mount.\n"); + msleep_interruptible(1000); + } else { + fs_info(sdp, "control_mount wait1 block %u start %u " + "mount %u lvb %u flags %lx\n", block_gen, + start_gen, mount_gen, lvb_gen, + ls->ls_recover_flags); + } + spin_unlock(&ls->ls_recover_spin); + goto restart; + } + + if (lvb_gen != start_gen) { + /* wait for mounted nodes to update control_lock lvb to the + latest recovery generation */ + fs_info(sdp, "control_mount wait2 block %u start %u mount %u " + "lvb %u flags %lx\n", block_gen, start_gen, mount_gen, + lvb_gen, ls->ls_recover_flags); + spin_unlock(&ls->ls_recover_spin); + goto restart; + } + + if (block_gen == start_gen) { + /* dlm recovery in progress, wait for it to finish */ + fs_info(sdp, "control_mount wait3 block %u start %u mount %u " + "lvb %u flags %lx\n", block_gen, start_gen, mount_gen, + lvb_gen, ls->ls_recover_flags); + spin_unlock(&ls->ls_recover_spin); + goto restart; + } + + clear_bit(DFL_BLOCK_LOCKS, &ls->ls_recover_flags); + set_bit(DFL_MOUNT_DONE, &ls->ls_recover_flags); + memset(ls->ls_recover_submit, 0, ls->ls_recover_size*sizeof(uint32_t)); + memset(ls->ls_recover_result, 0, ls->ls_recover_size*sizeof(uint32_t)); + spin_unlock(&ls->ls_recover_spin); + return 0; + +fail: + mounted_unlock(sdp); + control_unlock(sdp); + return error; +} + +static int control_first_done(struct gfs2_sbd *sdp) +{ + struct lm_lockstruct *ls = &sdp->sd_lockstruct; + uint32_t start_gen, block_gen; + int error; + +restart: + spin_lock(&ls->ls_recover_spin); + start_gen = ls->ls_recover_start; + block_gen = ls->ls_recover_block; + + if (test_bit(DFL_BLOCK_LOCKS, &ls->ls_recover_flags) || + !test_bit(DFL_MOUNT_DONE, &ls->ls_recover_flags) || + !test_bit(DFL_FIRST_MOUNT, &ls->ls_recover_flags)) { + /* sanity check, should not happen */ + fs_err(sdp, "control_first_done start %u block %u flags %lx\n", + start_gen, block_gen, ls->ls_recover_flags); + spin_unlock(&ls->ls_recover_spin); + control_unlock(sdp); + return -1; + } + + if (start_gen == block_gen) { + /* + * Wait for the end of a dlm recovery cycle to switch from + * first mounter recovery. We can ignore any recover_slot + * callbacks between the recover_prep and next recover_done + * because we are still the first mounter and any failed nodes + * have not fully mounted, so they don't need recovery. + */ + spin_unlock(&ls->ls_recover_spin); + fs_info(sdp, "control_first_done wait gen %u\n", start_gen); + + wait_on_bit(&ls->ls_recover_flags, DFL_DLM_RECOVERY, + TASK_UNINTERRUPTIBLE); + goto restart; + } + + clear_bit(DFL_FIRST_MOUNT, &ls->ls_recover_flags); + set_bit(DFL_FIRST_MOUNT_DONE, &ls->ls_recover_flags); + memset(ls->ls_recover_submit, 0, ls->ls_recover_size*sizeof(uint32_t)); + memset(ls->ls_recover_result, 0, ls->ls_recover_size*sizeof(uint32_t)); + spin_unlock(&ls->ls_recover_spin); + + memset(ls->ls_lvb_bits, 0, GDLM_LVB_SIZE); + control_lvb_write(ls, start_gen, ls->ls_lvb_bits); + + error = mounted_lock(sdp, DLM_LOCK_PR, DLM_LKF_CONVERT); + if (error) + fs_err(sdp, "control_first_done mounted PR error %d\n", error); + + error = control_lock(sdp, DLM_LOCK_NL, DLM_LKF_CONVERT|DLM_LKF_VALBLK); + if (error) + fs_err(sdp, "control_first_done control NL error %d\n", error); + + return error; +} + +/* + * Expand static jid arrays if necessary (by increments of RECOVER_SIZE_INC) + * to accommodate the largest slot number. (NB dlm slot numbers start at 1, + * gfs2 jids start at 0, so jid = slot - 1) + */ + +#define RECOVER_SIZE_INC 16 + +static int set_recover_size(struct gfs2_sbd *sdp, struct dlm_slot *slots, + int num_slots) +{ + struct lm_lockstruct *ls = &sdp->sd_lockstruct; + uint32_t *submit = NULL; + uint32_t *result = NULL; + uint32_t old_size, new_size; + int i, max_jid; + + if (!ls->ls_lvb_bits) { + ls->ls_lvb_bits = kzalloc(GDLM_LVB_SIZE, GFP_NOFS); + if (!ls->ls_lvb_bits) + return -ENOMEM; + } + + max_jid = 0; + for (i = 0; i < num_slots; i++) { + if (max_jid < slots[i].slot - 1) + max_jid = slots[i].slot - 1; + } + + old_size = ls->ls_recover_size; + new_size = old_size; + while (new_size < max_jid + 1) + new_size += RECOVER_SIZE_INC; + if (new_size == old_size) + return 0; + + submit = kcalloc(new_size, sizeof(uint32_t), GFP_NOFS); + result = kcalloc(new_size, sizeof(uint32_t), GFP_NOFS); + if (!submit || !result) { + kfree(submit); + kfree(result); + return -ENOMEM; + } + + spin_lock(&ls->ls_recover_spin); + memcpy(submit, ls->ls_recover_submit, old_size * sizeof(uint32_t)); + memcpy(result, ls->ls_recover_result, old_size * sizeof(uint32_t)); + kfree(ls->ls_recover_submit); + kfree(ls->ls_recover_result); + ls->ls_recover_submit = submit; + ls->ls_recover_result = result; + ls->ls_recover_size = new_size; + spin_unlock(&ls->ls_recover_spin); + return 0; +} + +static void free_recover_size(struct lm_lockstruct *ls) +{ + kfree(ls->ls_lvb_bits); + kfree(ls->ls_recover_submit); + kfree(ls->ls_recover_result); + ls->ls_recover_submit = NULL; + ls->ls_recover_result = NULL; + ls->ls_recover_size = 0; + ls->ls_lvb_bits = NULL; +} + +/* dlm calls before it does lock recovery */ + +static void gdlm_recover_prep(void *arg) +{ + struct gfs2_sbd *sdp = arg; + struct lm_lockstruct *ls = &sdp->sd_lockstruct; + + if (gfs2_withdrawn(sdp)) { + fs_err(sdp, "recover_prep ignored due to withdraw.\n"); + return; + } + spin_lock(&ls->ls_recover_spin); + ls->ls_recover_block = ls->ls_recover_start; + set_bit(DFL_DLM_RECOVERY, &ls->ls_recover_flags); + + if (!test_bit(DFL_MOUNT_DONE, &ls->ls_recover_flags) || + test_bit(DFL_FIRST_MOUNT, &ls->ls_recover_flags)) { + spin_unlock(&ls->ls_recover_spin); + return; + } + set_bit(DFL_BLOCK_LOCKS, &ls->ls_recover_flags); + spin_unlock(&ls->ls_recover_spin); +} + +/* dlm calls after recover_prep has been completed on all lockspace members; + identifies slot/jid of failed member */ + +static void gdlm_recover_slot(void *arg, struct dlm_slot *slot) +{ + struct gfs2_sbd *sdp = arg; + struct lm_lockstruct *ls = &sdp->sd_lockstruct; + int jid = slot->slot - 1; + + if (gfs2_withdrawn(sdp)) { + fs_err(sdp, "recover_slot jid %d ignored due to withdraw.\n", + jid); + return; + } + spin_lock(&ls->ls_recover_spin); + if (ls->ls_recover_size < jid + 1) { + fs_err(sdp, "recover_slot jid %d gen %u short size %d\n", + jid, ls->ls_recover_block, ls->ls_recover_size); + spin_unlock(&ls->ls_recover_spin); + return; + } + + if (ls->ls_recover_submit[jid]) { + fs_info(sdp, "recover_slot jid %d gen %u prev %u\n", + jid, ls->ls_recover_block, ls->ls_recover_submit[jid]); + } + ls->ls_recover_submit[jid] = ls->ls_recover_block; + spin_unlock(&ls->ls_recover_spin); +} + +/* dlm calls after recover_slot and after it completes lock recovery */ + +static void gdlm_recover_done(void *arg, struct dlm_slot *slots, int num_slots, + int our_slot, uint32_t generation) +{ + struct gfs2_sbd *sdp = arg; + struct lm_lockstruct *ls = &sdp->sd_lockstruct; + + if (gfs2_withdrawn(sdp)) { + fs_err(sdp, "recover_done ignored due to withdraw.\n"); + return; + } + /* ensure the ls jid arrays are large enough */ + set_recover_size(sdp, slots, num_slots); + + spin_lock(&ls->ls_recover_spin); + ls->ls_recover_start = generation; + + if (!ls->ls_recover_mount) { + ls->ls_recover_mount = generation; + ls->ls_jid = our_slot - 1; + } + + if (!test_bit(DFL_UNMOUNT, &ls->ls_recover_flags)) + queue_delayed_work(gfs2_control_wq, &sdp->sd_control_work, 0); + + clear_bit(DFL_DLM_RECOVERY, &ls->ls_recover_flags); + smp_mb__after_atomic(); + wake_up_bit(&ls->ls_recover_flags, DFL_DLM_RECOVERY); + spin_unlock(&ls->ls_recover_spin); +} + +/* gfs2_recover thread has a journal recovery result */ + +static void gdlm_recovery_result(struct gfs2_sbd *sdp, unsigned int jid, + unsigned int result) +{ + struct lm_lockstruct *ls = &sdp->sd_lockstruct; + + if (gfs2_withdrawn(sdp)) { + fs_err(sdp, "recovery_result jid %d ignored due to withdraw.\n", + jid); + return; + } + if (test_bit(DFL_NO_DLM_OPS, &ls->ls_recover_flags)) + return; + + /* don't care about the recovery of own journal during mount */ + if (jid == ls->ls_jid) + return; + + spin_lock(&ls->ls_recover_spin); + if (test_bit(DFL_FIRST_MOUNT, &ls->ls_recover_flags)) { + spin_unlock(&ls->ls_recover_spin); + return; + } + if (ls->ls_recover_size < jid + 1) { + fs_err(sdp, "recovery_result jid %d short size %d\n", + jid, ls->ls_recover_size); + spin_unlock(&ls->ls_recover_spin); + return; + } + + fs_info(sdp, "recover jid %d result %s\n", jid, + result == LM_RD_GAVEUP ? "busy" : "success"); + + ls->ls_recover_result[jid] = result; + + /* GAVEUP means another node is recovering the journal; delay our + next attempt to recover it, to give the other node a chance to + finish before trying again */ + + if (!test_bit(DFL_UNMOUNT, &ls->ls_recover_flags)) + queue_delayed_work(gfs2_control_wq, &sdp->sd_control_work, + result == LM_RD_GAVEUP ? HZ : 0); + spin_unlock(&ls->ls_recover_spin); +} + +static const struct dlm_lockspace_ops gdlm_lockspace_ops = { + .recover_prep = gdlm_recover_prep, + .recover_slot = gdlm_recover_slot, + .recover_done = gdlm_recover_done, +}; + +static int gdlm_mount(struct gfs2_sbd *sdp, const char *table) +{ + struct lm_lockstruct *ls = &sdp->sd_lockstruct; + char cluster[GFS2_LOCKNAME_LEN]; + const char *fsname; + uint32_t flags; + int error, ops_result; + + /* + * initialize everything + */ + + INIT_DELAYED_WORK(&sdp->sd_control_work, gfs2_control_func); + spin_lock_init(&ls->ls_recover_spin); + ls->ls_recover_flags = 0; + ls->ls_recover_mount = 0; + ls->ls_recover_start = 0; + ls->ls_recover_block = 0; + ls->ls_recover_size = 0; + ls->ls_recover_submit = NULL; + ls->ls_recover_result = NULL; + ls->ls_lvb_bits = NULL; + + error = set_recover_size(sdp, NULL, 0); + if (error) + goto fail; + + /* + * prepare dlm_new_lockspace args + */ + + fsname = strchr(table, ':'); + if (!fsname) { + fs_info(sdp, "no fsname found\n"); + error = -EINVAL; + goto fail_free; + } + memset(cluster, 0, sizeof(cluster)); + memcpy(cluster, table, strlen(table) - strlen(fsname)); + fsname++; + + flags = DLM_LSFL_NEWEXCL; + + /* + * create/join lockspace + */ + + error = dlm_new_lockspace(fsname, cluster, flags, GDLM_LVB_SIZE, + &gdlm_lockspace_ops, sdp, &ops_result, + &ls->ls_dlm); + if (error) { + fs_err(sdp, "dlm_new_lockspace error %d\n", error); + goto fail_free; + } + + if (ops_result < 0) { + /* + * dlm does not support ops callbacks, + * old dlm_controld/gfs_controld are used, try without ops. + */ + fs_info(sdp, "dlm lockspace ops not used\n"); + free_recover_size(ls); + set_bit(DFL_NO_DLM_OPS, &ls->ls_recover_flags); + return 0; + } + + if (!test_bit(SDF_NOJOURNALID, &sdp->sd_flags)) { + fs_err(sdp, "dlm lockspace ops disallow jid preset\n"); + error = -EINVAL; + goto fail_release; + } + + /* + * control_mount() uses control_lock to determine first mounter, + * and for later mounts, waits for any recoveries to be cleared. + */ + + error = control_mount(sdp); + if (error) { + fs_err(sdp, "mount control error %d\n", error); + goto fail_release; + } + + ls->ls_first = !!test_bit(DFL_FIRST_MOUNT, &ls->ls_recover_flags); + clear_bit(SDF_NOJOURNALID, &sdp->sd_flags); + smp_mb__after_atomic(); + wake_up_bit(&sdp->sd_flags, SDF_NOJOURNALID); + return 0; + +fail_release: + dlm_release_lockspace(ls->ls_dlm, 2); +fail_free: + free_recover_size(ls); +fail: + return error; +} + +static void gdlm_first_done(struct gfs2_sbd *sdp) +{ + struct lm_lockstruct *ls = &sdp->sd_lockstruct; + int error; + + if (test_bit(DFL_NO_DLM_OPS, &ls->ls_recover_flags)) + return; + + error = control_first_done(sdp); + if (error) + fs_err(sdp, "mount first_done error %d\n", error); +} + +static void gdlm_unmount(struct gfs2_sbd *sdp) +{ + struct lm_lockstruct *ls = &sdp->sd_lockstruct; + + if (test_bit(DFL_NO_DLM_OPS, &ls->ls_recover_flags)) + goto release; + + /* wait for gfs2_control_wq to be done with this mount */ + + spin_lock(&ls->ls_recover_spin); + set_bit(DFL_UNMOUNT, &ls->ls_recover_flags); + spin_unlock(&ls->ls_recover_spin); + flush_delayed_work(&sdp->sd_control_work); + + /* mounted_lock and control_lock will be purged in dlm recovery */ +release: + if (ls->ls_dlm) { + dlm_release_lockspace(ls->ls_dlm, 2); + ls->ls_dlm = NULL; + } + + free_recover_size(ls); +} + +static const match_table_t dlm_tokens = { + { Opt_jid, "jid=%d"}, + { Opt_id, "id=%d"}, + { Opt_first, "first=%d"}, + { Opt_nodir, "nodir=%d"}, + { Opt_err, NULL }, +}; + +const struct lm_lockops gfs2_dlm_ops = { + .lm_proto_name = "lock_dlm", + .lm_mount = gdlm_mount, + .lm_first_done = gdlm_first_done, + .lm_recovery_result = gdlm_recovery_result, + .lm_unmount = gdlm_unmount, + .lm_put_lock = gdlm_put_lock, + .lm_lock = gdlm_lock, + .lm_cancel = gdlm_cancel, + .lm_tokens = &dlm_tokens, +}; + diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c new file mode 100644 index 0000000000..e5271ae87d --- /dev/null +++ b/fs/gfs2/log.c @@ -0,0 +1,1347 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2007 Red Hat, Inc. All rights reserved. + */ + +#include <linux/sched.h> +#include <linux/slab.h> +#include <linux/spinlock.h> +#include <linux/completion.h> +#include <linux/buffer_head.h> +#include <linux/gfs2_ondisk.h> +#include <linux/crc32.h> +#include <linux/crc32c.h> +#include <linux/delay.h> +#include <linux/kthread.h> +#include <linux/freezer.h> +#include <linux/bio.h> +#include <linux/blkdev.h> +#include <linux/writeback.h> +#include <linux/list_sort.h> + +#include "gfs2.h" +#include "incore.h" +#include "bmap.h" +#include "glock.h" +#include "log.h" +#include "lops.h" +#include "meta_io.h" +#include "util.h" +#include "dir.h" +#include "trace_gfs2.h" +#include "trans.h" + +static void gfs2_log_shutdown(struct gfs2_sbd *sdp); + +/** + * gfs2_struct2blk - compute stuff + * @sdp: the filesystem + * @nstruct: the number of structures + * + * Compute the number of log descriptor blocks needed to hold a certain number + * of structures of a certain size. + * + * Returns: the number of blocks needed (minimum is always 1) + */ + +unsigned int gfs2_struct2blk(struct gfs2_sbd *sdp, unsigned int nstruct) +{ + unsigned int blks; + unsigned int first, second; + + /* The initial struct gfs2_log_descriptor block */ + blks = 1; + first = sdp->sd_ldptrs; + + if (nstruct > first) { + /* Subsequent struct gfs2_meta_header blocks */ + second = sdp->sd_inptrs; + blks += DIV_ROUND_UP(nstruct - first, second); + } + + return blks; +} + +/** + * gfs2_remove_from_ail - Remove an entry from the ail lists, updating counters + * @bd: The gfs2_bufdata to remove + * + * The ail lock _must_ be held when calling this function + * + */ + +void gfs2_remove_from_ail(struct gfs2_bufdata *bd) +{ + bd->bd_tr = NULL; + list_del_init(&bd->bd_ail_st_list); + list_del_init(&bd->bd_ail_gl_list); + atomic_dec(&bd->bd_gl->gl_ail_count); + brelse(bd->bd_bh); +} + +static int __gfs2_writepage(struct folio *folio, struct writeback_control *wbc, + void *data) +{ + struct address_space *mapping = data; + int ret = mapping->a_ops->writepage(&folio->page, wbc); + mapping_set_error(mapping, ret); + return ret; +} + +/** + * gfs2_ail1_start_one - Start I/O on a transaction + * @sdp: The superblock + * @wbc: The writeback control structure + * @tr: The transaction to start I/O on + * @plug: The block plug currently active + */ + +static int gfs2_ail1_start_one(struct gfs2_sbd *sdp, + struct writeback_control *wbc, + struct gfs2_trans *tr, struct blk_plug *plug) +__releases(&sdp->sd_ail_lock) +__acquires(&sdp->sd_ail_lock) +{ + struct gfs2_glock *gl = NULL; + struct address_space *mapping; + struct gfs2_bufdata *bd, *s; + struct buffer_head *bh; + int ret = 0; + + list_for_each_entry_safe_reverse(bd, s, &tr->tr_ail1_list, bd_ail_st_list) { + bh = bd->bd_bh; + + gfs2_assert(sdp, bd->bd_tr == tr); + + if (!buffer_busy(bh)) { + if (buffer_uptodate(bh)) { + list_move(&bd->bd_ail_st_list, + &tr->tr_ail2_list); + continue; + } + if (!cmpxchg(&sdp->sd_log_error, 0, -EIO)) { + gfs2_io_error_bh(sdp, bh); + gfs2_withdraw_delayed(sdp); + } + } + + if (gfs2_withdrawn(sdp)) { + gfs2_remove_from_ail(bd); + continue; + } + if (!buffer_dirty(bh)) + continue; + if (gl == bd->bd_gl) + continue; + gl = bd->bd_gl; + list_move(&bd->bd_ail_st_list, &tr->tr_ail1_list); + mapping = bh->b_folio->mapping; + if (!mapping) + continue; + spin_unlock(&sdp->sd_ail_lock); + ret = write_cache_pages(mapping, wbc, __gfs2_writepage, mapping); + if (need_resched()) { + blk_finish_plug(plug); + cond_resched(); + blk_start_plug(plug); + } + spin_lock(&sdp->sd_ail_lock); + if (ret == -ENODATA) /* if a jdata write into a new hole */ + ret = 0; /* ignore it */ + if (ret || wbc->nr_to_write <= 0) + break; + return -EBUSY; + } + + return ret; +} + +static void dump_ail_list(struct gfs2_sbd *sdp) +{ + struct gfs2_trans *tr; + struct gfs2_bufdata *bd; + struct buffer_head *bh; + + list_for_each_entry_reverse(tr, &sdp->sd_ail1_list, tr_list) { + list_for_each_entry_reverse(bd, &tr->tr_ail1_list, + bd_ail_st_list) { + bh = bd->bd_bh; + fs_err(sdp, "bd %p: blk:0x%llx bh=%p ", bd, + (unsigned long long)bd->bd_blkno, bh); + if (!bh) { + fs_err(sdp, "\n"); + continue; + } + fs_err(sdp, "0x%llx up2:%d dirt:%d lkd:%d req:%d " + "map:%d new:%d ar:%d aw:%d delay:%d " + "io err:%d unwritten:%d dfr:%d pin:%d esc:%d\n", + (unsigned long long)bh->b_blocknr, + buffer_uptodate(bh), buffer_dirty(bh), + buffer_locked(bh), buffer_req(bh), + buffer_mapped(bh), buffer_new(bh), + buffer_async_read(bh), buffer_async_write(bh), + buffer_delay(bh), buffer_write_io_error(bh), + buffer_unwritten(bh), + buffer_defer_completion(bh), + buffer_pinned(bh), buffer_escaped(bh)); + } + } +} + +/** + * gfs2_ail1_flush - start writeback of some ail1 entries + * @sdp: The super block + * @wbc: The writeback control structure + * + * Writes back some ail1 entries, according to the limits in the + * writeback control structure + */ + +void gfs2_ail1_flush(struct gfs2_sbd *sdp, struct writeback_control *wbc) +{ + struct list_head *head = &sdp->sd_ail1_list; + struct gfs2_trans *tr; + struct blk_plug plug; + int ret; + unsigned long flush_start = jiffies; + + trace_gfs2_ail_flush(sdp, wbc, 1); + blk_start_plug(&plug); + spin_lock(&sdp->sd_ail_lock); +restart: + ret = 0; + if (time_after(jiffies, flush_start + (HZ * 600))) { + fs_err(sdp, "Error: In %s for ten minutes! t=%d\n", + __func__, current->journal_info ? 1 : 0); + dump_ail_list(sdp); + goto out; + } + list_for_each_entry_reverse(tr, head, tr_list) { + if (wbc->nr_to_write <= 0) + break; + ret = gfs2_ail1_start_one(sdp, wbc, tr, &plug); + if (ret) { + if (ret == -EBUSY) + goto restart; + break; + } + } +out: + spin_unlock(&sdp->sd_ail_lock); + blk_finish_plug(&plug); + if (ret) { + gfs2_lm(sdp, "gfs2_ail1_start_one returned: %d\n", ret); + gfs2_withdraw(sdp); + } + trace_gfs2_ail_flush(sdp, wbc, 0); +} + +/** + * gfs2_ail1_start - start writeback of all ail1 entries + * @sdp: The superblock + */ + +static void gfs2_ail1_start(struct gfs2_sbd *sdp) +{ + struct writeback_control wbc = { + .sync_mode = WB_SYNC_NONE, + .nr_to_write = LONG_MAX, + .range_start = 0, + .range_end = LLONG_MAX, + }; + + return gfs2_ail1_flush(sdp, &wbc); +} + +static void gfs2_log_update_flush_tail(struct gfs2_sbd *sdp) +{ + unsigned int new_flush_tail = sdp->sd_log_head; + struct gfs2_trans *tr; + + if (!list_empty(&sdp->sd_ail1_list)) { + tr = list_last_entry(&sdp->sd_ail1_list, + struct gfs2_trans, tr_list); + new_flush_tail = tr->tr_first; + } + sdp->sd_log_flush_tail = new_flush_tail; +} + +static void gfs2_log_update_head(struct gfs2_sbd *sdp) +{ + unsigned int new_head = sdp->sd_log_flush_head; + + if (sdp->sd_log_flush_tail == sdp->sd_log_head) + sdp->sd_log_flush_tail = new_head; + sdp->sd_log_head = new_head; +} + +/* + * gfs2_ail_empty_tr - empty one of the ail lists of a transaction + */ + +static void gfs2_ail_empty_tr(struct gfs2_sbd *sdp, struct gfs2_trans *tr, + struct list_head *head) +{ + struct gfs2_bufdata *bd; + + while (!list_empty(head)) { + bd = list_first_entry(head, struct gfs2_bufdata, + bd_ail_st_list); + gfs2_assert(sdp, bd->bd_tr == tr); + gfs2_remove_from_ail(bd); + } +} + +/** + * gfs2_ail1_empty_one - Check whether or not a trans in the AIL has been synced + * @sdp: the filesystem + * @tr: the transaction + * @max_revokes: If nonzero, issue revokes for the bd items for written buffers + * + * returns: the transaction's count of remaining active items + */ + +static int gfs2_ail1_empty_one(struct gfs2_sbd *sdp, struct gfs2_trans *tr, + int *max_revokes) +{ + struct gfs2_bufdata *bd, *s; + struct buffer_head *bh; + int active_count = 0; + + list_for_each_entry_safe_reverse(bd, s, &tr->tr_ail1_list, + bd_ail_st_list) { + bh = bd->bd_bh; + gfs2_assert(sdp, bd->bd_tr == tr); + /* + * If another process flagged an io error, e.g. writing to the + * journal, error all other bhs and move them off the ail1 to + * prevent a tight loop when unmount tries to flush ail1, + * regardless of whether they're still busy. If no outside + * errors were found and the buffer is busy, move to the next. + * If the ail buffer is not busy and caught an error, flag it + * for others. + */ + if (!sdp->sd_log_error && buffer_busy(bh)) { + active_count++; + continue; + } + if (!buffer_uptodate(bh) && + !cmpxchg(&sdp->sd_log_error, 0, -EIO)) { + gfs2_io_error_bh(sdp, bh); + gfs2_withdraw_delayed(sdp); + } + /* + * If we have space for revokes and the bd is no longer on any + * buf list, we can just add a revoke for it immediately and + * avoid having to put it on the ail2 list, where it would need + * to be revoked later. + */ + if (*max_revokes && list_empty(&bd->bd_list)) { + gfs2_add_revoke(sdp, bd); + (*max_revokes)--; + continue; + } + list_move(&bd->bd_ail_st_list, &tr->tr_ail2_list); + } + return active_count; +} + +/** + * gfs2_ail1_empty - Try to empty the ail1 lists + * @sdp: The superblock + * @max_revokes: If non-zero, add revokes where appropriate + * + * Tries to empty the ail1 lists, starting with the oldest first + */ + +static int gfs2_ail1_empty(struct gfs2_sbd *sdp, int max_revokes) +{ + struct gfs2_trans *tr, *s; + int oldest_tr = 1; + int ret; + + spin_lock(&sdp->sd_ail_lock); + list_for_each_entry_safe_reverse(tr, s, &sdp->sd_ail1_list, tr_list) { + if (!gfs2_ail1_empty_one(sdp, tr, &max_revokes) && oldest_tr) + list_move(&tr->tr_list, &sdp->sd_ail2_list); + else + oldest_tr = 0; + } + gfs2_log_update_flush_tail(sdp); + ret = list_empty(&sdp->sd_ail1_list); + spin_unlock(&sdp->sd_ail_lock); + + if (test_bit(SDF_WITHDRAWING, &sdp->sd_flags)) { + gfs2_lm(sdp, "fatal: I/O error(s)\n"); + gfs2_withdraw(sdp); + } + + return ret; +} + +static void gfs2_ail1_wait(struct gfs2_sbd *sdp) +{ + struct gfs2_trans *tr; + struct gfs2_bufdata *bd; + struct buffer_head *bh; + + spin_lock(&sdp->sd_ail_lock); + list_for_each_entry_reverse(tr, &sdp->sd_ail1_list, tr_list) { + list_for_each_entry(bd, &tr->tr_ail1_list, bd_ail_st_list) { + bh = bd->bd_bh; + if (!buffer_locked(bh)) + continue; + get_bh(bh); + spin_unlock(&sdp->sd_ail_lock); + wait_on_buffer(bh); + brelse(bh); + return; + } + } + spin_unlock(&sdp->sd_ail_lock); +} + +static void __ail2_empty(struct gfs2_sbd *sdp, struct gfs2_trans *tr) +{ + gfs2_ail_empty_tr(sdp, tr, &tr->tr_ail2_list); + list_del(&tr->tr_list); + gfs2_assert_warn(sdp, list_empty(&tr->tr_ail1_list)); + gfs2_assert_warn(sdp, list_empty(&tr->tr_ail2_list)); + gfs2_trans_free(sdp, tr); +} + +static void ail2_empty(struct gfs2_sbd *sdp, unsigned int new_tail) +{ + struct list_head *ail2_list = &sdp->sd_ail2_list; + unsigned int old_tail = sdp->sd_log_tail; + struct gfs2_trans *tr, *safe; + + spin_lock(&sdp->sd_ail_lock); + if (old_tail <= new_tail) { + list_for_each_entry_safe(tr, safe, ail2_list, tr_list) { + if (old_tail <= tr->tr_first && tr->tr_first < new_tail) + __ail2_empty(sdp, tr); + } + } else { + list_for_each_entry_safe(tr, safe, ail2_list, tr_list) { + if (old_tail <= tr->tr_first || tr->tr_first < new_tail) + __ail2_empty(sdp, tr); + } + } + spin_unlock(&sdp->sd_ail_lock); +} + +/** + * gfs2_log_is_empty - Check if the log is empty + * @sdp: The GFS2 superblock + */ + +bool gfs2_log_is_empty(struct gfs2_sbd *sdp) { + return atomic_read(&sdp->sd_log_blks_free) == sdp->sd_jdesc->jd_blocks; +} + +static bool __gfs2_log_try_reserve_revokes(struct gfs2_sbd *sdp, unsigned int revokes) +{ + unsigned int available; + + available = atomic_read(&sdp->sd_log_revokes_available); + while (available >= revokes) { + if (atomic_try_cmpxchg(&sdp->sd_log_revokes_available, + &available, available - revokes)) + return true; + } + return false; +} + +/** + * gfs2_log_release_revokes - Release a given number of revokes + * @sdp: The GFS2 superblock + * @revokes: The number of revokes to release + * + * sdp->sd_log_flush_lock must be held. + */ +void gfs2_log_release_revokes(struct gfs2_sbd *sdp, unsigned int revokes) +{ + if (revokes) + atomic_add(revokes, &sdp->sd_log_revokes_available); +} + +/** + * gfs2_log_release - Release a given number of log blocks + * @sdp: The GFS2 superblock + * @blks: The number of blocks + * + */ + +void gfs2_log_release(struct gfs2_sbd *sdp, unsigned int blks) +{ + atomic_add(blks, &sdp->sd_log_blks_free); + trace_gfs2_log_blocks(sdp, blks); + gfs2_assert_withdraw(sdp, atomic_read(&sdp->sd_log_blks_free) <= + sdp->sd_jdesc->jd_blocks); + if (atomic_read(&sdp->sd_log_blks_needed)) + wake_up(&sdp->sd_log_waitq); +} + +/** + * __gfs2_log_try_reserve - Try to make a log reservation + * @sdp: The GFS2 superblock + * @blks: The number of blocks to reserve + * @taboo_blks: The number of blocks to leave free + * + * Try to do the same as __gfs2_log_reserve(), but fail if no more log + * space is immediately available. + */ +static bool __gfs2_log_try_reserve(struct gfs2_sbd *sdp, unsigned int blks, + unsigned int taboo_blks) +{ + unsigned wanted = blks + taboo_blks; + unsigned int free_blocks; + + free_blocks = atomic_read(&sdp->sd_log_blks_free); + while (free_blocks >= wanted) { + if (atomic_try_cmpxchg(&sdp->sd_log_blks_free, &free_blocks, + free_blocks - blks)) { + trace_gfs2_log_blocks(sdp, -blks); + return true; + } + } + return false; +} + +/** + * __gfs2_log_reserve - Make a log reservation + * @sdp: The GFS2 superblock + * @blks: The number of blocks to reserve + * @taboo_blks: The number of blocks to leave free + * + * @taboo_blks is set to 0 for logd, and to GFS2_LOG_FLUSH_MIN_BLOCKS + * for all other processes. This ensures that when the log is almost full, + * logd will still be able to call gfs2_log_flush one more time without + * blocking, which will advance the tail and make some more log space + * available. + * + * We no longer flush the log here, instead we wake up logd to do that + * for us. To avoid the thundering herd and to ensure that we deal fairly + * with queued waiters, we use an exclusive wait. This means that when we + * get woken with enough journal space to get our reservation, we need to + * wake the next waiter on the list. + */ + +static void __gfs2_log_reserve(struct gfs2_sbd *sdp, unsigned int blks, + unsigned int taboo_blks) +{ + unsigned wanted = blks + taboo_blks; + unsigned int free_blocks; + + atomic_add(blks, &sdp->sd_log_blks_needed); + for (;;) { + if (current != sdp->sd_logd_process) + wake_up(&sdp->sd_logd_waitq); + io_wait_event(sdp->sd_log_waitq, + (free_blocks = atomic_read(&sdp->sd_log_blks_free), + free_blocks >= wanted)); + do { + if (atomic_try_cmpxchg(&sdp->sd_log_blks_free, + &free_blocks, + free_blocks - blks)) + goto reserved; + } while (free_blocks >= wanted); + } + +reserved: + trace_gfs2_log_blocks(sdp, -blks); + if (atomic_sub_return(blks, &sdp->sd_log_blks_needed)) + wake_up(&sdp->sd_log_waitq); +} + +/** + * gfs2_log_try_reserve - Try to make a log reservation + * @sdp: The GFS2 superblock + * @tr: The transaction + * @extra_revokes: The number of additional revokes reserved (output) + * + * This is similar to gfs2_log_reserve, but sdp->sd_log_flush_lock must be + * held for correct revoke accounting. + */ + +bool gfs2_log_try_reserve(struct gfs2_sbd *sdp, struct gfs2_trans *tr, + unsigned int *extra_revokes) +{ + unsigned int blks = tr->tr_reserved; + unsigned int revokes = tr->tr_revokes; + unsigned int revoke_blks = 0; + + *extra_revokes = 0; + if (revokes && !__gfs2_log_try_reserve_revokes(sdp, revokes)) { + revoke_blks = DIV_ROUND_UP(revokes, sdp->sd_inptrs); + *extra_revokes = revoke_blks * sdp->sd_inptrs - revokes; + blks += revoke_blks; + } + if (!blks) + return true; + if (__gfs2_log_try_reserve(sdp, blks, GFS2_LOG_FLUSH_MIN_BLOCKS)) + return true; + if (!revoke_blks) + gfs2_log_release_revokes(sdp, revokes); + return false; +} + +/** + * gfs2_log_reserve - Make a log reservation + * @sdp: The GFS2 superblock + * @tr: The transaction + * @extra_revokes: The number of additional revokes reserved (output) + * + * sdp->sd_log_flush_lock must not be held. + */ + +void gfs2_log_reserve(struct gfs2_sbd *sdp, struct gfs2_trans *tr, + unsigned int *extra_revokes) +{ + unsigned int blks = tr->tr_reserved; + unsigned int revokes = tr->tr_revokes; + unsigned int revoke_blks; + + *extra_revokes = 0; + if (revokes) { + revoke_blks = DIV_ROUND_UP(revokes, sdp->sd_inptrs); + *extra_revokes = revoke_blks * sdp->sd_inptrs - revokes; + blks += revoke_blks; + } + __gfs2_log_reserve(sdp, blks, GFS2_LOG_FLUSH_MIN_BLOCKS); +} + +/** + * log_distance - Compute distance between two journal blocks + * @sdp: The GFS2 superblock + * @newer: The most recent journal block of the pair + * @older: The older journal block of the pair + * + * Compute the distance (in the journal direction) between two + * blocks in the journal + * + * Returns: the distance in blocks + */ + +static inline unsigned int log_distance(struct gfs2_sbd *sdp, unsigned int newer, + unsigned int older) +{ + int dist; + + dist = newer - older; + if (dist < 0) + dist += sdp->sd_jdesc->jd_blocks; + + return dist; +} + +/** + * calc_reserved - Calculate the number of blocks to keep reserved + * @sdp: The GFS2 superblock + * + * This is complex. We need to reserve room for all our currently used + * metadata blocks (e.g. normal file I/O rewriting file time stamps) and + * all our journaled data blocks for journaled files (e.g. files in the + * meta_fs like rindex, or files for which chattr +j was done.) + * If we don't reserve enough space, corruption will follow. + * + * We can have metadata blocks and jdata blocks in the same journal. Each + * type gets its own log descriptor, for which we need to reserve a block. + * In fact, each type has the potential for needing more than one log descriptor + * in cases where we have more blocks than will fit in a log descriptor. + * Metadata journal entries take up half the space of journaled buffer entries. + * + * Also, we need to reserve blocks for revoke journal entries and one for an + * overall header for the lot. + * + * Returns: the number of blocks reserved + */ +static unsigned int calc_reserved(struct gfs2_sbd *sdp) +{ + unsigned int reserved = GFS2_LOG_FLUSH_MIN_BLOCKS; + unsigned int blocks; + struct gfs2_trans *tr = sdp->sd_log_tr; + + if (tr) { + blocks = tr->tr_num_buf_new - tr->tr_num_buf_rm; + reserved += blocks + DIV_ROUND_UP(blocks, buf_limit(sdp)); + blocks = tr->tr_num_databuf_new - tr->tr_num_databuf_rm; + reserved += blocks + DIV_ROUND_UP(blocks, databuf_limit(sdp)); + } + return reserved; +} + +static void log_pull_tail(struct gfs2_sbd *sdp) +{ + unsigned int new_tail = sdp->sd_log_flush_tail; + unsigned int dist; + + if (new_tail == sdp->sd_log_tail) + return; + dist = log_distance(sdp, new_tail, sdp->sd_log_tail); + ail2_empty(sdp, new_tail); + gfs2_log_release(sdp, dist); + sdp->sd_log_tail = new_tail; +} + + +void log_flush_wait(struct gfs2_sbd *sdp) +{ + DEFINE_WAIT(wait); + + if (atomic_read(&sdp->sd_log_in_flight)) { + do { + prepare_to_wait(&sdp->sd_log_flush_wait, &wait, + TASK_UNINTERRUPTIBLE); + if (atomic_read(&sdp->sd_log_in_flight)) + io_schedule(); + } while(atomic_read(&sdp->sd_log_in_flight)); + finish_wait(&sdp->sd_log_flush_wait, &wait); + } +} + +static int ip_cmp(void *priv, const struct list_head *a, const struct list_head *b) +{ + struct gfs2_inode *ipa, *ipb; + + ipa = list_entry(a, struct gfs2_inode, i_ordered); + ipb = list_entry(b, struct gfs2_inode, i_ordered); + + if (ipa->i_no_addr < ipb->i_no_addr) + return -1; + if (ipa->i_no_addr > ipb->i_no_addr) + return 1; + return 0; +} + +static void __ordered_del_inode(struct gfs2_inode *ip) +{ + if (!list_empty(&ip->i_ordered)) + list_del_init(&ip->i_ordered); +} + +static void gfs2_ordered_write(struct gfs2_sbd *sdp) +{ + struct gfs2_inode *ip; + LIST_HEAD(written); + + spin_lock(&sdp->sd_ordered_lock); + list_sort(NULL, &sdp->sd_log_ordered, &ip_cmp); + while (!list_empty(&sdp->sd_log_ordered)) { + ip = list_first_entry(&sdp->sd_log_ordered, struct gfs2_inode, i_ordered); + if (ip->i_inode.i_mapping->nrpages == 0) { + __ordered_del_inode(ip); + continue; + } + list_move(&ip->i_ordered, &written); + spin_unlock(&sdp->sd_ordered_lock); + filemap_fdatawrite(ip->i_inode.i_mapping); + spin_lock(&sdp->sd_ordered_lock); + } + list_splice(&written, &sdp->sd_log_ordered); + spin_unlock(&sdp->sd_ordered_lock); +} + +static void gfs2_ordered_wait(struct gfs2_sbd *sdp) +{ + struct gfs2_inode *ip; + + spin_lock(&sdp->sd_ordered_lock); + while (!list_empty(&sdp->sd_log_ordered)) { + ip = list_first_entry(&sdp->sd_log_ordered, struct gfs2_inode, i_ordered); + __ordered_del_inode(ip); + if (ip->i_inode.i_mapping->nrpages == 0) + continue; + spin_unlock(&sdp->sd_ordered_lock); + filemap_fdatawait(ip->i_inode.i_mapping); + spin_lock(&sdp->sd_ordered_lock); + } + spin_unlock(&sdp->sd_ordered_lock); +} + +void gfs2_ordered_del_inode(struct gfs2_inode *ip) +{ + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + + spin_lock(&sdp->sd_ordered_lock); + __ordered_del_inode(ip); + spin_unlock(&sdp->sd_ordered_lock); +} + +void gfs2_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd) +{ + struct buffer_head *bh = bd->bd_bh; + struct gfs2_glock *gl = bd->bd_gl; + + sdp->sd_log_num_revoke++; + if (atomic_inc_return(&gl->gl_revokes) == 1) + gfs2_glock_hold(gl); + bh->b_private = NULL; + bd->bd_blkno = bh->b_blocknr; + gfs2_remove_from_ail(bd); /* drops ref on bh */ + bd->bd_bh = NULL; + set_bit(GLF_LFLUSH, &gl->gl_flags); + list_add(&bd->bd_list, &sdp->sd_log_revokes); +} + +void gfs2_glock_remove_revoke(struct gfs2_glock *gl) +{ + if (atomic_dec_return(&gl->gl_revokes) == 0) { + clear_bit(GLF_LFLUSH, &gl->gl_flags); + gfs2_glock_queue_put(gl); + } +} + +/** + * gfs2_flush_revokes - Add as many revokes to the system transaction as we can + * @sdp: The GFS2 superblock + * + * Our usual strategy is to defer writing revokes as much as we can in the hope + * that we'll eventually overwrite the journal, which will make those revokes + * go away. This changes when we flush the log: at that point, there will + * likely be some left-over space in the last revoke block of that transaction. + * We can fill that space with additional revokes for blocks that have already + * been written back. This will basically come at no cost now, and will save + * us from having to keep track of those blocks on the AIL2 list later. + */ +void gfs2_flush_revokes(struct gfs2_sbd *sdp) +{ + /* number of revokes we still have room for */ + unsigned int max_revokes = atomic_read(&sdp->sd_log_revokes_available); + + gfs2_log_lock(sdp); + gfs2_ail1_empty(sdp, max_revokes); + gfs2_log_unlock(sdp); +} + +/** + * gfs2_write_log_header - Write a journal log header buffer at lblock + * @sdp: The GFS2 superblock + * @jd: journal descriptor of the journal to which we are writing + * @seq: sequence number + * @tail: tail of the log + * @lblock: value for lh_blkno (block number relative to start of journal) + * @flags: log header flags GFS2_LOG_HEAD_* + * @op_flags: flags to pass to the bio + * + * Returns: the initialized log buffer descriptor + */ + +void gfs2_write_log_header(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd, + u64 seq, u32 tail, u32 lblock, u32 flags, + blk_opf_t op_flags) +{ + struct gfs2_log_header *lh; + u32 hash, crc; + struct page *page; + struct gfs2_statfs_change_host *l_sc = &sdp->sd_statfs_local; + struct timespec64 tv; + struct super_block *sb = sdp->sd_vfs; + u64 dblock; + + if (gfs2_withdrawn(sdp)) + return; + + page = mempool_alloc(gfs2_page_pool, GFP_NOIO); + lh = page_address(page); + clear_page(lh); + + lh->lh_header.mh_magic = cpu_to_be32(GFS2_MAGIC); + lh->lh_header.mh_type = cpu_to_be32(GFS2_METATYPE_LH); + lh->lh_header.__pad0 = cpu_to_be64(0); + lh->lh_header.mh_format = cpu_to_be32(GFS2_FORMAT_LH); + lh->lh_header.mh_jid = cpu_to_be32(sdp->sd_jdesc->jd_jid); + lh->lh_sequence = cpu_to_be64(seq); + lh->lh_flags = cpu_to_be32(flags); + lh->lh_tail = cpu_to_be32(tail); + lh->lh_blkno = cpu_to_be32(lblock); + hash = ~crc32(~0, lh, LH_V1_SIZE); + lh->lh_hash = cpu_to_be32(hash); + + ktime_get_coarse_real_ts64(&tv); + lh->lh_nsec = cpu_to_be32(tv.tv_nsec); + lh->lh_sec = cpu_to_be64(tv.tv_sec); + if (!list_empty(&jd->extent_list)) + dblock = gfs2_log_bmap(jd, lblock); + else { + unsigned int extlen; + int ret; + + extlen = 1; + ret = gfs2_get_extent(jd->jd_inode, lblock, &dblock, &extlen); + if (gfs2_assert_withdraw(sdp, ret == 0)) + return; + } + lh->lh_addr = cpu_to_be64(dblock); + lh->lh_jinode = cpu_to_be64(GFS2_I(jd->jd_inode)->i_no_addr); + + /* We may only write local statfs, quota, etc., when writing to our + own journal. The values are left 0 when recovering a journal + different from our own. */ + if (!(flags & GFS2_LOG_HEAD_RECOVERY)) { + lh->lh_statfs_addr = + cpu_to_be64(GFS2_I(sdp->sd_sc_inode)->i_no_addr); + lh->lh_quota_addr = + cpu_to_be64(GFS2_I(sdp->sd_qc_inode)->i_no_addr); + + spin_lock(&sdp->sd_statfs_spin); + lh->lh_local_total = cpu_to_be64(l_sc->sc_total); + lh->lh_local_free = cpu_to_be64(l_sc->sc_free); + lh->lh_local_dinodes = cpu_to_be64(l_sc->sc_dinodes); + spin_unlock(&sdp->sd_statfs_spin); + } + + BUILD_BUG_ON(offsetof(struct gfs2_log_header, lh_crc) != LH_V1_SIZE); + + crc = crc32c(~0, (void *)lh + LH_V1_SIZE + 4, + sb->s_blocksize - LH_V1_SIZE - 4); + lh->lh_crc = cpu_to_be32(crc); + + gfs2_log_write(sdp, jd, page, sb->s_blocksize, 0, dblock); + gfs2_log_submit_bio(&jd->jd_log_bio, REQ_OP_WRITE | op_flags); +} + +/** + * log_write_header - Get and initialize a journal header buffer + * @sdp: The GFS2 superblock + * @flags: The log header flags, including log header origin + * + * Returns: the initialized log buffer descriptor + */ + +static void log_write_header(struct gfs2_sbd *sdp, u32 flags) +{ + blk_opf_t op_flags = REQ_PREFLUSH | REQ_FUA | REQ_META | REQ_SYNC; + + gfs2_assert_withdraw(sdp, !test_bit(SDF_FROZEN, &sdp->sd_flags)); + + if (test_bit(SDF_NOBARRIERS, &sdp->sd_flags)) { + gfs2_ordered_wait(sdp); + log_flush_wait(sdp); + op_flags = REQ_SYNC | REQ_META | REQ_PRIO; + } + sdp->sd_log_idle = (sdp->sd_log_flush_tail == sdp->sd_log_flush_head); + gfs2_write_log_header(sdp, sdp->sd_jdesc, sdp->sd_log_sequence++, + sdp->sd_log_flush_tail, sdp->sd_log_flush_head, + flags, op_flags); + gfs2_log_incr_head(sdp); + log_flush_wait(sdp); + log_pull_tail(sdp); + gfs2_log_update_head(sdp); +} + +/** + * gfs2_ail_drain - drain the ail lists after a withdraw + * @sdp: Pointer to GFS2 superblock + */ +void gfs2_ail_drain(struct gfs2_sbd *sdp) +{ + struct gfs2_trans *tr; + + spin_lock(&sdp->sd_ail_lock); + /* + * For transactions on the sd_ail1_list we need to drain both the + * ail1 and ail2 lists. That's because function gfs2_ail1_start_one + * (temporarily) moves items from its tr_ail1 list to tr_ail2 list + * before revokes are sent for that block. Items on the sd_ail2_list + * should have already gotten beyond that point, so no need. + */ + while (!list_empty(&sdp->sd_ail1_list)) { + tr = list_first_entry(&sdp->sd_ail1_list, struct gfs2_trans, + tr_list); + gfs2_ail_empty_tr(sdp, tr, &tr->tr_ail1_list); + gfs2_ail_empty_tr(sdp, tr, &tr->tr_ail2_list); + list_del(&tr->tr_list); + gfs2_trans_free(sdp, tr); + } + while (!list_empty(&sdp->sd_ail2_list)) { + tr = list_first_entry(&sdp->sd_ail2_list, struct gfs2_trans, + tr_list); + gfs2_ail_empty_tr(sdp, tr, &tr->tr_ail2_list); + list_del(&tr->tr_list); + gfs2_trans_free(sdp, tr); + } + gfs2_drain_revokes(sdp); + spin_unlock(&sdp->sd_ail_lock); +} + +/** + * empty_ail1_list - try to start IO and empty the ail1 list + * @sdp: Pointer to GFS2 superblock + */ +static void empty_ail1_list(struct gfs2_sbd *sdp) +{ + unsigned long start = jiffies; + + for (;;) { + if (time_after(jiffies, start + (HZ * 600))) { + fs_err(sdp, "Error: In %s for 10 minutes! t=%d\n", + __func__, current->journal_info ? 1 : 0); + dump_ail_list(sdp); + return; + } + gfs2_ail1_start(sdp); + gfs2_ail1_wait(sdp); + if (gfs2_ail1_empty(sdp, 0)) + return; + } +} + +/** + * trans_drain - drain the buf and databuf queue for a failed transaction + * @tr: the transaction to drain + * + * When this is called, we're taking an error exit for a log write that failed + * but since we bypassed the after_commit functions, we need to remove the + * items from the buf and databuf queue. + */ +static void trans_drain(struct gfs2_trans *tr) +{ + struct gfs2_bufdata *bd; + struct list_head *head; + + if (!tr) + return; + + head = &tr->tr_buf; + while (!list_empty(head)) { + bd = list_first_entry(head, struct gfs2_bufdata, bd_list); + list_del_init(&bd->bd_list); + if (!list_empty(&bd->bd_ail_st_list)) + gfs2_remove_from_ail(bd); + kmem_cache_free(gfs2_bufdata_cachep, bd); + } + head = &tr->tr_databuf; + while (!list_empty(head)) { + bd = list_first_entry(head, struct gfs2_bufdata, bd_list); + list_del_init(&bd->bd_list); + if (!list_empty(&bd->bd_ail_st_list)) + gfs2_remove_from_ail(bd); + kmem_cache_free(gfs2_bufdata_cachep, bd); + } +} + +/** + * gfs2_log_flush - flush incore transaction(s) + * @sdp: The filesystem + * @gl: The glock structure to flush. If NULL, flush the whole incore log + * @flags: The log header flags: GFS2_LOG_HEAD_FLUSH_* and debug flags + * + */ + +void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl, u32 flags) +{ + struct gfs2_trans *tr = NULL; + unsigned int reserved_blocks = 0, used_blocks = 0; + bool frozen = test_bit(SDF_FROZEN, &sdp->sd_flags); + unsigned int first_log_head; + unsigned int reserved_revokes = 0; + + down_write(&sdp->sd_log_flush_lock); + trace_gfs2_log_flush(sdp, 1, flags); + +repeat: + /* + * Do this check while holding the log_flush_lock to prevent new + * buffers from being added to the ail via gfs2_pin() + */ + if (gfs2_withdrawn(sdp) || !test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) + goto out; + + /* Log might have been flushed while we waited for the flush lock */ + if (gl && !test_bit(GLF_LFLUSH, &gl->gl_flags)) + goto out; + + first_log_head = sdp->sd_log_head; + sdp->sd_log_flush_head = first_log_head; + + tr = sdp->sd_log_tr; + if (tr || sdp->sd_log_num_revoke) { + if (reserved_blocks) + gfs2_log_release(sdp, reserved_blocks); + reserved_blocks = sdp->sd_log_blks_reserved; + reserved_revokes = sdp->sd_log_num_revoke; + if (tr) { + sdp->sd_log_tr = NULL; + tr->tr_first = first_log_head; + if (unlikely(frozen)) { + if (gfs2_assert_withdraw_delayed(sdp, + !tr->tr_num_buf_new && !tr->tr_num_databuf_new)) + goto out_withdraw; + } + } + } else if (!reserved_blocks) { + unsigned int taboo_blocks = GFS2_LOG_FLUSH_MIN_BLOCKS; + + reserved_blocks = GFS2_LOG_FLUSH_MIN_BLOCKS; + if (current == sdp->sd_logd_process) + taboo_blocks = 0; + + if (!__gfs2_log_try_reserve(sdp, reserved_blocks, taboo_blocks)) { + up_write(&sdp->sd_log_flush_lock); + __gfs2_log_reserve(sdp, reserved_blocks, taboo_blocks); + down_write(&sdp->sd_log_flush_lock); + goto repeat; + } + BUG_ON(sdp->sd_log_num_revoke); + } + + if (flags & GFS2_LOG_HEAD_FLUSH_SHUTDOWN) + clear_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags); + + if (unlikely(frozen)) + if (gfs2_assert_withdraw_delayed(sdp, !reserved_revokes)) + goto out_withdraw; + + gfs2_ordered_write(sdp); + if (gfs2_withdrawn(sdp)) + goto out_withdraw; + lops_before_commit(sdp, tr); + if (gfs2_withdrawn(sdp)) + goto out_withdraw; + gfs2_log_submit_bio(&sdp->sd_jdesc->jd_log_bio, REQ_OP_WRITE); + if (gfs2_withdrawn(sdp)) + goto out_withdraw; + + if (sdp->sd_log_head != sdp->sd_log_flush_head) { + log_write_header(sdp, flags); + } else if (sdp->sd_log_tail != sdp->sd_log_flush_tail && !sdp->sd_log_idle) { + log_write_header(sdp, flags); + } + if (gfs2_withdrawn(sdp)) + goto out_withdraw; + lops_after_commit(sdp, tr); + + gfs2_log_lock(sdp); + sdp->sd_log_blks_reserved = 0; + + spin_lock(&sdp->sd_ail_lock); + if (tr && !list_empty(&tr->tr_ail1_list)) { + list_add(&tr->tr_list, &sdp->sd_ail1_list); + tr = NULL; + } + spin_unlock(&sdp->sd_ail_lock); + gfs2_log_unlock(sdp); + + if (!(flags & GFS2_LOG_HEAD_FLUSH_NORMAL)) { + if (!sdp->sd_log_idle) { + empty_ail1_list(sdp); + if (gfs2_withdrawn(sdp)) + goto out_withdraw; + log_write_header(sdp, flags); + } + if (flags & (GFS2_LOG_HEAD_FLUSH_SHUTDOWN | + GFS2_LOG_HEAD_FLUSH_FREEZE)) + gfs2_log_shutdown(sdp); + } + +out_end: + used_blocks = log_distance(sdp, sdp->sd_log_flush_head, first_log_head); + reserved_revokes += atomic_read(&sdp->sd_log_revokes_available); + atomic_set(&sdp->sd_log_revokes_available, sdp->sd_ldptrs); + gfs2_assert_withdraw(sdp, reserved_revokes % sdp->sd_inptrs == sdp->sd_ldptrs); + if (reserved_revokes > sdp->sd_ldptrs) + reserved_blocks += (reserved_revokes - sdp->sd_ldptrs) / sdp->sd_inptrs; +out: + if (used_blocks != reserved_blocks) { + gfs2_assert_withdraw_delayed(sdp, used_blocks < reserved_blocks); + gfs2_log_release(sdp, reserved_blocks - used_blocks); + } + up_write(&sdp->sd_log_flush_lock); + gfs2_trans_free(sdp, tr); + if (gfs2_withdrawing(sdp)) + gfs2_withdraw(sdp); + trace_gfs2_log_flush(sdp, 0, flags); + return; + +out_withdraw: + trans_drain(tr); + /** + * If the tr_list is empty, we're withdrawing during a log + * flush that targets a transaction, but the transaction was + * never queued onto any of the ail lists. Here we add it to + * ail1 just so that ail_drain() will find and free it. + */ + spin_lock(&sdp->sd_ail_lock); + if (tr && list_empty(&tr->tr_list)) + list_add(&tr->tr_list, &sdp->sd_ail1_list); + spin_unlock(&sdp->sd_ail_lock); + tr = NULL; + goto out_end; +} + +/** + * gfs2_merge_trans - Merge a new transaction into a cached transaction + * @sdp: the filesystem + * @new: New transaction to be merged + */ + +static void gfs2_merge_trans(struct gfs2_sbd *sdp, struct gfs2_trans *new) +{ + struct gfs2_trans *old = sdp->sd_log_tr; + + WARN_ON_ONCE(!test_bit(TR_ATTACHED, &old->tr_flags)); + + old->tr_num_buf_new += new->tr_num_buf_new; + old->tr_num_databuf_new += new->tr_num_databuf_new; + old->tr_num_buf_rm += new->tr_num_buf_rm; + old->tr_num_databuf_rm += new->tr_num_databuf_rm; + old->tr_revokes += new->tr_revokes; + old->tr_num_revoke += new->tr_num_revoke; + + list_splice_tail_init(&new->tr_databuf, &old->tr_databuf); + list_splice_tail_init(&new->tr_buf, &old->tr_buf); + + spin_lock(&sdp->sd_ail_lock); + list_splice_tail_init(&new->tr_ail1_list, &old->tr_ail1_list); + list_splice_tail_init(&new->tr_ail2_list, &old->tr_ail2_list); + spin_unlock(&sdp->sd_ail_lock); +} + +static void log_refund(struct gfs2_sbd *sdp, struct gfs2_trans *tr) +{ + unsigned int reserved; + unsigned int unused; + unsigned int maxres; + + gfs2_log_lock(sdp); + + if (sdp->sd_log_tr) { + gfs2_merge_trans(sdp, tr); + } else if (tr->tr_num_buf_new || tr->tr_num_databuf_new) { + gfs2_assert_withdraw(sdp, !test_bit(TR_ONSTACK, &tr->tr_flags)); + sdp->sd_log_tr = tr; + set_bit(TR_ATTACHED, &tr->tr_flags); + } + + reserved = calc_reserved(sdp); + maxres = sdp->sd_log_blks_reserved + tr->tr_reserved; + gfs2_assert_withdraw(sdp, maxres >= reserved); + unused = maxres - reserved; + if (unused) + gfs2_log_release(sdp, unused); + sdp->sd_log_blks_reserved = reserved; + + gfs2_log_unlock(sdp); +} + +static inline int gfs2_jrnl_flush_reqd(struct gfs2_sbd *sdp) +{ + return atomic_read(&sdp->sd_log_pinned) + + atomic_read(&sdp->sd_log_blks_needed) >= + atomic_read(&sdp->sd_log_thresh1); +} + +static inline int gfs2_ail_flush_reqd(struct gfs2_sbd *sdp) +{ + return sdp->sd_jdesc->jd_blocks - + atomic_read(&sdp->sd_log_blks_free) + + atomic_read(&sdp->sd_log_blks_needed) >= + atomic_read(&sdp->sd_log_thresh2); +} + +/** + * gfs2_log_commit - Commit a transaction to the log + * @sdp: the filesystem + * @tr: the transaction + * + * We wake up gfs2_logd if the number of pinned blocks exceed thresh1 + * or the total number of used blocks (pinned blocks plus AIL blocks) + * is greater than thresh2. + * + * At mount time thresh1 is 2/5ths of journal size, thresh2 is 4/5ths of + * journal size. + * + * Returns: errno + */ + +void gfs2_log_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr) +{ + log_refund(sdp, tr); + + if (gfs2_ail_flush_reqd(sdp) || gfs2_jrnl_flush_reqd(sdp)) + wake_up(&sdp->sd_logd_waitq); +} + +/** + * gfs2_log_shutdown - write a shutdown header into a journal + * @sdp: the filesystem + * + */ + +static void gfs2_log_shutdown(struct gfs2_sbd *sdp) +{ + gfs2_assert_withdraw(sdp, !sdp->sd_log_blks_reserved); + gfs2_assert_withdraw(sdp, !sdp->sd_log_num_revoke); + gfs2_assert_withdraw(sdp, list_empty(&sdp->sd_ail1_list)); + + log_write_header(sdp, GFS2_LOG_HEAD_UNMOUNT | GFS2_LFC_SHUTDOWN); + log_pull_tail(sdp); + + gfs2_assert_warn(sdp, sdp->sd_log_head == sdp->sd_log_tail); + gfs2_assert_warn(sdp, list_empty(&sdp->sd_ail2_list)); +} + +/** + * gfs2_logd - Update log tail as Active Items get flushed to in-place blocks + * @data: Pointer to GFS2 superblock + * + * Also, periodically check to make sure that we're using the most recent + * journal index. + */ + +int gfs2_logd(void *data) +{ + struct gfs2_sbd *sdp = data; + unsigned long t = 1; + + while (!kthread_should_stop()) { + if (gfs2_withdrawn(sdp)) + break; + + /* Check for errors writing to the journal */ + if (sdp->sd_log_error) { + gfs2_lm(sdp, + "GFS2: fsid=%s: error %d: " + "withdrawing the file system to " + "prevent further damage.\n", + sdp->sd_fsname, sdp->sd_log_error); + gfs2_withdraw(sdp); + break; + } + + if (gfs2_jrnl_flush_reqd(sdp) || t == 0) { + gfs2_ail1_empty(sdp, 0); + gfs2_log_flush(sdp, NULL, GFS2_LOG_HEAD_FLUSH_NORMAL | + GFS2_LFC_LOGD_JFLUSH_REQD); + } + + if (test_bit(SDF_FORCE_AIL_FLUSH, &sdp->sd_flags) || + gfs2_ail_flush_reqd(sdp)) { + clear_bit(SDF_FORCE_AIL_FLUSH, &sdp->sd_flags); + gfs2_ail1_start(sdp); + gfs2_ail1_wait(sdp); + gfs2_ail1_empty(sdp, 0); + gfs2_log_flush(sdp, NULL, GFS2_LOG_HEAD_FLUSH_NORMAL | + GFS2_LFC_LOGD_AIL_FLUSH_REQD); + } + + t = gfs2_tune_get(sdp, gt_logd_secs) * HZ; + + try_to_freeze(); + + t = wait_event_interruptible_timeout(sdp->sd_logd_waitq, + test_bit(SDF_FORCE_AIL_FLUSH, &sdp->sd_flags) || + gfs2_ail_flush_reqd(sdp) || + gfs2_jrnl_flush_reqd(sdp) || + sdp->sd_log_error || + gfs2_withdrawn(sdp) || + kthread_should_stop(), + t); + } + + return 0; +} + diff --git a/fs/gfs2/log.h b/fs/gfs2/log.h new file mode 100644 index 0000000000..653cffcbf8 --- /dev/null +++ b/fs/gfs2/log.h @@ -0,0 +1,98 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + */ + +#ifndef __LOG_DOT_H__ +#define __LOG_DOT_H__ + +#include <linux/list.h> +#include <linux/spinlock.h> +#include <linux/writeback.h> +#include "incore.h" +#include "inode.h" + +/* + * The minimum amount of log space required for a log flush is one block for + * revokes and one block for the log header. Log flushes other than + * GFS2_LOG_HEAD_FLUSH_NORMAL may write one or two more log headers. + */ +#define GFS2_LOG_FLUSH_MIN_BLOCKS 4 + +/** + * gfs2_log_lock - acquire the right to mess with the log manager + * @sdp: the filesystem + * + */ + +static inline void gfs2_log_lock(struct gfs2_sbd *sdp) +__acquires(&sdp->sd_log_lock) +{ + spin_lock(&sdp->sd_log_lock); +} + +/** + * gfs2_log_unlock - release the right to mess with the log manager + * @sdp: the filesystem + * + */ + +static inline void gfs2_log_unlock(struct gfs2_sbd *sdp) +__releases(&sdp->sd_log_lock) +{ + spin_unlock(&sdp->sd_log_lock); +} + +static inline void gfs2_log_pointers_init(struct gfs2_sbd *sdp, + unsigned int value) +{ + if (++value == sdp->sd_jdesc->jd_blocks) { + value = 0; + } + sdp->sd_log_tail = value; + sdp->sd_log_flush_tail = value; + sdp->sd_log_head = value; +} + +static inline void gfs2_ordered_add_inode(struct gfs2_inode *ip) +{ + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + + if (gfs2_is_jdata(ip) || !gfs2_is_ordered(sdp)) + return; + + if (list_empty(&ip->i_ordered)) { + spin_lock(&sdp->sd_ordered_lock); + if (list_empty(&ip->i_ordered)) + list_add(&ip->i_ordered, &sdp->sd_log_ordered); + spin_unlock(&sdp->sd_ordered_lock); + } +} + +extern void gfs2_ordered_del_inode(struct gfs2_inode *ip); +extern unsigned int gfs2_struct2blk(struct gfs2_sbd *sdp, unsigned int nstruct); +extern void gfs2_remove_from_ail(struct gfs2_bufdata *bd); +extern bool gfs2_log_is_empty(struct gfs2_sbd *sdp); +extern void gfs2_log_release_revokes(struct gfs2_sbd *sdp, unsigned int revokes); +extern void gfs2_log_release(struct gfs2_sbd *sdp, unsigned int blks); +extern bool gfs2_log_try_reserve(struct gfs2_sbd *sdp, struct gfs2_trans *tr, + unsigned int *extra_revokes); +extern void gfs2_log_reserve(struct gfs2_sbd *sdp, struct gfs2_trans *tr, + unsigned int *extra_revokes); +extern void gfs2_write_log_header(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd, + u64 seq, u32 tail, u32 lblock, u32 flags, + blk_opf_t op_flags); +extern void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl, + u32 type); +extern void gfs2_log_commit(struct gfs2_sbd *sdp, struct gfs2_trans *trans); +extern void gfs2_ail1_flush(struct gfs2_sbd *sdp, struct writeback_control *wbc); +extern void log_flush_wait(struct gfs2_sbd *sdp); + +extern int gfs2_logd(void *data); +extern void gfs2_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd); +extern void gfs2_glock_remove_revoke(struct gfs2_glock *gl); +extern void gfs2_flush_revokes(struct gfs2_sbd *sdp); +extern void gfs2_ail_drain(struct gfs2_sbd *sdp); + +#endif /* __LOG_DOT_H__ */ diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c new file mode 100644 index 0000000000..483f698070 --- /dev/null +++ b/fs/gfs2/lops.c @@ -0,0 +1,1128 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + */ + +#include <linux/sched.h> +#include <linux/slab.h> +#include <linux/spinlock.h> +#include <linux/completion.h> +#include <linux/buffer_head.h> +#include <linux/mempool.h> +#include <linux/gfs2_ondisk.h> +#include <linux/bio.h> +#include <linux/fs.h> +#include <linux/list_sort.h> +#include <linux/blkdev.h> + +#include "bmap.h" +#include "dir.h" +#include "gfs2.h" +#include "incore.h" +#include "inode.h" +#include "glock.h" +#include "glops.h" +#include "log.h" +#include "lops.h" +#include "meta_io.h" +#include "recovery.h" +#include "rgrp.h" +#include "trans.h" +#include "util.h" +#include "trace_gfs2.h" + +/** + * gfs2_pin - Pin a buffer in memory + * @sdp: The superblock + * @bh: The buffer to be pinned + * + * The log lock must be held when calling this function + */ +void gfs2_pin(struct gfs2_sbd *sdp, struct buffer_head *bh) +{ + struct gfs2_bufdata *bd; + + BUG_ON(!current->journal_info); + + clear_buffer_dirty(bh); + if (test_set_buffer_pinned(bh)) + gfs2_assert_withdraw(sdp, 0); + if (!buffer_uptodate(bh)) + gfs2_io_error_bh_wd(sdp, bh); + bd = bh->b_private; + /* If this buffer is in the AIL and it has already been written + * to in-place disk block, remove it from the AIL. + */ + spin_lock(&sdp->sd_ail_lock); + if (bd->bd_tr) + list_move(&bd->bd_ail_st_list, &bd->bd_tr->tr_ail2_list); + spin_unlock(&sdp->sd_ail_lock); + get_bh(bh); + atomic_inc(&sdp->sd_log_pinned); + trace_gfs2_pin(bd, 1); +} + +static bool buffer_is_rgrp(const struct gfs2_bufdata *bd) +{ + return bd->bd_gl->gl_name.ln_type == LM_TYPE_RGRP; +} + +static void maybe_release_space(struct gfs2_bufdata *bd) +{ + struct gfs2_glock *gl = bd->bd_gl; + struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; + struct gfs2_rgrpd *rgd = gfs2_glock2rgrp(gl); + unsigned int index = bd->bd_bh->b_blocknr - gl->gl_name.ln_number; + struct gfs2_bitmap *bi = rgd->rd_bits + index; + + rgrp_lock_local(rgd); + if (bi->bi_clone == NULL) + goto out; + if (sdp->sd_args.ar_discard) + gfs2_rgrp_send_discards(sdp, rgd->rd_data0, bd->bd_bh, bi, 1, NULL); + memcpy(bi->bi_clone + bi->bi_offset, + bd->bd_bh->b_data + bi->bi_offset, bi->bi_bytes); + clear_bit(GBF_FULL, &bi->bi_flags); + rgd->rd_free_clone = rgd->rd_free; + BUG_ON(rgd->rd_free_clone < rgd->rd_reserved); + rgd->rd_extfail_pt = rgd->rd_free; + +out: + rgrp_unlock_local(rgd); +} + +/** + * gfs2_unpin - Unpin a buffer + * @sdp: the filesystem the buffer belongs to + * @bh: The buffer to unpin + * @tr: The system transaction being flushed + */ + +static void gfs2_unpin(struct gfs2_sbd *sdp, struct buffer_head *bh, + struct gfs2_trans *tr) +{ + struct gfs2_bufdata *bd = bh->b_private; + + BUG_ON(!buffer_uptodate(bh)); + BUG_ON(!buffer_pinned(bh)); + + lock_buffer(bh); + mark_buffer_dirty(bh); + clear_buffer_pinned(bh); + + if (buffer_is_rgrp(bd)) + maybe_release_space(bd); + + spin_lock(&sdp->sd_ail_lock); + if (bd->bd_tr) { + list_del(&bd->bd_ail_st_list); + brelse(bh); + } else { + struct gfs2_glock *gl = bd->bd_gl; + list_add(&bd->bd_ail_gl_list, &gl->gl_ail_list); + atomic_inc(&gl->gl_ail_count); + } + bd->bd_tr = tr; + list_add(&bd->bd_ail_st_list, &tr->tr_ail1_list); + spin_unlock(&sdp->sd_ail_lock); + + clear_bit(GLF_LFLUSH, &bd->bd_gl->gl_flags); + trace_gfs2_pin(bd, 0); + unlock_buffer(bh); + atomic_dec(&sdp->sd_log_pinned); +} + +void gfs2_log_incr_head(struct gfs2_sbd *sdp) +{ + BUG_ON((sdp->sd_log_flush_head == sdp->sd_log_tail) && + (sdp->sd_log_flush_head != sdp->sd_log_head)); + + if (++sdp->sd_log_flush_head == sdp->sd_jdesc->jd_blocks) + sdp->sd_log_flush_head = 0; +} + +u64 gfs2_log_bmap(struct gfs2_jdesc *jd, unsigned int lblock) +{ + struct gfs2_journal_extent *je; + + list_for_each_entry(je, &jd->extent_list, list) { + if (lblock >= je->lblock && lblock < je->lblock + je->blocks) + return je->dblock + lblock - je->lblock; + } + + return -1; +} + +/** + * gfs2_end_log_write_bh - end log write of pagecache data with buffers + * @sdp: The superblock + * @bvec: The bio_vec + * @error: The i/o status + * + * This finds the relevant buffers and unlocks them and sets the + * error flag according to the status of the i/o request. This is + * used when the log is writing data which has an in-place version + * that is pinned in the pagecache. + */ + +static void gfs2_end_log_write_bh(struct gfs2_sbd *sdp, + struct bio_vec *bvec, + blk_status_t error) +{ + struct buffer_head *bh, *next; + struct page *page = bvec->bv_page; + unsigned size; + + bh = page_buffers(page); + size = bvec->bv_len; + while (bh_offset(bh) < bvec->bv_offset) + bh = bh->b_this_page; + do { + if (error) + mark_buffer_write_io_error(bh); + unlock_buffer(bh); + next = bh->b_this_page; + size -= bh->b_size; + brelse(bh); + bh = next; + } while(bh && size); +} + +/** + * gfs2_end_log_write - end of i/o to the log + * @bio: The bio + * + * Each bio_vec contains either data from the pagecache or data + * relating to the log itself. Here we iterate over the bio_vec + * array, processing both kinds of data. + * + */ + +static void gfs2_end_log_write(struct bio *bio) +{ + struct gfs2_sbd *sdp = bio->bi_private; + struct bio_vec *bvec; + struct page *page; + struct bvec_iter_all iter_all; + + if (bio->bi_status) { + if (!cmpxchg(&sdp->sd_log_error, 0, (int)bio->bi_status)) + fs_err(sdp, "Error %d writing to journal, jid=%u\n", + bio->bi_status, sdp->sd_jdesc->jd_jid); + gfs2_withdraw_delayed(sdp); + /* prevent more writes to the journal */ + clear_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags); + wake_up(&sdp->sd_logd_waitq); + } + + bio_for_each_segment_all(bvec, bio, iter_all) { + page = bvec->bv_page; + if (page_has_buffers(page)) + gfs2_end_log_write_bh(sdp, bvec, bio->bi_status); + else + mempool_free(page, gfs2_page_pool); + } + + bio_put(bio); + if (atomic_dec_and_test(&sdp->sd_log_in_flight)) + wake_up(&sdp->sd_log_flush_wait); +} + +/** + * gfs2_log_submit_bio - Submit any pending log bio + * @biop: Address of the bio pointer + * @opf: REQ_OP | op_flags + * + * Submit any pending part-built or full bio to the block device. If + * there is no pending bio, then this is a no-op. + */ + +void gfs2_log_submit_bio(struct bio **biop, blk_opf_t opf) +{ + struct bio *bio = *biop; + if (bio) { + struct gfs2_sbd *sdp = bio->bi_private; + atomic_inc(&sdp->sd_log_in_flight); + bio->bi_opf = opf; + submit_bio(bio); + *biop = NULL; + } +} + +/** + * gfs2_log_alloc_bio - Allocate a bio + * @sdp: The super block + * @blkno: The device block number we want to write to + * @end_io: The bi_end_io callback + * + * Allocate a new bio, initialize it with the given parameters and return it. + * + * Returns: The newly allocated bio + */ + +static struct bio *gfs2_log_alloc_bio(struct gfs2_sbd *sdp, u64 blkno, + bio_end_io_t *end_io) +{ + struct super_block *sb = sdp->sd_vfs; + struct bio *bio = bio_alloc(sb->s_bdev, BIO_MAX_VECS, 0, GFP_NOIO); + + bio->bi_iter.bi_sector = blkno << sdp->sd_fsb2bb_shift; + bio->bi_end_io = end_io; + bio->bi_private = sdp; + + return bio; +} + +/** + * gfs2_log_get_bio - Get cached log bio, or allocate a new one + * @sdp: The super block + * @blkno: The device block number we want to write to + * @biop: The bio to get or allocate + * @op: REQ_OP + * @end_io: The bi_end_io callback + * @flush: Always flush the current bio and allocate a new one? + * + * If there is a cached bio, then if the next block number is sequential + * with the previous one, return it, otherwise flush the bio to the + * device. If there is no cached bio, or we just flushed it, then + * allocate a new one. + * + * Returns: The bio to use for log writes + */ + +static struct bio *gfs2_log_get_bio(struct gfs2_sbd *sdp, u64 blkno, + struct bio **biop, enum req_op op, + bio_end_io_t *end_io, bool flush) +{ + struct bio *bio = *biop; + + if (bio) { + u64 nblk; + + nblk = bio_end_sector(bio); + nblk >>= sdp->sd_fsb2bb_shift; + if (blkno == nblk && !flush) + return bio; + gfs2_log_submit_bio(biop, op); + } + + *biop = gfs2_log_alloc_bio(sdp, blkno, end_io); + return *biop; +} + +/** + * gfs2_log_write - write to log + * @sdp: the filesystem + * @jd: The journal descriptor + * @page: the page to write + * @size: the size of the data to write + * @offset: the offset within the page + * @blkno: block number of the log entry + * + * Try and add the page segment to the current bio. If that fails, + * submit the current bio to the device and create a new one, and + * then add the page segment to that. + */ + +void gfs2_log_write(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd, + struct page *page, unsigned size, unsigned offset, + u64 blkno) +{ + struct bio *bio; + int ret; + + bio = gfs2_log_get_bio(sdp, blkno, &jd->jd_log_bio, REQ_OP_WRITE, + gfs2_end_log_write, false); + ret = bio_add_page(bio, page, size, offset); + if (ret == 0) { + bio = gfs2_log_get_bio(sdp, blkno, &jd->jd_log_bio, + REQ_OP_WRITE, gfs2_end_log_write, true); + ret = bio_add_page(bio, page, size, offset); + WARN_ON(ret == 0); + } +} + +/** + * gfs2_log_write_bh - write a buffer's content to the log + * @sdp: The super block + * @bh: The buffer pointing to the in-place location + * + * This writes the content of the buffer to the next available location + * in the log. The buffer will be unlocked once the i/o to the log has + * completed. + */ + +static void gfs2_log_write_bh(struct gfs2_sbd *sdp, struct buffer_head *bh) +{ + u64 dblock; + + dblock = gfs2_log_bmap(sdp->sd_jdesc, sdp->sd_log_flush_head); + gfs2_log_incr_head(sdp); + gfs2_log_write(sdp, sdp->sd_jdesc, bh->b_page, bh->b_size, + bh_offset(bh), dblock); +} + +/** + * gfs2_log_write_page - write one block stored in a page, into the log + * @sdp: The superblock + * @page: The struct page + * + * This writes the first block-sized part of the page into the log. Note + * that the page must have been allocated from the gfs2_page_pool mempool + * and that after this has been called, ownership has been transferred and + * the page may be freed at any time. + */ + +static void gfs2_log_write_page(struct gfs2_sbd *sdp, struct page *page) +{ + struct super_block *sb = sdp->sd_vfs; + u64 dblock; + + dblock = gfs2_log_bmap(sdp->sd_jdesc, sdp->sd_log_flush_head); + gfs2_log_incr_head(sdp); + gfs2_log_write(sdp, sdp->sd_jdesc, page, sb->s_blocksize, 0, dblock); +} + +/** + * gfs2_end_log_read - end I/O callback for reads from the log + * @bio: The bio + * + * Simply unlock the pages in the bio. The main thread will wait on them and + * process them in order as necessary. + */ + +static void gfs2_end_log_read(struct bio *bio) +{ + struct page *page; + struct bio_vec *bvec; + struct bvec_iter_all iter_all; + + bio_for_each_segment_all(bvec, bio, iter_all) { + page = bvec->bv_page; + if (bio->bi_status) { + int err = blk_status_to_errno(bio->bi_status); + + SetPageError(page); + mapping_set_error(page->mapping, err); + } + unlock_page(page); + } + + bio_put(bio); +} + +/** + * gfs2_jhead_pg_srch - Look for the journal head in a given page. + * @jd: The journal descriptor + * @head: The journal head to start from + * @page: The page to look in + * + * Returns: 1 if found, 0 otherwise. + */ + +static bool gfs2_jhead_pg_srch(struct gfs2_jdesc *jd, + struct gfs2_log_header_host *head, + struct page *page) +{ + struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode); + struct gfs2_log_header_host lh; + void *kaddr; + unsigned int offset; + bool ret = false; + + kaddr = kmap_local_page(page); + for (offset = 0; offset < PAGE_SIZE; offset += sdp->sd_sb.sb_bsize) { + if (!__get_log_header(sdp, kaddr + offset, 0, &lh)) { + if (lh.lh_sequence >= head->lh_sequence) + *head = lh; + else { + ret = true; + break; + } + } + } + kunmap_local(kaddr); + return ret; +} + +/** + * gfs2_jhead_process_page - Search/cleanup a page + * @jd: The journal descriptor + * @index: Index of the page to look into + * @head: The journal head to start from + * @done: If set, perform only cleanup, else search and set if found. + * + * Find the folio with 'index' in the journal's mapping. Search the folio for + * the journal head if requested (cleanup == false). Release refs on the + * folio so the page cache can reclaim it. We grabbed a + * reference on this folio twice, first when we did a grab_cache_page() + * to obtain the folio to add it to the bio and second when we do a + * filemap_get_folio() here to get the folio to wait on while I/O on it is being + * completed. + * This function is also used to free up a folio we might've grabbed but not + * used. Maybe we added it to a bio, but not submitted it for I/O. Or we + * submitted the I/O, but we already found the jhead so we only need to drop + * our references to the folio. + */ + +static void gfs2_jhead_process_page(struct gfs2_jdesc *jd, unsigned long index, + struct gfs2_log_header_host *head, + bool *done) +{ + struct folio *folio; + + folio = filemap_get_folio(jd->jd_inode->i_mapping, index); + + folio_wait_locked(folio); + if (folio_test_error(folio)) + *done = true; + + if (!*done) + *done = gfs2_jhead_pg_srch(jd, head, &folio->page); + + /* filemap_get_folio() and the earlier grab_cache_page() */ + folio_put_refs(folio, 2); +} + +static struct bio *gfs2_chain_bio(struct bio *prev, unsigned int nr_iovecs) +{ + struct bio *new; + + new = bio_alloc(prev->bi_bdev, nr_iovecs, prev->bi_opf, GFP_NOIO); + bio_clone_blkg_association(new, prev); + new->bi_iter.bi_sector = bio_end_sector(prev); + bio_chain(new, prev); + submit_bio(prev); + return new; +} + +/** + * gfs2_find_jhead - find the head of a log + * @jd: The journal descriptor + * @head: The log descriptor for the head of the log is returned here + * @keep_cache: If set inode pages will not be truncated + * + * Do a search of a journal by reading it in large chunks using bios and find + * the valid log entry with the highest sequence number. (i.e. the log head) + * + * Returns: 0 on success, errno otherwise + */ +int gfs2_find_jhead(struct gfs2_jdesc *jd, struct gfs2_log_header_host *head, + bool keep_cache) +{ + struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode); + struct address_space *mapping = jd->jd_inode->i_mapping; + unsigned int block = 0, blocks_submitted = 0, blocks_read = 0; + unsigned int bsize = sdp->sd_sb.sb_bsize, off; + unsigned int bsize_shift = sdp->sd_sb.sb_bsize_shift; + unsigned int shift = PAGE_SHIFT - bsize_shift; + unsigned int max_blocks = 2 * 1024 * 1024 >> bsize_shift; + struct gfs2_journal_extent *je; + int sz, ret = 0; + struct bio *bio = NULL; + struct page *page = NULL; + bool done = false; + errseq_t since; + + memset(head, 0, sizeof(*head)); + if (list_empty(&jd->extent_list)) + gfs2_map_journal_extents(sdp, jd); + + since = filemap_sample_wb_err(mapping); + list_for_each_entry(je, &jd->extent_list, list) { + u64 dblock = je->dblock; + + for (; block < je->lblock + je->blocks; block++, dblock++) { + if (!page) { + page = grab_cache_page(mapping, block >> shift); + if (!page) { + ret = -ENOMEM; + done = true; + goto out; + } + off = 0; + } + + if (bio && (off || block < blocks_submitted + max_blocks)) { + sector_t sector = dblock << sdp->sd_fsb2bb_shift; + + if (bio_end_sector(bio) == sector) { + sz = bio_add_page(bio, page, bsize, off); + if (sz == bsize) + goto block_added; + } + if (off) { + unsigned int blocks = + (PAGE_SIZE - off) >> bsize_shift; + + bio = gfs2_chain_bio(bio, blocks); + goto add_block_to_new_bio; + } + } + + if (bio) { + blocks_submitted = block; + submit_bio(bio); + } + + bio = gfs2_log_alloc_bio(sdp, dblock, gfs2_end_log_read); + bio->bi_opf = REQ_OP_READ; +add_block_to_new_bio: + sz = bio_add_page(bio, page, bsize, off); + BUG_ON(sz != bsize); +block_added: + off += bsize; + if (off == PAGE_SIZE) + page = NULL; + if (blocks_submitted <= blocks_read + max_blocks) { + /* Keep at least one bio in flight */ + continue; + } + + gfs2_jhead_process_page(jd, blocks_read >> shift, head, &done); + blocks_read += PAGE_SIZE >> bsize_shift; + if (done) + goto out; /* found */ + } + } + +out: + if (bio) + submit_bio(bio); + while (blocks_read < block) { + gfs2_jhead_process_page(jd, blocks_read >> shift, head, &done); + blocks_read += PAGE_SIZE >> bsize_shift; + } + + if (!ret) + ret = filemap_check_wb_err(mapping, since); + + if (!keep_cache) + truncate_inode_pages(mapping, 0); + + return ret; +} + +static struct page *gfs2_get_log_desc(struct gfs2_sbd *sdp, u32 ld_type, + u32 ld_length, u32 ld_data1) +{ + struct page *page = mempool_alloc(gfs2_page_pool, GFP_NOIO); + struct gfs2_log_descriptor *ld = page_address(page); + clear_page(ld); + ld->ld_header.mh_magic = cpu_to_be32(GFS2_MAGIC); + ld->ld_header.mh_type = cpu_to_be32(GFS2_METATYPE_LD); + ld->ld_header.mh_format = cpu_to_be32(GFS2_FORMAT_LD); + ld->ld_type = cpu_to_be32(ld_type); + ld->ld_length = cpu_to_be32(ld_length); + ld->ld_data1 = cpu_to_be32(ld_data1); + ld->ld_data2 = 0; + return page; +} + +static void gfs2_check_magic(struct buffer_head *bh) +{ + void *kaddr; + __be32 *ptr; + + clear_buffer_escaped(bh); + kaddr = kmap_local_page(bh->b_page); + ptr = kaddr + bh_offset(bh); + if (*ptr == cpu_to_be32(GFS2_MAGIC)) + set_buffer_escaped(bh); + kunmap_local(kaddr); +} + +static int blocknr_cmp(void *priv, const struct list_head *a, + const struct list_head *b) +{ + struct gfs2_bufdata *bda, *bdb; + + bda = list_entry(a, struct gfs2_bufdata, bd_list); + bdb = list_entry(b, struct gfs2_bufdata, bd_list); + + if (bda->bd_bh->b_blocknr < bdb->bd_bh->b_blocknr) + return -1; + if (bda->bd_bh->b_blocknr > bdb->bd_bh->b_blocknr) + return 1; + return 0; +} + +static void gfs2_before_commit(struct gfs2_sbd *sdp, unsigned int limit, + unsigned int total, struct list_head *blist, + bool is_databuf) +{ + struct gfs2_log_descriptor *ld; + struct gfs2_bufdata *bd1 = NULL, *bd2; + struct page *page; + unsigned int num; + unsigned n; + __be64 *ptr; + + gfs2_log_lock(sdp); + list_sort(NULL, blist, blocknr_cmp); + bd1 = bd2 = list_prepare_entry(bd1, blist, bd_list); + while(total) { + num = total; + if (total > limit) + num = limit; + gfs2_log_unlock(sdp); + page = gfs2_get_log_desc(sdp, + is_databuf ? GFS2_LOG_DESC_JDATA : + GFS2_LOG_DESC_METADATA, num + 1, num); + ld = page_address(page); + gfs2_log_lock(sdp); + ptr = (__be64 *)(ld + 1); + + n = 0; + list_for_each_entry_continue(bd1, blist, bd_list) { + *ptr++ = cpu_to_be64(bd1->bd_bh->b_blocknr); + if (is_databuf) { + gfs2_check_magic(bd1->bd_bh); + *ptr++ = cpu_to_be64(buffer_escaped(bd1->bd_bh) ? 1 : 0); + } + if (++n >= num) + break; + } + + gfs2_log_unlock(sdp); + gfs2_log_write_page(sdp, page); + gfs2_log_lock(sdp); + + n = 0; + list_for_each_entry_continue(bd2, blist, bd_list) { + get_bh(bd2->bd_bh); + gfs2_log_unlock(sdp); + lock_buffer(bd2->bd_bh); + + if (buffer_escaped(bd2->bd_bh)) { + void *p; + + page = mempool_alloc(gfs2_page_pool, GFP_NOIO); + p = page_address(page); + memcpy_from_page(p, page, bh_offset(bd2->bd_bh), bd2->bd_bh->b_size); + *(__be32 *)p = 0; + clear_buffer_escaped(bd2->bd_bh); + unlock_buffer(bd2->bd_bh); + brelse(bd2->bd_bh); + gfs2_log_write_page(sdp, page); + } else { + gfs2_log_write_bh(sdp, bd2->bd_bh); + } + gfs2_log_lock(sdp); + if (++n >= num) + break; + } + + BUG_ON(total < num); + total -= num; + } + gfs2_log_unlock(sdp); +} + +static void buf_lo_before_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr) +{ + unsigned int limit = buf_limit(sdp); /* 503 for 4k blocks */ + unsigned int nbuf; + if (tr == NULL) + return; + nbuf = tr->tr_num_buf_new - tr->tr_num_buf_rm; + gfs2_before_commit(sdp, limit, nbuf, &tr->tr_buf, 0); +} + +static void buf_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr) +{ + struct list_head *head; + struct gfs2_bufdata *bd; + + if (tr == NULL) + return; + + head = &tr->tr_buf; + while (!list_empty(head)) { + bd = list_first_entry(head, struct gfs2_bufdata, bd_list); + list_del_init(&bd->bd_list); + gfs2_unpin(sdp, bd->bd_bh, tr); + } +} + +static void buf_lo_before_scan(struct gfs2_jdesc *jd, + struct gfs2_log_header_host *head, int pass) +{ + if (pass != 0) + return; + + jd->jd_found_blocks = 0; + jd->jd_replayed_blocks = 0; +} + +#define obsolete_rgrp_replay \ +"Replaying 0x%llx from jid=%d/0x%llx but we already have a bh!\n" +#define obsolete_rgrp_replay2 \ +"busy:%d, pinned:%d rg_gen:0x%llx, j_gen:0x%llx\n" + +static void obsolete_rgrp(struct gfs2_jdesc *jd, struct buffer_head *bh_log, + u64 blkno) +{ + struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode); + struct gfs2_rgrpd *rgd; + struct gfs2_rgrp *jrgd = (struct gfs2_rgrp *)bh_log->b_data; + + rgd = gfs2_blk2rgrpd(sdp, blkno, false); + if (rgd && rgd->rd_addr == blkno && + rgd->rd_bits && rgd->rd_bits->bi_bh) { + fs_info(sdp, obsolete_rgrp_replay, (unsigned long long)blkno, + jd->jd_jid, bh_log->b_blocknr); + fs_info(sdp, obsolete_rgrp_replay2, + buffer_busy(rgd->rd_bits->bi_bh) ? 1 : 0, + buffer_pinned(rgd->rd_bits->bi_bh), + rgd->rd_igeneration, + be64_to_cpu(jrgd->rg_igeneration)); + gfs2_dump_glock(NULL, rgd->rd_gl, true); + } +} + +static int buf_lo_scan_elements(struct gfs2_jdesc *jd, u32 start, + struct gfs2_log_descriptor *ld, __be64 *ptr, + int pass) +{ + struct gfs2_inode *ip = GFS2_I(jd->jd_inode); + struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode); + struct gfs2_glock *gl = ip->i_gl; + unsigned int blks = be32_to_cpu(ld->ld_data1); + struct buffer_head *bh_log, *bh_ip; + u64 blkno; + int error = 0; + + if (pass != 1 || be32_to_cpu(ld->ld_type) != GFS2_LOG_DESC_METADATA) + return 0; + + gfs2_replay_incr_blk(jd, &start); + + for (; blks; gfs2_replay_incr_blk(jd, &start), blks--) { + blkno = be64_to_cpu(*ptr++); + + jd->jd_found_blocks++; + + if (gfs2_revoke_check(jd, blkno, start)) + continue; + + error = gfs2_replay_read_block(jd, start, &bh_log); + if (error) + return error; + + bh_ip = gfs2_meta_new(gl, blkno); + memcpy(bh_ip->b_data, bh_log->b_data, bh_log->b_size); + + if (gfs2_meta_check(sdp, bh_ip)) + error = -EIO; + else { + struct gfs2_meta_header *mh = + (struct gfs2_meta_header *)bh_ip->b_data; + + if (mh->mh_type == cpu_to_be32(GFS2_METATYPE_RG)) + obsolete_rgrp(jd, bh_log, blkno); + + mark_buffer_dirty(bh_ip); + } + brelse(bh_log); + brelse(bh_ip); + + if (error) + break; + + jd->jd_replayed_blocks++; + } + + return error; +} + +static void buf_lo_after_scan(struct gfs2_jdesc *jd, int error, int pass) +{ + struct gfs2_inode *ip = GFS2_I(jd->jd_inode); + struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode); + + if (error) { + gfs2_inode_metasync(ip->i_gl); + return; + } + if (pass != 1) + return; + + gfs2_inode_metasync(ip->i_gl); + + fs_info(sdp, "jid=%u: Replayed %u of %u blocks\n", + jd->jd_jid, jd->jd_replayed_blocks, jd->jd_found_blocks); +} + +static void revoke_lo_before_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr) +{ + struct gfs2_meta_header *mh; + unsigned int offset; + struct list_head *head = &sdp->sd_log_revokes; + struct gfs2_bufdata *bd; + struct page *page; + unsigned int length; + + gfs2_flush_revokes(sdp); + if (!sdp->sd_log_num_revoke) + return; + + length = gfs2_struct2blk(sdp, sdp->sd_log_num_revoke); + page = gfs2_get_log_desc(sdp, GFS2_LOG_DESC_REVOKE, length, sdp->sd_log_num_revoke); + offset = sizeof(struct gfs2_log_descriptor); + + list_for_each_entry(bd, head, bd_list) { + sdp->sd_log_num_revoke--; + + if (offset + sizeof(u64) > sdp->sd_sb.sb_bsize) { + gfs2_log_write_page(sdp, page); + page = mempool_alloc(gfs2_page_pool, GFP_NOIO); + mh = page_address(page); + clear_page(mh); + mh->mh_magic = cpu_to_be32(GFS2_MAGIC); + mh->mh_type = cpu_to_be32(GFS2_METATYPE_LB); + mh->mh_format = cpu_to_be32(GFS2_FORMAT_LB); + offset = sizeof(struct gfs2_meta_header); + } + + *(__be64 *)(page_address(page) + offset) = cpu_to_be64(bd->bd_blkno); + offset += sizeof(u64); + } + gfs2_assert_withdraw(sdp, !sdp->sd_log_num_revoke); + + gfs2_log_write_page(sdp, page); +} + +void gfs2_drain_revokes(struct gfs2_sbd *sdp) +{ + struct list_head *head = &sdp->sd_log_revokes; + struct gfs2_bufdata *bd; + struct gfs2_glock *gl; + + while (!list_empty(head)) { + bd = list_first_entry(head, struct gfs2_bufdata, bd_list); + list_del_init(&bd->bd_list); + gl = bd->bd_gl; + gfs2_glock_remove_revoke(gl); + kmem_cache_free(gfs2_bufdata_cachep, bd); + } +} + +static void revoke_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr) +{ + gfs2_drain_revokes(sdp); +} + +static void revoke_lo_before_scan(struct gfs2_jdesc *jd, + struct gfs2_log_header_host *head, int pass) +{ + if (pass != 0) + return; + + jd->jd_found_revokes = 0; + jd->jd_replay_tail = head->lh_tail; +} + +static int revoke_lo_scan_elements(struct gfs2_jdesc *jd, u32 start, + struct gfs2_log_descriptor *ld, __be64 *ptr, + int pass) +{ + struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode); + unsigned int blks = be32_to_cpu(ld->ld_length); + unsigned int revokes = be32_to_cpu(ld->ld_data1); + struct buffer_head *bh; + unsigned int offset; + u64 blkno; + int first = 1; + int error; + + if (pass != 0 || be32_to_cpu(ld->ld_type) != GFS2_LOG_DESC_REVOKE) + return 0; + + offset = sizeof(struct gfs2_log_descriptor); + + for (; blks; gfs2_replay_incr_blk(jd, &start), blks--) { + error = gfs2_replay_read_block(jd, start, &bh); + if (error) + return error; + + if (!first) + gfs2_metatype_check(sdp, bh, GFS2_METATYPE_LB); + + while (offset + sizeof(u64) <= sdp->sd_sb.sb_bsize) { + blkno = be64_to_cpu(*(__be64 *)(bh->b_data + offset)); + + error = gfs2_revoke_add(jd, blkno, start); + if (error < 0) { + brelse(bh); + return error; + } + else if (error) + jd->jd_found_revokes++; + + if (!--revokes) + break; + offset += sizeof(u64); + } + + brelse(bh); + offset = sizeof(struct gfs2_meta_header); + first = 0; + } + + return 0; +} + +static void revoke_lo_after_scan(struct gfs2_jdesc *jd, int error, int pass) +{ + struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode); + + if (error) { + gfs2_revoke_clean(jd); + return; + } + if (pass != 1) + return; + + fs_info(sdp, "jid=%u: Found %u revoke tags\n", + jd->jd_jid, jd->jd_found_revokes); + + gfs2_revoke_clean(jd); +} + +/** + * databuf_lo_before_commit - Scan the data buffers, writing as we go + * @sdp: The filesystem + * @tr: The system transaction being flushed + */ + +static void databuf_lo_before_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr) +{ + unsigned int limit = databuf_limit(sdp); + unsigned int nbuf; + if (tr == NULL) + return; + nbuf = tr->tr_num_databuf_new - tr->tr_num_databuf_rm; + gfs2_before_commit(sdp, limit, nbuf, &tr->tr_databuf, 1); +} + +static int databuf_lo_scan_elements(struct gfs2_jdesc *jd, u32 start, + struct gfs2_log_descriptor *ld, + __be64 *ptr, int pass) +{ + struct gfs2_inode *ip = GFS2_I(jd->jd_inode); + struct gfs2_glock *gl = ip->i_gl; + unsigned int blks = be32_to_cpu(ld->ld_data1); + struct buffer_head *bh_log, *bh_ip; + u64 blkno; + u64 esc; + int error = 0; + + if (pass != 1 || be32_to_cpu(ld->ld_type) != GFS2_LOG_DESC_JDATA) + return 0; + + gfs2_replay_incr_blk(jd, &start); + for (; blks; gfs2_replay_incr_blk(jd, &start), blks--) { + blkno = be64_to_cpu(*ptr++); + esc = be64_to_cpu(*ptr++); + + jd->jd_found_blocks++; + + if (gfs2_revoke_check(jd, blkno, start)) + continue; + + error = gfs2_replay_read_block(jd, start, &bh_log); + if (error) + return error; + + bh_ip = gfs2_meta_new(gl, blkno); + memcpy(bh_ip->b_data, bh_log->b_data, bh_log->b_size); + + /* Unescape */ + if (esc) { + __be32 *eptr = (__be32 *)bh_ip->b_data; + *eptr = cpu_to_be32(GFS2_MAGIC); + } + mark_buffer_dirty(bh_ip); + + brelse(bh_log); + brelse(bh_ip); + + jd->jd_replayed_blocks++; + } + + return error; +} + +/* FIXME: sort out accounting for log blocks etc. */ + +static void databuf_lo_after_scan(struct gfs2_jdesc *jd, int error, int pass) +{ + struct gfs2_inode *ip = GFS2_I(jd->jd_inode); + struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode); + + if (error) { + gfs2_inode_metasync(ip->i_gl); + return; + } + if (pass != 1) + return; + + /* data sync? */ + gfs2_inode_metasync(ip->i_gl); + + fs_info(sdp, "jid=%u: Replayed %u of %u data blocks\n", + jd->jd_jid, jd->jd_replayed_blocks, jd->jd_found_blocks); +} + +static void databuf_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr) +{ + struct list_head *head; + struct gfs2_bufdata *bd; + + if (tr == NULL) + return; + + head = &tr->tr_databuf; + while (!list_empty(head)) { + bd = list_first_entry(head, struct gfs2_bufdata, bd_list); + list_del_init(&bd->bd_list); + gfs2_unpin(sdp, bd->bd_bh, tr); + } +} + + +static const struct gfs2_log_operations gfs2_buf_lops = { + .lo_before_commit = buf_lo_before_commit, + .lo_after_commit = buf_lo_after_commit, + .lo_before_scan = buf_lo_before_scan, + .lo_scan_elements = buf_lo_scan_elements, + .lo_after_scan = buf_lo_after_scan, + .lo_name = "buf", +}; + +static const struct gfs2_log_operations gfs2_revoke_lops = { + .lo_before_commit = revoke_lo_before_commit, + .lo_after_commit = revoke_lo_after_commit, + .lo_before_scan = revoke_lo_before_scan, + .lo_scan_elements = revoke_lo_scan_elements, + .lo_after_scan = revoke_lo_after_scan, + .lo_name = "revoke", +}; + +static const struct gfs2_log_operations gfs2_databuf_lops = { + .lo_before_commit = databuf_lo_before_commit, + .lo_after_commit = databuf_lo_after_commit, + .lo_scan_elements = databuf_lo_scan_elements, + .lo_after_scan = databuf_lo_after_scan, + .lo_name = "databuf", +}; + +const struct gfs2_log_operations *gfs2_log_ops[] = { + &gfs2_databuf_lops, + &gfs2_buf_lops, + &gfs2_revoke_lops, + NULL, +}; + diff --git a/fs/gfs2/lops.h b/fs/gfs2/lops.h new file mode 100644 index 0000000000..1412ffba1d --- /dev/null +++ b/fs/gfs2/lops.h @@ -0,0 +1,89 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved. + */ + +#ifndef __LOPS_DOT_H__ +#define __LOPS_DOT_H__ + +#include <linux/list.h> +#include "incore.h" + +extern const struct gfs2_log_operations *gfs2_log_ops[]; +extern void gfs2_log_incr_head(struct gfs2_sbd *sdp); +extern u64 gfs2_log_bmap(struct gfs2_jdesc *jd, unsigned int lbn); +extern void gfs2_log_write(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd, + struct page *page, unsigned size, unsigned offset, + u64 blkno); +extern void gfs2_log_submit_bio(struct bio **biop, blk_opf_t opf); +extern void gfs2_pin(struct gfs2_sbd *sdp, struct buffer_head *bh); +extern int gfs2_find_jhead(struct gfs2_jdesc *jd, + struct gfs2_log_header_host *head, bool keep_cache); +extern void gfs2_drain_revokes(struct gfs2_sbd *sdp); +static inline unsigned int buf_limit(struct gfs2_sbd *sdp) +{ + return sdp->sd_ldptrs; +} + +static inline unsigned int databuf_limit(struct gfs2_sbd *sdp) +{ + return sdp->sd_ldptrs / 2; +} + +static inline void lops_before_commit(struct gfs2_sbd *sdp, + struct gfs2_trans *tr) +{ + int x; + for (x = 0; gfs2_log_ops[x]; x++) + if (gfs2_log_ops[x]->lo_before_commit) + gfs2_log_ops[x]->lo_before_commit(sdp, tr); +} + +static inline void lops_after_commit(struct gfs2_sbd *sdp, + struct gfs2_trans *tr) +{ + int x; + for (x = 0; gfs2_log_ops[x]; x++) + if (gfs2_log_ops[x]->lo_after_commit) + gfs2_log_ops[x]->lo_after_commit(sdp, tr); +} + +static inline void lops_before_scan(struct gfs2_jdesc *jd, + struct gfs2_log_header_host *head, + unsigned int pass) +{ + int x; + for (x = 0; gfs2_log_ops[x]; x++) + if (gfs2_log_ops[x]->lo_before_scan) + gfs2_log_ops[x]->lo_before_scan(jd, head, pass); +} + +static inline int lops_scan_elements(struct gfs2_jdesc *jd, u32 start, + struct gfs2_log_descriptor *ld, + __be64 *ptr, + unsigned int pass) +{ + int x, error; + for (x = 0; gfs2_log_ops[x]; x++) + if (gfs2_log_ops[x]->lo_scan_elements) { + error = gfs2_log_ops[x]->lo_scan_elements(jd, start, + ld, ptr, pass); + if (error) + return error; + } + + return 0; +} + +static inline void lops_after_scan(struct gfs2_jdesc *jd, int error, + unsigned int pass) +{ + int x; + for (x = 0; gfs2_log_ops[x]; x++) + if (gfs2_log_ops[x]->lo_before_scan) + gfs2_log_ops[x]->lo_after_scan(jd, error, pass); +} + +#endif /* __LOPS_DOT_H__ */ + diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c new file mode 100644 index 0000000000..66eb98b690 --- /dev/null +++ b/fs/gfs2/main.c @@ -0,0 +1,263 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/slab.h> +#include <linux/spinlock.h> +#include <linux/completion.h> +#include <linux/buffer_head.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/gfs2_ondisk.h> +#include <linux/rcupdate.h> +#include <linux/rculist_bl.h> +#include <linux/atomic.h> +#include <linux/mempool.h> + +#include "gfs2.h" +#include "incore.h" +#include "super.h" +#include "sys.h" +#include "util.h" +#include "glock.h" +#include "quota.h" +#include "recovery.h" +#include "dir.h" +#include "glops.h" + +struct workqueue_struct *gfs2_control_wq; + +static void gfs2_init_inode_once(void *foo) +{ + struct gfs2_inode *ip = foo; + + inode_init_once(&ip->i_inode); + atomic_set(&ip->i_sizehint, 0); + init_rwsem(&ip->i_rw_mutex); + INIT_LIST_HEAD(&ip->i_ordered); + ip->i_qadata = NULL; + gfs2_holder_mark_uninitialized(&ip->i_rgd_gh); + memset(&ip->i_res, 0, sizeof(ip->i_res)); + RB_CLEAR_NODE(&ip->i_res.rs_node); + ip->i_hash_cache = NULL; + gfs2_holder_mark_uninitialized(&ip->i_iopen_gh); +} + +static void gfs2_init_glock_once(void *foo) +{ + struct gfs2_glock *gl = foo; + + spin_lock_init(&gl->gl_lockref.lock); + INIT_LIST_HEAD(&gl->gl_holders); + INIT_LIST_HEAD(&gl->gl_lru); + INIT_LIST_HEAD(&gl->gl_ail_list); + atomic_set(&gl->gl_ail_count, 0); + atomic_set(&gl->gl_revokes, 0); +} + +static void gfs2_init_gl_aspace_once(void *foo) +{ + struct gfs2_glock_aspace *gla = foo; + + gfs2_init_glock_once(&gla->glock); + address_space_init_once(&gla->mapping); +} + +/** + * init_gfs2_fs - Register GFS2 as a filesystem + * + * Returns: 0 on success, error code on failure + */ + +static int __init init_gfs2_fs(void) +{ + int error; + + gfs2_str2qstr(&gfs2_qdot, "."); + gfs2_str2qstr(&gfs2_qdotdot, ".."); + gfs2_quota_hash_init(); + + error = gfs2_sys_init(); + if (error) + return error; + + error = list_lru_init(&gfs2_qd_lru); + if (error) + goto fail_lru; + + error = gfs2_glock_init(); + if (error) + goto fail_glock; + + error = -ENOMEM; + gfs2_glock_cachep = kmem_cache_create("gfs2_glock", + sizeof(struct gfs2_glock), + 0, SLAB_RECLAIM_ACCOUNT, + gfs2_init_glock_once); + if (!gfs2_glock_cachep) + goto fail_cachep1; + + gfs2_glock_aspace_cachep = kmem_cache_create("gfs2_glock(aspace)", + sizeof(struct gfs2_glock_aspace), + 0, 0, gfs2_init_gl_aspace_once); + + if (!gfs2_glock_aspace_cachep) + goto fail_cachep2; + + gfs2_inode_cachep = kmem_cache_create("gfs2_inode", + sizeof(struct gfs2_inode), + 0, SLAB_RECLAIM_ACCOUNT| + SLAB_MEM_SPREAD| + SLAB_ACCOUNT, + gfs2_init_inode_once); + if (!gfs2_inode_cachep) + goto fail_cachep3; + + gfs2_bufdata_cachep = kmem_cache_create("gfs2_bufdata", + sizeof(struct gfs2_bufdata), + 0, 0, NULL); + if (!gfs2_bufdata_cachep) + goto fail_cachep4; + + gfs2_rgrpd_cachep = kmem_cache_create("gfs2_rgrpd", + sizeof(struct gfs2_rgrpd), + 0, 0, NULL); + if (!gfs2_rgrpd_cachep) + goto fail_cachep5; + + gfs2_quotad_cachep = kmem_cache_create("gfs2_quotad", + sizeof(struct gfs2_quota_data), + 0, SLAB_RECLAIM_ACCOUNT, NULL); + if (!gfs2_quotad_cachep) + goto fail_cachep6; + + gfs2_qadata_cachep = kmem_cache_create("gfs2_qadata", + sizeof(struct gfs2_qadata), + 0, 0, NULL); + if (!gfs2_qadata_cachep) + goto fail_cachep7; + + gfs2_trans_cachep = kmem_cache_create("gfs2_trans", + sizeof(struct gfs2_trans), + 0, 0, NULL); + if (!gfs2_trans_cachep) + goto fail_cachep8; + + error = register_shrinker(&gfs2_qd_shrinker, "gfs2-qd"); + if (error) + goto fail_shrinker; + + error = -ENOMEM; + gfs2_recovery_wq = alloc_workqueue("gfs2_recovery", + WQ_MEM_RECLAIM | WQ_FREEZABLE, 0); + if (!gfs2_recovery_wq) + goto fail_wq1; + + gfs2_control_wq = alloc_workqueue("gfs2_control", + WQ_UNBOUND | WQ_FREEZABLE, 0); + if (!gfs2_control_wq) + goto fail_wq2; + + gfs2_freeze_wq = alloc_workqueue("gfs2_freeze", 0, 0); + + if (!gfs2_freeze_wq) + goto fail_wq3; + + gfs2_page_pool = mempool_create_page_pool(64, 0); + if (!gfs2_page_pool) + goto fail_mempool; + + gfs2_register_debugfs(); + error = register_filesystem(&gfs2_fs_type); + if (error) + goto fail_fs1; + + error = register_filesystem(&gfs2meta_fs_type); + if (error) + goto fail_fs2; + + + pr_info("GFS2 installed\n"); + + return 0; + +fail_fs2: + unregister_filesystem(&gfs2_fs_type); +fail_fs1: + mempool_destroy(gfs2_page_pool); +fail_mempool: + destroy_workqueue(gfs2_freeze_wq); +fail_wq3: + destroy_workqueue(gfs2_control_wq); +fail_wq2: + destroy_workqueue(gfs2_recovery_wq); +fail_wq1: + unregister_shrinker(&gfs2_qd_shrinker); +fail_shrinker: + kmem_cache_destroy(gfs2_trans_cachep); +fail_cachep8: + kmem_cache_destroy(gfs2_qadata_cachep); +fail_cachep7: + kmem_cache_destroy(gfs2_quotad_cachep); +fail_cachep6: + kmem_cache_destroy(gfs2_rgrpd_cachep); +fail_cachep5: + kmem_cache_destroy(gfs2_bufdata_cachep); +fail_cachep4: + kmem_cache_destroy(gfs2_inode_cachep); +fail_cachep3: + kmem_cache_destroy(gfs2_glock_aspace_cachep); +fail_cachep2: + kmem_cache_destroy(gfs2_glock_cachep); +fail_cachep1: + gfs2_glock_exit(); +fail_glock: + list_lru_destroy(&gfs2_qd_lru); +fail_lru: + gfs2_sys_uninit(); + return error; +} + +/** + * exit_gfs2_fs - Unregister the file system + * + */ + +static void __exit exit_gfs2_fs(void) +{ + unregister_shrinker(&gfs2_qd_shrinker); + gfs2_glock_exit(); + gfs2_unregister_debugfs(); + unregister_filesystem(&gfs2_fs_type); + unregister_filesystem(&gfs2meta_fs_type); + destroy_workqueue(gfs2_recovery_wq); + destroy_workqueue(gfs2_control_wq); + destroy_workqueue(gfs2_freeze_wq); + list_lru_destroy(&gfs2_qd_lru); + + rcu_barrier(); + + mempool_destroy(gfs2_page_pool); + kmem_cache_destroy(gfs2_trans_cachep); + kmem_cache_destroy(gfs2_qadata_cachep); + kmem_cache_destroy(gfs2_quotad_cachep); + kmem_cache_destroy(gfs2_rgrpd_cachep); + kmem_cache_destroy(gfs2_bufdata_cachep); + kmem_cache_destroy(gfs2_inode_cachep); + kmem_cache_destroy(gfs2_glock_aspace_cachep); + kmem_cache_destroy(gfs2_glock_cachep); + + gfs2_sys_uninit(); +} + +MODULE_DESCRIPTION("Global File System"); +MODULE_AUTHOR("Red Hat, Inc."); +MODULE_LICENSE("GPL"); + +module_init(init_gfs2_fs); +module_exit(exit_gfs2_fs); + diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c new file mode 100644 index 0000000000..924361fa51 --- /dev/null +++ b/fs/gfs2/meta_io.c @@ -0,0 +1,554 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved. + */ + +#include <linux/sched.h> +#include <linux/slab.h> +#include <linux/spinlock.h> +#include <linux/completion.h> +#include <linux/buffer_head.h> +#include <linux/mm.h> +#include <linux/pagemap.h> +#include <linux/writeback.h> +#include <linux/swap.h> +#include <linux/delay.h> +#include <linux/bio.h> +#include <linux/gfs2_ondisk.h> + +#include "gfs2.h" +#include "incore.h" +#include "glock.h" +#include "glops.h" +#include "inode.h" +#include "log.h" +#include "lops.h" +#include "meta_io.h" +#include "rgrp.h" +#include "trans.h" +#include "util.h" +#include "trace_gfs2.h" + +static int gfs2_aspace_writepage(struct page *page, struct writeback_control *wbc) +{ + struct buffer_head *bh, *head; + int nr_underway = 0; + blk_opf_t write_flags = REQ_META | REQ_PRIO | wbc_to_write_flags(wbc); + + BUG_ON(!PageLocked(page)); + BUG_ON(!page_has_buffers(page)); + + head = page_buffers(page); + bh = head; + + do { + if (!buffer_mapped(bh)) + continue; + /* + * If it's a fully non-blocking write attempt and we cannot + * lock the buffer then redirty the page. Note that this can + * potentially cause a busy-wait loop from flusher thread and kswapd + * activity, but those code paths have their own higher-level + * throttling. + */ + if (wbc->sync_mode != WB_SYNC_NONE) { + lock_buffer(bh); + } else if (!trylock_buffer(bh)) { + redirty_page_for_writepage(wbc, page); + continue; + } + if (test_clear_buffer_dirty(bh)) { + mark_buffer_async_write(bh); + } else { + unlock_buffer(bh); + } + } while ((bh = bh->b_this_page) != head); + + /* + * The page and its buffers are protected by PageWriteback(), so we can + * drop the bh refcounts early. + */ + BUG_ON(PageWriteback(page)); + set_page_writeback(page); + + do { + struct buffer_head *next = bh->b_this_page; + if (buffer_async_write(bh)) { + submit_bh(REQ_OP_WRITE | write_flags, bh); + nr_underway++; + } + bh = next; + } while (bh != head); + unlock_page(page); + + if (nr_underway == 0) + end_page_writeback(page); + + return 0; +} + +const struct address_space_operations gfs2_meta_aops = { + .dirty_folio = block_dirty_folio, + .invalidate_folio = block_invalidate_folio, + .writepage = gfs2_aspace_writepage, + .release_folio = gfs2_release_folio, +}; + +const struct address_space_operations gfs2_rgrp_aops = { + .dirty_folio = block_dirty_folio, + .invalidate_folio = block_invalidate_folio, + .writepage = gfs2_aspace_writepage, + .release_folio = gfs2_release_folio, +}; + +/** + * gfs2_getbuf - Get a buffer with a given address space + * @gl: the glock + * @blkno: the block number (filesystem scope) + * @create: 1 if the buffer should be created + * + * Returns: the buffer + */ + +struct buffer_head *gfs2_getbuf(struct gfs2_glock *gl, u64 blkno, int create) +{ + struct address_space *mapping = gfs2_glock2aspace(gl); + struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; + struct page *page; + struct buffer_head *bh; + unsigned int shift; + unsigned long index; + unsigned int bufnum; + + if (mapping == NULL) + mapping = &sdp->sd_aspace; + + shift = PAGE_SHIFT - sdp->sd_sb.sb_bsize_shift; + index = blkno >> shift; /* convert block to page */ + bufnum = blkno - (index << shift); /* block buf index within page */ + + if (create) { + for (;;) { + page = grab_cache_page(mapping, index); + if (page) + break; + yield(); + } + if (!page_has_buffers(page)) + create_empty_buffers(page, sdp->sd_sb.sb_bsize, 0); + } else { + page = find_get_page_flags(mapping, index, + FGP_LOCK|FGP_ACCESSED); + if (!page) + return NULL; + if (!page_has_buffers(page)) { + bh = NULL; + goto out_unlock; + } + } + + /* Locate header for our buffer within our page */ + for (bh = page_buffers(page); bufnum--; bh = bh->b_this_page) + /* Do nothing */; + get_bh(bh); + + if (!buffer_mapped(bh)) + map_bh(bh, sdp->sd_vfs, blkno); + +out_unlock: + unlock_page(page); + put_page(page); + + return bh; +} + +static void meta_prep_new(struct buffer_head *bh) +{ + struct gfs2_meta_header *mh = (struct gfs2_meta_header *)bh->b_data; + + lock_buffer(bh); + clear_buffer_dirty(bh); + set_buffer_uptodate(bh); + unlock_buffer(bh); + + mh->mh_magic = cpu_to_be32(GFS2_MAGIC); +} + +/** + * gfs2_meta_new - Get a block + * @gl: The glock associated with this block + * @blkno: The block number + * + * Returns: The buffer + */ + +struct buffer_head *gfs2_meta_new(struct gfs2_glock *gl, u64 blkno) +{ + struct buffer_head *bh; + bh = gfs2_getbuf(gl, blkno, CREATE); + meta_prep_new(bh); + return bh; +} + +static void gfs2_meta_read_endio(struct bio *bio) +{ + struct bio_vec *bvec; + struct bvec_iter_all iter_all; + + bio_for_each_segment_all(bvec, bio, iter_all) { + struct page *page = bvec->bv_page; + struct buffer_head *bh = page_buffers(page); + unsigned int len = bvec->bv_len; + + while (bh_offset(bh) < bvec->bv_offset) + bh = bh->b_this_page; + do { + struct buffer_head *next = bh->b_this_page; + len -= bh->b_size; + bh->b_end_io(bh, !bio->bi_status); + bh = next; + } while (bh && len); + } + bio_put(bio); +} + +/* + * Submit several consecutive buffer head I/O requests as a single bio I/O + * request. (See submit_bh_wbc.) + */ +static void gfs2_submit_bhs(blk_opf_t opf, struct buffer_head *bhs[], int num) +{ + while (num > 0) { + struct buffer_head *bh = *bhs; + struct bio *bio; + + bio = bio_alloc(bh->b_bdev, num, opf, GFP_NOIO); + bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9); + while (num > 0) { + bh = *bhs; + if (!bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh))) { + BUG_ON(bio->bi_iter.bi_size == 0); + break; + } + bhs++; + num--; + } + bio->bi_end_io = gfs2_meta_read_endio; + submit_bio(bio); + } +} + +/** + * gfs2_meta_read - Read a block from disk + * @gl: The glock covering the block + * @blkno: The block number + * @flags: flags + * @rahead: Do read-ahead + * @bhp: the place where the buffer is returned (NULL on failure) + * + * Returns: errno + */ + +int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int flags, + int rahead, struct buffer_head **bhp) +{ + struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; + struct buffer_head *bh, *bhs[2]; + int num = 0; + + if (unlikely(gfs2_withdrawn(sdp)) && !gfs2_withdraw_in_prog(sdp)) { + *bhp = NULL; + return -EIO; + } + + *bhp = bh = gfs2_getbuf(gl, blkno, CREATE); + + lock_buffer(bh); + if (buffer_uptodate(bh)) { + unlock_buffer(bh); + flags &= ~DIO_WAIT; + } else { + bh->b_end_io = end_buffer_read_sync; + get_bh(bh); + bhs[num++] = bh; + } + + if (rahead) { + bh = gfs2_getbuf(gl, blkno + 1, CREATE); + + lock_buffer(bh); + if (buffer_uptodate(bh)) { + unlock_buffer(bh); + brelse(bh); + } else { + bh->b_end_io = end_buffer_read_sync; + bhs[num++] = bh; + } + } + + gfs2_submit_bhs(REQ_OP_READ | REQ_META | REQ_PRIO, bhs, num); + if (!(flags & DIO_WAIT)) + return 0; + + bh = *bhp; + wait_on_buffer(bh); + if (unlikely(!buffer_uptodate(bh))) { + struct gfs2_trans *tr = current->journal_info; + if (tr && test_bit(TR_TOUCHED, &tr->tr_flags)) + gfs2_io_error_bh_wd(sdp, bh); + brelse(bh); + *bhp = NULL; + return -EIO; + } + + return 0; +} + +/** + * gfs2_meta_wait - Reread a block from disk + * @sdp: the filesystem + * @bh: The block to wait for + * + * Returns: errno + */ + +int gfs2_meta_wait(struct gfs2_sbd *sdp, struct buffer_head *bh) +{ + if (unlikely(gfs2_withdrawn(sdp)) && !gfs2_withdraw_in_prog(sdp)) + return -EIO; + + wait_on_buffer(bh); + + if (!buffer_uptodate(bh)) { + struct gfs2_trans *tr = current->journal_info; + if (tr && test_bit(TR_TOUCHED, &tr->tr_flags)) + gfs2_io_error_bh_wd(sdp, bh); + return -EIO; + } + if (unlikely(gfs2_withdrawn(sdp)) && !gfs2_withdraw_in_prog(sdp)) + return -EIO; + + return 0; +} + +void gfs2_remove_from_journal(struct buffer_head *bh, int meta) +{ + struct address_space *mapping = bh->b_folio->mapping; + struct gfs2_sbd *sdp = gfs2_mapping2sbd(mapping); + struct gfs2_bufdata *bd = bh->b_private; + struct gfs2_trans *tr = current->journal_info; + int was_pinned = 0; + + if (test_clear_buffer_pinned(bh)) { + trace_gfs2_pin(bd, 0); + atomic_dec(&sdp->sd_log_pinned); + list_del_init(&bd->bd_list); + if (meta == REMOVE_META) + tr->tr_num_buf_rm++; + else + tr->tr_num_databuf_rm++; + set_bit(TR_TOUCHED, &tr->tr_flags); + was_pinned = 1; + brelse(bh); + } + if (bd) { + if (bd->bd_tr) { + gfs2_trans_add_revoke(sdp, bd); + } else if (was_pinned) { + bh->b_private = NULL; + kmem_cache_free(gfs2_bufdata_cachep, bd); + } else if (!list_empty(&bd->bd_ail_st_list) && + !list_empty(&bd->bd_ail_gl_list)) { + gfs2_remove_from_ail(bd); + } + } + clear_buffer_dirty(bh); + clear_buffer_uptodate(bh); +} + +/** + * gfs2_ail1_wipe - remove deleted/freed buffers from the ail1 list + * @sdp: superblock + * @bstart: starting block address of buffers to remove + * @blen: length of buffers to be removed + * + * This function is called from gfs2_journal wipe, whose job is to remove + * buffers, corresponding to deleted blocks, from the journal. If we find any + * bufdata elements on the system ail1 list, they haven't been written to + * the journal yet. So we remove them. + */ +static void gfs2_ail1_wipe(struct gfs2_sbd *sdp, u64 bstart, u32 blen) +{ + struct gfs2_trans *tr, *s; + struct gfs2_bufdata *bd, *bs; + struct buffer_head *bh; + u64 end = bstart + blen; + + gfs2_log_lock(sdp); + spin_lock(&sdp->sd_ail_lock); + list_for_each_entry_safe(tr, s, &sdp->sd_ail1_list, tr_list) { + list_for_each_entry_safe(bd, bs, &tr->tr_ail1_list, + bd_ail_st_list) { + bh = bd->bd_bh; + if (bh->b_blocknr < bstart || bh->b_blocknr >= end) + continue; + + gfs2_remove_from_journal(bh, REMOVE_JDATA); + } + } + spin_unlock(&sdp->sd_ail_lock); + gfs2_log_unlock(sdp); +} + +static struct buffer_head *gfs2_getjdatabuf(struct gfs2_inode *ip, u64 blkno) +{ + struct address_space *mapping = ip->i_inode.i_mapping; + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + struct page *page; + struct buffer_head *bh; + unsigned int shift = PAGE_SHIFT - sdp->sd_sb.sb_bsize_shift; + unsigned long index = blkno >> shift; /* convert block to page */ + unsigned int bufnum = blkno - (index << shift); + + page = find_get_page_flags(mapping, index, FGP_LOCK|FGP_ACCESSED); + if (!page) + return NULL; + if (!page_has_buffers(page)) { + unlock_page(page); + put_page(page); + return NULL; + } + /* Locate header for our buffer within our page */ + for (bh = page_buffers(page); bufnum--; bh = bh->b_this_page) + /* Do nothing */; + get_bh(bh); + unlock_page(page); + put_page(page); + return bh; +} + +/** + * gfs2_journal_wipe - make inode's buffers so they aren't dirty/pinned anymore + * @ip: the inode who owns the buffers + * @bstart: the first buffer in the run + * @blen: the number of buffers in the run + * + */ + +void gfs2_journal_wipe(struct gfs2_inode *ip, u64 bstart, u32 blen) +{ + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + struct buffer_head *bh; + int ty; + + if (!ip->i_gl) { + /* This can only happen during incomplete inode creation. */ + BUG_ON(!test_bit(GIF_ALLOC_FAILED, &ip->i_flags)); + return; + } + + gfs2_ail1_wipe(sdp, bstart, blen); + while (blen) { + ty = REMOVE_META; + bh = gfs2_getbuf(ip->i_gl, bstart, NO_CREATE); + if (!bh && gfs2_is_jdata(ip)) { + bh = gfs2_getjdatabuf(ip, bstart); + ty = REMOVE_JDATA; + } + if (bh) { + lock_buffer(bh); + gfs2_log_lock(sdp); + spin_lock(&sdp->sd_ail_lock); + gfs2_remove_from_journal(bh, ty); + spin_unlock(&sdp->sd_ail_lock); + gfs2_log_unlock(sdp); + unlock_buffer(bh); + brelse(bh); + } + + bstart++; + blen--; + } +} + +/** + * gfs2_meta_buffer - Get a metadata buffer + * @ip: The GFS2 inode + * @mtype: The block type (GFS2_METATYPE_*) + * @num: The block number (device relative) of the buffer + * @bhp: the buffer is returned here + * + * Returns: errno + */ + +int gfs2_meta_buffer(struct gfs2_inode *ip, u32 mtype, u64 num, + struct buffer_head **bhp) +{ + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + struct gfs2_glock *gl = ip->i_gl; + struct buffer_head *bh; + int ret = 0; + int rahead = 0; + + if (num == ip->i_no_addr) + rahead = ip->i_rahead; + + ret = gfs2_meta_read(gl, num, DIO_WAIT, rahead, &bh); + if (ret == 0 && gfs2_metatype_check(sdp, bh, mtype)) { + brelse(bh); + ret = -EIO; + } else { + *bhp = bh; + } + return ret; +} + +/** + * gfs2_meta_ra - start readahead on an extent of a file + * @gl: the glock the blocks belong to + * @dblock: the starting disk block + * @extlen: the number of blocks in the extent + * + * returns: the first buffer in the extent + */ + +struct buffer_head *gfs2_meta_ra(struct gfs2_glock *gl, u64 dblock, u32 extlen) +{ + struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; + struct buffer_head *first_bh, *bh; + u32 max_ra = gfs2_tune_get(sdp, gt_max_readahead) >> + sdp->sd_sb.sb_bsize_shift; + + BUG_ON(!extlen); + + if (max_ra < 1) + max_ra = 1; + if (extlen > max_ra) + extlen = max_ra; + + first_bh = gfs2_getbuf(gl, dblock, CREATE); + + if (buffer_uptodate(first_bh)) + goto out; + bh_read_nowait(first_bh, REQ_META | REQ_PRIO); + + dblock++; + extlen--; + + while (extlen) { + bh = gfs2_getbuf(gl, dblock, CREATE); + + bh_readahead(bh, REQ_RAHEAD | REQ_META | REQ_PRIO); + brelse(bh); + dblock++; + extlen--; + if (!buffer_locked(first_bh) && buffer_uptodate(first_bh)) + goto out; + } + + wait_on_buffer(first_bh); +out: + return first_bh; +} + diff --git a/fs/gfs2/meta_io.h b/fs/gfs2/meta_io.h new file mode 100644 index 0000000000..d0a58cdd43 --- /dev/null +++ b/fs/gfs2/meta_io.h @@ -0,0 +1,81 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + */ + +#ifndef __DIO_DOT_H__ +#define __DIO_DOT_H__ + +#include <linux/buffer_head.h> +#include <linux/string.h> +#include "incore.h" + +static inline void gfs2_buffer_clear(struct buffer_head *bh) +{ + memset(bh->b_data, 0, bh->b_size); +} + +static inline void gfs2_buffer_clear_tail(struct buffer_head *bh, int head) +{ + BUG_ON(head > bh->b_size); + memset(bh->b_data + head, 0, bh->b_size - head); +} + +static inline void gfs2_buffer_copy_tail(struct buffer_head *to_bh, + int to_head, + struct buffer_head *from_bh, + int from_head) +{ + BUG_ON(from_head < to_head); + memcpy(to_bh->b_data + to_head, from_bh->b_data + from_head, + from_bh->b_size - from_head); + memset(to_bh->b_data + to_bh->b_size + to_head - from_head, + 0, from_head - to_head); +} + +extern const struct address_space_operations gfs2_meta_aops; +extern const struct address_space_operations gfs2_rgrp_aops; + +static inline struct gfs2_sbd *gfs2_mapping2sbd(struct address_space *mapping) +{ + struct inode *inode = mapping->host; + if (mapping->a_ops == &gfs2_meta_aops) { + struct gfs2_glock_aspace *gla = + container_of(mapping, struct gfs2_glock_aspace, mapping); + return gla->glock.gl_name.ln_sbd; + } else if (mapping->a_ops == &gfs2_rgrp_aops) + return container_of(mapping, struct gfs2_sbd, sd_aspace); + else + return inode->i_sb->s_fs_info; +} + +extern struct buffer_head *gfs2_meta_new(struct gfs2_glock *gl, u64 blkno); +extern int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int flags, + int rahead, struct buffer_head **bhp); +extern int gfs2_meta_wait(struct gfs2_sbd *sdp, struct buffer_head *bh); +extern struct buffer_head *gfs2_getbuf(struct gfs2_glock *gl, u64 blkno, + int create); +enum { + REMOVE_JDATA = 0, + REMOVE_META = 1, +}; + +extern void gfs2_remove_from_journal(struct buffer_head *bh, int meta); +extern void gfs2_journal_wipe(struct gfs2_inode *ip, u64 bstart, u32 blen); +extern int gfs2_meta_buffer(struct gfs2_inode *ip, u32 mtype, u64 num, + struct buffer_head **bhp); + +static inline int gfs2_meta_inode_buffer(struct gfs2_inode *ip, + struct buffer_head **bhp) +{ + return gfs2_meta_buffer(ip, GFS2_METATYPE_DI, ip->i_no_addr, bhp); +} + +struct buffer_head *gfs2_meta_ra(struct gfs2_glock *gl, u64 dblock, u32 extlen); + +#define buffer_busy(bh) \ +((bh)->b_state & ((1ul << BH_Dirty) | (1ul << BH_Lock) | (1ul << BH_Pinned))) + +#endif /* __DIO_DOT_H__ */ + diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c new file mode 100644 index 0000000000..dd64140ae6 --- /dev/null +++ b/fs/gfs2/ops_fstype.c @@ -0,0 +1,1828 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/sched.h> +#include <linux/slab.h> +#include <linux/spinlock.h> +#include <linux/completion.h> +#include <linux/buffer_head.h> +#include <linux/blkdev.h> +#include <linux/kthread.h> +#include <linux/export.h> +#include <linux/namei.h> +#include <linux/mount.h> +#include <linux/gfs2_ondisk.h> +#include <linux/quotaops.h> +#include <linux/lockdep.h> +#include <linux/module.h> +#include <linux/backing-dev.h> +#include <linux/fs_parser.h> + +#include "gfs2.h" +#include "incore.h" +#include "bmap.h" +#include "glock.h" +#include "glops.h" +#include "inode.h" +#include "recovery.h" +#include "rgrp.h" +#include "super.h" +#include "sys.h" +#include "util.h" +#include "log.h" +#include "quota.h" +#include "dir.h" +#include "meta_io.h" +#include "trace_gfs2.h" +#include "lops.h" + +#define DO 0 +#define UNDO 1 + +/** + * gfs2_tune_init - Fill a gfs2_tune structure with default values + * @gt: tune + * + */ + +static void gfs2_tune_init(struct gfs2_tune *gt) +{ + spin_lock_init(>->gt_spin); + + gt->gt_quota_warn_period = 10; + gt->gt_quota_scale_num = 1; + gt->gt_quota_scale_den = 1; + gt->gt_new_files_jdata = 0; + gt->gt_max_readahead = BIT(18); + gt->gt_complain_secs = 10; +} + +void free_sbd(struct gfs2_sbd *sdp) +{ + if (sdp->sd_lkstats) + free_percpu(sdp->sd_lkstats); + kfree(sdp); +} + +static struct gfs2_sbd *init_sbd(struct super_block *sb) +{ + struct gfs2_sbd *sdp; + struct address_space *mapping; + + sdp = kzalloc(sizeof(struct gfs2_sbd), GFP_KERNEL); + if (!sdp) + return NULL; + + sdp->sd_vfs = sb; + sdp->sd_lkstats = alloc_percpu(struct gfs2_pcpu_lkstats); + if (!sdp->sd_lkstats) + goto fail; + sb->s_fs_info = sdp; + + set_bit(SDF_NOJOURNALID, &sdp->sd_flags); + gfs2_tune_init(&sdp->sd_tune); + + init_waitqueue_head(&sdp->sd_kill_wait); + init_waitqueue_head(&sdp->sd_async_glock_wait); + atomic_set(&sdp->sd_glock_disposal, 0); + init_completion(&sdp->sd_locking_init); + init_completion(&sdp->sd_wdack); + spin_lock_init(&sdp->sd_statfs_spin); + + spin_lock_init(&sdp->sd_rindex_spin); + sdp->sd_rindex_tree.rb_node = NULL; + + INIT_LIST_HEAD(&sdp->sd_jindex_list); + spin_lock_init(&sdp->sd_jindex_spin); + mutex_init(&sdp->sd_jindex_mutex); + init_completion(&sdp->sd_journal_ready); + + INIT_LIST_HEAD(&sdp->sd_quota_list); + mutex_init(&sdp->sd_quota_mutex); + mutex_init(&sdp->sd_quota_sync_mutex); + init_waitqueue_head(&sdp->sd_quota_wait); + spin_lock_init(&sdp->sd_bitmap_lock); + + INIT_LIST_HEAD(&sdp->sd_sc_inodes_list); + + mapping = &sdp->sd_aspace; + + address_space_init_once(mapping); + mapping->a_ops = &gfs2_rgrp_aops; + mapping->host = sb->s_bdev->bd_inode; + mapping->flags = 0; + mapping_set_gfp_mask(mapping, GFP_NOFS); + mapping->private_data = NULL; + mapping->writeback_index = 0; + + spin_lock_init(&sdp->sd_log_lock); + atomic_set(&sdp->sd_log_pinned, 0); + INIT_LIST_HEAD(&sdp->sd_log_revokes); + INIT_LIST_HEAD(&sdp->sd_log_ordered); + spin_lock_init(&sdp->sd_ordered_lock); + + init_waitqueue_head(&sdp->sd_log_waitq); + init_waitqueue_head(&sdp->sd_logd_waitq); + spin_lock_init(&sdp->sd_ail_lock); + INIT_LIST_HEAD(&sdp->sd_ail1_list); + INIT_LIST_HEAD(&sdp->sd_ail2_list); + + init_rwsem(&sdp->sd_log_flush_lock); + atomic_set(&sdp->sd_log_in_flight, 0); + init_waitqueue_head(&sdp->sd_log_flush_wait); + mutex_init(&sdp->sd_freeze_mutex); + + return sdp; + +fail: + free_sbd(sdp); + return NULL; +} + +/** + * gfs2_check_sb - Check superblock + * @sdp: the filesystem + * @silent: Don't print a message if the check fails + * + * Checks the version code of the FS is one that we understand how to + * read and that the sizes of the various on-disk structures have not + * changed. + */ + +static int gfs2_check_sb(struct gfs2_sbd *sdp, int silent) +{ + struct gfs2_sb_host *sb = &sdp->sd_sb; + + if (sb->sb_magic != GFS2_MAGIC || + sb->sb_type != GFS2_METATYPE_SB) { + if (!silent) + pr_warn("not a GFS2 filesystem\n"); + return -EINVAL; + } + + if (sb->sb_fs_format < GFS2_FS_FORMAT_MIN || + sb->sb_fs_format > GFS2_FS_FORMAT_MAX || + sb->sb_multihost_format != GFS2_FORMAT_MULTI) { + fs_warn(sdp, "Unknown on-disk format, unable to mount\n"); + return -EINVAL; + } + + if (sb->sb_bsize < 512 || sb->sb_bsize > PAGE_SIZE || + (sb->sb_bsize & (sb->sb_bsize - 1))) { + pr_warn("Invalid block size\n"); + return -EINVAL; + } + if (sb->sb_bsize_shift != ffs(sb->sb_bsize) - 1) { + pr_warn("Invalid block size shift\n"); + return -EINVAL; + } + return 0; +} + +static void end_bio_io_page(struct bio *bio) +{ + struct page *page = bio->bi_private; + + if (!bio->bi_status) + SetPageUptodate(page); + else + pr_warn("error %d reading superblock\n", bio->bi_status); + unlock_page(page); +} + +static void gfs2_sb_in(struct gfs2_sbd *sdp, const void *buf) +{ + struct gfs2_sb_host *sb = &sdp->sd_sb; + struct super_block *s = sdp->sd_vfs; + const struct gfs2_sb *str = buf; + + sb->sb_magic = be32_to_cpu(str->sb_header.mh_magic); + sb->sb_type = be32_to_cpu(str->sb_header.mh_type); + sb->sb_fs_format = be32_to_cpu(str->sb_fs_format); + sb->sb_multihost_format = be32_to_cpu(str->sb_multihost_format); + sb->sb_bsize = be32_to_cpu(str->sb_bsize); + sb->sb_bsize_shift = be32_to_cpu(str->sb_bsize_shift); + sb->sb_master_dir.no_addr = be64_to_cpu(str->sb_master_dir.no_addr); + sb->sb_master_dir.no_formal_ino = be64_to_cpu(str->sb_master_dir.no_formal_ino); + sb->sb_root_dir.no_addr = be64_to_cpu(str->sb_root_dir.no_addr); + sb->sb_root_dir.no_formal_ino = be64_to_cpu(str->sb_root_dir.no_formal_ino); + + memcpy(sb->sb_lockproto, str->sb_lockproto, GFS2_LOCKNAME_LEN); + memcpy(sb->sb_locktable, str->sb_locktable, GFS2_LOCKNAME_LEN); + memcpy(&s->s_uuid, str->sb_uuid, 16); +} + +/** + * gfs2_read_super - Read the gfs2 super block from disk + * @sdp: The GFS2 super block + * @sector: The location of the super block + * @silent: Don't print a message if the check fails + * + * This uses the bio functions to read the super block from disk + * because we want to be 100% sure that we never read cached data. + * A super block is read twice only during each GFS2 mount and is + * never written to by the filesystem. The first time its read no + * locks are held, and the only details which are looked at are those + * relating to the locking protocol. Once locking is up and working, + * the sb is read again under the lock to establish the location of + * the master directory (contains pointers to journals etc) and the + * root directory. + * + * Returns: 0 on success or error + */ + +static int gfs2_read_super(struct gfs2_sbd *sdp, sector_t sector, int silent) +{ + struct super_block *sb = sdp->sd_vfs; + struct gfs2_sb *p; + struct page *page; + struct bio *bio; + + page = alloc_page(GFP_NOFS); + if (unlikely(!page)) + return -ENOMEM; + + ClearPageUptodate(page); + ClearPageDirty(page); + lock_page(page); + + bio = bio_alloc(sb->s_bdev, 1, REQ_OP_READ | REQ_META, GFP_NOFS); + bio->bi_iter.bi_sector = sector * (sb->s_blocksize >> 9); + __bio_add_page(bio, page, PAGE_SIZE, 0); + + bio->bi_end_io = end_bio_io_page; + bio->bi_private = page; + submit_bio(bio); + wait_on_page_locked(page); + bio_put(bio); + if (!PageUptodate(page)) { + __free_page(page); + return -EIO; + } + p = kmap(page); + gfs2_sb_in(sdp, p); + kunmap(page); + __free_page(page); + return gfs2_check_sb(sdp, silent); +} + +/** + * gfs2_read_sb - Read super block + * @sdp: The GFS2 superblock + * @silent: Don't print message if mount fails + * + */ + +static int gfs2_read_sb(struct gfs2_sbd *sdp, int silent) +{ + u32 hash_blocks, ind_blocks, leaf_blocks; + u32 tmp_blocks; + unsigned int x; + int error; + + error = gfs2_read_super(sdp, GFS2_SB_ADDR >> sdp->sd_fsb2bb_shift, silent); + if (error) { + if (!silent) + fs_err(sdp, "can't read superblock\n"); + return error; + } + + sdp->sd_fsb2bb_shift = sdp->sd_sb.sb_bsize_shift - + GFS2_BASIC_BLOCK_SHIFT; + sdp->sd_fsb2bb = BIT(sdp->sd_fsb2bb_shift); + sdp->sd_diptrs = (sdp->sd_sb.sb_bsize - + sizeof(struct gfs2_dinode)) / sizeof(u64); + sdp->sd_inptrs = (sdp->sd_sb.sb_bsize - + sizeof(struct gfs2_meta_header)) / sizeof(u64); + sdp->sd_ldptrs = (sdp->sd_sb.sb_bsize - + sizeof(struct gfs2_log_descriptor)) / sizeof(u64); + sdp->sd_jbsize = sdp->sd_sb.sb_bsize - sizeof(struct gfs2_meta_header); + sdp->sd_hash_bsize = sdp->sd_sb.sb_bsize / 2; + sdp->sd_hash_bsize_shift = sdp->sd_sb.sb_bsize_shift - 1; + sdp->sd_hash_ptrs = sdp->sd_hash_bsize / sizeof(u64); + sdp->sd_qc_per_block = (sdp->sd_sb.sb_bsize - + sizeof(struct gfs2_meta_header)) / + sizeof(struct gfs2_quota_change); + sdp->sd_blocks_per_bitmap = (sdp->sd_sb.sb_bsize - + sizeof(struct gfs2_meta_header)) + * GFS2_NBBY; /* not the rgrp bitmap, subsequent bitmaps only */ + + /* + * We always keep at least one block reserved for revokes in + * transactions. This greatly simplifies allocating additional + * revoke blocks. + */ + atomic_set(&sdp->sd_log_revokes_available, sdp->sd_ldptrs); + + /* Compute maximum reservation required to add a entry to a directory */ + + hash_blocks = DIV_ROUND_UP(sizeof(u64) * BIT(GFS2_DIR_MAX_DEPTH), + sdp->sd_jbsize); + + ind_blocks = 0; + for (tmp_blocks = hash_blocks; tmp_blocks > sdp->sd_diptrs;) { + tmp_blocks = DIV_ROUND_UP(tmp_blocks, sdp->sd_inptrs); + ind_blocks += tmp_blocks; + } + + leaf_blocks = 2 + GFS2_DIR_MAX_DEPTH; + + sdp->sd_max_dirres = hash_blocks + ind_blocks + leaf_blocks; + + sdp->sd_heightsize[0] = sdp->sd_sb.sb_bsize - + sizeof(struct gfs2_dinode); + sdp->sd_heightsize[1] = sdp->sd_sb.sb_bsize * sdp->sd_diptrs; + for (x = 2;; x++) { + u64 space, d; + u32 m; + + space = sdp->sd_heightsize[x - 1] * sdp->sd_inptrs; + d = space; + m = do_div(d, sdp->sd_inptrs); + + if (d != sdp->sd_heightsize[x - 1] || m) + break; + sdp->sd_heightsize[x] = space; + } + sdp->sd_max_height = x; + sdp->sd_heightsize[x] = ~0; + gfs2_assert(sdp, sdp->sd_max_height <= GFS2_MAX_META_HEIGHT); + + sdp->sd_max_dents_per_leaf = (sdp->sd_sb.sb_bsize - + sizeof(struct gfs2_leaf)) / + GFS2_MIN_DIRENT_SIZE; + return 0; +} + +static int init_names(struct gfs2_sbd *sdp, int silent) +{ + char *proto, *table; + int error = 0; + + proto = sdp->sd_args.ar_lockproto; + table = sdp->sd_args.ar_locktable; + + /* Try to autodetect */ + + if (!proto[0] || !table[0]) { + error = gfs2_read_super(sdp, GFS2_SB_ADDR >> sdp->sd_fsb2bb_shift, silent); + if (error) + return error; + + if (!proto[0]) + proto = sdp->sd_sb.sb_lockproto; + if (!table[0]) + table = sdp->sd_sb.sb_locktable; + } + + if (!table[0]) + table = sdp->sd_vfs->s_id; + + BUILD_BUG_ON(GFS2_LOCKNAME_LEN > GFS2_FSNAME_LEN); + + strscpy(sdp->sd_proto_name, proto, GFS2_LOCKNAME_LEN); + strscpy(sdp->sd_table_name, table, GFS2_LOCKNAME_LEN); + + table = sdp->sd_table_name; + while ((table = strchr(table, '/'))) + *table = '_'; + + return error; +} + +static int init_locking(struct gfs2_sbd *sdp, struct gfs2_holder *mount_gh, + int undo) +{ + int error = 0; + + if (undo) + goto fail_trans; + + error = gfs2_glock_nq_num(sdp, + GFS2_MOUNT_LOCK, &gfs2_nondisk_glops, + LM_ST_EXCLUSIVE, + LM_FLAG_NOEXP | GL_NOCACHE | GL_NOPID, + mount_gh); + if (error) { + fs_err(sdp, "can't acquire mount glock: %d\n", error); + goto fail; + } + + error = gfs2_glock_nq_num(sdp, + GFS2_LIVE_LOCK, &gfs2_nondisk_glops, + LM_ST_SHARED, + LM_FLAG_NOEXP | GL_EXACT | GL_NOPID, + &sdp->sd_live_gh); + if (error) { + fs_err(sdp, "can't acquire live glock: %d\n", error); + goto fail_mount; + } + + error = gfs2_glock_get(sdp, GFS2_RENAME_LOCK, &gfs2_nondisk_glops, + CREATE, &sdp->sd_rename_gl); + if (error) { + fs_err(sdp, "can't create rename glock: %d\n", error); + goto fail_live; + } + + error = gfs2_glock_get(sdp, GFS2_FREEZE_LOCK, &gfs2_freeze_glops, + CREATE, &sdp->sd_freeze_gl); + if (error) { + fs_err(sdp, "can't create freeze glock: %d\n", error); + goto fail_rename; + } + + return 0; + +fail_trans: + gfs2_glock_put(sdp->sd_freeze_gl); +fail_rename: + gfs2_glock_put(sdp->sd_rename_gl); +fail_live: + gfs2_glock_dq_uninit(&sdp->sd_live_gh); +fail_mount: + gfs2_glock_dq_uninit(mount_gh); +fail: + return error; +} + +static int gfs2_lookup_root(struct super_block *sb, struct dentry **dptr, + u64 no_addr, const char *name) +{ + struct gfs2_sbd *sdp = sb->s_fs_info; + struct dentry *dentry; + struct inode *inode; + + inode = gfs2_inode_lookup(sb, DT_DIR, no_addr, 0, + GFS2_BLKST_FREE /* ignore */); + if (IS_ERR(inode)) { + fs_err(sdp, "can't read in %s inode: %ld\n", name, PTR_ERR(inode)); + return PTR_ERR(inode); + } + dentry = d_make_root(inode); + if (!dentry) { + fs_err(sdp, "can't alloc %s dentry\n", name); + return -ENOMEM; + } + *dptr = dentry; + return 0; +} + +static int init_sb(struct gfs2_sbd *sdp, int silent) +{ + struct super_block *sb = sdp->sd_vfs; + struct gfs2_holder sb_gh; + u64 no_addr; + int ret; + + ret = gfs2_glock_nq_num(sdp, GFS2_SB_LOCK, &gfs2_meta_glops, + LM_ST_SHARED, 0, &sb_gh); + if (ret) { + fs_err(sdp, "can't acquire superblock glock: %d\n", ret); + return ret; + } + + ret = gfs2_read_sb(sdp, silent); + if (ret) { + fs_err(sdp, "can't read superblock: %d\n", ret); + goto out; + } + + switch(sdp->sd_sb.sb_fs_format) { + case GFS2_FS_FORMAT_MAX: + sb->s_xattr = gfs2_xattr_handlers_max; + break; + + case GFS2_FS_FORMAT_MIN: + sb->s_xattr = gfs2_xattr_handlers_min; + break; + + default: + BUG(); + } + + /* Set up the buffer cache and SB for real */ + if (sdp->sd_sb.sb_bsize < bdev_logical_block_size(sb->s_bdev)) { + ret = -EINVAL; + fs_err(sdp, "FS block size (%u) is too small for device " + "block size (%u)\n", + sdp->sd_sb.sb_bsize, bdev_logical_block_size(sb->s_bdev)); + goto out; + } + if (sdp->sd_sb.sb_bsize > PAGE_SIZE) { + ret = -EINVAL; + fs_err(sdp, "FS block size (%u) is too big for machine " + "page size (%u)\n", + sdp->sd_sb.sb_bsize, (unsigned int)PAGE_SIZE); + goto out; + } + sb_set_blocksize(sb, sdp->sd_sb.sb_bsize); + + /* Get the root inode */ + no_addr = sdp->sd_sb.sb_root_dir.no_addr; + ret = gfs2_lookup_root(sb, &sdp->sd_root_dir, no_addr, "root"); + if (ret) + goto out; + + /* Get the master inode */ + no_addr = sdp->sd_sb.sb_master_dir.no_addr; + ret = gfs2_lookup_root(sb, &sdp->sd_master_dir, no_addr, "master"); + if (ret) { + dput(sdp->sd_root_dir); + goto out; + } + sb->s_root = dget(sdp->sd_args.ar_meta ? sdp->sd_master_dir : sdp->sd_root_dir); +out: + gfs2_glock_dq_uninit(&sb_gh); + return ret; +} + +static void gfs2_others_may_mount(struct gfs2_sbd *sdp) +{ + char *message = "FIRSTMOUNT=Done"; + char *envp[] = { message, NULL }; + + fs_info(sdp, "first mount done, others may mount\n"); + + if (sdp->sd_lockstruct.ls_ops->lm_first_done) + sdp->sd_lockstruct.ls_ops->lm_first_done(sdp); + + kobject_uevent_env(&sdp->sd_kobj, KOBJ_CHANGE, envp); +} + +/** + * gfs2_jindex_hold - Grab a lock on the jindex + * @sdp: The GFS2 superblock + * @ji_gh: the holder for the jindex glock + * + * Returns: errno + */ + +static int gfs2_jindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ji_gh) +{ + struct gfs2_inode *dip = GFS2_I(sdp->sd_jindex); + struct qstr name; + char buf[20]; + struct gfs2_jdesc *jd; + int error; + + name.name = buf; + + mutex_lock(&sdp->sd_jindex_mutex); + + for (;;) { + struct gfs2_inode *jip; + + error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED, 0, ji_gh); + if (error) + break; + + name.len = sprintf(buf, "journal%u", sdp->sd_journals); + name.hash = gfs2_disk_hash(name.name, name.len); + + error = gfs2_dir_check(sdp->sd_jindex, &name, NULL); + if (error == -ENOENT) { + error = 0; + break; + } + + gfs2_glock_dq_uninit(ji_gh); + + if (error) + break; + + error = -ENOMEM; + jd = kzalloc(sizeof(struct gfs2_jdesc), GFP_KERNEL); + if (!jd) + break; + + INIT_LIST_HEAD(&jd->extent_list); + INIT_LIST_HEAD(&jd->jd_revoke_list); + + INIT_WORK(&jd->jd_work, gfs2_recover_func); + jd->jd_inode = gfs2_lookupi(sdp->sd_jindex, &name, 1); + if (IS_ERR_OR_NULL(jd->jd_inode)) { + if (!jd->jd_inode) + error = -ENOENT; + else + error = PTR_ERR(jd->jd_inode); + kfree(jd); + break; + } + + d_mark_dontcache(jd->jd_inode); + spin_lock(&sdp->sd_jindex_spin); + jd->jd_jid = sdp->sd_journals++; + jip = GFS2_I(jd->jd_inode); + jd->jd_no_addr = jip->i_no_addr; + list_add_tail(&jd->jd_list, &sdp->sd_jindex_list); + spin_unlock(&sdp->sd_jindex_spin); + } + + mutex_unlock(&sdp->sd_jindex_mutex); + + return error; +} + +/** + * init_statfs - look up and initialize master and local (per node) statfs inodes + * @sdp: The GFS2 superblock + * + * This should be called after the jindex is initialized in init_journal() and + * before gfs2_journal_recovery() is called because we need to be able to write + * to these inodes during recovery. + * + * Returns: errno + */ +static int init_statfs(struct gfs2_sbd *sdp) +{ + int error = 0; + struct inode *master = d_inode(sdp->sd_master_dir); + struct inode *pn = NULL; + char buf[30]; + struct gfs2_jdesc *jd; + struct gfs2_inode *ip; + + sdp->sd_statfs_inode = gfs2_lookup_simple(master, "statfs"); + if (IS_ERR(sdp->sd_statfs_inode)) { + error = PTR_ERR(sdp->sd_statfs_inode); + fs_err(sdp, "can't read in statfs inode: %d\n", error); + goto out; + } + if (sdp->sd_args.ar_spectator) + goto out; + + pn = gfs2_lookup_simple(master, "per_node"); + if (IS_ERR(pn)) { + error = PTR_ERR(pn); + fs_err(sdp, "can't find per_node directory: %d\n", error); + goto put_statfs; + } + + /* For each jid, lookup the corresponding local statfs inode in the + * per_node metafs directory and save it in the sdp->sd_sc_inodes_list. */ + list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) { + struct local_statfs_inode *lsi = + kmalloc(sizeof(struct local_statfs_inode), GFP_NOFS); + if (!lsi) { + error = -ENOMEM; + goto free_local; + } + sprintf(buf, "statfs_change%u", jd->jd_jid); + lsi->si_sc_inode = gfs2_lookup_simple(pn, buf); + if (IS_ERR(lsi->si_sc_inode)) { + error = PTR_ERR(lsi->si_sc_inode); + fs_err(sdp, "can't find local \"sc\" file#%u: %d\n", + jd->jd_jid, error); + kfree(lsi); + goto free_local; + } + lsi->si_jid = jd->jd_jid; + if (jd->jd_jid == sdp->sd_jdesc->jd_jid) + sdp->sd_sc_inode = lsi->si_sc_inode; + + list_add_tail(&lsi->si_list, &sdp->sd_sc_inodes_list); + } + + iput(pn); + pn = NULL; + ip = GFS2_I(sdp->sd_sc_inode); + error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, GL_NOPID, + &sdp->sd_sc_gh); + if (error) { + fs_err(sdp, "can't lock local \"sc\" file: %d\n", error); + goto free_local; + } + /* read in the local statfs buffer - other nodes don't change it. */ + error = gfs2_meta_inode_buffer(ip, &sdp->sd_sc_bh); + if (error) { + fs_err(sdp, "Cannot read in local statfs: %d\n", error); + goto unlock_sd_gh; + } + return 0; + +unlock_sd_gh: + gfs2_glock_dq_uninit(&sdp->sd_sc_gh); +free_local: + free_local_statfs_inodes(sdp); + iput(pn); +put_statfs: + iput(sdp->sd_statfs_inode); +out: + return error; +} + +/* Uninitialize and free up memory used by the list of statfs inodes */ +static void uninit_statfs(struct gfs2_sbd *sdp) +{ + if (!sdp->sd_args.ar_spectator) { + brelse(sdp->sd_sc_bh); + gfs2_glock_dq_uninit(&sdp->sd_sc_gh); + free_local_statfs_inodes(sdp); + } + iput(sdp->sd_statfs_inode); +} + +static int init_journal(struct gfs2_sbd *sdp, int undo) +{ + struct inode *master = d_inode(sdp->sd_master_dir); + struct gfs2_holder ji_gh; + struct gfs2_inode *ip; + int error = 0; + + gfs2_holder_mark_uninitialized(&ji_gh); + if (undo) + goto fail_statfs; + + sdp->sd_jindex = gfs2_lookup_simple(master, "jindex"); + if (IS_ERR(sdp->sd_jindex)) { + fs_err(sdp, "can't lookup journal index: %d\n", error); + return PTR_ERR(sdp->sd_jindex); + } + + /* Load in the journal index special file */ + + error = gfs2_jindex_hold(sdp, &ji_gh); + if (error) { + fs_err(sdp, "can't read journal index: %d\n", error); + goto fail; + } + + error = -EUSERS; + if (!gfs2_jindex_size(sdp)) { + fs_err(sdp, "no journals!\n"); + goto fail_jindex; + } + + atomic_set(&sdp->sd_log_blks_needed, 0); + if (sdp->sd_args.ar_spectator) { + sdp->sd_jdesc = gfs2_jdesc_find(sdp, 0); + atomic_set(&sdp->sd_log_blks_free, sdp->sd_jdesc->jd_blocks); + atomic_set(&sdp->sd_log_thresh1, 2*sdp->sd_jdesc->jd_blocks/5); + atomic_set(&sdp->sd_log_thresh2, 4*sdp->sd_jdesc->jd_blocks/5); + } else { + if (sdp->sd_lockstruct.ls_jid >= gfs2_jindex_size(sdp)) { + fs_err(sdp, "can't mount journal #%u\n", + sdp->sd_lockstruct.ls_jid); + fs_err(sdp, "there are only %u journals (0 - %u)\n", + gfs2_jindex_size(sdp), + gfs2_jindex_size(sdp) - 1); + goto fail_jindex; + } + sdp->sd_jdesc = gfs2_jdesc_find(sdp, sdp->sd_lockstruct.ls_jid); + + error = gfs2_glock_nq_num(sdp, sdp->sd_lockstruct.ls_jid, + &gfs2_journal_glops, + LM_ST_EXCLUSIVE, + LM_FLAG_NOEXP | GL_NOCACHE | GL_NOPID, + &sdp->sd_journal_gh); + if (error) { + fs_err(sdp, "can't acquire journal glock: %d\n", error); + goto fail_jindex; + } + + ip = GFS2_I(sdp->sd_jdesc->jd_inode); + sdp->sd_jinode_gl = ip->i_gl; + error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, + LM_FLAG_NOEXP | GL_EXACT | + GL_NOCACHE | GL_NOPID, + &sdp->sd_jinode_gh); + if (error) { + fs_err(sdp, "can't acquire journal inode glock: %d\n", + error); + goto fail_journal_gh; + } + + error = gfs2_jdesc_check(sdp->sd_jdesc); + if (error) { + fs_err(sdp, "my journal (%u) is bad: %d\n", + sdp->sd_jdesc->jd_jid, error); + goto fail_jinode_gh; + } + atomic_set(&sdp->sd_log_blks_free, sdp->sd_jdesc->jd_blocks); + atomic_set(&sdp->sd_log_thresh1, 2*sdp->sd_jdesc->jd_blocks/5); + atomic_set(&sdp->sd_log_thresh2, 4*sdp->sd_jdesc->jd_blocks/5); + + /* Map the extents for this journal's blocks */ + gfs2_map_journal_extents(sdp, sdp->sd_jdesc); + } + trace_gfs2_log_blocks(sdp, atomic_read(&sdp->sd_log_blks_free)); + + /* Lookup statfs inodes here so journal recovery can use them. */ + error = init_statfs(sdp); + if (error) + goto fail_jinode_gh; + + if (sdp->sd_lockstruct.ls_first) { + unsigned int x; + for (x = 0; x < sdp->sd_journals; x++) { + struct gfs2_jdesc *jd = gfs2_jdesc_find(sdp, x); + + if (sdp->sd_args.ar_spectator) { + error = check_journal_clean(sdp, jd, true); + if (error) + goto fail_statfs; + continue; + } + error = gfs2_recover_journal(jd, true); + if (error) { + fs_err(sdp, "error recovering journal %u: %d\n", + x, error); + goto fail_statfs; + } + } + + gfs2_others_may_mount(sdp); + } else if (!sdp->sd_args.ar_spectator) { + error = gfs2_recover_journal(sdp->sd_jdesc, true); + if (error) { + fs_err(sdp, "error recovering my journal: %d\n", error); + goto fail_statfs; + } + } + + sdp->sd_log_idle = 1; + set_bit(SDF_JOURNAL_CHECKED, &sdp->sd_flags); + gfs2_glock_dq_uninit(&ji_gh); + INIT_WORK(&sdp->sd_freeze_work, gfs2_freeze_func); + return 0; + +fail_statfs: + uninit_statfs(sdp); +fail_jinode_gh: + /* A withdraw may have done dq/uninit so now we need to check it */ + if (!sdp->sd_args.ar_spectator && + gfs2_holder_initialized(&sdp->sd_jinode_gh)) + gfs2_glock_dq_uninit(&sdp->sd_jinode_gh); +fail_journal_gh: + if (!sdp->sd_args.ar_spectator && + gfs2_holder_initialized(&sdp->sd_journal_gh)) + gfs2_glock_dq_uninit(&sdp->sd_journal_gh); +fail_jindex: + gfs2_jindex_free(sdp); + if (gfs2_holder_initialized(&ji_gh)) + gfs2_glock_dq_uninit(&ji_gh); +fail: + iput(sdp->sd_jindex); + return error; +} + +static struct lock_class_key gfs2_quota_imutex_key; + +static int init_inodes(struct gfs2_sbd *sdp, int undo) +{ + int error = 0; + struct inode *master = d_inode(sdp->sd_master_dir); + + if (undo) + goto fail_qinode; + + error = init_journal(sdp, undo); + complete_all(&sdp->sd_journal_ready); + if (error) + goto fail; + + /* Read in the resource index inode */ + sdp->sd_rindex = gfs2_lookup_simple(master, "rindex"); + if (IS_ERR(sdp->sd_rindex)) { + error = PTR_ERR(sdp->sd_rindex); + fs_err(sdp, "can't get resource index inode: %d\n", error); + goto fail_journal; + } + sdp->sd_rindex_uptodate = 0; + + /* Read in the quota inode */ + sdp->sd_quota_inode = gfs2_lookup_simple(master, "quota"); + if (IS_ERR(sdp->sd_quota_inode)) { + error = PTR_ERR(sdp->sd_quota_inode); + fs_err(sdp, "can't get quota file inode: %d\n", error); + goto fail_rindex; + } + /* + * i_rwsem on quota files is special. Since this inode is hidden system + * file, we are safe to define locking ourselves. + */ + lockdep_set_class(&sdp->sd_quota_inode->i_rwsem, + &gfs2_quota_imutex_key); + + error = gfs2_rindex_update(sdp); + if (error) + goto fail_qinode; + + return 0; + +fail_qinode: + iput(sdp->sd_quota_inode); +fail_rindex: + gfs2_clear_rgrpd(sdp); + iput(sdp->sd_rindex); +fail_journal: + init_journal(sdp, UNDO); +fail: + return error; +} + +static int init_per_node(struct gfs2_sbd *sdp, int undo) +{ + struct inode *pn = NULL; + char buf[30]; + int error = 0; + struct gfs2_inode *ip; + struct inode *master = d_inode(sdp->sd_master_dir); + + if (sdp->sd_args.ar_spectator) + return 0; + + if (undo) + goto fail_qc_gh; + + pn = gfs2_lookup_simple(master, "per_node"); + if (IS_ERR(pn)) { + error = PTR_ERR(pn); + fs_err(sdp, "can't find per_node directory: %d\n", error); + return error; + } + + sprintf(buf, "quota_change%u", sdp->sd_jdesc->jd_jid); + sdp->sd_qc_inode = gfs2_lookup_simple(pn, buf); + if (IS_ERR(sdp->sd_qc_inode)) { + error = PTR_ERR(sdp->sd_qc_inode); + fs_err(sdp, "can't find local \"qc\" file: %d\n", error); + goto fail_ut_i; + } + + iput(pn); + pn = NULL; + + ip = GFS2_I(sdp->sd_qc_inode); + error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, GL_NOPID, + &sdp->sd_qc_gh); + if (error) { + fs_err(sdp, "can't lock local \"qc\" file: %d\n", error); + goto fail_qc_i; + } + + return 0; + +fail_qc_gh: + gfs2_glock_dq_uninit(&sdp->sd_qc_gh); +fail_qc_i: + iput(sdp->sd_qc_inode); +fail_ut_i: + iput(pn); + return error; +} + +static const match_table_t nolock_tokens = { + { Opt_jid, "jid=%d", }, + { Opt_err, NULL }, +}; + +static const struct lm_lockops nolock_ops = { + .lm_proto_name = "lock_nolock", + .lm_put_lock = gfs2_glock_free, + .lm_tokens = &nolock_tokens, +}; + +/** + * gfs2_lm_mount - mount a locking protocol + * @sdp: the filesystem + * @silent: if 1, don't complain if the FS isn't a GFS2 fs + * + * Returns: errno + */ + +static int gfs2_lm_mount(struct gfs2_sbd *sdp, int silent) +{ + const struct lm_lockops *lm; + struct lm_lockstruct *ls = &sdp->sd_lockstruct; + struct gfs2_args *args = &sdp->sd_args; + const char *proto = sdp->sd_proto_name; + const char *table = sdp->sd_table_name; + char *o, *options; + int ret; + + if (!strcmp("lock_nolock", proto)) { + lm = &nolock_ops; + sdp->sd_args.ar_localflocks = 1; +#ifdef CONFIG_GFS2_FS_LOCKING_DLM + } else if (!strcmp("lock_dlm", proto)) { + lm = &gfs2_dlm_ops; +#endif + } else { + pr_info("can't find protocol %s\n", proto); + return -ENOENT; + } + + fs_info(sdp, "Trying to join cluster \"%s\", \"%s\"\n", proto, table); + + ls->ls_ops = lm; + ls->ls_first = 1; + + for (options = args->ar_hostdata; (o = strsep(&options, ":")); ) { + substring_t tmp[MAX_OPT_ARGS]; + int token, option; + + if (!o || !*o) + continue; + + token = match_token(o, *lm->lm_tokens, tmp); + switch (token) { + case Opt_jid: + ret = match_int(&tmp[0], &option); + if (ret || option < 0) + goto hostdata_error; + if (test_and_clear_bit(SDF_NOJOURNALID, &sdp->sd_flags)) + ls->ls_jid = option; + break; + case Opt_id: + case Opt_nodir: + /* Obsolete, but left for backward compat purposes */ + break; + case Opt_first: + ret = match_int(&tmp[0], &option); + if (ret || (option != 0 && option != 1)) + goto hostdata_error; + ls->ls_first = option; + break; + case Opt_err: + default: +hostdata_error: + fs_info(sdp, "unknown hostdata (%s)\n", o); + return -EINVAL; + } + } + + if (lm->lm_mount == NULL) { + fs_info(sdp, "Now mounting FS (format %u)...\n", sdp->sd_sb.sb_fs_format); + complete_all(&sdp->sd_locking_init); + return 0; + } + ret = lm->lm_mount(sdp, table); + if (ret == 0) + fs_info(sdp, "Joined cluster. Now mounting FS (format %u)...\n", + sdp->sd_sb.sb_fs_format); + complete_all(&sdp->sd_locking_init); + return ret; +} + +void gfs2_lm_unmount(struct gfs2_sbd *sdp) +{ + const struct lm_lockops *lm = sdp->sd_lockstruct.ls_ops; + if (likely(!gfs2_withdrawn(sdp)) && lm->lm_unmount) + lm->lm_unmount(sdp); +} + +static int wait_on_journal(struct gfs2_sbd *sdp) +{ + if (sdp->sd_lockstruct.ls_ops->lm_mount == NULL) + return 0; + + return wait_on_bit(&sdp->sd_flags, SDF_NOJOURNALID, TASK_INTERRUPTIBLE) + ? -EINTR : 0; +} + +void gfs2_online_uevent(struct gfs2_sbd *sdp) +{ + struct super_block *sb = sdp->sd_vfs; + char ro[20]; + char spectator[20]; + char *envp[] = { ro, spectator, NULL }; + sprintf(ro, "RDONLY=%d", sb_rdonly(sb)); + sprintf(spectator, "SPECTATOR=%d", sdp->sd_args.ar_spectator ? 1 : 0); + kobject_uevent_env(&sdp->sd_kobj, KOBJ_ONLINE, envp); +} + +static int init_threads(struct gfs2_sbd *sdp) +{ + struct task_struct *p; + int error = 0; + + p = kthread_create(gfs2_logd, sdp, "gfs2_logd/%s", sdp->sd_fsname); + if (IS_ERR(p)) { + error = PTR_ERR(p); + fs_err(sdp, "can't create logd thread: %d\n", error); + return error; + } + get_task_struct(p); + sdp->sd_logd_process = p; + + p = kthread_create(gfs2_quotad, sdp, "gfs2_quotad/%s", sdp->sd_fsname); + if (IS_ERR(p)) { + error = PTR_ERR(p); + fs_err(sdp, "can't create quotad thread: %d\n", error); + goto fail; + } + get_task_struct(p); + sdp->sd_quotad_process = p; + + wake_up_process(sdp->sd_logd_process); + wake_up_process(sdp->sd_quotad_process); + return 0; + +fail: + kthread_stop(sdp->sd_logd_process); + put_task_struct(sdp->sd_logd_process); + sdp->sd_logd_process = NULL; + return error; +} + +void gfs2_destroy_threads(struct gfs2_sbd *sdp) +{ + if (sdp->sd_logd_process) { + kthread_stop(sdp->sd_logd_process); + put_task_struct(sdp->sd_logd_process); + sdp->sd_logd_process = NULL; + } + if (sdp->sd_quotad_process) { + kthread_stop(sdp->sd_quotad_process); + put_task_struct(sdp->sd_quotad_process); + sdp->sd_quotad_process = NULL; + } +} + +/** + * gfs2_fill_super - Read in superblock + * @sb: The VFS superblock + * @fc: Mount options and flags + * + * Returns: -errno + */ +static int gfs2_fill_super(struct super_block *sb, struct fs_context *fc) +{ + struct gfs2_args *args = fc->fs_private; + int silent = fc->sb_flags & SB_SILENT; + struct gfs2_sbd *sdp; + struct gfs2_holder mount_gh; + int error; + + sdp = init_sbd(sb); + if (!sdp) { + pr_warn("can't alloc struct gfs2_sbd\n"); + return -ENOMEM; + } + sdp->sd_args = *args; + + if (sdp->sd_args.ar_spectator) { + sb->s_flags |= SB_RDONLY; + set_bit(SDF_RORECOVERY, &sdp->sd_flags); + } + if (sdp->sd_args.ar_posix_acl) + sb->s_flags |= SB_POSIXACL; + if (sdp->sd_args.ar_nobarrier) + set_bit(SDF_NOBARRIERS, &sdp->sd_flags); + + sb->s_flags |= SB_NOSEC; + sb->s_magic = GFS2_MAGIC; + sb->s_op = &gfs2_super_ops; + sb->s_d_op = &gfs2_dops; + sb->s_export_op = &gfs2_export_ops; + sb->s_qcop = &gfs2_quotactl_ops; + sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP; + sb_dqopt(sb)->flags |= DQUOT_QUOTA_SYS_FILE; + sb->s_time_gran = 1; + sb->s_maxbytes = MAX_LFS_FILESIZE; + + /* Set up the buffer cache and fill in some fake block size values + to allow us to read-in the on-disk superblock. */ + sdp->sd_sb.sb_bsize = sb_min_blocksize(sb, GFS2_BASIC_BLOCK); + sdp->sd_sb.sb_bsize_shift = sb->s_blocksize_bits; + sdp->sd_fsb2bb_shift = sdp->sd_sb.sb_bsize_shift - + GFS2_BASIC_BLOCK_SHIFT; + sdp->sd_fsb2bb = BIT(sdp->sd_fsb2bb_shift); + + sdp->sd_tune.gt_logd_secs = sdp->sd_args.ar_commit; + sdp->sd_tune.gt_quota_quantum = sdp->sd_args.ar_quota_quantum; + if (sdp->sd_args.ar_statfs_quantum) { + sdp->sd_tune.gt_statfs_slow = 0; + sdp->sd_tune.gt_statfs_quantum = sdp->sd_args.ar_statfs_quantum; + } else { + sdp->sd_tune.gt_statfs_slow = 1; + sdp->sd_tune.gt_statfs_quantum = 30; + } + + error = init_names(sdp, silent); + if (error) + goto fail_free; + + snprintf(sdp->sd_fsname, sizeof(sdp->sd_fsname), "%s", sdp->sd_table_name); + + sdp->sd_delete_wq = alloc_workqueue("gfs2-delete/%s", + WQ_MEM_RECLAIM | WQ_FREEZABLE, 0, sdp->sd_fsname); + error = -ENOMEM; + if (!sdp->sd_delete_wq) + goto fail_free; + + error = gfs2_sys_fs_add(sdp); + if (error) + goto fail_delete_wq; + + gfs2_create_debugfs_file(sdp); + + error = gfs2_lm_mount(sdp, silent); + if (error) + goto fail_debug; + + error = init_locking(sdp, &mount_gh, DO); + if (error) + goto fail_lm; + + error = init_sb(sdp, silent); + if (error) + goto fail_locking; + + /* Turn rgrplvb on by default if fs format is recent enough */ + if (!sdp->sd_args.ar_got_rgrplvb && sdp->sd_sb.sb_fs_format > 1801) + sdp->sd_args.ar_rgrplvb = 1; + + error = wait_on_journal(sdp); + if (error) + goto fail_sb; + + /* + * If user space has failed to join the cluster or some similar + * failure has occurred, then the journal id will contain a + * negative (error) number. This will then be returned to the + * caller (of the mount syscall). We do this even for spectator + * mounts (which just write a jid of 0 to indicate "ok" even though + * the jid is unused in the spectator case) + */ + if (sdp->sd_lockstruct.ls_jid < 0) { + error = sdp->sd_lockstruct.ls_jid; + sdp->sd_lockstruct.ls_jid = 0; + goto fail_sb; + } + + if (sdp->sd_args.ar_spectator) + snprintf(sdp->sd_fsname, sizeof(sdp->sd_fsname), "%s.s", + sdp->sd_table_name); + else + snprintf(sdp->sd_fsname, sizeof(sdp->sd_fsname), "%s.%u", + sdp->sd_table_name, sdp->sd_lockstruct.ls_jid); + + error = init_inodes(sdp, DO); + if (error) + goto fail_sb; + + error = init_per_node(sdp, DO); + if (error) + goto fail_inodes; + + error = gfs2_statfs_init(sdp); + if (error) { + fs_err(sdp, "can't initialize statfs subsystem: %d\n", error); + goto fail_per_node; + } + + if (!sb_rdonly(sb)) { + error = init_threads(sdp); + if (error) + goto fail_per_node; + } + + error = gfs2_freeze_lock_shared(sdp); + if (error) + goto fail_per_node; + + if (!sb_rdonly(sb)) + error = gfs2_make_fs_rw(sdp); + + if (error) { + gfs2_freeze_unlock(&sdp->sd_freeze_gh); + gfs2_destroy_threads(sdp); + fs_err(sdp, "can't make FS RW: %d\n", error); + goto fail_per_node; + } + gfs2_glock_dq_uninit(&mount_gh); + gfs2_online_uevent(sdp); + return 0; + +fail_per_node: + init_per_node(sdp, UNDO); +fail_inodes: + init_inodes(sdp, UNDO); +fail_sb: + if (sdp->sd_root_dir) + dput(sdp->sd_root_dir); + if (sdp->sd_master_dir) + dput(sdp->sd_master_dir); + if (sb->s_root) + dput(sb->s_root); + sb->s_root = NULL; +fail_locking: + init_locking(sdp, &mount_gh, UNDO); +fail_lm: + complete_all(&sdp->sd_journal_ready); + gfs2_gl_hash_clear(sdp); + gfs2_lm_unmount(sdp); +fail_debug: + gfs2_delete_debugfs_file(sdp); + gfs2_sys_fs_del(sdp); +fail_delete_wq: + destroy_workqueue(sdp->sd_delete_wq); +fail_free: + free_sbd(sdp); + sb->s_fs_info = NULL; + return error; +} + +/** + * gfs2_get_tree - Get the GFS2 superblock and root directory + * @fc: The filesystem context + * + * Returns: 0 or -errno on error + */ +static int gfs2_get_tree(struct fs_context *fc) +{ + struct gfs2_args *args = fc->fs_private; + struct gfs2_sbd *sdp; + int error; + + error = get_tree_bdev(fc, gfs2_fill_super); + if (error) + return error; + + sdp = fc->root->d_sb->s_fs_info; + dput(fc->root); + if (args->ar_meta) + fc->root = dget(sdp->sd_master_dir); + else + fc->root = dget(sdp->sd_root_dir); + return 0; +} + +static void gfs2_fc_free(struct fs_context *fc) +{ + struct gfs2_args *args = fc->fs_private; + + kfree(args); +} + +enum gfs2_param { + Opt_lockproto, + Opt_locktable, + Opt_hostdata, + Opt_spectator, + Opt_ignore_local_fs, + Opt_localflocks, + Opt_localcaching, + Opt_debug, + Opt_upgrade, + Opt_acl, + Opt_quota, + Opt_quota_flag, + Opt_suiddir, + Opt_data, + Opt_meta, + Opt_discard, + Opt_commit, + Opt_errors, + Opt_statfs_quantum, + Opt_statfs_percent, + Opt_quota_quantum, + Opt_barrier, + Opt_rgrplvb, + Opt_loccookie, +}; + +static const struct constant_table gfs2_param_quota[] = { + {"off", GFS2_QUOTA_OFF}, + {"account", GFS2_QUOTA_ACCOUNT}, + {"on", GFS2_QUOTA_ON}, + {"quiet", GFS2_QUOTA_QUIET}, + {} +}; + +enum opt_data { + Opt_data_writeback = GFS2_DATA_WRITEBACK, + Opt_data_ordered = GFS2_DATA_ORDERED, +}; + +static const struct constant_table gfs2_param_data[] = { + {"writeback", Opt_data_writeback }, + {"ordered", Opt_data_ordered }, + {} +}; + +enum opt_errors { + Opt_errors_withdraw = GFS2_ERRORS_WITHDRAW, + Opt_errors_panic = GFS2_ERRORS_PANIC, +}; + +static const struct constant_table gfs2_param_errors[] = { + {"withdraw", Opt_errors_withdraw }, + {"panic", Opt_errors_panic }, + {} +}; + +static const struct fs_parameter_spec gfs2_fs_parameters[] = { + fsparam_string ("lockproto", Opt_lockproto), + fsparam_string ("locktable", Opt_locktable), + fsparam_string ("hostdata", Opt_hostdata), + fsparam_flag ("spectator", Opt_spectator), + fsparam_flag ("norecovery", Opt_spectator), + fsparam_flag ("ignore_local_fs", Opt_ignore_local_fs), + fsparam_flag ("localflocks", Opt_localflocks), + fsparam_flag ("localcaching", Opt_localcaching), + fsparam_flag_no("debug", Opt_debug), + fsparam_flag ("upgrade", Opt_upgrade), + fsparam_flag_no("acl", Opt_acl), + fsparam_flag_no("suiddir", Opt_suiddir), + fsparam_enum ("data", Opt_data, gfs2_param_data), + fsparam_flag ("meta", Opt_meta), + fsparam_flag_no("discard", Opt_discard), + fsparam_s32 ("commit", Opt_commit), + fsparam_enum ("errors", Opt_errors, gfs2_param_errors), + fsparam_s32 ("statfs_quantum", Opt_statfs_quantum), + fsparam_s32 ("statfs_percent", Opt_statfs_percent), + fsparam_s32 ("quota_quantum", Opt_quota_quantum), + fsparam_flag_no("barrier", Opt_barrier), + fsparam_flag_no("rgrplvb", Opt_rgrplvb), + fsparam_flag_no("loccookie", Opt_loccookie), + /* quota can be a flag or an enum so it gets special treatment */ + fsparam_flag_no("quota", Opt_quota_flag), + fsparam_enum("quota", Opt_quota, gfs2_param_quota), + {} +}; + +/* Parse a single mount parameter */ +static int gfs2_parse_param(struct fs_context *fc, struct fs_parameter *param) +{ + struct gfs2_args *args = fc->fs_private; + struct fs_parse_result result; + int o; + + o = fs_parse(fc, gfs2_fs_parameters, param, &result); + if (o < 0) + return o; + + switch (o) { + case Opt_lockproto: + strscpy(args->ar_lockproto, param->string, GFS2_LOCKNAME_LEN); + break; + case Opt_locktable: + strscpy(args->ar_locktable, param->string, GFS2_LOCKNAME_LEN); + break; + case Opt_hostdata: + strscpy(args->ar_hostdata, param->string, GFS2_LOCKNAME_LEN); + break; + case Opt_spectator: + args->ar_spectator = 1; + break; + case Opt_ignore_local_fs: + /* Retained for backwards compat only */ + break; + case Opt_localflocks: + args->ar_localflocks = 1; + break; + case Opt_localcaching: + /* Retained for backwards compat only */ + break; + case Opt_debug: + if (result.boolean && args->ar_errors == GFS2_ERRORS_PANIC) + return invalfc(fc, "-o debug and -o errors=panic are mutually exclusive"); + args->ar_debug = result.boolean; + break; + case Opt_upgrade: + /* Retained for backwards compat only */ + break; + case Opt_acl: + args->ar_posix_acl = result.boolean; + break; + case Opt_quota_flag: + args->ar_quota = result.negated ? GFS2_QUOTA_OFF : GFS2_QUOTA_ON; + break; + case Opt_quota: + args->ar_quota = result.int_32; + break; + case Opt_suiddir: + args->ar_suiddir = result.boolean; + break; + case Opt_data: + /* The uint_32 result maps directly to GFS2_DATA_* */ + args->ar_data = result.uint_32; + break; + case Opt_meta: + args->ar_meta = 1; + break; + case Opt_discard: + args->ar_discard = result.boolean; + break; + case Opt_commit: + if (result.int_32 <= 0) + return invalfc(fc, "commit mount option requires a positive numeric argument"); + args->ar_commit = result.int_32; + break; + case Opt_statfs_quantum: + if (result.int_32 < 0) + return invalfc(fc, "statfs_quantum mount option requires a non-negative numeric argument"); + args->ar_statfs_quantum = result.int_32; + break; + case Opt_quota_quantum: + if (result.int_32 <= 0) + return invalfc(fc, "quota_quantum mount option requires a positive numeric argument"); + args->ar_quota_quantum = result.int_32; + break; + case Opt_statfs_percent: + if (result.int_32 < 0 || result.int_32 > 100) + return invalfc(fc, "statfs_percent mount option requires a numeric argument between 0 and 100"); + args->ar_statfs_percent = result.int_32; + break; + case Opt_errors: + if (args->ar_debug && result.uint_32 == GFS2_ERRORS_PANIC) + return invalfc(fc, "-o debug and -o errors=panic are mutually exclusive"); + args->ar_errors = result.uint_32; + break; + case Opt_barrier: + args->ar_nobarrier = result.boolean; + break; + case Opt_rgrplvb: + args->ar_rgrplvb = result.boolean; + args->ar_got_rgrplvb = 1; + break; + case Opt_loccookie: + args->ar_loccookie = result.boolean; + break; + default: + return invalfc(fc, "invalid mount option: %s", param->key); + } + return 0; +} + +static int gfs2_reconfigure(struct fs_context *fc) +{ + struct super_block *sb = fc->root->d_sb; + struct gfs2_sbd *sdp = sb->s_fs_info; + struct gfs2_args *oldargs = &sdp->sd_args; + struct gfs2_args *newargs = fc->fs_private; + struct gfs2_tune *gt = &sdp->sd_tune; + int error = 0; + + sync_filesystem(sb); + + spin_lock(>->gt_spin); + oldargs->ar_commit = gt->gt_logd_secs; + oldargs->ar_quota_quantum = gt->gt_quota_quantum; + if (gt->gt_statfs_slow) + oldargs->ar_statfs_quantum = 0; + else + oldargs->ar_statfs_quantum = gt->gt_statfs_quantum; + spin_unlock(>->gt_spin); + + if (strcmp(newargs->ar_lockproto, oldargs->ar_lockproto)) { + errorfc(fc, "reconfiguration of locking protocol not allowed"); + return -EINVAL; + } + if (strcmp(newargs->ar_locktable, oldargs->ar_locktable)) { + errorfc(fc, "reconfiguration of lock table not allowed"); + return -EINVAL; + } + if (strcmp(newargs->ar_hostdata, oldargs->ar_hostdata)) { + errorfc(fc, "reconfiguration of host data not allowed"); + return -EINVAL; + } + if (newargs->ar_spectator != oldargs->ar_spectator) { + errorfc(fc, "reconfiguration of spectator mode not allowed"); + return -EINVAL; + } + if (newargs->ar_localflocks != oldargs->ar_localflocks) { + errorfc(fc, "reconfiguration of localflocks not allowed"); + return -EINVAL; + } + if (newargs->ar_meta != oldargs->ar_meta) { + errorfc(fc, "switching between gfs2 and gfs2meta not allowed"); + return -EINVAL; + } + if (oldargs->ar_spectator) + fc->sb_flags |= SB_RDONLY; + + if ((sb->s_flags ^ fc->sb_flags) & SB_RDONLY) { + if (fc->sb_flags & SB_RDONLY) { + gfs2_make_fs_ro(sdp); + } else { + error = gfs2_make_fs_rw(sdp); + if (error) + errorfc(fc, "unable to remount read-write"); + } + } + sdp->sd_args = *newargs; + + if (sdp->sd_args.ar_posix_acl) + sb->s_flags |= SB_POSIXACL; + else + sb->s_flags &= ~SB_POSIXACL; + if (sdp->sd_args.ar_nobarrier) + set_bit(SDF_NOBARRIERS, &sdp->sd_flags); + else + clear_bit(SDF_NOBARRIERS, &sdp->sd_flags); + spin_lock(>->gt_spin); + gt->gt_logd_secs = newargs->ar_commit; + gt->gt_quota_quantum = newargs->ar_quota_quantum; + if (newargs->ar_statfs_quantum) { + gt->gt_statfs_slow = 0; + gt->gt_statfs_quantum = newargs->ar_statfs_quantum; + } + else { + gt->gt_statfs_slow = 1; + gt->gt_statfs_quantum = 30; + } + spin_unlock(>->gt_spin); + + gfs2_online_uevent(sdp); + return error; +} + +static const struct fs_context_operations gfs2_context_ops = { + .free = gfs2_fc_free, + .parse_param = gfs2_parse_param, + .get_tree = gfs2_get_tree, + .reconfigure = gfs2_reconfigure, +}; + +/* Set up the filesystem mount context */ +static int gfs2_init_fs_context(struct fs_context *fc) +{ + struct gfs2_args *args; + + args = kmalloc(sizeof(*args), GFP_KERNEL); + if (args == NULL) + return -ENOMEM; + + if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE) { + struct gfs2_sbd *sdp = fc->root->d_sb->s_fs_info; + + *args = sdp->sd_args; + } else { + memset(args, 0, sizeof(*args)); + args->ar_quota = GFS2_QUOTA_DEFAULT; + args->ar_data = GFS2_DATA_DEFAULT; + args->ar_commit = 30; + args->ar_statfs_quantum = 30; + args->ar_quota_quantum = 60; + args->ar_errors = GFS2_ERRORS_DEFAULT; + } + fc->fs_private = args; + fc->ops = &gfs2_context_ops; + return 0; +} + +static int set_meta_super(struct super_block *s, struct fs_context *fc) +{ + return -EINVAL; +} + +static int test_meta_super(struct super_block *s, struct fs_context *fc) +{ + return (fc->sget_key == s->s_bdev); +} + +static int gfs2_meta_get_tree(struct fs_context *fc) +{ + struct super_block *s; + struct gfs2_sbd *sdp; + struct path path; + int error; + + if (!fc->source || !*fc->source) + return -EINVAL; + + error = kern_path(fc->source, LOOKUP_FOLLOW, &path); + if (error) { + pr_warn("path_lookup on %s returned error %d\n", + fc->source, error); + return error; + } + fc->fs_type = &gfs2_fs_type; + fc->sget_key = path.dentry->d_sb->s_bdev; + s = sget_fc(fc, test_meta_super, set_meta_super); + path_put(&path); + if (IS_ERR(s)) { + pr_warn("gfs2 mount does not exist\n"); + return PTR_ERR(s); + } + if ((fc->sb_flags ^ s->s_flags) & SB_RDONLY) { + deactivate_locked_super(s); + return -EBUSY; + } + sdp = s->s_fs_info; + fc->root = dget(sdp->sd_master_dir); + return 0; +} + +static const struct fs_context_operations gfs2_meta_context_ops = { + .free = gfs2_fc_free, + .get_tree = gfs2_meta_get_tree, +}; + +static int gfs2_meta_init_fs_context(struct fs_context *fc) +{ + int ret = gfs2_init_fs_context(fc); + + if (ret) + return ret; + + fc->ops = &gfs2_meta_context_ops; + return 0; +} + +/** + * gfs2_evict_inodes - evict inodes cooperatively + * @sb: the superblock + * + * When evicting an inode with a zero link count, we are trying to upgrade the + * inode's iopen glock from SH to EX mode in order to determine if we can + * delete the inode. The other nodes are supposed to evict the inode from + * their caches if they can, and to poke the inode's inode glock if they cannot + * do so. Either behavior allows gfs2_upgrade_iopen_glock() to proceed + * quickly, but if the other nodes are not cooperating, the lock upgrading + * attempt will time out. Since inodes are evicted sequentially, this can add + * up quickly. + * + * Function evict_inodes() tries to keep the s_inode_list_lock list locked over + * a long time, which prevents other inodes from being evicted concurrently. + * This precludes the cooperative behavior we are looking for. This special + * version of evict_inodes() avoids that. + * + * Modeled after drop_pagecache_sb(). + */ +static void gfs2_evict_inodes(struct super_block *sb) +{ + struct inode *inode, *toput_inode = NULL; + struct gfs2_sbd *sdp = sb->s_fs_info; + + set_bit(SDF_EVICTING, &sdp->sd_flags); + + spin_lock(&sb->s_inode_list_lock); + list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { + spin_lock(&inode->i_lock); + if ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) && + !need_resched()) { + spin_unlock(&inode->i_lock); + continue; + } + atomic_inc(&inode->i_count); + spin_unlock(&inode->i_lock); + spin_unlock(&sb->s_inode_list_lock); + + iput(toput_inode); + toput_inode = inode; + + cond_resched(); + spin_lock(&sb->s_inode_list_lock); + } + spin_unlock(&sb->s_inode_list_lock); + iput(toput_inode); +} + +static void gfs2_kill_sb(struct super_block *sb) +{ + struct gfs2_sbd *sdp = sb->s_fs_info; + + if (sdp == NULL) { + kill_block_super(sb); + return; + } + + gfs2_log_flush(sdp, NULL, GFS2_LOG_HEAD_FLUSH_SYNC | GFS2_LFC_KILL_SB); + dput(sdp->sd_root_dir); + dput(sdp->sd_master_dir); + sdp->sd_root_dir = NULL; + sdp->sd_master_dir = NULL; + shrink_dcache_sb(sb); + + gfs2_evict_inodes(sb); + + /* + * Flush and then drain the delete workqueue here (via + * destroy_workqueue()) to ensure that any delete work that + * may be running will also see the SDF_KILL flag. + */ + set_bit(SDF_KILL, &sdp->sd_flags); + gfs2_flush_delete_work(sdp); + destroy_workqueue(sdp->sd_delete_wq); + + kill_block_super(sb); +} + +struct file_system_type gfs2_fs_type = { + .name = "gfs2", + .fs_flags = FS_REQUIRES_DEV, + .init_fs_context = gfs2_init_fs_context, + .parameters = gfs2_fs_parameters, + .kill_sb = gfs2_kill_sb, + .owner = THIS_MODULE, +}; +MODULE_ALIAS_FS("gfs2"); + +struct file_system_type gfs2meta_fs_type = { + .name = "gfs2meta", + .fs_flags = FS_REQUIRES_DEV, + .init_fs_context = gfs2_meta_init_fs_context, + .owner = THIS_MODULE, +}; +MODULE_ALIAS_FS("gfs2meta"); diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c new file mode 100644 index 0000000000..f689847bab --- /dev/null +++ b/fs/gfs2/quota.c @@ -0,0 +1,1798 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2007 Red Hat, Inc. All rights reserved. + */ + +/* + * Quota change tags are associated with each transaction that allocates or + * deallocates space. Those changes are accumulated locally to each node (in a + * per-node file) and then are periodically synced to the quota file. This + * avoids the bottleneck of constantly touching the quota file, but introduces + * fuzziness in the current usage value of IDs that are being used on different + * nodes in the cluster simultaneously. So, it is possible for a user on + * multiple nodes to overrun their quota, but that overrun is controlable. + * Since quota tags are part of transactions, there is no need for a quota check + * program to be run on node crashes or anything like that. + * + * There are couple of knobs that let the administrator manage the quota + * fuzziness. "quota_quantum" sets the maximum time a quota change can be + * sitting on one node before being synced to the quota file. (The default is + * 60 seconds.) Another knob, "quota_scale" controls how quickly the frequency + * of quota file syncs increases as the user moves closer to their limit. The + * more frequent the syncs, the more accurate the quota enforcement, but that + * means that there is more contention between the nodes for the quota file. + * The default value is one. This sets the maximum theoretical quota overrun + * (with infinite node with infinite bandwidth) to twice the user's limit. (In + * practice, the maximum overrun you see should be much less.) A "quota_scale" + * number greater than one makes quota syncs more frequent and reduces the + * maximum overrun. Numbers less than one (but greater than zero) make quota + * syncs less frequent. + * + * GFS quotas also use per-ID Lock Value Blocks (LVBs) to cache the contents of + * the quota file, so it is not being constantly read. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/sched.h> +#include <linux/slab.h> +#include <linux/mm.h> +#include <linux/spinlock.h> +#include <linux/completion.h> +#include <linux/buffer_head.h> +#include <linux/sort.h> +#include <linux/fs.h> +#include <linux/bio.h> +#include <linux/gfs2_ondisk.h> +#include <linux/kthread.h> +#include <linux/freezer.h> +#include <linux/quota.h> +#include <linux/dqblk_xfs.h> +#include <linux/lockref.h> +#include <linux/list_lru.h> +#include <linux/rcupdate.h> +#include <linux/rculist_bl.h> +#include <linux/bit_spinlock.h> +#include <linux/jhash.h> +#include <linux/vmalloc.h> + +#include "gfs2.h" +#include "incore.h" +#include "bmap.h" +#include "glock.h" +#include "glops.h" +#include "log.h" +#include "meta_io.h" +#include "quota.h" +#include "rgrp.h" +#include "super.h" +#include "trans.h" +#include "inode.h" +#include "util.h" + +#define GFS2_QD_HASH_SHIFT 12 +#define GFS2_QD_HASH_SIZE BIT(GFS2_QD_HASH_SHIFT) +#define GFS2_QD_HASH_MASK (GFS2_QD_HASH_SIZE - 1) + +#define QC_CHANGE 0 +#define QC_SYNC 1 + +/* Lock order: qd_lock -> bucket lock -> qd->lockref.lock -> lru lock */ +/* -> sd_bitmap_lock */ +static DEFINE_SPINLOCK(qd_lock); +struct list_lru gfs2_qd_lru; + +static struct hlist_bl_head qd_hash_table[GFS2_QD_HASH_SIZE]; + +static unsigned int gfs2_qd_hash(const struct gfs2_sbd *sdp, + const struct kqid qid) +{ + unsigned int h; + + h = jhash(&sdp, sizeof(struct gfs2_sbd *), 0); + h = jhash(&qid, sizeof(struct kqid), h); + + return h & GFS2_QD_HASH_MASK; +} + +static inline void spin_lock_bucket(unsigned int hash) +{ + hlist_bl_lock(&qd_hash_table[hash]); +} + +static inline void spin_unlock_bucket(unsigned int hash) +{ + hlist_bl_unlock(&qd_hash_table[hash]); +} + +static void gfs2_qd_dealloc(struct rcu_head *rcu) +{ + struct gfs2_quota_data *qd = container_of(rcu, struct gfs2_quota_data, qd_rcu); + struct gfs2_sbd *sdp = qd->qd_sbd; + + kmem_cache_free(gfs2_quotad_cachep, qd); + if (atomic_dec_and_test(&sdp->sd_quota_count)) + wake_up(&sdp->sd_kill_wait); +} + +static void gfs2_qd_dispose(struct gfs2_quota_data *qd) +{ + struct gfs2_sbd *sdp = qd->qd_sbd; + + spin_lock(&qd_lock); + list_del(&qd->qd_list); + spin_unlock(&qd_lock); + + spin_lock_bucket(qd->qd_hash); + hlist_bl_del_rcu(&qd->qd_hlist); + spin_unlock_bucket(qd->qd_hash); + + if (!gfs2_withdrawn(sdp)) { + gfs2_assert_warn(sdp, !qd->qd_change); + gfs2_assert_warn(sdp, !qd->qd_slot_ref); + gfs2_assert_warn(sdp, !qd->qd_bh_count); + } + + gfs2_glock_put(qd->qd_gl); + call_rcu(&qd->qd_rcu, gfs2_qd_dealloc); +} + +static void gfs2_qd_list_dispose(struct list_head *list) +{ + struct gfs2_quota_data *qd; + + while (!list_empty(list)) { + qd = list_first_entry(list, struct gfs2_quota_data, qd_lru); + list_del(&qd->qd_lru); + + gfs2_qd_dispose(qd); + } +} + + +static enum lru_status gfs2_qd_isolate(struct list_head *item, + struct list_lru_one *lru, spinlock_t *lru_lock, void *arg) +{ + struct list_head *dispose = arg; + struct gfs2_quota_data *qd = + list_entry(item, struct gfs2_quota_data, qd_lru); + enum lru_status status; + + if (!spin_trylock(&qd->qd_lockref.lock)) + return LRU_SKIP; + + status = LRU_SKIP; + if (qd->qd_lockref.count == 0) { + lockref_mark_dead(&qd->qd_lockref); + list_lru_isolate_move(lru, &qd->qd_lru, dispose); + status = LRU_REMOVED; + } + + spin_unlock(&qd->qd_lockref.lock); + return status; +} + +static unsigned long gfs2_qd_shrink_scan(struct shrinker *shrink, + struct shrink_control *sc) +{ + LIST_HEAD(dispose); + unsigned long freed; + + if (!(sc->gfp_mask & __GFP_FS)) + return SHRINK_STOP; + + freed = list_lru_shrink_walk(&gfs2_qd_lru, sc, + gfs2_qd_isolate, &dispose); + + gfs2_qd_list_dispose(&dispose); + + return freed; +} + +static unsigned long gfs2_qd_shrink_count(struct shrinker *shrink, + struct shrink_control *sc) +{ + return vfs_pressure_ratio(list_lru_shrink_count(&gfs2_qd_lru, sc)); +} + +struct shrinker gfs2_qd_shrinker = { + .count_objects = gfs2_qd_shrink_count, + .scan_objects = gfs2_qd_shrink_scan, + .seeks = DEFAULT_SEEKS, + .flags = SHRINKER_NUMA_AWARE, +}; + + +static u64 qd2index(struct gfs2_quota_data *qd) +{ + struct kqid qid = qd->qd_id; + return (2 * (u64)from_kqid(&init_user_ns, qid)) + + ((qid.type == USRQUOTA) ? 0 : 1); +} + +static u64 qd2offset(struct gfs2_quota_data *qd) +{ + return qd2index(qd) * sizeof(struct gfs2_quota); +} + +static struct gfs2_quota_data *qd_alloc(unsigned hash, struct gfs2_sbd *sdp, struct kqid qid) +{ + struct gfs2_quota_data *qd; + int error; + + qd = kmem_cache_zalloc(gfs2_quotad_cachep, GFP_NOFS); + if (!qd) + return NULL; + + qd->qd_sbd = sdp; + qd->qd_lockref.count = 0; + spin_lock_init(&qd->qd_lockref.lock); + qd->qd_id = qid; + qd->qd_slot = -1; + INIT_LIST_HEAD(&qd->qd_lru); + qd->qd_hash = hash; + + error = gfs2_glock_get(sdp, qd2index(qd), + &gfs2_quota_glops, CREATE, &qd->qd_gl); + if (error) + goto fail; + + return qd; + +fail: + kmem_cache_free(gfs2_quotad_cachep, qd); + return NULL; +} + +static struct gfs2_quota_data *gfs2_qd_search_bucket(unsigned int hash, + const struct gfs2_sbd *sdp, + struct kqid qid) +{ + struct gfs2_quota_data *qd; + struct hlist_bl_node *h; + + hlist_bl_for_each_entry_rcu(qd, h, &qd_hash_table[hash], qd_hlist) { + if (!qid_eq(qd->qd_id, qid)) + continue; + if (qd->qd_sbd != sdp) + continue; + if (lockref_get_not_dead(&qd->qd_lockref)) { + list_lru_del(&gfs2_qd_lru, &qd->qd_lru); + return qd; + } + } + + return NULL; +} + + +static int qd_get(struct gfs2_sbd *sdp, struct kqid qid, + struct gfs2_quota_data **qdp) +{ + struct gfs2_quota_data *qd, *new_qd; + unsigned int hash = gfs2_qd_hash(sdp, qid); + + rcu_read_lock(); + *qdp = qd = gfs2_qd_search_bucket(hash, sdp, qid); + rcu_read_unlock(); + + if (qd) + return 0; + + new_qd = qd_alloc(hash, sdp, qid); + if (!new_qd) + return -ENOMEM; + + spin_lock(&qd_lock); + spin_lock_bucket(hash); + *qdp = qd = gfs2_qd_search_bucket(hash, sdp, qid); + if (qd == NULL) { + new_qd->qd_lockref.count++; + *qdp = new_qd; + list_add(&new_qd->qd_list, &sdp->sd_quota_list); + hlist_bl_add_head_rcu(&new_qd->qd_hlist, &qd_hash_table[hash]); + atomic_inc(&sdp->sd_quota_count); + } + spin_unlock_bucket(hash); + spin_unlock(&qd_lock); + + if (qd) { + gfs2_glock_put(new_qd->qd_gl); + kmem_cache_free(gfs2_quotad_cachep, new_qd); + } + + return 0; +} + + +static void qd_hold(struct gfs2_quota_data *qd) +{ + struct gfs2_sbd *sdp = qd->qd_sbd; + gfs2_assert(sdp, !__lockref_is_dead(&qd->qd_lockref)); + lockref_get(&qd->qd_lockref); +} + +static void qd_put(struct gfs2_quota_data *qd) +{ + struct gfs2_sbd *sdp; + + if (lockref_put_or_lock(&qd->qd_lockref)) + return; + + BUG_ON(__lockref_is_dead(&qd->qd_lockref)); + sdp = qd->qd_sbd; + if (unlikely(!test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags))) { + lockref_mark_dead(&qd->qd_lockref); + spin_unlock(&qd->qd_lockref.lock); + + gfs2_qd_dispose(qd); + return; + } + + qd->qd_lockref.count = 0; + list_lru_add(&gfs2_qd_lru, &qd->qd_lru); + spin_unlock(&qd->qd_lockref.lock); +} + +static int slot_get(struct gfs2_quota_data *qd) +{ + struct gfs2_sbd *sdp = qd->qd_sbd; + unsigned int bit; + int error = 0; + + spin_lock(&sdp->sd_bitmap_lock); + if (qd->qd_slot_ref == 0) { + bit = find_first_zero_bit(sdp->sd_quota_bitmap, + sdp->sd_quota_slots); + if (bit >= sdp->sd_quota_slots) { + error = -ENOSPC; + goto out; + } + set_bit(bit, sdp->sd_quota_bitmap); + qd->qd_slot = bit; + } + qd->qd_slot_ref++; +out: + spin_unlock(&sdp->sd_bitmap_lock); + return error; +} + +static void slot_hold(struct gfs2_quota_data *qd) +{ + struct gfs2_sbd *sdp = qd->qd_sbd; + + spin_lock(&sdp->sd_bitmap_lock); + gfs2_assert(sdp, qd->qd_slot_ref); + qd->qd_slot_ref++; + spin_unlock(&sdp->sd_bitmap_lock); +} + +static void slot_put(struct gfs2_quota_data *qd) +{ + struct gfs2_sbd *sdp = qd->qd_sbd; + + spin_lock(&sdp->sd_bitmap_lock); + gfs2_assert(sdp, qd->qd_slot_ref); + if (!--qd->qd_slot_ref) { + BUG_ON(!test_and_clear_bit(qd->qd_slot, sdp->sd_quota_bitmap)); + qd->qd_slot = -1; + } + spin_unlock(&sdp->sd_bitmap_lock); +} + +static int bh_get(struct gfs2_quota_data *qd) +{ + struct gfs2_sbd *sdp = qd->qd_sbd; + struct inode *inode = sdp->sd_qc_inode; + struct gfs2_inode *ip = GFS2_I(inode); + unsigned int block, offset; + struct buffer_head *bh; + struct iomap iomap = { }; + int error; + + mutex_lock(&sdp->sd_quota_mutex); + + if (qd->qd_bh_count++) { + mutex_unlock(&sdp->sd_quota_mutex); + return 0; + } + + block = qd->qd_slot / sdp->sd_qc_per_block; + offset = qd->qd_slot % sdp->sd_qc_per_block; + + error = gfs2_iomap_get(inode, + (loff_t)block << inode->i_blkbits, + i_blocksize(inode), &iomap); + if (error) + goto fail; + error = -ENOENT; + if (iomap.type != IOMAP_MAPPED) + goto fail; + + error = gfs2_meta_read(ip->i_gl, iomap.addr >> inode->i_blkbits, + DIO_WAIT, 0, &bh); + if (error) + goto fail; + error = -EIO; + if (gfs2_metatype_check(sdp, bh, GFS2_METATYPE_QC)) + goto fail_brelse; + + qd->qd_bh = bh; + qd->qd_bh_qc = (struct gfs2_quota_change *) + (bh->b_data + sizeof(struct gfs2_meta_header) + + offset * sizeof(struct gfs2_quota_change)); + + mutex_unlock(&sdp->sd_quota_mutex); + + return 0; + +fail_brelse: + brelse(bh); +fail: + qd->qd_bh_count--; + mutex_unlock(&sdp->sd_quota_mutex); + return error; +} + +static void bh_put(struct gfs2_quota_data *qd) +{ + struct gfs2_sbd *sdp = qd->qd_sbd; + + mutex_lock(&sdp->sd_quota_mutex); + gfs2_assert(sdp, qd->qd_bh_count); + if (!--qd->qd_bh_count) { + brelse(qd->qd_bh); + qd->qd_bh = NULL; + qd->qd_bh_qc = NULL; + } + mutex_unlock(&sdp->sd_quota_mutex); +} + +static int qd_check_sync(struct gfs2_sbd *sdp, struct gfs2_quota_data *qd, + u64 *sync_gen) +{ + if (test_bit(QDF_LOCKED, &qd->qd_flags) || + !test_bit(QDF_CHANGE, &qd->qd_flags) || + (sync_gen && (qd->qd_sync_gen >= *sync_gen))) + return 0; + + /* + * If qd_change is 0 it means a pending quota change was negated. + * We should not sync it, but we still have a qd reference and slot + * reference taken by gfs2_quota_change -> do_qc that need to be put. + */ + if (!qd->qd_change && test_and_clear_bit(QDF_CHANGE, &qd->qd_flags)) { + slot_put(qd); + qd_put(qd); + return 0; + } + + if (!lockref_get_not_dead(&qd->qd_lockref)) + return 0; + + list_move_tail(&qd->qd_list, &sdp->sd_quota_list); + set_bit(QDF_LOCKED, &qd->qd_flags); + qd->qd_change_sync = qd->qd_change; + slot_hold(qd); + return 1; +} + +static int qd_bh_get_or_undo(struct gfs2_sbd *sdp, struct gfs2_quota_data *qd) +{ + int error; + + error = bh_get(qd); + if (!error) + return 0; + + clear_bit(QDF_LOCKED, &qd->qd_flags); + slot_put(qd); + qd_put(qd); + return error; +} + +static int qd_fish(struct gfs2_sbd *sdp, struct gfs2_quota_data **qdp) +{ + struct gfs2_quota_data *qd = NULL, *iter; + int error; + + *qdp = NULL; + + if (sb_rdonly(sdp->sd_vfs)) + return 0; + + spin_lock(&qd_lock); + + list_for_each_entry(iter, &sdp->sd_quota_list, qd_list) { + if (qd_check_sync(sdp, iter, &sdp->sd_quota_sync_gen)) { + qd = iter; + break; + } + } + + spin_unlock(&qd_lock); + + if (qd) { + error = qd_bh_get_or_undo(sdp, qd); + if (error) + return error; + *qdp = qd; + } + + return 0; +} + +static void qdsb_put(struct gfs2_quota_data *qd) +{ + bh_put(qd); + slot_put(qd); + qd_put(qd); +} + +static void qd_unlock(struct gfs2_quota_data *qd) +{ + gfs2_assert_warn(qd->qd_sbd, test_bit(QDF_LOCKED, &qd->qd_flags)); + clear_bit(QDF_LOCKED, &qd->qd_flags); + qdsb_put(qd); +} + +static int qdsb_get(struct gfs2_sbd *sdp, struct kqid qid, + struct gfs2_quota_data **qdp) +{ + int error; + + error = qd_get(sdp, qid, qdp); + if (error) + return error; + + error = slot_get(*qdp); + if (error) + goto fail; + + error = bh_get(*qdp); + if (error) + goto fail_slot; + + return 0; + +fail_slot: + slot_put(*qdp); +fail: + qd_put(*qdp); + return error; +} + +/** + * gfs2_qa_get - make sure we have a quota allocations data structure, + * if necessary + * @ip: the inode for this reservation + */ +int gfs2_qa_get(struct gfs2_inode *ip) +{ + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + struct inode *inode = &ip->i_inode; + + if (sdp->sd_args.ar_quota == GFS2_QUOTA_OFF) + return 0; + + spin_lock(&inode->i_lock); + if (ip->i_qadata == NULL) { + struct gfs2_qadata *tmp; + + spin_unlock(&inode->i_lock); + tmp = kmem_cache_zalloc(gfs2_qadata_cachep, GFP_NOFS); + if (!tmp) + return -ENOMEM; + + spin_lock(&inode->i_lock); + if (ip->i_qadata == NULL) + ip->i_qadata = tmp; + else + kmem_cache_free(gfs2_qadata_cachep, tmp); + } + ip->i_qadata->qa_ref++; + spin_unlock(&inode->i_lock); + return 0; +} + +void gfs2_qa_put(struct gfs2_inode *ip) +{ + struct inode *inode = &ip->i_inode; + + spin_lock(&inode->i_lock); + if (ip->i_qadata && --ip->i_qadata->qa_ref == 0) { + kmem_cache_free(gfs2_qadata_cachep, ip->i_qadata); + ip->i_qadata = NULL; + } + spin_unlock(&inode->i_lock); +} + +int gfs2_quota_hold(struct gfs2_inode *ip, kuid_t uid, kgid_t gid) +{ + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + struct gfs2_quota_data **qd; + int error; + + if (sdp->sd_args.ar_quota == GFS2_QUOTA_OFF) + return 0; + + error = gfs2_qa_get(ip); + if (error) + return error; + + qd = ip->i_qadata->qa_qd; + + if (gfs2_assert_warn(sdp, !ip->i_qadata->qa_qd_num) || + gfs2_assert_warn(sdp, !test_bit(GIF_QD_LOCKED, &ip->i_flags))) { + error = -EIO; + gfs2_qa_put(ip); + goto out; + } + + error = qdsb_get(sdp, make_kqid_uid(ip->i_inode.i_uid), qd); + if (error) + goto out_unhold; + ip->i_qadata->qa_qd_num++; + qd++; + + error = qdsb_get(sdp, make_kqid_gid(ip->i_inode.i_gid), qd); + if (error) + goto out_unhold; + ip->i_qadata->qa_qd_num++; + qd++; + + if (!uid_eq(uid, NO_UID_QUOTA_CHANGE) && + !uid_eq(uid, ip->i_inode.i_uid)) { + error = qdsb_get(sdp, make_kqid_uid(uid), qd); + if (error) + goto out_unhold; + ip->i_qadata->qa_qd_num++; + qd++; + } + + if (!gid_eq(gid, NO_GID_QUOTA_CHANGE) && + !gid_eq(gid, ip->i_inode.i_gid)) { + error = qdsb_get(sdp, make_kqid_gid(gid), qd); + if (error) + goto out_unhold; + ip->i_qadata->qa_qd_num++; + qd++; + } + +out_unhold: + if (error) + gfs2_quota_unhold(ip); +out: + return error; +} + +void gfs2_quota_unhold(struct gfs2_inode *ip) +{ + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + u32 x; + + if (ip->i_qadata == NULL) + return; + + gfs2_assert_warn(sdp, !test_bit(GIF_QD_LOCKED, &ip->i_flags)); + + for (x = 0; x < ip->i_qadata->qa_qd_num; x++) { + qdsb_put(ip->i_qadata->qa_qd[x]); + ip->i_qadata->qa_qd[x] = NULL; + } + ip->i_qadata->qa_qd_num = 0; + gfs2_qa_put(ip); +} + +static int sort_qd(const void *a, const void *b) +{ + const struct gfs2_quota_data *qd_a = *(const struct gfs2_quota_data **)a; + const struct gfs2_quota_data *qd_b = *(const struct gfs2_quota_data **)b; + + if (qid_lt(qd_a->qd_id, qd_b->qd_id)) + return -1; + if (qid_lt(qd_b->qd_id, qd_a->qd_id)) + return 1; + return 0; +} + +static void do_qc(struct gfs2_quota_data *qd, s64 change, int qc_type) +{ + struct gfs2_sbd *sdp = qd->qd_sbd; + struct gfs2_inode *ip = GFS2_I(sdp->sd_qc_inode); + struct gfs2_quota_change *qc = qd->qd_bh_qc; + s64 x; + + mutex_lock(&sdp->sd_quota_mutex); + gfs2_trans_add_meta(ip->i_gl, qd->qd_bh); + + if (!test_bit(QDF_CHANGE, &qd->qd_flags)) { + qc->qc_change = 0; + qc->qc_flags = 0; + if (qd->qd_id.type == USRQUOTA) + qc->qc_flags = cpu_to_be32(GFS2_QCF_USER); + qc->qc_id = cpu_to_be32(from_kqid(&init_user_ns, qd->qd_id)); + } + + x = be64_to_cpu(qc->qc_change) + change; + qc->qc_change = cpu_to_be64(x); + + spin_lock(&qd_lock); + qd->qd_change = x; + spin_unlock(&qd_lock); + + if (qc_type == QC_CHANGE) { + if (!test_and_set_bit(QDF_CHANGE, &qd->qd_flags)) { + qd_hold(qd); + slot_hold(qd); + } + } else { + gfs2_assert_warn(sdp, test_bit(QDF_CHANGE, &qd->qd_flags)); + clear_bit(QDF_CHANGE, &qd->qd_flags); + qc->qc_flags = 0; + qc->qc_id = 0; + slot_put(qd); + qd_put(qd); + } + + if (change < 0) /* Reset quiet flag if we freed some blocks */ + clear_bit(QDF_QMSG_QUIET, &qd->qd_flags); + mutex_unlock(&sdp->sd_quota_mutex); +} + +static int gfs2_write_buf_to_page(struct gfs2_sbd *sdp, unsigned long index, + unsigned off, void *buf, unsigned bytes) +{ + struct gfs2_inode *ip = GFS2_I(sdp->sd_quota_inode); + struct inode *inode = &ip->i_inode; + struct address_space *mapping = inode->i_mapping; + struct page *page; + struct buffer_head *bh; + u64 blk; + unsigned bsize = sdp->sd_sb.sb_bsize, bnum = 0, boff = 0; + unsigned to_write = bytes, pg_off = off; + + blk = index << (PAGE_SHIFT - sdp->sd_sb.sb_bsize_shift); + boff = off % bsize; + + page = grab_cache_page(mapping, index); + if (!page) + return -ENOMEM; + if (!page_has_buffers(page)) + create_empty_buffers(page, bsize, 0); + + bh = page_buffers(page); + for(;;) { + /* Find the beginning block within the page */ + if (pg_off >= ((bnum * bsize) + bsize)) { + bh = bh->b_this_page; + bnum++; + blk++; + continue; + } + if (!buffer_mapped(bh)) { + gfs2_block_map(inode, blk, bh, 1); + if (!buffer_mapped(bh)) + goto unlock_out; + /* If it's a newly allocated disk block, zero it */ + if (buffer_new(bh)) + zero_user(page, bnum * bsize, bh->b_size); + } + if (PageUptodate(page)) + set_buffer_uptodate(bh); + if (bh_read(bh, REQ_META | REQ_PRIO) < 0) + goto unlock_out; + gfs2_trans_add_data(ip->i_gl, bh); + + /* If we need to write to the next block as well */ + if (to_write > (bsize - boff)) { + pg_off += (bsize - boff); + to_write -= (bsize - boff); + boff = pg_off % bsize; + continue; + } + break; + } + + /* Write to the page, now that we have setup the buffer(s) */ + memcpy_to_page(page, off, buf, bytes); + flush_dcache_page(page); + unlock_page(page); + put_page(page); + + return 0; + +unlock_out: + unlock_page(page); + put_page(page); + return -EIO; +} + +static int gfs2_write_disk_quota(struct gfs2_sbd *sdp, struct gfs2_quota *qp, + loff_t loc) +{ + unsigned long pg_beg; + unsigned pg_off, nbytes, overflow = 0; + int error; + void *ptr; + + nbytes = sizeof(struct gfs2_quota); + + pg_beg = loc >> PAGE_SHIFT; + pg_off = offset_in_page(loc); + + /* If the quota straddles a page boundary, split the write in two */ + if ((pg_off + nbytes) > PAGE_SIZE) + overflow = (pg_off + nbytes) - PAGE_SIZE; + + ptr = qp; + error = gfs2_write_buf_to_page(sdp, pg_beg, pg_off, ptr, + nbytes - overflow); + /* If there's an overflow, write the remaining bytes to the next page */ + if (!error && overflow) + error = gfs2_write_buf_to_page(sdp, pg_beg + 1, 0, + ptr + nbytes - overflow, + overflow); + return error; +} + +/** + * gfs2_adjust_quota - adjust record of current block usage + * @sdp: The superblock + * @loc: Offset of the entry in the quota file + * @change: The amount of usage change to record + * @qd: The quota data + * @fdq: The updated limits to record + * + * This function was mostly borrowed from gfs2_block_truncate_page which was + * in turn mostly borrowed from ext3 + * + * Returns: 0 or -ve on error + */ + +static int gfs2_adjust_quota(struct gfs2_sbd *sdp, loff_t loc, + s64 change, struct gfs2_quota_data *qd, + struct qc_dqblk *fdq) +{ + struct gfs2_inode *ip = GFS2_I(sdp->sd_quota_inode); + struct inode *inode = &ip->i_inode; + struct gfs2_quota q; + int err; + u64 size; + + if (gfs2_is_stuffed(ip)) { + err = gfs2_unstuff_dinode(ip); + if (err) + return err; + } + + memset(&q, 0, sizeof(struct gfs2_quota)); + err = gfs2_internal_read(ip, (char *)&q, &loc, sizeof(q)); + if (err < 0) + return err; + + loc -= sizeof(q); /* gfs2_internal_read would've advanced the loc ptr */ + be64_add_cpu(&q.qu_value, change); + if (((s64)be64_to_cpu(q.qu_value)) < 0) + q.qu_value = 0; /* Never go negative on quota usage */ + qd->qd_qb.qb_value = q.qu_value; + if (fdq) { + if (fdq->d_fieldmask & QC_SPC_SOFT) { + q.qu_warn = cpu_to_be64(fdq->d_spc_softlimit >> sdp->sd_sb.sb_bsize_shift); + qd->qd_qb.qb_warn = q.qu_warn; + } + if (fdq->d_fieldmask & QC_SPC_HARD) { + q.qu_limit = cpu_to_be64(fdq->d_spc_hardlimit >> sdp->sd_sb.sb_bsize_shift); + qd->qd_qb.qb_limit = q.qu_limit; + } + if (fdq->d_fieldmask & QC_SPACE) { + q.qu_value = cpu_to_be64(fdq->d_space >> sdp->sd_sb.sb_bsize_shift); + qd->qd_qb.qb_value = q.qu_value; + } + } + + err = gfs2_write_disk_quota(sdp, &q, loc); + if (!err) { + size = loc + sizeof(struct gfs2_quota); + if (size > inode->i_size) + i_size_write(inode, size); + inode->i_mtime = inode_set_ctime_current(inode); + mark_inode_dirty(inode); + set_bit(QDF_REFRESH, &qd->qd_flags); + } + + return err; +} + +static int do_sync(unsigned int num_qd, struct gfs2_quota_data **qda) +{ + struct gfs2_sbd *sdp = (*qda)->qd_sbd; + struct gfs2_inode *ip = GFS2_I(sdp->sd_quota_inode); + struct gfs2_alloc_parms ap = { .aflags = 0, }; + unsigned int data_blocks, ind_blocks; + struct gfs2_holder *ghs, i_gh; + unsigned int qx, x; + struct gfs2_quota_data *qd; + unsigned reserved; + loff_t offset; + unsigned int nalloc = 0, blocks; + int error; + + gfs2_write_calc_reserv(ip, sizeof(struct gfs2_quota), + &data_blocks, &ind_blocks); + + ghs = kmalloc_array(num_qd, sizeof(struct gfs2_holder), GFP_NOFS); + if (!ghs) + return -ENOMEM; + + sort(qda, num_qd, sizeof(struct gfs2_quota_data *), sort_qd, NULL); + inode_lock(&ip->i_inode); + for (qx = 0; qx < num_qd; qx++) { + error = gfs2_glock_nq_init(qda[qx]->qd_gl, LM_ST_EXCLUSIVE, + GL_NOCACHE, &ghs[qx]); + if (error) + goto out_dq; + } + + error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &i_gh); + if (error) + goto out_dq; + + for (x = 0; x < num_qd; x++) { + offset = qd2offset(qda[x]); + if (gfs2_write_alloc_required(ip, offset, + sizeof(struct gfs2_quota))) + nalloc++; + } + + /* + * 1 blk for unstuffing inode if stuffed. We add this extra + * block to the reservation unconditionally. If the inode + * doesn't need unstuffing, the block will be released to the + * rgrp since it won't be allocated during the transaction + */ + /* +3 in the end for unstuffing block, inode size update block + * and another block in case quota straddles page boundary and + * two blocks need to be updated instead of 1 */ + blocks = num_qd * data_blocks + RES_DINODE + num_qd + 3; + + reserved = 1 + (nalloc * (data_blocks + ind_blocks)); + ap.target = reserved; + error = gfs2_inplace_reserve(ip, &ap); + if (error) + goto out_alloc; + + if (nalloc) + blocks += gfs2_rg_blocks(ip, reserved) + nalloc * ind_blocks + RES_STATFS; + + error = gfs2_trans_begin(sdp, blocks, 0); + if (error) + goto out_ipres; + + for (x = 0; x < num_qd; x++) { + qd = qda[x]; + offset = qd2offset(qd); + error = gfs2_adjust_quota(sdp, offset, qd->qd_change_sync, qd, + NULL); + if (error) + goto out_end_trans; + + do_qc(qd, -qd->qd_change_sync, QC_SYNC); + set_bit(QDF_REFRESH, &qd->qd_flags); + } + +out_end_trans: + gfs2_trans_end(sdp); +out_ipres: + gfs2_inplace_release(ip); +out_alloc: + gfs2_glock_dq_uninit(&i_gh); +out_dq: + while (qx--) + gfs2_glock_dq_uninit(&ghs[qx]); + inode_unlock(&ip->i_inode); + kfree(ghs); + gfs2_log_flush(ip->i_gl->gl_name.ln_sbd, ip->i_gl, + GFS2_LOG_HEAD_FLUSH_NORMAL | GFS2_LFC_DO_SYNC); + if (!error) { + for (x = 0; x < num_qd; x++) + qda[x]->qd_sync_gen = sdp->sd_quota_sync_gen; + } + return error; +} + +static int update_qd(struct gfs2_sbd *sdp, struct gfs2_quota_data *qd) +{ + struct gfs2_inode *ip = GFS2_I(sdp->sd_quota_inode); + struct gfs2_quota q; + struct gfs2_quota_lvb *qlvb; + loff_t pos; + int error; + + memset(&q, 0, sizeof(struct gfs2_quota)); + pos = qd2offset(qd); + error = gfs2_internal_read(ip, (char *)&q, &pos, sizeof(q)); + if (error < 0) + return error; + + qlvb = (struct gfs2_quota_lvb *)qd->qd_gl->gl_lksb.sb_lvbptr; + qlvb->qb_magic = cpu_to_be32(GFS2_MAGIC); + qlvb->__pad = 0; + qlvb->qb_limit = q.qu_limit; + qlvb->qb_warn = q.qu_warn; + qlvb->qb_value = q.qu_value; + qd->qd_qb = *qlvb; + + return 0; +} + +static int do_glock(struct gfs2_quota_data *qd, int force_refresh, + struct gfs2_holder *q_gh) +{ + struct gfs2_sbd *sdp = qd->qd_sbd; + struct gfs2_inode *ip = GFS2_I(sdp->sd_quota_inode); + struct gfs2_holder i_gh; + int error; + + gfs2_assert_warn(sdp, sdp == qd->qd_gl->gl_name.ln_sbd); +restart: + error = gfs2_glock_nq_init(qd->qd_gl, LM_ST_SHARED, 0, q_gh); + if (error) + return error; + + if (test_and_clear_bit(QDF_REFRESH, &qd->qd_flags)) + force_refresh = FORCE; + + qd->qd_qb = *(struct gfs2_quota_lvb *)qd->qd_gl->gl_lksb.sb_lvbptr; + + if (force_refresh || qd->qd_qb.qb_magic != cpu_to_be32(GFS2_MAGIC)) { + gfs2_glock_dq_uninit(q_gh); + error = gfs2_glock_nq_init(qd->qd_gl, LM_ST_EXCLUSIVE, + GL_NOCACHE, q_gh); + if (error) + return error; + + error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, 0, &i_gh); + if (error) + goto fail; + + error = update_qd(sdp, qd); + if (error) + goto fail_gunlock; + + gfs2_glock_dq_uninit(&i_gh); + gfs2_glock_dq_uninit(q_gh); + force_refresh = 0; + goto restart; + } + + return 0; + +fail_gunlock: + gfs2_glock_dq_uninit(&i_gh); +fail: + gfs2_glock_dq_uninit(q_gh); + return error; +} + +int gfs2_quota_lock(struct gfs2_inode *ip, kuid_t uid, kgid_t gid) +{ + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + struct gfs2_quota_data *qd; + u32 x; + int error; + + if (sdp->sd_args.ar_quota != GFS2_QUOTA_ON && + sdp->sd_args.ar_quota != GFS2_QUOTA_QUIET) + return 0; + + error = gfs2_quota_hold(ip, uid, gid); + if (error) + return error; + + sort(ip->i_qadata->qa_qd, ip->i_qadata->qa_qd_num, + sizeof(struct gfs2_quota_data *), sort_qd, NULL); + + for (x = 0; x < ip->i_qadata->qa_qd_num; x++) { + qd = ip->i_qadata->qa_qd[x]; + error = do_glock(qd, NO_FORCE, &ip->i_qadata->qa_qd_ghs[x]); + if (error) + break; + } + + if (!error) + set_bit(GIF_QD_LOCKED, &ip->i_flags); + else { + while (x--) + gfs2_glock_dq_uninit(&ip->i_qadata->qa_qd_ghs[x]); + gfs2_quota_unhold(ip); + } + + return error; +} + +static bool need_sync(struct gfs2_quota_data *qd) +{ + struct gfs2_sbd *sdp = qd->qd_sbd; + struct gfs2_tune *gt = &sdp->sd_tune; + s64 value; + unsigned int num, den; + + if (!qd->qd_qb.qb_limit) + return false; + + spin_lock(&qd_lock); + value = qd->qd_change; + spin_unlock(&qd_lock); + + spin_lock(>->gt_spin); + num = gt->gt_quota_scale_num; + den = gt->gt_quota_scale_den; + spin_unlock(>->gt_spin); + + if (value <= 0) + return false; + else if ((s64)be64_to_cpu(qd->qd_qb.qb_value) >= + (s64)be64_to_cpu(qd->qd_qb.qb_limit)) + return false; + else { + value *= gfs2_jindex_size(sdp) * num; + value = div_s64(value, den); + value += (s64)be64_to_cpu(qd->qd_qb.qb_value); + if (value < (s64)be64_to_cpu(qd->qd_qb.qb_limit)) + return false; + } + + return true; +} + +void gfs2_quota_unlock(struct gfs2_inode *ip) +{ + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + struct gfs2_quota_data *qda[2 * GFS2_MAXQUOTAS]; + unsigned int count = 0; + u32 x; + int found; + + if (!test_and_clear_bit(GIF_QD_LOCKED, &ip->i_flags)) + return; + + for (x = 0; x < ip->i_qadata->qa_qd_num; x++) { + struct gfs2_quota_data *qd; + bool sync; + + qd = ip->i_qadata->qa_qd[x]; + sync = need_sync(qd); + + gfs2_glock_dq_uninit(&ip->i_qadata->qa_qd_ghs[x]); + if (!sync) + continue; + + spin_lock(&qd_lock); + found = qd_check_sync(sdp, qd, NULL); + spin_unlock(&qd_lock); + + if (!found) + continue; + + if (!qd_bh_get_or_undo(sdp, qd)) + qda[count++] = qd; + } + + if (count) { + do_sync(count, qda); + for (x = 0; x < count; x++) + qd_unlock(qda[x]); + } + + gfs2_quota_unhold(ip); +} + +#define MAX_LINE 256 + +static int print_message(struct gfs2_quota_data *qd, char *type) +{ + struct gfs2_sbd *sdp = qd->qd_sbd; + + if (sdp->sd_args.ar_quota != GFS2_QUOTA_QUIET) + fs_info(sdp, "quota %s for %s %u\n", + type, + (qd->qd_id.type == USRQUOTA) ? "user" : "group", + from_kqid(&init_user_ns, qd->qd_id)); + + return 0; +} + +/** + * gfs2_quota_check - check if allocating new blocks will exceed quota + * @ip: The inode for which this check is being performed + * @uid: The uid to check against + * @gid: The gid to check against + * @ap: The allocation parameters. ap->target contains the requested + * blocks. ap->min_target, if set, contains the minimum blks + * requested. + * + * Returns: 0 on success. + * min_req = ap->min_target ? ap->min_target : ap->target; + * quota must allow at least min_req blks for success and + * ap->allowed is set to the number of blocks allowed + * + * -EDQUOT otherwise, quota violation. ap->allowed is set to number + * of blocks available. + */ +int gfs2_quota_check(struct gfs2_inode *ip, kuid_t uid, kgid_t gid, + struct gfs2_alloc_parms *ap) +{ + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + struct gfs2_quota_data *qd; + s64 value, warn, limit; + u32 x; + int error = 0; + + ap->allowed = UINT_MAX; /* Assume we are permitted a whole lot */ + if (!test_bit(GIF_QD_LOCKED, &ip->i_flags)) + return 0; + + for (x = 0; x < ip->i_qadata->qa_qd_num; x++) { + qd = ip->i_qadata->qa_qd[x]; + + if (!(qid_eq(qd->qd_id, make_kqid_uid(uid)) || + qid_eq(qd->qd_id, make_kqid_gid(gid)))) + continue; + + warn = (s64)be64_to_cpu(qd->qd_qb.qb_warn); + limit = (s64)be64_to_cpu(qd->qd_qb.qb_limit); + value = (s64)be64_to_cpu(qd->qd_qb.qb_value); + spin_lock(&qd_lock); + value += qd->qd_change; + spin_unlock(&qd_lock); + + if (limit > 0 && (limit - value) < ap->allowed) + ap->allowed = limit - value; + /* If we can't meet the target */ + if (limit && limit < (value + (s64)ap->target)) { + /* If no min_target specified or we don't meet + * min_target, return -EDQUOT */ + if (!ap->min_target || ap->min_target > ap->allowed) { + if (!test_and_set_bit(QDF_QMSG_QUIET, + &qd->qd_flags)) { + print_message(qd, "exceeded"); + quota_send_warning(qd->qd_id, + sdp->sd_vfs->s_dev, + QUOTA_NL_BHARDWARN); + } + error = -EDQUOT; + break; + } + } else if (warn && warn < value && + time_after_eq(jiffies, qd->qd_last_warn + + gfs2_tune_get(sdp, gt_quota_warn_period) + * HZ)) { + quota_send_warning(qd->qd_id, + sdp->sd_vfs->s_dev, QUOTA_NL_BSOFTWARN); + error = print_message(qd, "warning"); + qd->qd_last_warn = jiffies; + } + } + return error; +} + +void gfs2_quota_change(struct gfs2_inode *ip, s64 change, + kuid_t uid, kgid_t gid) +{ + struct gfs2_quota_data *qd; + u32 x; + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + + if ((sdp->sd_args.ar_quota != GFS2_QUOTA_ON && + sdp->sd_args.ar_quota != GFS2_QUOTA_QUIET) || + gfs2_assert_warn(sdp, change)) + return; + if (ip->i_diskflags & GFS2_DIF_SYSTEM) + return; + + if (gfs2_assert_withdraw(sdp, ip->i_qadata && + ip->i_qadata->qa_ref > 0)) + return; + for (x = 0; x < ip->i_qadata->qa_qd_num; x++) { + qd = ip->i_qadata->qa_qd[x]; + + if (qid_eq(qd->qd_id, make_kqid_uid(uid)) || + qid_eq(qd->qd_id, make_kqid_gid(gid))) { + do_qc(qd, change, QC_CHANGE); + } + } +} + +static bool qd_changed(struct gfs2_sbd *sdp) +{ + struct gfs2_quota_data *qd; + bool changed = false; + + spin_lock(&qd_lock); + list_for_each_entry(qd, &sdp->sd_quota_list, qd_list) { + if (test_bit(QDF_LOCKED, &qd->qd_flags) || + !test_bit(QDF_CHANGE, &qd->qd_flags)) + continue; + + changed = true; + break; + } + spin_unlock(&qd_lock); + return changed; +} + +int gfs2_quota_sync(struct super_block *sb, int type) +{ + struct gfs2_sbd *sdp = sb->s_fs_info; + struct gfs2_quota_data **qda; + unsigned int max_qd = PAGE_SIZE / sizeof(struct gfs2_holder); + unsigned int num_qd; + unsigned int x; + int error = 0; + + if (!qd_changed(sdp)) + return 0; + + qda = kcalloc(max_qd, sizeof(struct gfs2_quota_data *), GFP_KERNEL); + if (!qda) + return -ENOMEM; + + mutex_lock(&sdp->sd_quota_sync_mutex); + sdp->sd_quota_sync_gen++; + + do { + num_qd = 0; + + for (;;) { + error = qd_fish(sdp, qda + num_qd); + if (error || !qda[num_qd]) + break; + if (++num_qd == max_qd) + break; + } + + if (num_qd) { + if (!error) + error = do_sync(num_qd, qda); + + for (x = 0; x < num_qd; x++) + qd_unlock(qda[x]); + } + } while (!error && num_qd == max_qd); + + mutex_unlock(&sdp->sd_quota_sync_mutex); + kfree(qda); + + return error; +} + +int gfs2_quota_refresh(struct gfs2_sbd *sdp, struct kqid qid) +{ + struct gfs2_quota_data *qd; + struct gfs2_holder q_gh; + int error; + + error = qd_get(sdp, qid, &qd); + if (error) + return error; + + error = do_glock(qd, FORCE, &q_gh); + if (!error) + gfs2_glock_dq_uninit(&q_gh); + + qd_put(qd); + return error; +} + +int gfs2_quota_init(struct gfs2_sbd *sdp) +{ + struct gfs2_inode *ip = GFS2_I(sdp->sd_qc_inode); + u64 size = i_size_read(sdp->sd_qc_inode); + unsigned int blocks = size >> sdp->sd_sb.sb_bsize_shift; + unsigned int x, slot = 0; + unsigned int found = 0; + unsigned int hash; + unsigned int bm_size; + u64 dblock; + u32 extlen = 0; + int error; + + if (gfs2_check_internal_file_size(sdp->sd_qc_inode, 1, 64 << 20)) + return -EIO; + + sdp->sd_quota_slots = blocks * sdp->sd_qc_per_block; + bm_size = DIV_ROUND_UP(sdp->sd_quota_slots, 8 * sizeof(unsigned long)); + bm_size *= sizeof(unsigned long); + error = -ENOMEM; + sdp->sd_quota_bitmap = kzalloc(bm_size, GFP_NOFS | __GFP_NOWARN); + if (sdp->sd_quota_bitmap == NULL) + sdp->sd_quota_bitmap = __vmalloc(bm_size, GFP_NOFS | + __GFP_ZERO); + if (!sdp->sd_quota_bitmap) + return error; + + for (x = 0; x < blocks; x++) { + struct buffer_head *bh; + const struct gfs2_quota_change *qc; + unsigned int y; + + if (!extlen) { + extlen = 32; + error = gfs2_get_extent(&ip->i_inode, x, &dblock, &extlen); + if (error) + goto fail; + } + error = -EIO; + bh = gfs2_meta_ra(ip->i_gl, dblock, extlen); + if (!bh) + goto fail; + if (gfs2_metatype_check(sdp, bh, GFS2_METATYPE_QC)) { + brelse(bh); + goto fail; + } + + qc = (const struct gfs2_quota_change *)(bh->b_data + sizeof(struct gfs2_meta_header)); + for (y = 0; y < sdp->sd_qc_per_block && slot < sdp->sd_quota_slots; + y++, slot++) { + struct gfs2_quota_data *qd; + s64 qc_change = be64_to_cpu(qc->qc_change); + u32 qc_flags = be32_to_cpu(qc->qc_flags); + enum quota_type qtype = (qc_flags & GFS2_QCF_USER) ? + USRQUOTA : GRPQUOTA; + struct kqid qc_id = make_kqid(&init_user_ns, qtype, + be32_to_cpu(qc->qc_id)); + qc++; + if (!qc_change) + continue; + + hash = gfs2_qd_hash(sdp, qc_id); + qd = qd_alloc(hash, sdp, qc_id); + if (qd == NULL) { + brelse(bh); + goto fail; + } + + set_bit(QDF_CHANGE, &qd->qd_flags); + qd->qd_change = qc_change; + qd->qd_slot = slot; + qd->qd_slot_ref = 1; + + spin_lock(&qd_lock); + BUG_ON(test_and_set_bit(slot, sdp->sd_quota_bitmap)); + list_add(&qd->qd_list, &sdp->sd_quota_list); + atomic_inc(&sdp->sd_quota_count); + spin_unlock(&qd_lock); + + spin_lock_bucket(hash); + hlist_bl_add_head_rcu(&qd->qd_hlist, &qd_hash_table[hash]); + spin_unlock_bucket(hash); + + found++; + } + + brelse(bh); + dblock++; + extlen--; + } + + if (found) + fs_info(sdp, "found %u quota changes\n", found); + + return 0; + +fail: + gfs2_quota_cleanup(sdp); + return error; +} + +void gfs2_quota_cleanup(struct gfs2_sbd *sdp) +{ + struct gfs2_quota_data *qd; + LIST_HEAD(dispose); + int count; + + BUG_ON(!test_bit(SDF_NORECOVERY, &sdp->sd_flags) && + test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)); + + spin_lock(&qd_lock); + list_for_each_entry(qd, &sdp->sd_quota_list, qd_list) { + spin_lock(&qd->qd_lockref.lock); + if (qd->qd_lockref.count != 0) { + spin_unlock(&qd->qd_lockref.lock); + continue; + } + lockref_mark_dead(&qd->qd_lockref); + spin_unlock(&qd->qd_lockref.lock); + + list_lru_del(&gfs2_qd_lru, &qd->qd_lru); + list_add(&qd->qd_lru, &dispose); + } + spin_unlock(&qd_lock); + + gfs2_qd_list_dispose(&dispose); + + wait_event_timeout(sdp->sd_kill_wait, + (count = atomic_read(&sdp->sd_quota_count)) == 0, + HZ * 60); + + if (count != 0) + fs_err(sdp, "%d left-over quota data objects\n", count); + + kvfree(sdp->sd_quota_bitmap); + sdp->sd_quota_bitmap = NULL; +} + +static void quotad_error(struct gfs2_sbd *sdp, const char *msg, int error) +{ + if (error == 0 || error == -EROFS) + return; + if (!gfs2_withdrawn(sdp)) { + if (!cmpxchg(&sdp->sd_log_error, 0, error)) + fs_err(sdp, "gfs2_quotad: %s error %d\n", msg, error); + wake_up(&sdp->sd_logd_waitq); + } +} + +static void quotad_check_timeo(struct gfs2_sbd *sdp, const char *msg, + int (*fxn)(struct super_block *sb, int type), + unsigned long t, unsigned long *timeo, + unsigned int *new_timeo) +{ + if (t >= *timeo) { + int error = fxn(sdp->sd_vfs, 0); + quotad_error(sdp, msg, error); + *timeo = gfs2_tune_get_i(&sdp->sd_tune, new_timeo) * HZ; + } else { + *timeo -= t; + } +} + +void gfs2_wake_up_statfs(struct gfs2_sbd *sdp) { + if (!sdp->sd_statfs_force_sync) { + sdp->sd_statfs_force_sync = 1; + wake_up(&sdp->sd_quota_wait); + } +} + + +/** + * gfs2_quotad - Write cached quota changes into the quota file + * @data: Pointer to GFS2 superblock + * + */ + +int gfs2_quotad(void *data) +{ + struct gfs2_sbd *sdp = data; + struct gfs2_tune *tune = &sdp->sd_tune; + unsigned long statfs_timeo = 0; + unsigned long quotad_timeo = 0; + unsigned long t = 0; + + while (!kthread_should_stop()) { + if (gfs2_withdrawn(sdp)) + break; + + /* Update the master statfs file */ + if (sdp->sd_statfs_force_sync) { + int error = gfs2_statfs_sync(sdp->sd_vfs, 0); + quotad_error(sdp, "statfs", error); + statfs_timeo = gfs2_tune_get(sdp, gt_statfs_quantum) * HZ; + } + else + quotad_check_timeo(sdp, "statfs", gfs2_statfs_sync, t, + &statfs_timeo, + &tune->gt_statfs_quantum); + + /* Update quota file */ + quotad_check_timeo(sdp, "sync", gfs2_quota_sync, t, + "ad_timeo, &tune->gt_quota_quantum); + + try_to_freeze(); + + t = min(quotad_timeo, statfs_timeo); + + t = wait_event_interruptible_timeout(sdp->sd_quota_wait, + sdp->sd_statfs_force_sync || + gfs2_withdrawn(sdp) || + kthread_should_stop(), + t); + + if (sdp->sd_statfs_force_sync) + t = 0; + } + + return 0; +} + +static int gfs2_quota_get_state(struct super_block *sb, struct qc_state *state) +{ + struct gfs2_sbd *sdp = sb->s_fs_info; + + memset(state, 0, sizeof(*state)); + + switch (sdp->sd_args.ar_quota) { + case GFS2_QUOTA_QUIET: + fallthrough; + case GFS2_QUOTA_ON: + state->s_state[USRQUOTA].flags |= QCI_LIMITS_ENFORCED; + state->s_state[GRPQUOTA].flags |= QCI_LIMITS_ENFORCED; + fallthrough; + case GFS2_QUOTA_ACCOUNT: + state->s_state[USRQUOTA].flags |= QCI_ACCT_ENABLED | + QCI_SYSFILE; + state->s_state[GRPQUOTA].flags |= QCI_ACCT_ENABLED | + QCI_SYSFILE; + break; + case GFS2_QUOTA_OFF: + break; + } + if (sdp->sd_quota_inode) { + state->s_state[USRQUOTA].ino = + GFS2_I(sdp->sd_quota_inode)->i_no_addr; + state->s_state[USRQUOTA].blocks = sdp->sd_quota_inode->i_blocks; + } + state->s_state[USRQUOTA].nextents = 1; /* unsupported */ + state->s_state[GRPQUOTA] = state->s_state[USRQUOTA]; + state->s_incoredqs = list_lru_count(&gfs2_qd_lru); + return 0; +} + +static int gfs2_get_dqblk(struct super_block *sb, struct kqid qid, + struct qc_dqblk *fdq) +{ + struct gfs2_sbd *sdp = sb->s_fs_info; + struct gfs2_quota_lvb *qlvb; + struct gfs2_quota_data *qd; + struct gfs2_holder q_gh; + int error; + + memset(fdq, 0, sizeof(*fdq)); + + if (sdp->sd_args.ar_quota == GFS2_QUOTA_OFF) + return -ESRCH; /* Crazy XFS error code */ + + if ((qid.type != USRQUOTA) && + (qid.type != GRPQUOTA)) + return -EINVAL; + + error = qd_get(sdp, qid, &qd); + if (error) + return error; + error = do_glock(qd, FORCE, &q_gh); + if (error) + goto out; + + qlvb = (struct gfs2_quota_lvb *)qd->qd_gl->gl_lksb.sb_lvbptr; + fdq->d_spc_hardlimit = be64_to_cpu(qlvb->qb_limit) << sdp->sd_sb.sb_bsize_shift; + fdq->d_spc_softlimit = be64_to_cpu(qlvb->qb_warn) << sdp->sd_sb.sb_bsize_shift; + fdq->d_space = be64_to_cpu(qlvb->qb_value) << sdp->sd_sb.sb_bsize_shift; + + gfs2_glock_dq_uninit(&q_gh); +out: + qd_put(qd); + return error; +} + +/* GFS2 only supports a subset of the XFS fields */ +#define GFS2_FIELDMASK (QC_SPC_SOFT|QC_SPC_HARD|QC_SPACE) + +static int gfs2_set_dqblk(struct super_block *sb, struct kqid qid, + struct qc_dqblk *fdq) +{ + struct gfs2_sbd *sdp = sb->s_fs_info; + struct gfs2_inode *ip = GFS2_I(sdp->sd_quota_inode); + struct gfs2_quota_data *qd; + struct gfs2_holder q_gh, i_gh; + unsigned int data_blocks, ind_blocks; + unsigned int blocks = 0; + int alloc_required; + loff_t offset; + int error; + + if (sdp->sd_args.ar_quota == GFS2_QUOTA_OFF) + return -ESRCH; /* Crazy XFS error code */ + + if ((qid.type != USRQUOTA) && + (qid.type != GRPQUOTA)) + return -EINVAL; + + if (fdq->d_fieldmask & ~GFS2_FIELDMASK) + return -EINVAL; + + error = qd_get(sdp, qid, &qd); + if (error) + return error; + + error = gfs2_qa_get(ip); + if (error) + goto out_put; + + inode_lock(&ip->i_inode); + error = gfs2_glock_nq_init(qd->qd_gl, LM_ST_EXCLUSIVE, 0, &q_gh); + if (error) + goto out_unlockput; + error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &i_gh); + if (error) + goto out_q; + + /* Check for existing entry, if none then alloc new blocks */ + error = update_qd(sdp, qd); + if (error) + goto out_i; + + /* If nothing has changed, this is a no-op */ + if ((fdq->d_fieldmask & QC_SPC_SOFT) && + ((fdq->d_spc_softlimit >> sdp->sd_sb.sb_bsize_shift) == be64_to_cpu(qd->qd_qb.qb_warn))) + fdq->d_fieldmask ^= QC_SPC_SOFT; + + if ((fdq->d_fieldmask & QC_SPC_HARD) && + ((fdq->d_spc_hardlimit >> sdp->sd_sb.sb_bsize_shift) == be64_to_cpu(qd->qd_qb.qb_limit))) + fdq->d_fieldmask ^= QC_SPC_HARD; + + if ((fdq->d_fieldmask & QC_SPACE) && + ((fdq->d_space >> sdp->sd_sb.sb_bsize_shift) == be64_to_cpu(qd->qd_qb.qb_value))) + fdq->d_fieldmask ^= QC_SPACE; + + if (fdq->d_fieldmask == 0) + goto out_i; + + offset = qd2offset(qd); + alloc_required = gfs2_write_alloc_required(ip, offset, sizeof(struct gfs2_quota)); + if (gfs2_is_stuffed(ip)) + alloc_required = 1; + if (alloc_required) { + struct gfs2_alloc_parms ap = { .aflags = 0, }; + gfs2_write_calc_reserv(ip, sizeof(struct gfs2_quota), + &data_blocks, &ind_blocks); + blocks = 1 + data_blocks + ind_blocks; + ap.target = blocks; + error = gfs2_inplace_reserve(ip, &ap); + if (error) + goto out_i; + blocks += gfs2_rg_blocks(ip, blocks); + } + + /* Some quotas span block boundaries and can update two blocks, + adding an extra block to the transaction to handle such quotas */ + error = gfs2_trans_begin(sdp, blocks + RES_DINODE + 2, 0); + if (error) + goto out_release; + + /* Apply changes */ + error = gfs2_adjust_quota(sdp, offset, 0, qd, fdq); + if (!error) + clear_bit(QDF_QMSG_QUIET, &qd->qd_flags); + + gfs2_trans_end(sdp); +out_release: + if (alloc_required) + gfs2_inplace_release(ip); +out_i: + gfs2_glock_dq_uninit(&i_gh); +out_q: + gfs2_glock_dq_uninit(&q_gh); +out_unlockput: + gfs2_qa_put(ip); + inode_unlock(&ip->i_inode); +out_put: + qd_put(qd); + return error; +} + +const struct quotactl_ops gfs2_quotactl_ops = { + .quota_sync = gfs2_quota_sync, + .get_state = gfs2_quota_get_state, + .get_dqblk = gfs2_get_dqblk, + .set_dqblk = gfs2_set_dqblk, +}; + +void __init gfs2_quota_hash_init(void) +{ + unsigned i; + + for(i = 0; i < GFS2_QD_HASH_SIZE; i++) + INIT_HLIST_BL_HEAD(&qd_hash_table[i]); +} diff --git a/fs/gfs2/quota.h b/fs/gfs2/quota.h new file mode 100644 index 0000000000..1429945215 --- /dev/null +++ b/fs/gfs2/quota.h @@ -0,0 +1,67 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + */ + +#ifndef __QUOTA_DOT_H__ +#define __QUOTA_DOT_H__ + +#include <linux/list_lru.h> + +struct gfs2_inode; +struct gfs2_sbd; + +#define NO_UID_QUOTA_CHANGE INVALID_UID +#define NO_GID_QUOTA_CHANGE INVALID_GID + +extern int gfs2_qa_get(struct gfs2_inode *ip); +extern void gfs2_qa_put(struct gfs2_inode *ip); +extern int gfs2_quota_hold(struct gfs2_inode *ip, kuid_t uid, kgid_t gid); +extern void gfs2_quota_unhold(struct gfs2_inode *ip); + +extern int gfs2_quota_lock(struct gfs2_inode *ip, kuid_t uid, kgid_t gid); +extern void gfs2_quota_unlock(struct gfs2_inode *ip); + +extern int gfs2_quota_check(struct gfs2_inode *ip, kuid_t uid, kgid_t gid, + struct gfs2_alloc_parms *ap); +extern void gfs2_quota_change(struct gfs2_inode *ip, s64 change, + kuid_t uid, kgid_t gid); + +extern int gfs2_quota_sync(struct super_block *sb, int type); +extern int gfs2_quota_refresh(struct gfs2_sbd *sdp, struct kqid qid); + +extern int gfs2_quota_init(struct gfs2_sbd *sdp); +extern void gfs2_quota_cleanup(struct gfs2_sbd *sdp); +extern int gfs2_quotad(void *data); + +extern void gfs2_wake_up_statfs(struct gfs2_sbd *sdp); + +static inline int gfs2_quota_lock_check(struct gfs2_inode *ip, + struct gfs2_alloc_parms *ap) +{ + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + int ret; + + ap->allowed = UINT_MAX; /* Assume we are permitted a whole lot */ + if (capable(CAP_SYS_RESOURCE) || + sdp->sd_args.ar_quota == GFS2_QUOTA_OFF) + return 0; + ret = gfs2_quota_lock(ip, NO_UID_QUOTA_CHANGE, NO_GID_QUOTA_CHANGE); + if (ret) + return ret; + if (sdp->sd_args.ar_quota != GFS2_QUOTA_ON && + sdp->sd_args.ar_quota != GFS2_QUOTA_QUIET) + return 0; + ret = gfs2_quota_check(ip, ip->i_inode.i_uid, ip->i_inode.i_gid, ap); + if (ret) + gfs2_quota_unlock(ip); + return ret; +} + +extern const struct quotactl_ops gfs2_quotactl_ops; +extern struct shrinker gfs2_qd_shrinker; +extern struct list_lru gfs2_qd_lru; +extern void __init gfs2_quota_hash_init(void); + +#endif /* __QUOTA_DOT_H__ */ diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c new file mode 100644 index 0000000000..5aae02669a --- /dev/null +++ b/fs/gfs2/recovery.c @@ -0,0 +1,582 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + */ + +#include <linux/module.h> +#include <linux/slab.h> +#include <linux/spinlock.h> +#include <linux/completion.h> +#include <linux/buffer_head.h> +#include <linux/gfs2_ondisk.h> +#include <linux/crc32.h> +#include <linux/crc32c.h> +#include <linux/ktime.h> + +#include "gfs2.h" +#include "incore.h" +#include "bmap.h" +#include "glock.h" +#include "glops.h" +#include "log.h" +#include "lops.h" +#include "meta_io.h" +#include "recovery.h" +#include "super.h" +#include "util.h" +#include "dir.h" + +struct workqueue_struct *gfs2_recovery_wq; + +int gfs2_replay_read_block(struct gfs2_jdesc *jd, unsigned int blk, + struct buffer_head **bh) +{ + struct gfs2_inode *ip = GFS2_I(jd->jd_inode); + struct gfs2_glock *gl = ip->i_gl; + u64 dblock; + u32 extlen; + int error; + + extlen = 32; + error = gfs2_get_extent(&ip->i_inode, blk, &dblock, &extlen); + if (error) + return error; + if (!dblock) { + gfs2_consist_inode(ip); + return -EIO; + } + + *bh = gfs2_meta_ra(gl, dblock, extlen); + + return error; +} + +int gfs2_revoke_add(struct gfs2_jdesc *jd, u64 blkno, unsigned int where) +{ + struct list_head *head = &jd->jd_revoke_list; + struct gfs2_revoke_replay *rr = NULL, *iter; + + list_for_each_entry(iter, head, rr_list) { + if (iter->rr_blkno == blkno) { + rr = iter; + break; + } + } + + if (rr) { + rr->rr_where = where; + return 0; + } + + rr = kmalloc(sizeof(struct gfs2_revoke_replay), GFP_NOFS); + if (!rr) + return -ENOMEM; + + rr->rr_blkno = blkno; + rr->rr_where = where; + list_add(&rr->rr_list, head); + + return 1; +} + +int gfs2_revoke_check(struct gfs2_jdesc *jd, u64 blkno, unsigned int where) +{ + struct gfs2_revoke_replay *rr = NULL, *iter; + int wrap, a, b, revoke; + + list_for_each_entry(iter, &jd->jd_revoke_list, rr_list) { + if (iter->rr_blkno == blkno) { + rr = iter; + break; + } + } + + if (!rr) + return 0; + + wrap = (rr->rr_where < jd->jd_replay_tail); + a = (jd->jd_replay_tail < where); + b = (where < rr->rr_where); + revoke = (wrap) ? (a || b) : (a && b); + + return revoke; +} + +void gfs2_revoke_clean(struct gfs2_jdesc *jd) +{ + struct list_head *head = &jd->jd_revoke_list; + struct gfs2_revoke_replay *rr; + + while (!list_empty(head)) { + rr = list_first_entry(head, struct gfs2_revoke_replay, rr_list); + list_del(&rr->rr_list); + kfree(rr); + } +} + +int __get_log_header(struct gfs2_sbd *sdp, const struct gfs2_log_header *lh, + unsigned int blkno, struct gfs2_log_header_host *head) +{ + u32 hash, crc; + + if (lh->lh_header.mh_magic != cpu_to_be32(GFS2_MAGIC) || + lh->lh_header.mh_type != cpu_to_be32(GFS2_METATYPE_LH) || + (blkno && be32_to_cpu(lh->lh_blkno) != blkno)) + return 1; + + hash = crc32(~0, lh, LH_V1_SIZE - 4); + hash = ~crc32_le_shift(hash, 4); /* assume lh_hash is zero */ + + if (be32_to_cpu(lh->lh_hash) != hash) + return 1; + + crc = crc32c(~0, (void *)lh + LH_V1_SIZE + 4, + sdp->sd_sb.sb_bsize - LH_V1_SIZE - 4); + + if ((lh->lh_crc != 0 && be32_to_cpu(lh->lh_crc) != crc)) + return 1; + + head->lh_sequence = be64_to_cpu(lh->lh_sequence); + head->lh_flags = be32_to_cpu(lh->lh_flags); + head->lh_tail = be32_to_cpu(lh->lh_tail); + head->lh_blkno = be32_to_cpu(lh->lh_blkno); + + head->lh_local_total = be64_to_cpu(lh->lh_local_total); + head->lh_local_free = be64_to_cpu(lh->lh_local_free); + head->lh_local_dinodes = be64_to_cpu(lh->lh_local_dinodes); + + return 0; +} +/** + * get_log_header - read the log header for a given segment + * @jd: the journal + * @blk: the block to look at + * @head: the log header to return + * + * Read the log header for a given segement in a given journal. Do a few + * sanity checks on it. + * + * Returns: 0 on success, + * 1 if the header was invalid or incomplete, + * errno on error + */ + +static int get_log_header(struct gfs2_jdesc *jd, unsigned int blk, + struct gfs2_log_header_host *head) +{ + struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode); + struct buffer_head *bh; + int error; + + error = gfs2_replay_read_block(jd, blk, &bh); + if (error) + return error; + + error = __get_log_header(sdp, (const struct gfs2_log_header *)bh->b_data, + blk, head); + brelse(bh); + + return error; +} + +/** + * foreach_descriptor - go through the active part of the log + * @jd: the journal + * @start: the first log header in the active region + * @end: the last log header (don't process the contents of this entry)) + * @pass: iteration number (foreach_descriptor() is called in a for() loop) + * + * Call a given function once for every log descriptor in the active + * portion of the log. + * + * Returns: errno + */ + +static int foreach_descriptor(struct gfs2_jdesc *jd, u32 start, + unsigned int end, int pass) +{ + struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode); + struct buffer_head *bh; + struct gfs2_log_descriptor *ld; + int error = 0; + u32 length; + __be64 *ptr; + unsigned int offset = sizeof(struct gfs2_log_descriptor); + offset += sizeof(__be64) - 1; + offset &= ~(sizeof(__be64) - 1); + + while (start != end) { + error = gfs2_replay_read_block(jd, start, &bh); + if (error) + return error; + if (gfs2_meta_check(sdp, bh)) { + brelse(bh); + return -EIO; + } + ld = (struct gfs2_log_descriptor *)bh->b_data; + length = be32_to_cpu(ld->ld_length); + + if (be32_to_cpu(ld->ld_header.mh_type) == GFS2_METATYPE_LH) { + struct gfs2_log_header_host lh; + error = get_log_header(jd, start, &lh); + if (!error) { + gfs2_replay_incr_blk(jd, &start); + brelse(bh); + continue; + } + if (error == 1) { + gfs2_consist_inode(GFS2_I(jd->jd_inode)); + error = -EIO; + } + brelse(bh); + return error; + } else if (gfs2_metatype_check(sdp, bh, GFS2_METATYPE_LD)) { + brelse(bh); + return -EIO; + } + ptr = (__be64 *)(bh->b_data + offset); + error = lops_scan_elements(jd, start, ld, ptr, pass); + if (error) { + brelse(bh); + return error; + } + + while (length--) + gfs2_replay_incr_blk(jd, &start); + + brelse(bh); + } + + return 0; +} + +/** + * clean_journal - mark a dirty journal as being clean + * @jd: the journal + * @head: the head journal to start from + * + * Returns: errno + */ + +static void clean_journal(struct gfs2_jdesc *jd, + struct gfs2_log_header_host *head) +{ + struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode); + u32 lblock = head->lh_blkno; + + gfs2_replay_incr_blk(jd, &lblock); + gfs2_write_log_header(sdp, jd, head->lh_sequence + 1, 0, lblock, + GFS2_LOG_HEAD_UNMOUNT | GFS2_LOG_HEAD_RECOVERY, + REQ_PREFLUSH | REQ_FUA | REQ_META | REQ_SYNC); + if (jd->jd_jid == sdp->sd_lockstruct.ls_jid) { + sdp->sd_log_flush_head = lblock; + gfs2_log_incr_head(sdp); + } +} + + +static void gfs2_recovery_done(struct gfs2_sbd *sdp, unsigned int jid, + unsigned int message) +{ + char env_jid[20]; + char env_status[20]; + char *envp[] = { env_jid, env_status, NULL }; + struct lm_lockstruct *ls = &sdp->sd_lockstruct; + + ls->ls_recover_jid_done = jid; + ls->ls_recover_jid_status = message; + sprintf(env_jid, "JID=%u", jid); + sprintf(env_status, "RECOVERY=%s", + message == LM_RD_SUCCESS ? "Done" : "Failed"); + kobject_uevent_env(&sdp->sd_kobj, KOBJ_CHANGE, envp); + + if (sdp->sd_lockstruct.ls_ops->lm_recovery_result) + sdp->sd_lockstruct.ls_ops->lm_recovery_result(sdp, jid, message); +} + +/** + * update_statfs_inode - Update the master statfs inode or zero out the local + * statfs inode for a given journal. + * @jd: The journal + * @head: If NULL, @inode is the local statfs inode and we need to zero it out. + * Otherwise, it @head contains the statfs change info that needs to be + * synced to the master statfs inode (pointed to by @inode). + * @inode: statfs inode to update. + */ +static int update_statfs_inode(struct gfs2_jdesc *jd, + struct gfs2_log_header_host *head, + struct inode *inode) +{ + struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode); + struct gfs2_inode *ip; + struct buffer_head *bh; + struct gfs2_statfs_change_host sc; + int error = 0; + + BUG_ON(!inode); + ip = GFS2_I(inode); + + error = gfs2_meta_inode_buffer(ip, &bh); + if (error) + goto out; + + spin_lock(&sdp->sd_statfs_spin); + + if (head) { /* Update the master statfs inode */ + gfs2_statfs_change_in(&sc, bh->b_data + sizeof(struct gfs2_dinode)); + sc.sc_total += head->lh_local_total; + sc.sc_free += head->lh_local_free; + sc.sc_dinodes += head->lh_local_dinodes; + gfs2_statfs_change_out(&sc, bh->b_data + sizeof(struct gfs2_dinode)); + + fs_info(sdp, "jid=%u: Updated master statfs Total:%lld, " + "Free:%lld, Dinodes:%lld after change " + "[%+lld,%+lld,%+lld]\n", jd->jd_jid, sc.sc_total, + sc.sc_free, sc.sc_dinodes, head->lh_local_total, + head->lh_local_free, head->lh_local_dinodes); + } else { /* Zero out the local statfs inode */ + memset(bh->b_data + sizeof(struct gfs2_dinode), 0, + sizeof(struct gfs2_statfs_change)); + /* If it's our own journal, reset any in-memory changes too */ + if (jd->jd_jid == sdp->sd_lockstruct.ls_jid) { + memset(&sdp->sd_statfs_local, 0, + sizeof(struct gfs2_statfs_change_host)); + } + } + spin_unlock(&sdp->sd_statfs_spin); + + mark_buffer_dirty(bh); + brelse(bh); + gfs2_inode_metasync(ip->i_gl); + +out: + return error; +} + +/** + * recover_local_statfs - Update the master and local statfs changes for this + * journal. + * + * Previously, statfs updates would be read in from the local statfs inode and + * synced to the master statfs inode during recovery. + * + * We now use the statfs updates in the journal head to update the master statfs + * inode instead of reading in from the local statfs inode. To preserve backward + * compatibility with kernels that can't do this, we still need to keep the + * local statfs inode up to date by writing changes to it. At some point in the + * future, we can do away with the local statfs inodes altogether and keep the + * statfs changes solely in the journal. + * + * @jd: the journal + * @head: the journal head + * + * Returns: errno + */ +static void recover_local_statfs(struct gfs2_jdesc *jd, + struct gfs2_log_header_host *head) +{ + int error; + struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode); + + if (!head->lh_local_total && !head->lh_local_free + && !head->lh_local_dinodes) /* No change */ + goto zero_local; + + /* First update the master statfs inode with the changes we + * found in the journal. */ + error = update_statfs_inode(jd, head, sdp->sd_statfs_inode); + if (error) + goto out; + +zero_local: + /* Zero out the local statfs inode so any changes in there + * are not re-recovered. */ + error = update_statfs_inode(jd, NULL, + find_local_statfs_inode(sdp, jd->jd_jid)); +out: + return; +} + +void gfs2_recover_func(struct work_struct *work) +{ + struct gfs2_jdesc *jd = container_of(work, struct gfs2_jdesc, jd_work); + struct gfs2_inode *ip = GFS2_I(jd->jd_inode); + struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode); + struct gfs2_log_header_host head; + struct gfs2_holder j_gh, ji_gh; + ktime_t t_start, t_jlck, t_jhd, t_tlck, t_rep; + int ro = 0; + unsigned int pass; + int error = 0; + int jlocked = 0; + + if (gfs2_withdrawn(sdp)) { + fs_err(sdp, "jid=%u: Recovery not attempted due to withdraw.\n", + jd->jd_jid); + goto fail; + } + t_start = ktime_get(); + if (sdp->sd_args.ar_spectator) + goto fail; + if (jd->jd_jid != sdp->sd_lockstruct.ls_jid) { + fs_info(sdp, "jid=%u: Trying to acquire journal glock...\n", + jd->jd_jid); + jlocked = 1; + /* Acquire the journal glock so we can do recovery */ + + error = gfs2_glock_nq_num(sdp, jd->jd_jid, &gfs2_journal_glops, + LM_ST_EXCLUSIVE, + LM_FLAG_NOEXP | LM_FLAG_TRY | GL_NOCACHE, + &j_gh); + switch (error) { + case 0: + break; + + case GLR_TRYFAILED: + fs_info(sdp, "jid=%u: Busy\n", jd->jd_jid); + error = 0; + goto fail; + + default: + goto fail; + } + + error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, + LM_FLAG_NOEXP | GL_NOCACHE, &ji_gh); + if (error) + goto fail_gunlock_j; + } else { + fs_info(sdp, "jid=%u, already locked for use\n", jd->jd_jid); + } + + t_jlck = ktime_get(); + fs_info(sdp, "jid=%u: Looking at journal...\n", jd->jd_jid); + + error = gfs2_jdesc_check(jd); + if (error) + goto fail_gunlock_ji; + + error = gfs2_find_jhead(jd, &head, true); + if (error) + goto fail_gunlock_ji; + t_jhd = ktime_get(); + fs_info(sdp, "jid=%u: Journal head lookup took %lldms\n", jd->jd_jid, + ktime_ms_delta(t_jhd, t_jlck)); + + if (!(head.lh_flags & GFS2_LOG_HEAD_UNMOUNT)) { + mutex_lock(&sdp->sd_freeze_mutex); + + if (test_bit(SDF_FROZEN, &sdp->sd_flags)) { + mutex_unlock(&sdp->sd_freeze_mutex); + fs_warn(sdp, "jid=%u: Can't replay: filesystem " + "is frozen\n", jd->jd_jid); + goto fail_gunlock_ji; + } + + if (test_bit(SDF_RORECOVERY, &sdp->sd_flags)) { + ro = 1; + } else if (test_bit(SDF_JOURNAL_CHECKED, &sdp->sd_flags)) { + if (!test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) + ro = 1; + } else { + if (sb_rdonly(sdp->sd_vfs)) { + /* check if device itself is read-only */ + ro = bdev_read_only(sdp->sd_vfs->s_bdev); + if (!ro) { + fs_info(sdp, "recovery required on " + "read-only filesystem.\n"); + fs_info(sdp, "write access will be " + "enabled during recovery.\n"); + } + } + } + + if (ro) { + fs_warn(sdp, "jid=%u: Can't replay: read-only block " + "device\n", jd->jd_jid); + error = -EROFS; + goto fail_gunlock_nofreeze; + } + + t_tlck = ktime_get(); + fs_info(sdp, "jid=%u: Replaying journal...0x%x to 0x%x\n", + jd->jd_jid, head.lh_tail, head.lh_blkno); + + /* We take the sd_log_flush_lock here primarily to prevent log + * flushes and simultaneous journal replays from stomping on + * each other wrt jd_log_bio. */ + down_read(&sdp->sd_log_flush_lock); + for (pass = 0; pass < 2; pass++) { + lops_before_scan(jd, &head, pass); + error = foreach_descriptor(jd, head.lh_tail, + head.lh_blkno, pass); + lops_after_scan(jd, error, pass); + if (error) { + up_read(&sdp->sd_log_flush_lock); + goto fail_gunlock_nofreeze; + } + } + + recover_local_statfs(jd, &head); + clean_journal(jd, &head); + up_read(&sdp->sd_log_flush_lock); + + mutex_unlock(&sdp->sd_freeze_mutex); + t_rep = ktime_get(); + fs_info(sdp, "jid=%u: Journal replayed in %lldms [jlck:%lldms, " + "jhead:%lldms, tlck:%lldms, replay:%lldms]\n", + jd->jd_jid, ktime_ms_delta(t_rep, t_start), + ktime_ms_delta(t_jlck, t_start), + ktime_ms_delta(t_jhd, t_jlck), + ktime_ms_delta(t_tlck, t_jhd), + ktime_ms_delta(t_rep, t_tlck)); + } + + gfs2_recovery_done(sdp, jd->jd_jid, LM_RD_SUCCESS); + + if (jlocked) { + gfs2_glock_dq_uninit(&ji_gh); + gfs2_glock_dq_uninit(&j_gh); + } + + fs_info(sdp, "jid=%u: Done\n", jd->jd_jid); + goto done; + +fail_gunlock_nofreeze: + mutex_unlock(&sdp->sd_freeze_mutex); +fail_gunlock_ji: + if (jlocked) { + gfs2_glock_dq_uninit(&ji_gh); +fail_gunlock_j: + gfs2_glock_dq_uninit(&j_gh); + } + + fs_info(sdp, "jid=%u: %s\n", jd->jd_jid, (error) ? "Failed" : "Done"); +fail: + jd->jd_recover_error = error; + gfs2_recovery_done(sdp, jd->jd_jid, LM_RD_GAVEUP); +done: + clear_bit(JDF_RECOVERY, &jd->jd_flags); + smp_mb__after_atomic(); + wake_up_bit(&jd->jd_flags, JDF_RECOVERY); +} + +int gfs2_recover_journal(struct gfs2_jdesc *jd, bool wait) +{ + int rv; + + if (test_and_set_bit(JDF_RECOVERY, &jd->jd_flags)) + return -EBUSY; + + /* we have JDF_RECOVERY, queue should always succeed */ + rv = queue_work(gfs2_recovery_wq, &jd->jd_work); + BUG_ON(!rv); + + if (wait) + wait_on_bit(&jd->jd_flags, JDF_RECOVERY, + TASK_UNINTERRUPTIBLE); + + return wait ? jd->jd_recover_error : 0; +} + diff --git a/fs/gfs2/recovery.h b/fs/gfs2/recovery.h new file mode 100644 index 0000000000..7a0c9d0b75 --- /dev/null +++ b/fs/gfs2/recovery.h @@ -0,0 +1,34 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + */ + +#ifndef __RECOVERY_DOT_H__ +#define __RECOVERY_DOT_H__ + +#include "incore.h" + +extern struct workqueue_struct *gfs2_recovery_wq; + +static inline void gfs2_replay_incr_blk(struct gfs2_jdesc *jd, u32 *blk) +{ + if (++*blk == jd->jd_blocks) + *blk = 0; +} + +extern int gfs2_replay_read_block(struct gfs2_jdesc *jd, unsigned int blk, + struct buffer_head **bh); + +extern int gfs2_revoke_add(struct gfs2_jdesc *jd, u64 blkno, unsigned int where); +extern int gfs2_revoke_check(struct gfs2_jdesc *jd, u64 blkno, unsigned int where); +extern void gfs2_revoke_clean(struct gfs2_jdesc *jd); + +extern int gfs2_recover_journal(struct gfs2_jdesc *gfs2_jd, bool wait); +extern void gfs2_recover_func(struct work_struct *work); +extern int __get_log_header(struct gfs2_sbd *sdp, + const struct gfs2_log_header *lh, unsigned int blkno, + struct gfs2_log_header_host *head); + +#endif /* __RECOVERY_DOT_H__ */ + diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c new file mode 100644 index 0000000000..307b952a41 --- /dev/null +++ b/fs/gfs2/rgrp.c @@ -0,0 +1,2770 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/slab.h> +#include <linux/spinlock.h> +#include <linux/completion.h> +#include <linux/buffer_head.h> +#include <linux/fs.h> +#include <linux/gfs2_ondisk.h> +#include <linux/prefetch.h> +#include <linux/blkdev.h> +#include <linux/rbtree.h> +#include <linux/random.h> + +#include "gfs2.h" +#include "incore.h" +#include "glock.h" +#include "glops.h" +#include "lops.h" +#include "meta_io.h" +#include "quota.h" +#include "rgrp.h" +#include "super.h" +#include "trans.h" +#include "util.h" +#include "log.h" +#include "inode.h" +#include "trace_gfs2.h" +#include "dir.h" + +#define BFITNOENT ((u32)~0) +#define NO_BLOCK ((u64)~0) + +struct gfs2_rbm { + struct gfs2_rgrpd *rgd; + u32 offset; /* The offset is bitmap relative */ + int bii; /* Bitmap index */ +}; + +static inline struct gfs2_bitmap *rbm_bi(const struct gfs2_rbm *rbm) +{ + return rbm->rgd->rd_bits + rbm->bii; +} + +static inline u64 gfs2_rbm_to_block(const struct gfs2_rbm *rbm) +{ + BUG_ON(rbm->offset >= rbm->rgd->rd_data); + return rbm->rgd->rd_data0 + (rbm_bi(rbm)->bi_start * GFS2_NBBY) + + rbm->offset; +} + +/* + * These routines are used by the resource group routines (rgrp.c) + * to keep track of block allocation. Each block is represented by two + * bits. So, each byte represents GFS2_NBBY (i.e. 4) blocks. + * + * 0 = Free + * 1 = Used (not metadata) + * 2 = Unlinked (still in use) inode + * 3 = Used (metadata) + */ + +struct gfs2_extent { + struct gfs2_rbm rbm; + u32 len; +}; + +static const char valid_change[16] = { + /* current */ + /* n */ 0, 1, 1, 1, + /* e */ 1, 0, 0, 0, + /* w */ 0, 0, 0, 1, + 1, 0, 0, 0 +}; + +static int gfs2_rbm_find(struct gfs2_rbm *rbm, u8 state, u32 *minext, + struct gfs2_blkreserv *rs, bool nowrap); + + +/** + * gfs2_setbit - Set a bit in the bitmaps + * @rbm: The position of the bit to set + * @do_clone: Also set the clone bitmap, if it exists + * @new_state: the new state of the block + * + */ + +static inline void gfs2_setbit(const struct gfs2_rbm *rbm, bool do_clone, + unsigned char new_state) +{ + unsigned char *byte1, *byte2, *end, cur_state; + struct gfs2_bitmap *bi = rbm_bi(rbm); + unsigned int buflen = bi->bi_bytes; + const unsigned int bit = (rbm->offset % GFS2_NBBY) * GFS2_BIT_SIZE; + + byte1 = bi->bi_bh->b_data + bi->bi_offset + (rbm->offset / GFS2_NBBY); + end = bi->bi_bh->b_data + bi->bi_offset + buflen; + + BUG_ON(byte1 >= end); + + cur_state = (*byte1 >> bit) & GFS2_BIT_MASK; + + if (unlikely(!valid_change[new_state * 4 + cur_state])) { + struct gfs2_sbd *sdp = rbm->rgd->rd_sbd; + + fs_warn(sdp, "buf_blk = 0x%x old_state=%d, new_state=%d\n", + rbm->offset, cur_state, new_state); + fs_warn(sdp, "rgrp=0x%llx bi_start=0x%x biblk: 0x%llx\n", + (unsigned long long)rbm->rgd->rd_addr, bi->bi_start, + (unsigned long long)bi->bi_bh->b_blocknr); + fs_warn(sdp, "bi_offset=0x%x bi_bytes=0x%x block=0x%llx\n", + bi->bi_offset, bi->bi_bytes, + (unsigned long long)gfs2_rbm_to_block(rbm)); + dump_stack(); + gfs2_consist_rgrpd(rbm->rgd); + return; + } + *byte1 ^= (cur_state ^ new_state) << bit; + + if (do_clone && bi->bi_clone) { + byte2 = bi->bi_clone + bi->bi_offset + (rbm->offset / GFS2_NBBY); + cur_state = (*byte2 >> bit) & GFS2_BIT_MASK; + *byte2 ^= (cur_state ^ new_state) << bit; + } +} + +/** + * gfs2_testbit - test a bit in the bitmaps + * @rbm: The bit to test + * @use_clone: If true, test the clone bitmap, not the official bitmap. + * + * Some callers like gfs2_unaligned_extlen need to test the clone bitmaps, + * not the "real" bitmaps, to avoid allocating recently freed blocks. + * + * Returns: The two bit block state of the requested bit + */ + +static inline u8 gfs2_testbit(const struct gfs2_rbm *rbm, bool use_clone) +{ + struct gfs2_bitmap *bi = rbm_bi(rbm); + const u8 *buffer; + const u8 *byte; + unsigned int bit; + + if (use_clone && bi->bi_clone) + buffer = bi->bi_clone; + else + buffer = bi->bi_bh->b_data; + buffer += bi->bi_offset; + byte = buffer + (rbm->offset / GFS2_NBBY); + bit = (rbm->offset % GFS2_NBBY) * GFS2_BIT_SIZE; + + return (*byte >> bit) & GFS2_BIT_MASK; +} + +/** + * gfs2_bit_search + * @ptr: Pointer to bitmap data + * @mask: Mask to use (normally 0x55555.... but adjusted for search start) + * @state: The state we are searching for + * + * We xor the bitmap data with a patter which is the bitwise opposite + * of what we are looking for, this gives rise to a pattern of ones + * wherever there is a match. Since we have two bits per entry, we + * take this pattern, shift it down by one place and then and it with + * the original. All the even bit positions (0,2,4, etc) then represent + * successful matches, so we mask with 0x55555..... to remove the unwanted + * odd bit positions. + * + * This allows searching of a whole u64 at once (32 blocks) with a + * single test (on 64 bit arches). + */ + +static inline u64 gfs2_bit_search(const __le64 *ptr, u64 mask, u8 state) +{ + u64 tmp; + static const u64 search[] = { + [0] = 0xffffffffffffffffULL, + [1] = 0xaaaaaaaaaaaaaaaaULL, + [2] = 0x5555555555555555ULL, + [3] = 0x0000000000000000ULL, + }; + tmp = le64_to_cpu(*ptr) ^ search[state]; + tmp &= (tmp >> 1); + tmp &= mask; + return tmp; +} + +/** + * rs_cmp - multi-block reservation range compare + * @start: start of the new reservation + * @len: number of blocks in the new reservation + * @rs: existing reservation to compare against + * + * returns: 1 if the block range is beyond the reach of the reservation + * -1 if the block range is before the start of the reservation + * 0 if the block range overlaps with the reservation + */ +static inline int rs_cmp(u64 start, u32 len, struct gfs2_blkreserv *rs) +{ + if (start >= rs->rs_start + rs->rs_requested) + return 1; + if (rs->rs_start >= start + len) + return -1; + return 0; +} + +/** + * gfs2_bitfit - Search an rgrp's bitmap buffer to find a bit-pair representing + * a block in a given allocation state. + * @buf: the buffer that holds the bitmaps + * @len: the length (in bytes) of the buffer + * @goal: start search at this block's bit-pair (within @buffer) + * @state: GFS2_BLKST_XXX the state of the block we're looking for. + * + * Scope of @goal and returned block number is only within this bitmap buffer, + * not entire rgrp or filesystem. @buffer will be offset from the actual + * beginning of a bitmap block buffer, skipping any header structures, but + * headers are always a multiple of 64 bits long so that the buffer is + * always aligned to a 64 bit boundary. + * + * The size of the buffer is in bytes, but is it assumed that it is + * always ok to read a complete multiple of 64 bits at the end + * of the block in case the end is no aligned to a natural boundary. + * + * Return: the block number (bitmap buffer scope) that was found + */ + +static u32 gfs2_bitfit(const u8 *buf, const unsigned int len, + u32 goal, u8 state) +{ + u32 spoint = (goal << 1) & ((8*sizeof(u64)) - 1); + const __le64 *ptr = ((__le64 *)buf) + (goal >> 5); + const __le64 *end = (__le64 *)(buf + ALIGN(len, sizeof(u64))); + u64 tmp; + u64 mask = 0x5555555555555555ULL; + u32 bit; + + /* Mask off bits we don't care about at the start of the search */ + mask <<= spoint; + tmp = gfs2_bit_search(ptr, mask, state); + ptr++; + while(tmp == 0 && ptr < end) { + tmp = gfs2_bit_search(ptr, 0x5555555555555555ULL, state); + ptr++; + } + /* Mask off any bits which are more than len bytes from the start */ + if (ptr == end && (len & (sizeof(u64) - 1))) + tmp &= (((u64)~0) >> (64 - 8*(len & (sizeof(u64) - 1)))); + /* Didn't find anything, so return */ + if (tmp == 0) + return BFITNOENT; + ptr--; + bit = __ffs64(tmp); + bit /= 2; /* two bits per entry in the bitmap */ + return (((const unsigned char *)ptr - buf) * GFS2_NBBY) + bit; +} + +/** + * gfs2_rbm_from_block - Set the rbm based upon rgd and block number + * @rbm: The rbm with rgd already set correctly + * @block: The block number (filesystem relative) + * + * This sets the bi and offset members of an rbm based on a + * resource group and a filesystem relative block number. The + * resource group must be set in the rbm on entry, the bi and + * offset members will be set by this function. + * + * Returns: 0 on success, or an error code + */ + +static int gfs2_rbm_from_block(struct gfs2_rbm *rbm, u64 block) +{ + if (!rgrp_contains_block(rbm->rgd, block)) + return -E2BIG; + rbm->bii = 0; + rbm->offset = block - rbm->rgd->rd_data0; + /* Check if the block is within the first block */ + if (rbm->offset < rbm_bi(rbm)->bi_blocks) + return 0; + + /* Adjust for the size diff between gfs2_meta_header and gfs2_rgrp */ + rbm->offset += (sizeof(struct gfs2_rgrp) - + sizeof(struct gfs2_meta_header)) * GFS2_NBBY; + rbm->bii = rbm->offset / rbm->rgd->rd_sbd->sd_blocks_per_bitmap; + rbm->offset -= rbm->bii * rbm->rgd->rd_sbd->sd_blocks_per_bitmap; + return 0; +} + +/** + * gfs2_rbm_add - add a number of blocks to an rbm + * @rbm: The rbm with rgd already set correctly + * @blocks: The number of blocks to add to rpm + * + * This function takes an existing rbm structure and adds a number of blocks to + * it. + * + * Returns: True if the new rbm would point past the end of the rgrp. + */ + +static bool gfs2_rbm_add(struct gfs2_rbm *rbm, u32 blocks) +{ + struct gfs2_rgrpd *rgd = rbm->rgd; + struct gfs2_bitmap *bi = rgd->rd_bits + rbm->bii; + + if (rbm->offset + blocks < bi->bi_blocks) { + rbm->offset += blocks; + return false; + } + blocks -= bi->bi_blocks - rbm->offset; + + for(;;) { + bi++; + if (bi == rgd->rd_bits + rgd->rd_length) + return true; + if (blocks < bi->bi_blocks) { + rbm->offset = blocks; + rbm->bii = bi - rgd->rd_bits; + return false; + } + blocks -= bi->bi_blocks; + } +} + +/** + * gfs2_unaligned_extlen - Look for free blocks which are not byte aligned + * @rbm: Position to search (value/result) + * @n_unaligned: Number of unaligned blocks to check + * @len: Decremented for each block found (terminate on zero) + * + * Returns: true if a non-free block is encountered or the end of the resource + * group is reached. + */ + +static bool gfs2_unaligned_extlen(struct gfs2_rbm *rbm, u32 n_unaligned, u32 *len) +{ + u32 n; + u8 res; + + for (n = 0; n < n_unaligned; n++) { + res = gfs2_testbit(rbm, true); + if (res != GFS2_BLKST_FREE) + return true; + (*len)--; + if (*len == 0) + return true; + if (gfs2_rbm_add(rbm, 1)) + return true; + } + + return false; +} + +/** + * gfs2_free_extlen - Return extent length of free blocks + * @rrbm: Starting position + * @len: Max length to check + * + * Starting at the block specified by the rbm, see how many free blocks + * there are, not reading more than len blocks ahead. This can be done + * using memchr_inv when the blocks are byte aligned, but has to be done + * on a block by block basis in case of unaligned blocks. Also this + * function can cope with bitmap boundaries (although it must stop on + * a resource group boundary) + * + * Returns: Number of free blocks in the extent + */ + +static u32 gfs2_free_extlen(const struct gfs2_rbm *rrbm, u32 len) +{ + struct gfs2_rbm rbm = *rrbm; + u32 n_unaligned = rbm.offset & 3; + u32 size = len; + u32 bytes; + u32 chunk_size; + u8 *ptr, *start, *end; + u64 block; + struct gfs2_bitmap *bi; + + if (n_unaligned && + gfs2_unaligned_extlen(&rbm, 4 - n_unaligned, &len)) + goto out; + + n_unaligned = len & 3; + /* Start is now byte aligned */ + while (len > 3) { + bi = rbm_bi(&rbm); + start = bi->bi_bh->b_data; + if (bi->bi_clone) + start = bi->bi_clone; + start += bi->bi_offset; + end = start + bi->bi_bytes; + BUG_ON(rbm.offset & 3); + start += (rbm.offset / GFS2_NBBY); + bytes = min_t(u32, len / GFS2_NBBY, (end - start)); + ptr = memchr_inv(start, 0, bytes); + chunk_size = ((ptr == NULL) ? bytes : (ptr - start)); + chunk_size *= GFS2_NBBY; + BUG_ON(len < chunk_size); + len -= chunk_size; + block = gfs2_rbm_to_block(&rbm); + if (gfs2_rbm_from_block(&rbm, block + chunk_size)) { + n_unaligned = 0; + break; + } + if (ptr) { + n_unaligned = 3; + break; + } + n_unaligned = len & 3; + } + + /* Deal with any bits left over at the end */ + if (n_unaligned) + gfs2_unaligned_extlen(&rbm, n_unaligned, &len); +out: + return size - len; +} + +/** + * gfs2_bitcount - count the number of bits in a certain state + * @rgd: the resource group descriptor + * @buffer: the buffer that holds the bitmaps + * @buflen: the length (in bytes) of the buffer + * @state: the state of the block we're looking for + * + * Returns: The number of bits + */ + +static u32 gfs2_bitcount(struct gfs2_rgrpd *rgd, const u8 *buffer, + unsigned int buflen, u8 state) +{ + const u8 *byte = buffer; + const u8 *end = buffer + buflen; + const u8 state1 = state << 2; + const u8 state2 = state << 4; + const u8 state3 = state << 6; + u32 count = 0; + + for (; byte < end; byte++) { + if (((*byte) & 0x03) == state) + count++; + if (((*byte) & 0x0C) == state1) + count++; + if (((*byte) & 0x30) == state2) + count++; + if (((*byte) & 0xC0) == state3) + count++; + } + + return count; +} + +/** + * gfs2_rgrp_verify - Verify that a resource group is consistent + * @rgd: the rgrp + * + */ + +void gfs2_rgrp_verify(struct gfs2_rgrpd *rgd) +{ + struct gfs2_sbd *sdp = rgd->rd_sbd; + struct gfs2_bitmap *bi = NULL; + u32 length = rgd->rd_length; + u32 count[4], tmp; + int buf, x; + + memset(count, 0, 4 * sizeof(u32)); + + /* Count # blocks in each of 4 possible allocation states */ + for (buf = 0; buf < length; buf++) { + bi = rgd->rd_bits + buf; + for (x = 0; x < 4; x++) + count[x] += gfs2_bitcount(rgd, + bi->bi_bh->b_data + + bi->bi_offset, + bi->bi_bytes, x); + } + + if (count[0] != rgd->rd_free) { + gfs2_lm(sdp, "free data mismatch: %u != %u\n", + count[0], rgd->rd_free); + gfs2_consist_rgrpd(rgd); + return; + } + + tmp = rgd->rd_data - rgd->rd_free - rgd->rd_dinodes; + if (count[1] != tmp) { + gfs2_lm(sdp, "used data mismatch: %u != %u\n", + count[1], tmp); + gfs2_consist_rgrpd(rgd); + return; + } + + if (count[2] + count[3] != rgd->rd_dinodes) { + gfs2_lm(sdp, "used metadata mismatch: %u != %u\n", + count[2] + count[3], rgd->rd_dinodes); + gfs2_consist_rgrpd(rgd); + return; + } +} + +/** + * gfs2_blk2rgrpd - Find resource group for a given data/meta block number + * @sdp: The GFS2 superblock + * @blk: The data block number + * @exact: True if this needs to be an exact match + * + * The @exact argument should be set to true by most callers. The exception + * is when we need to match blocks which are not represented by the rgrp + * bitmap, but which are part of the rgrp (i.e. padding blocks) which are + * there for alignment purposes. Another way of looking at it is that @exact + * matches only valid data/metadata blocks, but with @exact false, it will + * match any block within the extent of the rgrp. + * + * Returns: The resource group, or NULL if not found + */ + +struct gfs2_rgrpd *gfs2_blk2rgrpd(struct gfs2_sbd *sdp, u64 blk, bool exact) +{ + struct rb_node *n, *next; + struct gfs2_rgrpd *cur; + + spin_lock(&sdp->sd_rindex_spin); + n = sdp->sd_rindex_tree.rb_node; + while (n) { + cur = rb_entry(n, struct gfs2_rgrpd, rd_node); + next = NULL; + if (blk < cur->rd_addr) + next = n->rb_left; + else if (blk >= cur->rd_data0 + cur->rd_data) + next = n->rb_right; + if (next == NULL) { + spin_unlock(&sdp->sd_rindex_spin); + if (exact) { + if (blk < cur->rd_addr) + return NULL; + if (blk >= cur->rd_data0 + cur->rd_data) + return NULL; + } + return cur; + } + n = next; + } + spin_unlock(&sdp->sd_rindex_spin); + + return NULL; +} + +/** + * gfs2_rgrpd_get_first - get the first Resource Group in the filesystem + * @sdp: The GFS2 superblock + * + * Returns: The first rgrp in the filesystem + */ + +struct gfs2_rgrpd *gfs2_rgrpd_get_first(struct gfs2_sbd *sdp) +{ + const struct rb_node *n; + struct gfs2_rgrpd *rgd; + + spin_lock(&sdp->sd_rindex_spin); + n = rb_first(&sdp->sd_rindex_tree); + rgd = rb_entry(n, struct gfs2_rgrpd, rd_node); + spin_unlock(&sdp->sd_rindex_spin); + + return rgd; +} + +/** + * gfs2_rgrpd_get_next - get the next RG + * @rgd: the resource group descriptor + * + * Returns: The next rgrp + */ + +struct gfs2_rgrpd *gfs2_rgrpd_get_next(struct gfs2_rgrpd *rgd) +{ + struct gfs2_sbd *sdp = rgd->rd_sbd; + const struct rb_node *n; + + spin_lock(&sdp->sd_rindex_spin); + n = rb_next(&rgd->rd_node); + if (n == NULL) + n = rb_first(&sdp->sd_rindex_tree); + + if (unlikely(&rgd->rd_node == n)) { + spin_unlock(&sdp->sd_rindex_spin); + return NULL; + } + rgd = rb_entry(n, struct gfs2_rgrpd, rd_node); + spin_unlock(&sdp->sd_rindex_spin); + return rgd; +} + +void check_and_update_goal(struct gfs2_inode *ip) +{ + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + if (!ip->i_goal || gfs2_blk2rgrpd(sdp, ip->i_goal, 1) == NULL) + ip->i_goal = ip->i_no_addr; +} + +void gfs2_free_clones(struct gfs2_rgrpd *rgd) +{ + int x; + + for (x = 0; x < rgd->rd_length; x++) { + struct gfs2_bitmap *bi = rgd->rd_bits + x; + kfree(bi->bi_clone); + bi->bi_clone = NULL; + } +} + +static void dump_rs(struct seq_file *seq, const struct gfs2_blkreserv *rs, + const char *fs_id_buf) +{ + struct gfs2_inode *ip = container_of(rs, struct gfs2_inode, i_res); + + gfs2_print_dbg(seq, "%s B: n:%llu s:%llu f:%u\n", + fs_id_buf, + (unsigned long long)ip->i_no_addr, + (unsigned long long)rs->rs_start, + rs->rs_requested); +} + +/** + * __rs_deltree - remove a multi-block reservation from the rgd tree + * @rs: The reservation to remove + * + */ +static void __rs_deltree(struct gfs2_blkreserv *rs) +{ + struct gfs2_rgrpd *rgd; + + if (!gfs2_rs_active(rs)) + return; + + rgd = rs->rs_rgd; + trace_gfs2_rs(rs, TRACE_RS_TREEDEL); + rb_erase(&rs->rs_node, &rgd->rd_rstree); + RB_CLEAR_NODE(&rs->rs_node); + + if (rs->rs_requested) { + /* return requested blocks to the rgrp */ + BUG_ON(rs->rs_rgd->rd_requested < rs->rs_requested); + rs->rs_rgd->rd_requested -= rs->rs_requested; + + /* The rgrp extent failure point is likely not to increase; + it will only do so if the freed blocks are somehow + contiguous with a span of free blocks that follows. Still, + it will force the number to be recalculated later. */ + rgd->rd_extfail_pt += rs->rs_requested; + rs->rs_requested = 0; + } +} + +/** + * gfs2_rs_deltree - remove a multi-block reservation from the rgd tree + * @rs: The reservation to remove + * + */ +void gfs2_rs_deltree(struct gfs2_blkreserv *rs) +{ + struct gfs2_rgrpd *rgd; + + rgd = rs->rs_rgd; + if (rgd) { + spin_lock(&rgd->rd_rsspin); + __rs_deltree(rs); + BUG_ON(rs->rs_requested); + spin_unlock(&rgd->rd_rsspin); + } +} + +/** + * gfs2_rs_delete - delete a multi-block reservation + * @ip: The inode for this reservation + * + */ +void gfs2_rs_delete(struct gfs2_inode *ip) +{ + struct inode *inode = &ip->i_inode; + + down_write(&ip->i_rw_mutex); + if (atomic_read(&inode->i_writecount) <= 1) + gfs2_rs_deltree(&ip->i_res); + up_write(&ip->i_rw_mutex); +} + +/** + * return_all_reservations - return all reserved blocks back to the rgrp. + * @rgd: the rgrp that needs its space back + * + * We previously reserved a bunch of blocks for allocation. Now we need to + * give them back. This leave the reservation structures in tact, but removes + * all of their corresponding "no-fly zones". + */ +static void return_all_reservations(struct gfs2_rgrpd *rgd) +{ + struct rb_node *n; + struct gfs2_blkreserv *rs; + + spin_lock(&rgd->rd_rsspin); + while ((n = rb_first(&rgd->rd_rstree))) { + rs = rb_entry(n, struct gfs2_blkreserv, rs_node); + __rs_deltree(rs); + } + spin_unlock(&rgd->rd_rsspin); +} + +void gfs2_clear_rgrpd(struct gfs2_sbd *sdp) +{ + struct rb_node *n; + struct gfs2_rgrpd *rgd; + struct gfs2_glock *gl; + + while ((n = rb_first(&sdp->sd_rindex_tree))) { + rgd = rb_entry(n, struct gfs2_rgrpd, rd_node); + gl = rgd->rd_gl; + + rb_erase(n, &sdp->sd_rindex_tree); + + if (gl) { + if (gl->gl_state != LM_ST_UNLOCKED) { + gfs2_glock_cb(gl, LM_ST_UNLOCKED); + flush_delayed_work(&gl->gl_work); + } + gfs2_rgrp_brelse(rgd); + glock_clear_object(gl, rgd); + gfs2_glock_put(gl); + } + + gfs2_free_clones(rgd); + return_all_reservations(rgd); + kfree(rgd->rd_bits); + rgd->rd_bits = NULL; + kmem_cache_free(gfs2_rgrpd_cachep, rgd); + } +} + +/** + * compute_bitstructs - Compute the bitmap sizes + * @rgd: The resource group descriptor + * + * Calculates bitmap descriptors, one for each block that contains bitmap data + * + * Returns: errno + */ + +static int compute_bitstructs(struct gfs2_rgrpd *rgd) +{ + struct gfs2_sbd *sdp = rgd->rd_sbd; + struct gfs2_bitmap *bi; + u32 length = rgd->rd_length; /* # blocks in hdr & bitmap */ + u32 bytes_left, bytes; + int x; + + if (!length) + return -EINVAL; + + rgd->rd_bits = kcalloc(length, sizeof(struct gfs2_bitmap), GFP_NOFS); + if (!rgd->rd_bits) + return -ENOMEM; + + bytes_left = rgd->rd_bitbytes; + + for (x = 0; x < length; x++) { + bi = rgd->rd_bits + x; + + bi->bi_flags = 0; + /* small rgrp; bitmap stored completely in header block */ + if (length == 1) { + bytes = bytes_left; + bi->bi_offset = sizeof(struct gfs2_rgrp); + bi->bi_start = 0; + bi->bi_bytes = bytes; + bi->bi_blocks = bytes * GFS2_NBBY; + /* header block */ + } else if (x == 0) { + bytes = sdp->sd_sb.sb_bsize - sizeof(struct gfs2_rgrp); + bi->bi_offset = sizeof(struct gfs2_rgrp); + bi->bi_start = 0; + bi->bi_bytes = bytes; + bi->bi_blocks = bytes * GFS2_NBBY; + /* last block */ + } else if (x + 1 == length) { + bytes = bytes_left; + bi->bi_offset = sizeof(struct gfs2_meta_header); + bi->bi_start = rgd->rd_bitbytes - bytes_left; + bi->bi_bytes = bytes; + bi->bi_blocks = bytes * GFS2_NBBY; + /* other blocks */ + } else { + bytes = sdp->sd_sb.sb_bsize - + sizeof(struct gfs2_meta_header); + bi->bi_offset = sizeof(struct gfs2_meta_header); + bi->bi_start = rgd->rd_bitbytes - bytes_left; + bi->bi_bytes = bytes; + bi->bi_blocks = bytes * GFS2_NBBY; + } + + bytes_left -= bytes; + } + + if (bytes_left) { + gfs2_consist_rgrpd(rgd); + return -EIO; + } + bi = rgd->rd_bits + (length - 1); + if ((bi->bi_start + bi->bi_bytes) * GFS2_NBBY != rgd->rd_data) { + gfs2_lm(sdp, + "ri_addr = %llu\n" + "ri_length = %u\n" + "ri_data0 = %llu\n" + "ri_data = %u\n" + "ri_bitbytes = %u\n" + "start=%u len=%u offset=%u\n", + (unsigned long long)rgd->rd_addr, + rgd->rd_length, + (unsigned long long)rgd->rd_data0, + rgd->rd_data, + rgd->rd_bitbytes, + bi->bi_start, bi->bi_bytes, bi->bi_offset); + gfs2_consist_rgrpd(rgd); + return -EIO; + } + + return 0; +} + +/** + * gfs2_ri_total - Total up the file system space, according to the rindex. + * @sdp: the filesystem + * + */ +u64 gfs2_ri_total(struct gfs2_sbd *sdp) +{ + u64 total_data = 0; + struct inode *inode = sdp->sd_rindex; + struct gfs2_inode *ip = GFS2_I(inode); + char buf[sizeof(struct gfs2_rindex)]; + int error, rgrps; + + for (rgrps = 0;; rgrps++) { + loff_t pos = rgrps * sizeof(struct gfs2_rindex); + + if (pos + sizeof(struct gfs2_rindex) > i_size_read(inode)) + break; + error = gfs2_internal_read(ip, buf, &pos, + sizeof(struct gfs2_rindex)); + if (error != sizeof(struct gfs2_rindex)) + break; + total_data += be32_to_cpu(((struct gfs2_rindex *)buf)->ri_data); + } + return total_data; +} + +static int rgd_insert(struct gfs2_rgrpd *rgd) +{ + struct gfs2_sbd *sdp = rgd->rd_sbd; + struct rb_node **newn = &sdp->sd_rindex_tree.rb_node, *parent = NULL; + + /* Figure out where to put new node */ + while (*newn) { + struct gfs2_rgrpd *cur = rb_entry(*newn, struct gfs2_rgrpd, + rd_node); + + parent = *newn; + if (rgd->rd_addr < cur->rd_addr) + newn = &((*newn)->rb_left); + else if (rgd->rd_addr > cur->rd_addr) + newn = &((*newn)->rb_right); + else + return -EEXIST; + } + + rb_link_node(&rgd->rd_node, parent, newn); + rb_insert_color(&rgd->rd_node, &sdp->sd_rindex_tree); + sdp->sd_rgrps++; + return 0; +} + +/** + * read_rindex_entry - Pull in a new resource index entry from the disk + * @ip: Pointer to the rindex inode + * + * Returns: 0 on success, > 0 on EOF, error code otherwise + */ + +static int read_rindex_entry(struct gfs2_inode *ip) +{ + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + loff_t pos = sdp->sd_rgrps * sizeof(struct gfs2_rindex); + struct gfs2_rindex buf; + int error; + struct gfs2_rgrpd *rgd; + + if (pos >= i_size_read(&ip->i_inode)) + return 1; + + error = gfs2_internal_read(ip, (char *)&buf, &pos, + sizeof(struct gfs2_rindex)); + + if (error != sizeof(struct gfs2_rindex)) + return (error == 0) ? 1 : error; + + rgd = kmem_cache_zalloc(gfs2_rgrpd_cachep, GFP_NOFS); + error = -ENOMEM; + if (!rgd) + return error; + + rgd->rd_sbd = sdp; + rgd->rd_addr = be64_to_cpu(buf.ri_addr); + rgd->rd_length = be32_to_cpu(buf.ri_length); + rgd->rd_data0 = be64_to_cpu(buf.ri_data0); + rgd->rd_data = be32_to_cpu(buf.ri_data); + rgd->rd_bitbytes = be32_to_cpu(buf.ri_bitbytes); + spin_lock_init(&rgd->rd_rsspin); + mutex_init(&rgd->rd_mutex); + + error = gfs2_glock_get(sdp, rgd->rd_addr, + &gfs2_rgrp_glops, CREATE, &rgd->rd_gl); + if (error) + goto fail; + + error = compute_bitstructs(rgd); + if (error) + goto fail_glock; + + rgd->rd_rgl = (struct gfs2_rgrp_lvb *)rgd->rd_gl->gl_lksb.sb_lvbptr; + rgd->rd_flags &= ~GFS2_RDF_PREFERRED; + if (rgd->rd_data > sdp->sd_max_rg_data) + sdp->sd_max_rg_data = rgd->rd_data; + spin_lock(&sdp->sd_rindex_spin); + error = rgd_insert(rgd); + spin_unlock(&sdp->sd_rindex_spin); + if (!error) { + glock_set_object(rgd->rd_gl, rgd); + return 0; + } + + error = 0; /* someone else read in the rgrp; free it and ignore it */ +fail_glock: + gfs2_glock_put(rgd->rd_gl); + +fail: + kfree(rgd->rd_bits); + rgd->rd_bits = NULL; + kmem_cache_free(gfs2_rgrpd_cachep, rgd); + return error; +} + +/** + * set_rgrp_preferences - Run all the rgrps, selecting some we prefer to use + * @sdp: the GFS2 superblock + * + * The purpose of this function is to select a subset of the resource groups + * and mark them as PREFERRED. We do it in such a way that each node prefers + * to use a unique set of rgrps to minimize glock contention. + */ +static void set_rgrp_preferences(struct gfs2_sbd *sdp) +{ + struct gfs2_rgrpd *rgd, *first; + int i; + + /* Skip an initial number of rgrps, based on this node's journal ID. + That should start each node out on its own set. */ + rgd = gfs2_rgrpd_get_first(sdp); + for (i = 0; i < sdp->sd_lockstruct.ls_jid; i++) + rgd = gfs2_rgrpd_get_next(rgd); + first = rgd; + + do { + rgd->rd_flags |= GFS2_RDF_PREFERRED; + for (i = 0; i < sdp->sd_journals; i++) { + rgd = gfs2_rgrpd_get_next(rgd); + if (!rgd || rgd == first) + break; + } + } while (rgd && rgd != first); +} + +/** + * gfs2_ri_update - Pull in a new resource index from the disk + * @ip: pointer to the rindex inode + * + * Returns: 0 on successful update, error code otherwise + */ + +static int gfs2_ri_update(struct gfs2_inode *ip) +{ + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + int error; + + do { + error = read_rindex_entry(ip); + } while (error == 0); + + if (error < 0) + return error; + + if (RB_EMPTY_ROOT(&sdp->sd_rindex_tree)) { + fs_err(sdp, "no resource groups found in the file system.\n"); + return -ENOENT; + } + set_rgrp_preferences(sdp); + + sdp->sd_rindex_uptodate = 1; + return 0; +} + +/** + * gfs2_rindex_update - Update the rindex if required + * @sdp: The GFS2 superblock + * + * We grab a lock on the rindex inode to make sure that it doesn't + * change whilst we are performing an operation. We keep this lock + * for quite long periods of time compared to other locks. This + * doesn't matter, since it is shared and it is very, very rarely + * accessed in the exclusive mode (i.e. only when expanding the filesystem). + * + * This makes sure that we're using the latest copy of the resource index + * special file, which might have been updated if someone expanded the + * filesystem (via gfs2_grow utility), which adds new resource groups. + * + * Returns: 0 on succeess, error code otherwise + */ + +int gfs2_rindex_update(struct gfs2_sbd *sdp) +{ + struct gfs2_inode *ip = GFS2_I(sdp->sd_rindex); + struct gfs2_glock *gl = ip->i_gl; + struct gfs2_holder ri_gh; + int error = 0; + int unlock_required = 0; + + /* Read new copy from disk if we don't have the latest */ + if (!sdp->sd_rindex_uptodate) { + if (!gfs2_glock_is_locked_by_me(gl)) { + error = gfs2_glock_nq_init(gl, LM_ST_SHARED, 0, &ri_gh); + if (error) + return error; + unlock_required = 1; + } + if (!sdp->sd_rindex_uptodate) + error = gfs2_ri_update(ip); + if (unlock_required) + gfs2_glock_dq_uninit(&ri_gh); + } + + return error; +} + +static void gfs2_rgrp_in(struct gfs2_rgrpd *rgd, const void *buf) +{ + const struct gfs2_rgrp *str = buf; + u32 rg_flags; + + rg_flags = be32_to_cpu(str->rg_flags); + rg_flags &= ~GFS2_RDF_MASK; + rgd->rd_flags &= GFS2_RDF_MASK; + rgd->rd_flags |= rg_flags; + rgd->rd_free = be32_to_cpu(str->rg_free); + rgd->rd_dinodes = be32_to_cpu(str->rg_dinodes); + rgd->rd_igeneration = be64_to_cpu(str->rg_igeneration); + /* rd_data0, rd_data and rd_bitbytes already set from rindex */ +} + +static void gfs2_rgrp_ondisk2lvb(struct gfs2_rgrp_lvb *rgl, const void *buf) +{ + const struct gfs2_rgrp *str = buf; + + rgl->rl_magic = cpu_to_be32(GFS2_MAGIC); + rgl->rl_flags = str->rg_flags; + rgl->rl_free = str->rg_free; + rgl->rl_dinodes = str->rg_dinodes; + rgl->rl_igeneration = str->rg_igeneration; + rgl->__pad = 0UL; +} + +static void gfs2_rgrp_out(struct gfs2_rgrpd *rgd, void *buf) +{ + struct gfs2_rgrpd *next = gfs2_rgrpd_get_next(rgd); + struct gfs2_rgrp *str = buf; + u32 crc; + + str->rg_flags = cpu_to_be32(rgd->rd_flags & ~GFS2_RDF_MASK); + str->rg_free = cpu_to_be32(rgd->rd_free); + str->rg_dinodes = cpu_to_be32(rgd->rd_dinodes); + if (next == NULL) + str->rg_skip = 0; + else if (next->rd_addr > rgd->rd_addr) + str->rg_skip = cpu_to_be32(next->rd_addr - rgd->rd_addr); + str->rg_igeneration = cpu_to_be64(rgd->rd_igeneration); + str->rg_data0 = cpu_to_be64(rgd->rd_data0); + str->rg_data = cpu_to_be32(rgd->rd_data); + str->rg_bitbytes = cpu_to_be32(rgd->rd_bitbytes); + str->rg_crc = 0; + crc = gfs2_disk_hash(buf, sizeof(struct gfs2_rgrp)); + str->rg_crc = cpu_to_be32(crc); + + memset(&str->rg_reserved, 0, sizeof(str->rg_reserved)); + gfs2_rgrp_ondisk2lvb(rgd->rd_rgl, buf); +} + +static int gfs2_rgrp_lvb_valid(struct gfs2_rgrpd *rgd) +{ + struct gfs2_rgrp_lvb *rgl = rgd->rd_rgl; + struct gfs2_rgrp *str = (struct gfs2_rgrp *)rgd->rd_bits[0].bi_bh->b_data; + struct gfs2_sbd *sdp = rgd->rd_sbd; + int valid = 1; + + if (rgl->rl_flags != str->rg_flags) { + fs_warn(sdp, "GFS2: rgd: %llu lvb flag mismatch %u/%u", + (unsigned long long)rgd->rd_addr, + be32_to_cpu(rgl->rl_flags), be32_to_cpu(str->rg_flags)); + valid = 0; + } + if (rgl->rl_free != str->rg_free) { + fs_warn(sdp, "GFS2: rgd: %llu lvb free mismatch %u/%u", + (unsigned long long)rgd->rd_addr, + be32_to_cpu(rgl->rl_free), be32_to_cpu(str->rg_free)); + valid = 0; + } + if (rgl->rl_dinodes != str->rg_dinodes) { + fs_warn(sdp, "GFS2: rgd: %llu lvb dinode mismatch %u/%u", + (unsigned long long)rgd->rd_addr, + be32_to_cpu(rgl->rl_dinodes), + be32_to_cpu(str->rg_dinodes)); + valid = 0; + } + if (rgl->rl_igeneration != str->rg_igeneration) { + fs_warn(sdp, "GFS2: rgd: %llu lvb igen mismatch %llu/%llu", + (unsigned long long)rgd->rd_addr, + (unsigned long long)be64_to_cpu(rgl->rl_igeneration), + (unsigned long long)be64_to_cpu(str->rg_igeneration)); + valid = 0; + } + return valid; +} + +static u32 count_unlinked(struct gfs2_rgrpd *rgd) +{ + struct gfs2_bitmap *bi; + const u32 length = rgd->rd_length; + const u8 *buffer = NULL; + u32 i, goal, count = 0; + + for (i = 0, bi = rgd->rd_bits; i < length; i++, bi++) { + goal = 0; + buffer = bi->bi_bh->b_data + bi->bi_offset; + WARN_ON(!buffer_uptodate(bi->bi_bh)); + while (goal < bi->bi_blocks) { + goal = gfs2_bitfit(buffer, bi->bi_bytes, goal, + GFS2_BLKST_UNLINKED); + if (goal == BFITNOENT) + break; + count++; + goal++; + } + } + + return count; +} + +static void rgrp_set_bitmap_flags(struct gfs2_rgrpd *rgd) +{ + struct gfs2_bitmap *bi; + int x; + + if (rgd->rd_free) { + for (x = 0; x < rgd->rd_length; x++) { + bi = rgd->rd_bits + x; + clear_bit(GBF_FULL, &bi->bi_flags); + } + } else { + for (x = 0; x < rgd->rd_length; x++) { + bi = rgd->rd_bits + x; + set_bit(GBF_FULL, &bi->bi_flags); + } + } +} + +/** + * gfs2_rgrp_go_instantiate - Read in a RG's header and bitmaps + * @gh: the glock holder representing the rgrpd to read in + * + * Read in all of a Resource Group's header and bitmap blocks. + * Caller must eventually call gfs2_rgrp_brelse() to free the bitmaps. + * + * Returns: errno + */ + +int gfs2_rgrp_go_instantiate(struct gfs2_glock *gl) +{ + struct gfs2_rgrpd *rgd = gl->gl_object; + struct gfs2_sbd *sdp = rgd->rd_sbd; + unsigned int length = rgd->rd_length; + struct gfs2_bitmap *bi; + unsigned int x, y; + int error; + + if (rgd->rd_bits[0].bi_bh != NULL) + return 0; + + for (x = 0; x < length; x++) { + bi = rgd->rd_bits + x; + error = gfs2_meta_read(gl, rgd->rd_addr + x, 0, 0, &bi->bi_bh); + if (error) + goto fail; + } + + for (y = length; y--;) { + bi = rgd->rd_bits + y; + error = gfs2_meta_wait(sdp, bi->bi_bh); + if (error) + goto fail; + if (gfs2_metatype_check(sdp, bi->bi_bh, y ? GFS2_METATYPE_RB : + GFS2_METATYPE_RG)) { + error = -EIO; + goto fail; + } + } + + gfs2_rgrp_in(rgd, (rgd->rd_bits[0].bi_bh)->b_data); + rgrp_set_bitmap_flags(rgd); + rgd->rd_flags |= GFS2_RDF_CHECK; + rgd->rd_free_clone = rgd->rd_free; + GLOCK_BUG_ON(rgd->rd_gl, rgd->rd_reserved); + /* max out the rgrp allocation failure point */ + rgd->rd_extfail_pt = rgd->rd_free; + if (cpu_to_be32(GFS2_MAGIC) != rgd->rd_rgl->rl_magic) { + rgd->rd_rgl->rl_unlinked = cpu_to_be32(count_unlinked(rgd)); + gfs2_rgrp_ondisk2lvb(rgd->rd_rgl, + rgd->rd_bits[0].bi_bh->b_data); + } else if (sdp->sd_args.ar_rgrplvb) { + if (!gfs2_rgrp_lvb_valid(rgd)){ + gfs2_consist_rgrpd(rgd); + error = -EIO; + goto fail; + } + if (rgd->rd_rgl->rl_unlinked == 0) + rgd->rd_flags &= ~GFS2_RDF_CHECK; + } + return 0; + +fail: + while (x--) { + bi = rgd->rd_bits + x; + brelse(bi->bi_bh); + bi->bi_bh = NULL; + gfs2_assert_warn(sdp, !bi->bi_clone); + } + return error; +} + +static int update_rgrp_lvb(struct gfs2_rgrpd *rgd, struct gfs2_holder *gh) +{ + u32 rl_flags; + + if (!test_bit(GLF_INSTANTIATE_NEEDED, &gh->gh_gl->gl_flags)) + return 0; + + if (cpu_to_be32(GFS2_MAGIC) != rgd->rd_rgl->rl_magic) + return gfs2_instantiate(gh); + + rl_flags = be32_to_cpu(rgd->rd_rgl->rl_flags); + rl_flags &= ~GFS2_RDF_MASK; + rgd->rd_flags &= GFS2_RDF_MASK; + rgd->rd_flags |= (rl_flags | GFS2_RDF_CHECK); + if (rgd->rd_rgl->rl_unlinked == 0) + rgd->rd_flags &= ~GFS2_RDF_CHECK; + rgd->rd_free = be32_to_cpu(rgd->rd_rgl->rl_free); + rgrp_set_bitmap_flags(rgd); + rgd->rd_free_clone = rgd->rd_free; + GLOCK_BUG_ON(rgd->rd_gl, rgd->rd_reserved); + /* max out the rgrp allocation failure point */ + rgd->rd_extfail_pt = rgd->rd_free; + rgd->rd_dinodes = be32_to_cpu(rgd->rd_rgl->rl_dinodes); + rgd->rd_igeneration = be64_to_cpu(rgd->rd_rgl->rl_igeneration); + return 0; +} + +/** + * gfs2_rgrp_brelse - Release RG bitmaps read in with gfs2_rgrp_bh_get() + * @rgd: The resource group + * + */ + +void gfs2_rgrp_brelse(struct gfs2_rgrpd *rgd) +{ + int x, length = rgd->rd_length; + + for (x = 0; x < length; x++) { + struct gfs2_bitmap *bi = rgd->rd_bits + x; + if (bi->bi_bh) { + brelse(bi->bi_bh); + bi->bi_bh = NULL; + } + } + set_bit(GLF_INSTANTIATE_NEEDED, &rgd->rd_gl->gl_flags); +} + +int gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset, + struct buffer_head *bh, + const struct gfs2_bitmap *bi, unsigned minlen, u64 *ptrimmed) +{ + struct super_block *sb = sdp->sd_vfs; + u64 blk; + sector_t start = 0; + sector_t nr_blks = 0; + int rv = -EIO; + unsigned int x; + u32 trimmed = 0; + u8 diff; + + for (x = 0; x < bi->bi_bytes; x++) { + const u8 *clone = bi->bi_clone ? bi->bi_clone : bi->bi_bh->b_data; + clone += bi->bi_offset; + clone += x; + if (bh) { + const u8 *orig = bh->b_data + bi->bi_offset + x; + diff = ~(*orig | (*orig >> 1)) & (*clone | (*clone >> 1)); + } else { + diff = ~(*clone | (*clone >> 1)); + } + diff &= 0x55; + if (diff == 0) + continue; + blk = offset + ((bi->bi_start + x) * GFS2_NBBY); + while(diff) { + if (diff & 1) { + if (nr_blks == 0) + goto start_new_extent; + if ((start + nr_blks) != blk) { + if (nr_blks >= minlen) { + rv = sb_issue_discard(sb, + start, nr_blks, + GFP_NOFS, 0); + if (rv) + goto fail; + trimmed += nr_blks; + } + nr_blks = 0; +start_new_extent: + start = blk; + } + nr_blks++; + } + diff >>= 2; + blk++; + } + } + if (nr_blks >= minlen) { + rv = sb_issue_discard(sb, start, nr_blks, GFP_NOFS, 0); + if (rv) + goto fail; + trimmed += nr_blks; + } + if (ptrimmed) + *ptrimmed = trimmed; + return 0; + +fail: + if (sdp->sd_args.ar_discard) + fs_warn(sdp, "error %d on discard request, turning discards off for this filesystem\n", rv); + sdp->sd_args.ar_discard = 0; + return rv; +} + +/** + * gfs2_fitrim - Generate discard requests for unused bits of the filesystem + * @filp: Any file on the filesystem + * @argp: Pointer to the arguments (also used to pass result) + * + * Returns: 0 on success, otherwise error code + */ + +int gfs2_fitrim(struct file *filp, void __user *argp) +{ + struct inode *inode = file_inode(filp); + struct gfs2_sbd *sdp = GFS2_SB(inode); + struct block_device *bdev = sdp->sd_vfs->s_bdev; + struct buffer_head *bh; + struct gfs2_rgrpd *rgd; + struct gfs2_rgrpd *rgd_end; + struct gfs2_holder gh; + struct fstrim_range r; + int ret = 0; + u64 amt; + u64 trimmed = 0; + u64 start, end, minlen; + unsigned int x; + unsigned bs_shift = sdp->sd_sb.sb_bsize_shift; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (!test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) + return -EROFS; + + if (!bdev_max_discard_sectors(bdev)) + return -EOPNOTSUPP; + + if (copy_from_user(&r, argp, sizeof(r))) + return -EFAULT; + + ret = gfs2_rindex_update(sdp); + if (ret) + return ret; + + start = r.start >> bs_shift; + end = start + (r.len >> bs_shift); + minlen = max_t(u64, r.minlen, sdp->sd_sb.sb_bsize); + minlen = max_t(u64, minlen, bdev_discard_granularity(bdev)) >> bs_shift; + + if (end <= start || minlen > sdp->sd_max_rg_data) + return -EINVAL; + + rgd = gfs2_blk2rgrpd(sdp, start, 0); + rgd_end = gfs2_blk2rgrpd(sdp, end, 0); + + if ((gfs2_rgrpd_get_first(sdp) == gfs2_rgrpd_get_next(rgd_end)) + && (start > rgd_end->rd_data0 + rgd_end->rd_data)) + return -EINVAL; /* start is beyond the end of the fs */ + + while (1) { + + ret = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, + LM_FLAG_NODE_SCOPE, &gh); + if (ret) + goto out; + + if (!(rgd->rd_flags & GFS2_RGF_TRIMMED)) { + /* Trim each bitmap in the rgrp */ + for (x = 0; x < rgd->rd_length; x++) { + struct gfs2_bitmap *bi = rgd->rd_bits + x; + rgrp_lock_local(rgd); + ret = gfs2_rgrp_send_discards(sdp, + rgd->rd_data0, NULL, bi, minlen, + &amt); + rgrp_unlock_local(rgd); + if (ret) { + gfs2_glock_dq_uninit(&gh); + goto out; + } + trimmed += amt; + } + + /* Mark rgrp as having been trimmed */ + ret = gfs2_trans_begin(sdp, RES_RG_HDR, 0); + if (ret == 0) { + bh = rgd->rd_bits[0].bi_bh; + rgrp_lock_local(rgd); + rgd->rd_flags |= GFS2_RGF_TRIMMED; + gfs2_trans_add_meta(rgd->rd_gl, bh); + gfs2_rgrp_out(rgd, bh->b_data); + rgrp_unlock_local(rgd); + gfs2_trans_end(sdp); + } + } + gfs2_glock_dq_uninit(&gh); + + if (rgd == rgd_end) + break; + + rgd = gfs2_rgrpd_get_next(rgd); + } + +out: + r.len = trimmed << bs_shift; + if (copy_to_user(argp, &r, sizeof(r))) + return -EFAULT; + + return ret; +} + +/** + * rs_insert - insert a new multi-block reservation into the rgrp's rb_tree + * @ip: the inode structure + * + */ +static void rs_insert(struct gfs2_inode *ip) +{ + struct rb_node **newn, *parent = NULL; + int rc; + struct gfs2_blkreserv *rs = &ip->i_res; + struct gfs2_rgrpd *rgd = rs->rs_rgd; + + BUG_ON(gfs2_rs_active(rs)); + + spin_lock(&rgd->rd_rsspin); + newn = &rgd->rd_rstree.rb_node; + while (*newn) { + struct gfs2_blkreserv *cur = + rb_entry(*newn, struct gfs2_blkreserv, rs_node); + + parent = *newn; + rc = rs_cmp(rs->rs_start, rs->rs_requested, cur); + if (rc > 0) + newn = &((*newn)->rb_right); + else if (rc < 0) + newn = &((*newn)->rb_left); + else { + spin_unlock(&rgd->rd_rsspin); + WARN_ON(1); + return; + } + } + + rb_link_node(&rs->rs_node, parent, newn); + rb_insert_color(&rs->rs_node, &rgd->rd_rstree); + + /* Do our rgrp accounting for the reservation */ + rgd->rd_requested += rs->rs_requested; /* blocks requested */ + spin_unlock(&rgd->rd_rsspin); + trace_gfs2_rs(rs, TRACE_RS_INSERT); +} + +/** + * rgd_free - return the number of free blocks we can allocate + * @rgd: the resource group + * @rs: The reservation to free + * + * This function returns the number of free blocks for an rgrp. + * That's the clone-free blocks (blocks that are free, not including those + * still being used for unlinked files that haven't been deleted.) + * + * It also subtracts any blocks reserved by someone else, but does not + * include free blocks that are still part of our current reservation, + * because obviously we can (and will) allocate them. + */ +static inline u32 rgd_free(struct gfs2_rgrpd *rgd, struct gfs2_blkreserv *rs) +{ + u32 tot_reserved, tot_free; + + if (WARN_ON_ONCE(rgd->rd_requested < rs->rs_requested)) + return 0; + tot_reserved = rgd->rd_requested - rs->rs_requested; + + if (rgd->rd_free_clone < tot_reserved) + tot_reserved = 0; + + tot_free = rgd->rd_free_clone - tot_reserved; + + return tot_free; +} + +/** + * rg_mblk_search - find a group of multiple free blocks to form a reservation + * @rgd: the resource group descriptor + * @ip: pointer to the inode for which we're reserving blocks + * @ap: the allocation parameters + * + */ + +static void rg_mblk_search(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip, + const struct gfs2_alloc_parms *ap) +{ + struct gfs2_rbm rbm = { .rgd = rgd, }; + u64 goal; + struct gfs2_blkreserv *rs = &ip->i_res; + u32 extlen; + u32 free_blocks, blocks_available; + int ret; + struct inode *inode = &ip->i_inode; + + spin_lock(&rgd->rd_rsspin); + free_blocks = rgd_free(rgd, rs); + if (rgd->rd_free_clone < rgd->rd_requested) + free_blocks = 0; + blocks_available = rgd->rd_free_clone - rgd->rd_reserved; + if (rgd == rs->rs_rgd) + blocks_available += rs->rs_reserved; + spin_unlock(&rgd->rd_rsspin); + + if (S_ISDIR(inode->i_mode)) + extlen = 1; + else { + extlen = max_t(u32, atomic_read(&ip->i_sizehint), ap->target); + extlen = clamp(extlen, (u32)RGRP_RSRV_MINBLKS, free_blocks); + } + if (free_blocks < extlen || blocks_available < extlen) + return; + + /* Find bitmap block that contains bits for goal block */ + if (rgrp_contains_block(rgd, ip->i_goal)) + goal = ip->i_goal; + else + goal = rgd->rd_last_alloc + rgd->rd_data0; + + if (WARN_ON(gfs2_rbm_from_block(&rbm, goal))) + return; + + ret = gfs2_rbm_find(&rbm, GFS2_BLKST_FREE, &extlen, &ip->i_res, true); + if (ret == 0) { + rs->rs_start = gfs2_rbm_to_block(&rbm); + rs->rs_requested = extlen; + rs_insert(ip); + } else { + if (goal == rgd->rd_last_alloc + rgd->rd_data0) + rgd->rd_last_alloc = 0; + } +} + +/** + * gfs2_next_unreserved_block - Return next block that is not reserved + * @rgd: The resource group + * @block: The starting block + * @length: The required length + * @ignore_rs: Reservation to ignore + * + * If the block does not appear in any reservation, then return the + * block number unchanged. If it does appear in the reservation, then + * keep looking through the tree of reservations in order to find the + * first block number which is not reserved. + */ + +static u64 gfs2_next_unreserved_block(struct gfs2_rgrpd *rgd, u64 block, + u32 length, + struct gfs2_blkreserv *ignore_rs) +{ + struct gfs2_blkreserv *rs; + struct rb_node *n; + int rc; + + spin_lock(&rgd->rd_rsspin); + n = rgd->rd_rstree.rb_node; + while (n) { + rs = rb_entry(n, struct gfs2_blkreserv, rs_node); + rc = rs_cmp(block, length, rs); + if (rc < 0) + n = n->rb_left; + else if (rc > 0) + n = n->rb_right; + else + break; + } + + if (n) { + while (rs_cmp(block, length, rs) == 0 && rs != ignore_rs) { + block = rs->rs_start + rs->rs_requested; + n = n->rb_right; + if (n == NULL) + break; + rs = rb_entry(n, struct gfs2_blkreserv, rs_node); + } + } + + spin_unlock(&rgd->rd_rsspin); + return block; +} + +/** + * gfs2_reservation_check_and_update - Check for reservations during block alloc + * @rbm: The current position in the resource group + * @rs: Our own reservation + * @minext: The minimum extent length + * @maxext: A pointer to the maximum extent structure + * + * This checks the current position in the rgrp to see whether there is + * a reservation covering this block. If not then this function is a + * no-op. If there is, then the position is moved to the end of the + * contiguous reservation(s) so that we are pointing at the first + * non-reserved block. + * + * Returns: 0 if no reservation, 1 if @rbm has changed, otherwise an error + */ + +static int gfs2_reservation_check_and_update(struct gfs2_rbm *rbm, + struct gfs2_blkreserv *rs, + u32 minext, + struct gfs2_extent *maxext) +{ + u64 block = gfs2_rbm_to_block(rbm); + u32 extlen = 1; + u64 nblock; + + /* + * If we have a minimum extent length, then skip over any extent + * which is less than the min extent length in size. + */ + if (minext > 1) { + extlen = gfs2_free_extlen(rbm, minext); + if (extlen <= maxext->len) + goto fail; + } + + /* + * Check the extent which has been found against the reservations + * and skip if parts of it are already reserved + */ + nblock = gfs2_next_unreserved_block(rbm->rgd, block, extlen, rs); + if (nblock == block) { + if (!minext || extlen >= minext) + return 0; + + if (extlen > maxext->len) { + maxext->len = extlen; + maxext->rbm = *rbm; + } + } else { + u64 len = nblock - block; + if (len >= (u64)1 << 32) + return -E2BIG; + extlen = len; + } +fail: + if (gfs2_rbm_add(rbm, extlen)) + return -E2BIG; + return 1; +} + +/** + * gfs2_rbm_find - Look for blocks of a particular state + * @rbm: Value/result starting position and final position + * @state: The state which we want to find + * @minext: Pointer to the requested extent length + * This is updated to be the actual reservation size. + * @rs: Our own reservation (NULL to skip checking for reservations) + * @nowrap: Stop looking at the end of the rgrp, rather than wrapping + * around until we've reached the starting point. + * + * Side effects: + * - If looking for free blocks, we set GBF_FULL on each bitmap which + * has no free blocks in it. + * - If looking for free blocks, we set rd_extfail_pt on each rgrp which + * has come up short on a free block search. + * + * Returns: 0 on success, -ENOSPC if there is no block of the requested state + */ + +static int gfs2_rbm_find(struct gfs2_rbm *rbm, u8 state, u32 *minext, + struct gfs2_blkreserv *rs, bool nowrap) +{ + bool scan_from_start = rbm->bii == 0 && rbm->offset == 0; + struct buffer_head *bh; + int last_bii; + u32 offset; + u8 *buffer; + bool wrapped = false; + int ret; + struct gfs2_bitmap *bi; + struct gfs2_extent maxext = { .rbm.rgd = rbm->rgd, }; + + /* + * Determine the last bitmap to search. If we're not starting at the + * beginning of a bitmap, we need to search that bitmap twice to scan + * the entire resource group. + */ + last_bii = rbm->bii - (rbm->offset == 0); + + while(1) { + bi = rbm_bi(rbm); + if (test_bit(GBF_FULL, &bi->bi_flags) && + (state == GFS2_BLKST_FREE)) + goto next_bitmap; + + bh = bi->bi_bh; + buffer = bh->b_data + bi->bi_offset; + WARN_ON(!buffer_uptodate(bh)); + if (state != GFS2_BLKST_UNLINKED && bi->bi_clone) + buffer = bi->bi_clone + bi->bi_offset; + offset = gfs2_bitfit(buffer, bi->bi_bytes, rbm->offset, state); + if (offset == BFITNOENT) { + if (state == GFS2_BLKST_FREE && rbm->offset == 0) + set_bit(GBF_FULL, &bi->bi_flags); + goto next_bitmap; + } + rbm->offset = offset; + if (!rs || !minext) + return 0; + + ret = gfs2_reservation_check_and_update(rbm, rs, *minext, + &maxext); + if (ret == 0) + return 0; + if (ret > 0) + goto next_iter; + if (ret == -E2BIG) { + rbm->bii = 0; + rbm->offset = 0; + goto res_covered_end_of_rgrp; + } + return ret; + +next_bitmap: /* Find next bitmap in the rgrp */ + rbm->offset = 0; + rbm->bii++; + if (rbm->bii == rbm->rgd->rd_length) + rbm->bii = 0; +res_covered_end_of_rgrp: + if (rbm->bii == 0) { + if (wrapped) + break; + wrapped = true; + if (nowrap) + break; + } +next_iter: + /* Have we scanned the entire resource group? */ + if (wrapped && rbm->bii > last_bii) + break; + } + + if (state != GFS2_BLKST_FREE) + return -ENOSPC; + + /* If the extent was too small, and it's smaller than the smallest + to have failed before, remember for future reference that it's + useless to search this rgrp again for this amount or more. */ + if (wrapped && (scan_from_start || rbm->bii > last_bii) && + *minext < rbm->rgd->rd_extfail_pt) + rbm->rgd->rd_extfail_pt = *minext - 1; + + /* If the maximum extent we found is big enough to fulfill the + minimum requirements, use it anyway. */ + if (maxext.len) { + *rbm = maxext.rbm; + *minext = maxext.len; + return 0; + } + + return -ENOSPC; +} + +/** + * try_rgrp_unlink - Look for any unlinked, allocated, but unused inodes + * @rgd: The rgrp + * @last_unlinked: block address of the last dinode we unlinked + * @skip: block address we should explicitly not unlink + * + * Returns: 0 if no error + * The inode, if one has been found, in inode. + */ + +static void try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked, u64 skip) +{ + u64 block; + struct gfs2_sbd *sdp = rgd->rd_sbd; + struct gfs2_glock *gl; + struct gfs2_inode *ip; + int error; + int found = 0; + struct gfs2_rbm rbm = { .rgd = rgd, .bii = 0, .offset = 0 }; + + while (1) { + error = gfs2_rbm_find(&rbm, GFS2_BLKST_UNLINKED, NULL, NULL, + true); + if (error == -ENOSPC) + break; + if (WARN_ON_ONCE(error)) + break; + + block = gfs2_rbm_to_block(&rbm); + if (gfs2_rbm_from_block(&rbm, block + 1)) + break; + if (*last_unlinked != NO_BLOCK && block <= *last_unlinked) + continue; + if (block == skip) + continue; + *last_unlinked = block; + + error = gfs2_glock_get(sdp, block, &gfs2_iopen_glops, CREATE, &gl); + if (error) + continue; + + /* If the inode is already in cache, we can ignore it here + * because the existing inode disposal code will deal with + * it when all refs have gone away. Accessing gl_object like + * this is not safe in general. Here it is ok because we do + * not dereference the pointer, and we only need an approx + * answer to whether it is NULL or not. + */ + ip = gl->gl_object; + + if (ip || !gfs2_queue_try_to_evict(gl)) + gfs2_glock_put(gl); + else + found++; + + /* Limit reclaim to sensible number of tasks */ + if (found > NR_CPUS) + return; + } + + rgd->rd_flags &= ~GFS2_RDF_CHECK; + return; +} + +/** + * gfs2_rgrp_congested - Use stats to figure out whether an rgrp is congested + * @rgd: The rgrp in question + * @loops: An indication of how picky we can be (0=very, 1=less so) + * + * This function uses the recently added glock statistics in order to + * figure out whether a parciular resource group is suffering from + * contention from multiple nodes. This is done purely on the basis + * of timings, since this is the only data we have to work with and + * our aim here is to reject a resource group which is highly contended + * but (very important) not to do this too often in order to ensure that + * we do not land up introducing fragmentation by changing resource + * groups when not actually required. + * + * The calculation is fairly simple, we want to know whether the SRTTB + * (i.e. smoothed round trip time for blocking operations) to acquire + * the lock for this rgrp's glock is significantly greater than the + * time taken for resource groups on average. We introduce a margin in + * the form of the variable @var which is computed as the sum of the two + * respective variences, and multiplied by a factor depending on @loops + * and whether we have a lot of data to base the decision on. This is + * then tested against the square difference of the means in order to + * decide whether the result is statistically significant or not. + * + * Returns: A boolean verdict on the congestion status + */ + +static bool gfs2_rgrp_congested(const struct gfs2_rgrpd *rgd, int loops) +{ + const struct gfs2_glock *gl = rgd->rd_gl; + const struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; + struct gfs2_lkstats *st; + u64 r_dcount, l_dcount; + u64 l_srttb, a_srttb = 0; + s64 srttb_diff; + u64 sqr_diff; + u64 var; + int cpu, nonzero = 0; + + preempt_disable(); + for_each_present_cpu(cpu) { + st = &per_cpu_ptr(sdp->sd_lkstats, cpu)->lkstats[LM_TYPE_RGRP]; + if (st->stats[GFS2_LKS_SRTTB]) { + a_srttb += st->stats[GFS2_LKS_SRTTB]; + nonzero++; + } + } + st = &this_cpu_ptr(sdp->sd_lkstats)->lkstats[LM_TYPE_RGRP]; + if (nonzero) + do_div(a_srttb, nonzero); + r_dcount = st->stats[GFS2_LKS_DCOUNT]; + var = st->stats[GFS2_LKS_SRTTVARB] + + gl->gl_stats.stats[GFS2_LKS_SRTTVARB]; + preempt_enable(); + + l_srttb = gl->gl_stats.stats[GFS2_LKS_SRTTB]; + l_dcount = gl->gl_stats.stats[GFS2_LKS_DCOUNT]; + + if ((l_dcount < 1) || (r_dcount < 1) || (a_srttb == 0)) + return false; + + srttb_diff = a_srttb - l_srttb; + sqr_diff = srttb_diff * srttb_diff; + + var *= 2; + if (l_dcount < 8 || r_dcount < 8) + var *= 2; + if (loops == 1) + var *= 2; + + return ((srttb_diff < 0) && (sqr_diff > var)); +} + +/** + * gfs2_rgrp_used_recently + * @rs: The block reservation with the rgrp to test + * @msecs: The time limit in milliseconds + * + * Returns: True if the rgrp glock has been used within the time limit + */ +static bool gfs2_rgrp_used_recently(const struct gfs2_blkreserv *rs, + u64 msecs) +{ + u64 tdiff; + + tdiff = ktime_to_ns(ktime_sub(ktime_get_real(), + rs->rs_rgd->rd_gl->gl_dstamp)); + + return tdiff > (msecs * 1000 * 1000); +} + +static u32 gfs2_orlov_skip(const struct gfs2_inode *ip) +{ + const struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + u32 skip; + + get_random_bytes(&skip, sizeof(skip)); + return skip % sdp->sd_rgrps; +} + +static bool gfs2_select_rgrp(struct gfs2_rgrpd **pos, const struct gfs2_rgrpd *begin) +{ + struct gfs2_rgrpd *rgd = *pos; + struct gfs2_sbd *sdp = rgd->rd_sbd; + + rgd = gfs2_rgrpd_get_next(rgd); + if (rgd == NULL) + rgd = gfs2_rgrpd_get_first(sdp); + *pos = rgd; + if (rgd != begin) /* If we didn't wrap */ + return true; + return false; +} + +/** + * fast_to_acquire - determine if a resource group will be fast to acquire + * @rgd: The rgrp + * + * If this is one of our preferred rgrps, it should be quicker to acquire, + * because we tried to set ourselves up as dlm lock master. + */ +static inline int fast_to_acquire(struct gfs2_rgrpd *rgd) +{ + struct gfs2_glock *gl = rgd->rd_gl; + + if (gl->gl_state != LM_ST_UNLOCKED && list_empty(&gl->gl_holders) && + !test_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags) && + !test_bit(GLF_DEMOTE, &gl->gl_flags)) + return 1; + if (rgd->rd_flags & GFS2_RDF_PREFERRED) + return 1; + return 0; +} + +/** + * gfs2_inplace_reserve - Reserve space in the filesystem + * @ip: the inode to reserve space for + * @ap: the allocation parameters + * + * We try our best to find an rgrp that has at least ap->target blocks + * available. After a couple of passes (loops == 2), the prospects of finding + * such an rgrp diminish. At this stage, we return the first rgrp that has + * at least ap->min_target blocks available. + * + * Returns: 0 on success, + * -ENOMEM if a suitable rgrp can't be found + * errno otherwise + */ + +int gfs2_inplace_reserve(struct gfs2_inode *ip, struct gfs2_alloc_parms *ap) +{ + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + struct gfs2_rgrpd *begin = NULL; + struct gfs2_blkreserv *rs = &ip->i_res; + int error = 0, flags = LM_FLAG_NODE_SCOPE; + bool rg_locked; + u64 last_unlinked = NO_BLOCK; + u32 target = ap->target; + int loops = 0; + u32 free_blocks, blocks_available, skip = 0; + + BUG_ON(rs->rs_reserved); + + if (sdp->sd_args.ar_rgrplvb) + flags |= GL_SKIP; + if (gfs2_assert_warn(sdp, target)) + return -EINVAL; + if (gfs2_rs_active(rs)) { + begin = rs->rs_rgd; + } else if (rs->rs_rgd && + rgrp_contains_block(rs->rs_rgd, ip->i_goal)) { + begin = rs->rs_rgd; + } else { + check_and_update_goal(ip); + rs->rs_rgd = begin = gfs2_blk2rgrpd(sdp, ip->i_goal, 1); + } + if (S_ISDIR(ip->i_inode.i_mode) && (ap->aflags & GFS2_AF_ORLOV)) + skip = gfs2_orlov_skip(ip); + if (rs->rs_rgd == NULL) + return -EBADSLT; + + while (loops < 3) { + struct gfs2_rgrpd *rgd; + + rg_locked = gfs2_glock_is_locked_by_me(rs->rs_rgd->rd_gl); + if (rg_locked) { + rgrp_lock_local(rs->rs_rgd); + } else { + if (skip && skip--) + goto next_rgrp; + if (!gfs2_rs_active(rs)) { + if (loops == 0 && + !fast_to_acquire(rs->rs_rgd)) + goto next_rgrp; + if ((loops < 2) && + gfs2_rgrp_used_recently(rs, 1000) && + gfs2_rgrp_congested(rs->rs_rgd, loops)) + goto next_rgrp; + } + error = gfs2_glock_nq_init(rs->rs_rgd->rd_gl, + LM_ST_EXCLUSIVE, flags, + &ip->i_rgd_gh); + if (unlikely(error)) + return error; + rgrp_lock_local(rs->rs_rgd); + if (!gfs2_rs_active(rs) && (loops < 2) && + gfs2_rgrp_congested(rs->rs_rgd, loops)) + goto skip_rgrp; + if (sdp->sd_args.ar_rgrplvb) { + error = update_rgrp_lvb(rs->rs_rgd, + &ip->i_rgd_gh); + if (unlikely(error)) { + rgrp_unlock_local(rs->rs_rgd); + gfs2_glock_dq_uninit(&ip->i_rgd_gh); + return error; + } + } + } + + /* Skip unusable resource groups */ + if ((rs->rs_rgd->rd_flags & (GFS2_RGF_NOALLOC | + GFS2_RDF_ERROR)) || + (loops == 0 && target > rs->rs_rgd->rd_extfail_pt)) + goto skip_rgrp; + + if (sdp->sd_args.ar_rgrplvb) { + error = gfs2_instantiate(&ip->i_rgd_gh); + if (error) + goto skip_rgrp; + } + + /* Get a reservation if we don't already have one */ + if (!gfs2_rs_active(rs)) + rg_mblk_search(rs->rs_rgd, ip, ap); + + /* Skip rgrps when we can't get a reservation on first pass */ + if (!gfs2_rs_active(rs) && (loops < 1)) + goto check_rgrp; + + /* If rgrp has enough free space, use it */ + rgd = rs->rs_rgd; + spin_lock(&rgd->rd_rsspin); + free_blocks = rgd_free(rgd, rs); + blocks_available = rgd->rd_free_clone - rgd->rd_reserved; + if (free_blocks < target || blocks_available < target) { + spin_unlock(&rgd->rd_rsspin); + goto check_rgrp; + } + rs->rs_reserved = ap->target; + if (rs->rs_reserved > blocks_available) + rs->rs_reserved = blocks_available; + rgd->rd_reserved += rs->rs_reserved; + spin_unlock(&rgd->rd_rsspin); + rgrp_unlock_local(rs->rs_rgd); + return 0; +check_rgrp: + /* Check for unlinked inodes which can be reclaimed */ + if (rs->rs_rgd->rd_flags & GFS2_RDF_CHECK) + try_rgrp_unlink(rs->rs_rgd, &last_unlinked, + ip->i_no_addr); +skip_rgrp: + rgrp_unlock_local(rs->rs_rgd); + + /* Drop reservation, if we couldn't use reserved rgrp */ + if (gfs2_rs_active(rs)) + gfs2_rs_deltree(rs); + + /* Unlock rgrp if required */ + if (!rg_locked) + gfs2_glock_dq_uninit(&ip->i_rgd_gh); +next_rgrp: + /* Find the next rgrp, and continue looking */ + if (gfs2_select_rgrp(&rs->rs_rgd, begin)) + continue; + if (skip) + continue; + + /* If we've scanned all the rgrps, but found no free blocks + * then this checks for some less likely conditions before + * trying again. + */ + loops++; + /* Check that fs hasn't grown if writing to rindex */ + if (ip == GFS2_I(sdp->sd_rindex) && !sdp->sd_rindex_uptodate) { + error = gfs2_ri_update(ip); + if (error) + return error; + } + /* Flushing the log may release space */ + if (loops == 2) { + if (ap->min_target) + target = ap->min_target; + gfs2_log_flush(sdp, NULL, GFS2_LOG_HEAD_FLUSH_NORMAL | + GFS2_LFC_INPLACE_RESERVE); + } + } + + return -ENOSPC; +} + +/** + * gfs2_inplace_release - release an inplace reservation + * @ip: the inode the reservation was taken out on + * + * Release a reservation made by gfs2_inplace_reserve(). + */ + +void gfs2_inplace_release(struct gfs2_inode *ip) +{ + struct gfs2_blkreserv *rs = &ip->i_res; + + if (rs->rs_reserved) { + struct gfs2_rgrpd *rgd = rs->rs_rgd; + + spin_lock(&rgd->rd_rsspin); + GLOCK_BUG_ON(rgd->rd_gl, rgd->rd_reserved < rs->rs_reserved); + rgd->rd_reserved -= rs->rs_reserved; + spin_unlock(&rgd->rd_rsspin); + rs->rs_reserved = 0; + } + if (gfs2_holder_initialized(&ip->i_rgd_gh)) + gfs2_glock_dq_uninit(&ip->i_rgd_gh); +} + +/** + * gfs2_alloc_extent - allocate an extent from a given bitmap + * @rbm: the resource group information + * @dinode: TRUE if the first block we allocate is for a dinode + * @n: The extent length (value/result) + * + * Add the bitmap buffer to the transaction. + * Set the found bits to @new_state to change block's allocation state. + */ +static void gfs2_alloc_extent(const struct gfs2_rbm *rbm, bool dinode, + unsigned int *n) +{ + struct gfs2_rbm pos = { .rgd = rbm->rgd, }; + const unsigned int elen = *n; + u64 block; + int ret; + + *n = 1; + block = gfs2_rbm_to_block(rbm); + gfs2_trans_add_meta(rbm->rgd->rd_gl, rbm_bi(rbm)->bi_bh); + gfs2_setbit(rbm, true, dinode ? GFS2_BLKST_DINODE : GFS2_BLKST_USED); + block++; + while (*n < elen) { + ret = gfs2_rbm_from_block(&pos, block); + if (ret || gfs2_testbit(&pos, true) != GFS2_BLKST_FREE) + break; + gfs2_trans_add_meta(pos.rgd->rd_gl, rbm_bi(&pos)->bi_bh); + gfs2_setbit(&pos, true, GFS2_BLKST_USED); + (*n)++; + block++; + } +} + +/** + * rgblk_free - Change alloc state of given block(s) + * @sdp: the filesystem + * @rgd: the resource group the blocks are in + * @bstart: the start of a run of blocks to free + * @blen: the length of the block run (all must lie within ONE RG!) + * @new_state: GFS2_BLKST_XXX the after-allocation block state + */ + +static void rgblk_free(struct gfs2_sbd *sdp, struct gfs2_rgrpd *rgd, + u64 bstart, u32 blen, unsigned char new_state) +{ + struct gfs2_rbm rbm; + struct gfs2_bitmap *bi, *bi_prev = NULL; + + rbm.rgd = rgd; + if (WARN_ON_ONCE(gfs2_rbm_from_block(&rbm, bstart))) + return; + while (blen--) { + bi = rbm_bi(&rbm); + if (bi != bi_prev) { + if (!bi->bi_clone) { + bi->bi_clone = kmalloc(bi->bi_bh->b_size, + GFP_NOFS | __GFP_NOFAIL); + memcpy(bi->bi_clone + bi->bi_offset, + bi->bi_bh->b_data + bi->bi_offset, + bi->bi_bytes); + } + gfs2_trans_add_meta(rbm.rgd->rd_gl, bi->bi_bh); + bi_prev = bi; + } + gfs2_setbit(&rbm, false, new_state); + gfs2_rbm_add(&rbm, 1); + } +} + +/** + * gfs2_rgrp_dump - print out an rgrp + * @seq: The iterator + * @rgd: The rgrp in question + * @fs_id_buf: pointer to file system id (if requested) + * + */ + +void gfs2_rgrp_dump(struct seq_file *seq, struct gfs2_rgrpd *rgd, + const char *fs_id_buf) +{ + struct gfs2_blkreserv *trs; + const struct rb_node *n; + + spin_lock(&rgd->rd_rsspin); + gfs2_print_dbg(seq, "%s R: n:%llu f:%02x b:%u/%u i:%u q:%u r:%u e:%u\n", + fs_id_buf, + (unsigned long long)rgd->rd_addr, rgd->rd_flags, + rgd->rd_free, rgd->rd_free_clone, rgd->rd_dinodes, + rgd->rd_requested, rgd->rd_reserved, rgd->rd_extfail_pt); + if (rgd->rd_sbd->sd_args.ar_rgrplvb && rgd->rd_rgl) { + struct gfs2_rgrp_lvb *rgl = rgd->rd_rgl; + + gfs2_print_dbg(seq, "%s L: f:%02x b:%u i:%u\n", fs_id_buf, + be32_to_cpu(rgl->rl_flags), + be32_to_cpu(rgl->rl_free), + be32_to_cpu(rgl->rl_dinodes)); + } + for (n = rb_first(&rgd->rd_rstree); n; n = rb_next(&trs->rs_node)) { + trs = rb_entry(n, struct gfs2_blkreserv, rs_node); + dump_rs(seq, trs, fs_id_buf); + } + spin_unlock(&rgd->rd_rsspin); +} + +static void gfs2_rgrp_error(struct gfs2_rgrpd *rgd) +{ + struct gfs2_sbd *sdp = rgd->rd_sbd; + char fs_id_buf[sizeof(sdp->sd_fsname) + 7]; + + fs_warn(sdp, "rgrp %llu has an error, marking it readonly until umount\n", + (unsigned long long)rgd->rd_addr); + fs_warn(sdp, "umount on all nodes and run fsck.gfs2 to fix the error\n"); + sprintf(fs_id_buf, "fsid=%s: ", sdp->sd_fsname); + gfs2_rgrp_dump(NULL, rgd, fs_id_buf); + rgd->rd_flags |= GFS2_RDF_ERROR; +} + +/** + * gfs2_adjust_reservation - Adjust (or remove) a reservation after allocation + * @ip: The inode we have just allocated blocks for + * @rbm: The start of the allocated blocks + * @len: The extent length + * + * Adjusts a reservation after an allocation has taken place. If the + * reservation does not match the allocation, or if it is now empty + * then it is removed. + */ + +static void gfs2_adjust_reservation(struct gfs2_inode *ip, + const struct gfs2_rbm *rbm, unsigned len) +{ + struct gfs2_blkreserv *rs = &ip->i_res; + struct gfs2_rgrpd *rgd = rbm->rgd; + + BUG_ON(rs->rs_reserved < len); + rs->rs_reserved -= len; + if (gfs2_rs_active(rs)) { + u64 start = gfs2_rbm_to_block(rbm); + + if (rs->rs_start == start) { + unsigned int rlen; + + rs->rs_start += len; + rlen = min(rs->rs_requested, len); + rs->rs_requested -= rlen; + rgd->rd_requested -= rlen; + trace_gfs2_rs(rs, TRACE_RS_CLAIM); + if (rs->rs_start < rgd->rd_data0 + rgd->rd_data && + rs->rs_requested) + return; + /* We used up our block reservation, so we should + reserve more blocks next time. */ + atomic_add(RGRP_RSRV_ADDBLKS, &ip->i_sizehint); + } + __rs_deltree(rs); + } +} + +/** + * gfs2_set_alloc_start - Set starting point for block allocation + * @rbm: The rbm which will be set to the required location + * @ip: The gfs2 inode + * @dinode: Flag to say if allocation includes a new inode + * + * This sets the starting point from the reservation if one is active + * otherwise it falls back to guessing a start point based on the + * inode's goal block or the last allocation point in the rgrp. + */ + +static void gfs2_set_alloc_start(struct gfs2_rbm *rbm, + const struct gfs2_inode *ip, bool dinode) +{ + u64 goal; + + if (gfs2_rs_active(&ip->i_res)) { + goal = ip->i_res.rs_start; + } else { + if (!dinode && rgrp_contains_block(rbm->rgd, ip->i_goal)) + goal = ip->i_goal; + else + goal = rbm->rgd->rd_last_alloc + rbm->rgd->rd_data0; + } + if (WARN_ON_ONCE(gfs2_rbm_from_block(rbm, goal))) { + rbm->bii = 0; + rbm->offset = 0; + } +} + +/** + * gfs2_alloc_blocks - Allocate one or more blocks of data and/or a dinode + * @ip: the inode to allocate the block for + * @bn: Used to return the starting block number + * @nblocks: requested number of blocks/extent length (value/result) + * @dinode: 1 if we're allocating a dinode block, else 0 + * @generation: the generation number of the inode + * + * Returns: 0 or error + */ + +int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *nblocks, + bool dinode, u64 *generation) +{ + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + struct buffer_head *dibh; + struct gfs2_rbm rbm = { .rgd = ip->i_res.rs_rgd, }; + u64 block; /* block, within the file system scope */ + u32 minext = 1; + int error = -ENOSPC; + + BUG_ON(ip->i_res.rs_reserved < *nblocks); + + rgrp_lock_local(rbm.rgd); + if (gfs2_rs_active(&ip->i_res)) { + gfs2_set_alloc_start(&rbm, ip, dinode); + error = gfs2_rbm_find(&rbm, GFS2_BLKST_FREE, &minext, &ip->i_res, false); + } + if (error == -ENOSPC) { + gfs2_set_alloc_start(&rbm, ip, dinode); + error = gfs2_rbm_find(&rbm, GFS2_BLKST_FREE, &minext, NULL, false); + } + + /* Since all blocks are reserved in advance, this shouldn't happen */ + if (error) { + fs_warn(sdp, "inum=%llu error=%d, nblocks=%u, full=%d fail_pt=%d\n", + (unsigned long long)ip->i_no_addr, error, *nblocks, + test_bit(GBF_FULL, &rbm.rgd->rd_bits->bi_flags), + rbm.rgd->rd_extfail_pt); + goto rgrp_error; + } + + gfs2_alloc_extent(&rbm, dinode, nblocks); + block = gfs2_rbm_to_block(&rbm); + rbm.rgd->rd_last_alloc = block - rbm.rgd->rd_data0; + if (!dinode) { + ip->i_goal = block + *nblocks - 1; + error = gfs2_meta_inode_buffer(ip, &dibh); + if (error == 0) { + struct gfs2_dinode *di = + (struct gfs2_dinode *)dibh->b_data; + gfs2_trans_add_meta(ip->i_gl, dibh); + di->di_goal_meta = di->di_goal_data = + cpu_to_be64(ip->i_goal); + brelse(dibh); + } + } + spin_lock(&rbm.rgd->rd_rsspin); + gfs2_adjust_reservation(ip, &rbm, *nblocks); + if (rbm.rgd->rd_free < *nblocks || rbm.rgd->rd_reserved < *nblocks) { + fs_warn(sdp, "nblocks=%u\n", *nblocks); + spin_unlock(&rbm.rgd->rd_rsspin); + goto rgrp_error; + } + GLOCK_BUG_ON(rbm.rgd->rd_gl, rbm.rgd->rd_reserved < *nblocks); + GLOCK_BUG_ON(rbm.rgd->rd_gl, rbm.rgd->rd_free_clone < *nblocks); + GLOCK_BUG_ON(rbm.rgd->rd_gl, rbm.rgd->rd_free < *nblocks); + rbm.rgd->rd_reserved -= *nblocks; + rbm.rgd->rd_free_clone -= *nblocks; + rbm.rgd->rd_free -= *nblocks; + spin_unlock(&rbm.rgd->rd_rsspin); + if (dinode) { + rbm.rgd->rd_dinodes++; + *generation = rbm.rgd->rd_igeneration++; + if (*generation == 0) + *generation = rbm.rgd->rd_igeneration++; + } + + gfs2_trans_add_meta(rbm.rgd->rd_gl, rbm.rgd->rd_bits[0].bi_bh); + gfs2_rgrp_out(rbm.rgd, rbm.rgd->rd_bits[0].bi_bh->b_data); + rgrp_unlock_local(rbm.rgd); + + gfs2_statfs_change(sdp, 0, -(s64)*nblocks, dinode ? 1 : 0); + if (dinode) + gfs2_trans_remove_revoke(sdp, block, *nblocks); + + gfs2_quota_change(ip, *nblocks, ip->i_inode.i_uid, ip->i_inode.i_gid); + + trace_gfs2_block_alloc(ip, rbm.rgd, block, *nblocks, + dinode ? GFS2_BLKST_DINODE : GFS2_BLKST_USED); + *bn = block; + return 0; + +rgrp_error: + rgrp_unlock_local(rbm.rgd); + gfs2_rgrp_error(rbm.rgd); + return -EIO; +} + +/** + * __gfs2_free_blocks - free a contiguous run of block(s) + * @ip: the inode these blocks are being freed from + * @rgd: the resource group the blocks are in + * @bstart: first block of a run of contiguous blocks + * @blen: the length of the block run + * @meta: 1 if the blocks represent metadata + * + */ + +void __gfs2_free_blocks(struct gfs2_inode *ip, struct gfs2_rgrpd *rgd, + u64 bstart, u32 blen, int meta) +{ + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + + rgrp_lock_local(rgd); + rgblk_free(sdp, rgd, bstart, blen, GFS2_BLKST_FREE); + trace_gfs2_block_alloc(ip, rgd, bstart, blen, GFS2_BLKST_FREE); + rgd->rd_free += blen; + rgd->rd_flags &= ~GFS2_RGF_TRIMMED; + gfs2_trans_add_meta(rgd->rd_gl, rgd->rd_bits[0].bi_bh); + gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data); + rgrp_unlock_local(rgd); + + /* Directories keep their data in the metadata address space */ + if (meta || ip->i_depth || gfs2_is_jdata(ip)) + gfs2_journal_wipe(ip, bstart, blen); +} + +/** + * gfs2_free_meta - free a contiguous run of data block(s) + * @ip: the inode these blocks are being freed from + * @rgd: the resource group the blocks are in + * @bstart: first block of a run of contiguous blocks + * @blen: the length of the block run + * + */ + +void gfs2_free_meta(struct gfs2_inode *ip, struct gfs2_rgrpd *rgd, + u64 bstart, u32 blen) +{ + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + + __gfs2_free_blocks(ip, rgd, bstart, blen, 1); + gfs2_statfs_change(sdp, 0, +blen, 0); + gfs2_quota_change(ip, -(s64)blen, ip->i_inode.i_uid, ip->i_inode.i_gid); +} + +void gfs2_unlink_di(struct inode *inode) +{ + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_sbd *sdp = GFS2_SB(inode); + struct gfs2_rgrpd *rgd; + u64 blkno = ip->i_no_addr; + + rgd = gfs2_blk2rgrpd(sdp, blkno, true); + if (!rgd) + return; + rgrp_lock_local(rgd); + rgblk_free(sdp, rgd, blkno, 1, GFS2_BLKST_UNLINKED); + trace_gfs2_block_alloc(ip, rgd, blkno, 1, GFS2_BLKST_UNLINKED); + gfs2_trans_add_meta(rgd->rd_gl, rgd->rd_bits[0].bi_bh); + gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data); + be32_add_cpu(&rgd->rd_rgl->rl_unlinked, 1); + rgrp_unlock_local(rgd); +} + +void gfs2_free_di(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip) +{ + struct gfs2_sbd *sdp = rgd->rd_sbd; + + rgrp_lock_local(rgd); + rgblk_free(sdp, rgd, ip->i_no_addr, 1, GFS2_BLKST_FREE); + if (!rgd->rd_dinodes) + gfs2_consist_rgrpd(rgd); + rgd->rd_dinodes--; + rgd->rd_free++; + + gfs2_trans_add_meta(rgd->rd_gl, rgd->rd_bits[0].bi_bh); + gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data); + be32_add_cpu(&rgd->rd_rgl->rl_unlinked, -1); + rgrp_unlock_local(rgd); + + gfs2_statfs_change(sdp, 0, +1, -1); + trace_gfs2_block_alloc(ip, rgd, ip->i_no_addr, 1, GFS2_BLKST_FREE); + gfs2_quota_change(ip, -1, ip->i_inode.i_uid, ip->i_inode.i_gid); + gfs2_journal_wipe(ip, ip->i_no_addr, 1); +} + +/** + * gfs2_check_blk_type - Check the type of a block + * @sdp: The superblock + * @no_addr: The block number to check + * @type: The block type we are looking for + * + * The inode glock of @no_addr must be held. The @type to check for is either + * GFS2_BLKST_DINODE or GFS2_BLKST_UNLINKED; checking for type GFS2_BLKST_FREE + * or GFS2_BLKST_USED would make no sense. + * + * Returns: 0 if the block type matches the expected type + * -ESTALE if it doesn't match + * or -ve errno if something went wrong while checking + */ + +int gfs2_check_blk_type(struct gfs2_sbd *sdp, u64 no_addr, unsigned int type) +{ + struct gfs2_rgrpd *rgd; + struct gfs2_holder rgd_gh; + struct gfs2_rbm rbm; + int error = -EINVAL; + + rgd = gfs2_blk2rgrpd(sdp, no_addr, 1); + if (!rgd) + goto fail; + + error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_SHARED, 0, &rgd_gh); + if (error) + goto fail; + + rbm.rgd = rgd; + error = gfs2_rbm_from_block(&rbm, no_addr); + if (!WARN_ON_ONCE(error)) { + /* + * No need to take the local resource group lock here; the + * inode glock of @no_addr provides the necessary + * synchronization in case the block is an inode. (In case + * the block is not an inode, the block type will not match + * the @type we are looking for.) + */ + if (gfs2_testbit(&rbm, false) != type) + error = -ESTALE; + } + + gfs2_glock_dq_uninit(&rgd_gh); + +fail: + return error; +} + +/** + * gfs2_rlist_add - add a RG to a list of RGs + * @ip: the inode + * @rlist: the list of resource groups + * @block: the block + * + * Figure out what RG a block belongs to and add that RG to the list + * + * FIXME: Don't use NOFAIL + * + */ + +void gfs2_rlist_add(struct gfs2_inode *ip, struct gfs2_rgrp_list *rlist, + u64 block) +{ + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + struct gfs2_rgrpd *rgd; + struct gfs2_rgrpd **tmp; + unsigned int new_space; + unsigned int x; + + if (gfs2_assert_warn(sdp, !rlist->rl_ghs)) + return; + + /* + * The resource group last accessed is kept in the last position. + */ + + if (rlist->rl_rgrps) { + rgd = rlist->rl_rgd[rlist->rl_rgrps - 1]; + if (rgrp_contains_block(rgd, block)) + return; + rgd = gfs2_blk2rgrpd(sdp, block, 1); + } else { + rgd = ip->i_res.rs_rgd; + if (!rgd || !rgrp_contains_block(rgd, block)) + rgd = gfs2_blk2rgrpd(sdp, block, 1); + } + + if (!rgd) { + fs_err(sdp, "rlist_add: no rgrp for block %llu\n", + (unsigned long long)block); + return; + } + + for (x = 0; x < rlist->rl_rgrps; x++) { + if (rlist->rl_rgd[x] == rgd) { + swap(rlist->rl_rgd[x], + rlist->rl_rgd[rlist->rl_rgrps - 1]); + return; + } + } + + if (rlist->rl_rgrps == rlist->rl_space) { + new_space = rlist->rl_space + 10; + + tmp = kcalloc(new_space, sizeof(struct gfs2_rgrpd *), + GFP_NOFS | __GFP_NOFAIL); + + if (rlist->rl_rgd) { + memcpy(tmp, rlist->rl_rgd, + rlist->rl_space * sizeof(struct gfs2_rgrpd *)); + kfree(rlist->rl_rgd); + } + + rlist->rl_space = new_space; + rlist->rl_rgd = tmp; + } + + rlist->rl_rgd[rlist->rl_rgrps++] = rgd; +} + +/** + * gfs2_rlist_alloc - all RGs have been added to the rlist, now allocate + * and initialize an array of glock holders for them + * @rlist: the list of resource groups + * @state: the state we're requesting + * @flags: the modifier flags + * + * FIXME: Don't use NOFAIL + * + */ + +void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist, + unsigned int state, u16 flags) +{ + unsigned int x; + + rlist->rl_ghs = kmalloc_array(rlist->rl_rgrps, + sizeof(struct gfs2_holder), + GFP_NOFS | __GFP_NOFAIL); + for (x = 0; x < rlist->rl_rgrps; x++) + gfs2_holder_init(rlist->rl_rgd[x]->rd_gl, state, flags, + &rlist->rl_ghs[x]); +} + +/** + * gfs2_rlist_free - free a resource group list + * @rlist: the list of resource groups + * + */ + +void gfs2_rlist_free(struct gfs2_rgrp_list *rlist) +{ + unsigned int x; + + kfree(rlist->rl_rgd); + + if (rlist->rl_ghs) { + for (x = 0; x < rlist->rl_rgrps; x++) + gfs2_holder_uninit(&rlist->rl_ghs[x]); + kfree(rlist->rl_ghs); + rlist->rl_ghs = NULL; + } +} + +void rgrp_lock_local(struct gfs2_rgrpd *rgd) +{ + mutex_lock(&rgd->rd_mutex); +} + +void rgrp_unlock_local(struct gfs2_rgrpd *rgd) +{ + mutex_unlock(&rgd->rd_mutex); +} diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h new file mode 100644 index 0000000000..00b30cf893 --- /dev/null +++ b/fs/gfs2/rgrp.h @@ -0,0 +1,96 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved. + */ + +#ifndef __RGRP_DOT_H__ +#define __RGRP_DOT_H__ + +#include <linux/slab.h> +#include <linux/uaccess.h> + +/* Since each block in the file system is represented by two bits in the + * bitmap, one 64-bit word in the bitmap will represent 32 blocks. + * By reserving 32 blocks at a time, we can optimize / shortcut how we search + * through the bitmaps by looking a word at a time. + */ +#define RGRP_RSRV_MINBLKS 32 +#define RGRP_RSRV_ADDBLKS 64 + +struct gfs2_rgrpd; +struct gfs2_sbd; +struct gfs2_holder; + +extern void gfs2_rgrp_verify(struct gfs2_rgrpd *rgd); + +extern struct gfs2_rgrpd *gfs2_blk2rgrpd(struct gfs2_sbd *sdp, u64 blk, bool exact); +extern struct gfs2_rgrpd *gfs2_rgrpd_get_first(struct gfs2_sbd *sdp); +extern struct gfs2_rgrpd *gfs2_rgrpd_get_next(struct gfs2_rgrpd *rgd); + +extern void gfs2_clear_rgrpd(struct gfs2_sbd *sdp); +extern int gfs2_rindex_update(struct gfs2_sbd *sdp); +extern void gfs2_free_clones(struct gfs2_rgrpd *rgd); +extern int gfs2_rgrp_go_instantiate(struct gfs2_glock *gl); +extern void gfs2_rgrp_brelse(struct gfs2_rgrpd *rgd); + +extern struct gfs2_alloc *gfs2_alloc_get(struct gfs2_inode *ip); + +#define GFS2_AF_ORLOV 1 +extern int gfs2_inplace_reserve(struct gfs2_inode *ip, + struct gfs2_alloc_parms *ap); +extern void gfs2_inplace_release(struct gfs2_inode *ip); + +extern int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *n, + bool dinode, u64 *generation); + +extern void gfs2_rs_deltree(struct gfs2_blkreserv *rs); +extern void gfs2_rs_delete(struct gfs2_inode *ip); +extern void __gfs2_free_blocks(struct gfs2_inode *ip, struct gfs2_rgrpd *rgd, + u64 bstart, u32 blen, int meta); +extern void gfs2_free_meta(struct gfs2_inode *ip, struct gfs2_rgrpd *rgd, + u64 bstart, u32 blen); +extern void gfs2_free_di(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip); +extern void gfs2_unlink_di(struct inode *inode); +extern int gfs2_check_blk_type(struct gfs2_sbd *sdp, u64 no_addr, + unsigned int type); + +struct gfs2_rgrp_list { + unsigned int rl_rgrps; + unsigned int rl_space; + struct gfs2_rgrpd **rl_rgd; + struct gfs2_holder *rl_ghs; +}; + +extern void gfs2_rlist_add(struct gfs2_inode *ip, struct gfs2_rgrp_list *rlist, + u64 block); +extern void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist, + unsigned int state, u16 flags); +extern void gfs2_rlist_free(struct gfs2_rgrp_list *rlist); +extern u64 gfs2_ri_total(struct gfs2_sbd *sdp); +extern void gfs2_rgrp_dump(struct seq_file *seq, struct gfs2_rgrpd *rgd, + const char *fs_id_buf); +extern int gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset, + struct buffer_head *bh, + const struct gfs2_bitmap *bi, unsigned minlen, u64 *ptrimmed); +extern int gfs2_fitrim(struct file *filp, void __user *argp); + +/* This is how to tell if a reservation is in the rgrp tree: */ +static inline bool gfs2_rs_active(const struct gfs2_blkreserv *rs) +{ + return !RB_EMPTY_NODE(&rs->rs_node); +} + +static inline int rgrp_contains_block(struct gfs2_rgrpd *rgd, u64 block) +{ + u64 first = rgd->rd_data0; + u64 last = first + rgd->rd_data; + return first <= block && block < last; +} + +extern void check_and_update_goal(struct gfs2_inode *ip); + +extern void rgrp_lock_local(struct gfs2_rgrpd *rgd); +extern void rgrp_unlock_local(struct gfs2_rgrpd *rgd); + +#endif /* __RGRP_DOT_H__ */ diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c new file mode 100644 index 0000000000..5f4ebe279a --- /dev/null +++ b/fs/gfs2/super.c @@ -0,0 +1,1624 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2007 Red Hat, Inc. All rights reserved. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/bio.h> +#include <linux/sched/signal.h> +#include <linux/slab.h> +#include <linux/spinlock.h> +#include <linux/completion.h> +#include <linux/buffer_head.h> +#include <linux/statfs.h> +#include <linux/seq_file.h> +#include <linux/mount.h> +#include <linux/kthread.h> +#include <linux/delay.h> +#include <linux/gfs2_ondisk.h> +#include <linux/crc32.h> +#include <linux/time.h> +#include <linux/wait.h> +#include <linux/writeback.h> +#include <linux/backing-dev.h> +#include <linux/kernel.h> + +#include "gfs2.h" +#include "incore.h" +#include "bmap.h" +#include "dir.h" +#include "glock.h" +#include "glops.h" +#include "inode.h" +#include "log.h" +#include "meta_io.h" +#include "quota.h" +#include "recovery.h" +#include "rgrp.h" +#include "super.h" +#include "trans.h" +#include "util.h" +#include "sys.h" +#include "xattr.h" +#include "lops.h" + +enum dinode_demise { + SHOULD_DELETE_DINODE, + SHOULD_NOT_DELETE_DINODE, + SHOULD_DEFER_EVICTION, +}; + +/** + * gfs2_jindex_free - Clear all the journal index information + * @sdp: The GFS2 superblock + * + */ + +void gfs2_jindex_free(struct gfs2_sbd *sdp) +{ + struct list_head list; + struct gfs2_jdesc *jd; + + spin_lock(&sdp->sd_jindex_spin); + list_add(&list, &sdp->sd_jindex_list); + list_del_init(&sdp->sd_jindex_list); + sdp->sd_journals = 0; + spin_unlock(&sdp->sd_jindex_spin); + + sdp->sd_jdesc = NULL; + while (!list_empty(&list)) { + jd = list_first_entry(&list, struct gfs2_jdesc, jd_list); + gfs2_free_journal_extents(jd); + list_del(&jd->jd_list); + iput(jd->jd_inode); + jd->jd_inode = NULL; + kfree(jd); + } +} + +static struct gfs2_jdesc *jdesc_find_i(struct list_head *head, unsigned int jid) +{ + struct gfs2_jdesc *jd; + + list_for_each_entry(jd, head, jd_list) { + if (jd->jd_jid == jid) + return jd; + } + return NULL; +} + +struct gfs2_jdesc *gfs2_jdesc_find(struct gfs2_sbd *sdp, unsigned int jid) +{ + struct gfs2_jdesc *jd; + + spin_lock(&sdp->sd_jindex_spin); + jd = jdesc_find_i(&sdp->sd_jindex_list, jid); + spin_unlock(&sdp->sd_jindex_spin); + + return jd; +} + +int gfs2_jdesc_check(struct gfs2_jdesc *jd) +{ + struct gfs2_inode *ip = GFS2_I(jd->jd_inode); + struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode); + u64 size = i_size_read(jd->jd_inode); + + if (gfs2_check_internal_file_size(jd->jd_inode, 8 << 20, BIT(30))) + return -EIO; + + jd->jd_blocks = size >> sdp->sd_sb.sb_bsize_shift; + + if (gfs2_write_alloc_required(ip, 0, size)) { + gfs2_consist_inode(ip); + return -EIO; + } + + return 0; +} + +/** + * gfs2_make_fs_rw - Turn a Read-Only FS into a Read-Write one + * @sdp: the filesystem + * + * Returns: errno + */ + +int gfs2_make_fs_rw(struct gfs2_sbd *sdp) +{ + struct gfs2_inode *ip = GFS2_I(sdp->sd_jdesc->jd_inode); + struct gfs2_glock *j_gl = ip->i_gl; + struct gfs2_log_header_host head; + int error; + + j_gl->gl_ops->go_inval(j_gl, DIO_METADATA); + if (gfs2_withdrawn(sdp)) + return -EIO; + + error = gfs2_find_jhead(sdp->sd_jdesc, &head, false); + if (error) { + gfs2_consist(sdp); + return error; + } + + if (!(head.lh_flags & GFS2_LOG_HEAD_UNMOUNT)) { + gfs2_consist(sdp); + return -EIO; + } + + /* Initialize some head of the log stuff */ + sdp->sd_log_sequence = head.lh_sequence + 1; + gfs2_log_pointers_init(sdp, head.lh_blkno); + + error = gfs2_quota_init(sdp); + if (!error && gfs2_withdrawn(sdp)) + error = -EIO; + if (!error) + set_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags); + return error; +} + +void gfs2_statfs_change_in(struct gfs2_statfs_change_host *sc, const void *buf) +{ + const struct gfs2_statfs_change *str = buf; + + sc->sc_total = be64_to_cpu(str->sc_total); + sc->sc_free = be64_to_cpu(str->sc_free); + sc->sc_dinodes = be64_to_cpu(str->sc_dinodes); +} + +void gfs2_statfs_change_out(const struct gfs2_statfs_change_host *sc, void *buf) +{ + struct gfs2_statfs_change *str = buf; + + str->sc_total = cpu_to_be64(sc->sc_total); + str->sc_free = cpu_to_be64(sc->sc_free); + str->sc_dinodes = cpu_to_be64(sc->sc_dinodes); +} + +int gfs2_statfs_init(struct gfs2_sbd *sdp) +{ + struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode); + struct gfs2_statfs_change_host *m_sc = &sdp->sd_statfs_master; + struct gfs2_statfs_change_host *l_sc = &sdp->sd_statfs_local; + struct buffer_head *m_bh; + struct gfs2_holder gh; + int error; + + error = gfs2_glock_nq_init(m_ip->i_gl, LM_ST_EXCLUSIVE, GL_NOCACHE, + &gh); + if (error) + return error; + + error = gfs2_meta_inode_buffer(m_ip, &m_bh); + if (error) + goto out; + + if (sdp->sd_args.ar_spectator) { + spin_lock(&sdp->sd_statfs_spin); + gfs2_statfs_change_in(m_sc, m_bh->b_data + + sizeof(struct gfs2_dinode)); + spin_unlock(&sdp->sd_statfs_spin); + } else { + spin_lock(&sdp->sd_statfs_spin); + gfs2_statfs_change_in(m_sc, m_bh->b_data + + sizeof(struct gfs2_dinode)); + gfs2_statfs_change_in(l_sc, sdp->sd_sc_bh->b_data + + sizeof(struct gfs2_dinode)); + spin_unlock(&sdp->sd_statfs_spin); + + } + + brelse(m_bh); +out: + gfs2_glock_dq_uninit(&gh); + return 0; +} + +void gfs2_statfs_change(struct gfs2_sbd *sdp, s64 total, s64 free, + s64 dinodes) +{ + struct gfs2_inode *l_ip = GFS2_I(sdp->sd_sc_inode); + struct gfs2_statfs_change_host *l_sc = &sdp->sd_statfs_local; + struct gfs2_statfs_change_host *m_sc = &sdp->sd_statfs_master; + s64 x, y; + int need_sync = 0; + + gfs2_trans_add_meta(l_ip->i_gl, sdp->sd_sc_bh); + + spin_lock(&sdp->sd_statfs_spin); + l_sc->sc_total += total; + l_sc->sc_free += free; + l_sc->sc_dinodes += dinodes; + gfs2_statfs_change_out(l_sc, sdp->sd_sc_bh->b_data + + sizeof(struct gfs2_dinode)); + if (sdp->sd_args.ar_statfs_percent) { + x = 100 * l_sc->sc_free; + y = m_sc->sc_free * sdp->sd_args.ar_statfs_percent; + if (x >= y || x <= -y) + need_sync = 1; + } + spin_unlock(&sdp->sd_statfs_spin); + + if (need_sync) + gfs2_wake_up_statfs(sdp); +} + +void update_statfs(struct gfs2_sbd *sdp, struct buffer_head *m_bh) +{ + struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode); + struct gfs2_inode *l_ip = GFS2_I(sdp->sd_sc_inode); + struct gfs2_statfs_change_host *m_sc = &sdp->sd_statfs_master; + struct gfs2_statfs_change_host *l_sc = &sdp->sd_statfs_local; + + gfs2_trans_add_meta(l_ip->i_gl, sdp->sd_sc_bh); + gfs2_trans_add_meta(m_ip->i_gl, m_bh); + + spin_lock(&sdp->sd_statfs_spin); + m_sc->sc_total += l_sc->sc_total; + m_sc->sc_free += l_sc->sc_free; + m_sc->sc_dinodes += l_sc->sc_dinodes; + memset(l_sc, 0, sizeof(struct gfs2_statfs_change)); + memset(sdp->sd_sc_bh->b_data + sizeof(struct gfs2_dinode), + 0, sizeof(struct gfs2_statfs_change)); + gfs2_statfs_change_out(m_sc, m_bh->b_data + sizeof(struct gfs2_dinode)); + spin_unlock(&sdp->sd_statfs_spin); +} + +int gfs2_statfs_sync(struct super_block *sb, int type) +{ + struct gfs2_sbd *sdp = sb->s_fs_info; + struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode); + struct gfs2_statfs_change_host *m_sc = &sdp->sd_statfs_master; + struct gfs2_statfs_change_host *l_sc = &sdp->sd_statfs_local; + struct gfs2_holder gh; + struct buffer_head *m_bh; + int error; + + error = gfs2_glock_nq_init(m_ip->i_gl, LM_ST_EXCLUSIVE, GL_NOCACHE, + &gh); + if (error) + goto out; + + error = gfs2_meta_inode_buffer(m_ip, &m_bh); + if (error) + goto out_unlock; + + spin_lock(&sdp->sd_statfs_spin); + gfs2_statfs_change_in(m_sc, m_bh->b_data + + sizeof(struct gfs2_dinode)); + if (!l_sc->sc_total && !l_sc->sc_free && !l_sc->sc_dinodes) { + spin_unlock(&sdp->sd_statfs_spin); + goto out_bh; + } + spin_unlock(&sdp->sd_statfs_spin); + + error = gfs2_trans_begin(sdp, 2 * RES_DINODE, 0); + if (error) + goto out_bh; + + update_statfs(sdp, m_bh); + sdp->sd_statfs_force_sync = 0; + + gfs2_trans_end(sdp); + +out_bh: + brelse(m_bh); +out_unlock: + gfs2_glock_dq_uninit(&gh); +out: + return error; +} + +struct lfcc { + struct list_head list; + struct gfs2_holder gh; +}; + +/** + * gfs2_lock_fs_check_clean - Stop all writes to the FS and check that all + * journals are clean + * @sdp: the file system + * + * Returns: errno + */ + +static int gfs2_lock_fs_check_clean(struct gfs2_sbd *sdp) +{ + struct gfs2_inode *ip; + struct gfs2_jdesc *jd; + struct lfcc *lfcc; + LIST_HEAD(list); + struct gfs2_log_header_host lh; + int error, error2; + + /* + * Grab all the journal glocks in SH mode. We are *probably* doing + * that to prevent recovery. + */ + + list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) { + lfcc = kmalloc(sizeof(struct lfcc), GFP_KERNEL); + if (!lfcc) { + error = -ENOMEM; + goto out; + } + ip = GFS2_I(jd->jd_inode); + error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, 0, &lfcc->gh); + if (error) { + kfree(lfcc); + goto out; + } + list_add(&lfcc->list, &list); + } + + gfs2_freeze_unlock(&sdp->sd_freeze_gh); + + error = gfs2_glock_nq_init(sdp->sd_freeze_gl, LM_ST_EXCLUSIVE, + LM_FLAG_NOEXP | GL_NOPID, + &sdp->sd_freeze_gh); + if (error) + goto relock_shared; + + list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) { + error = gfs2_jdesc_check(jd); + if (error) + break; + error = gfs2_find_jhead(jd, &lh, false); + if (error) + break; + if (!(lh.lh_flags & GFS2_LOG_HEAD_UNMOUNT)) { + error = -EBUSY; + break; + } + } + + if (!error) + goto out; /* success */ + + gfs2_freeze_unlock(&sdp->sd_freeze_gh); + +relock_shared: + error2 = gfs2_freeze_lock_shared(sdp); + gfs2_assert_withdraw(sdp, !error2); + +out: + while (!list_empty(&list)) { + lfcc = list_first_entry(&list, struct lfcc, list); + list_del(&lfcc->list); + gfs2_glock_dq_uninit(&lfcc->gh); + kfree(lfcc); + } + return error; +} + +void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf) +{ + const struct inode *inode = &ip->i_inode; + struct gfs2_dinode *str = buf; + + str->di_header.mh_magic = cpu_to_be32(GFS2_MAGIC); + str->di_header.mh_type = cpu_to_be32(GFS2_METATYPE_DI); + str->di_header.mh_format = cpu_to_be32(GFS2_FORMAT_DI); + str->di_num.no_addr = cpu_to_be64(ip->i_no_addr); + str->di_num.no_formal_ino = cpu_to_be64(ip->i_no_formal_ino); + str->di_mode = cpu_to_be32(inode->i_mode); + str->di_uid = cpu_to_be32(i_uid_read(inode)); + str->di_gid = cpu_to_be32(i_gid_read(inode)); + str->di_nlink = cpu_to_be32(inode->i_nlink); + str->di_size = cpu_to_be64(i_size_read(inode)); + str->di_blocks = cpu_to_be64(gfs2_get_inode_blocks(inode)); + str->di_atime = cpu_to_be64(inode->i_atime.tv_sec); + str->di_mtime = cpu_to_be64(inode->i_mtime.tv_sec); + str->di_ctime = cpu_to_be64(inode_get_ctime(inode).tv_sec); + + str->di_goal_meta = cpu_to_be64(ip->i_goal); + str->di_goal_data = cpu_to_be64(ip->i_goal); + str->di_generation = cpu_to_be64(ip->i_generation); + + str->di_flags = cpu_to_be32(ip->i_diskflags); + str->di_height = cpu_to_be16(ip->i_height); + str->di_payload_format = cpu_to_be32(S_ISDIR(inode->i_mode) && + !(ip->i_diskflags & GFS2_DIF_EXHASH) ? + GFS2_FORMAT_DE : 0); + str->di_depth = cpu_to_be16(ip->i_depth); + str->di_entries = cpu_to_be32(ip->i_entries); + + str->di_eattr = cpu_to_be64(ip->i_eattr); + str->di_atime_nsec = cpu_to_be32(inode->i_atime.tv_nsec); + str->di_mtime_nsec = cpu_to_be32(inode->i_mtime.tv_nsec); + str->di_ctime_nsec = cpu_to_be32(inode_get_ctime(inode).tv_nsec); +} + +/** + * gfs2_write_inode - Make sure the inode is stable on the disk + * @inode: The inode + * @wbc: The writeback control structure + * + * Returns: errno + */ + +static int gfs2_write_inode(struct inode *inode, struct writeback_control *wbc) +{ + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_sbd *sdp = GFS2_SB(inode); + struct address_space *metamapping = gfs2_glock2aspace(ip->i_gl); + struct backing_dev_info *bdi = inode_to_bdi(metamapping->host); + int ret = 0; + bool flush_all = (wbc->sync_mode == WB_SYNC_ALL || gfs2_is_jdata(ip)); + + if (flush_all) + gfs2_log_flush(GFS2_SB(inode), ip->i_gl, + GFS2_LOG_HEAD_FLUSH_NORMAL | + GFS2_LFC_WRITE_INODE); + if (bdi->wb.dirty_exceeded) + gfs2_ail1_flush(sdp, wbc); + else + filemap_fdatawrite(metamapping); + if (flush_all) + ret = filemap_fdatawait(metamapping); + if (ret) + mark_inode_dirty_sync(inode); + else { + spin_lock(&inode->i_lock); + if (!(inode->i_flags & I_DIRTY)) + gfs2_ordered_del_inode(ip); + spin_unlock(&inode->i_lock); + } + return ret; +} + +/** + * gfs2_dirty_inode - check for atime updates + * @inode: The inode in question + * @flags: The type of dirty + * + * Unfortunately it can be called under any combination of inode + * glock and freeze glock, so we have to check carefully. + * + * At the moment this deals only with atime - it should be possible + * to expand that role in future, once a review of the locking has + * been carried out. + */ + +static void gfs2_dirty_inode(struct inode *inode, int flags) +{ + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_sbd *sdp = GFS2_SB(inode); + struct buffer_head *bh; + struct gfs2_holder gh; + int need_unlock = 0; + int need_endtrans = 0; + int ret; + + if (unlikely(!ip->i_gl)) { + /* This can only happen during incomplete inode creation. */ + BUG_ON(!test_bit(GIF_ALLOC_FAILED, &ip->i_flags)); + return; + } + + if (unlikely(gfs2_withdrawn(sdp))) + return; + if (!gfs2_glock_is_locked_by_me(ip->i_gl)) { + ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh); + if (ret) { + fs_err(sdp, "dirty_inode: glock %d\n", ret); + gfs2_dump_glock(NULL, ip->i_gl, true); + return; + } + need_unlock = 1; + } else if (WARN_ON_ONCE(ip->i_gl->gl_state != LM_ST_EXCLUSIVE)) + return; + + if (current->journal_info == NULL) { + ret = gfs2_trans_begin(sdp, RES_DINODE, 0); + if (ret) { + fs_err(sdp, "dirty_inode: gfs2_trans_begin %d\n", ret); + goto out; + } + need_endtrans = 1; + } + + ret = gfs2_meta_inode_buffer(ip, &bh); + if (ret == 0) { + gfs2_trans_add_meta(ip->i_gl, bh); + gfs2_dinode_out(ip, bh->b_data); + brelse(bh); + } + + if (need_endtrans) + gfs2_trans_end(sdp); +out: + if (need_unlock) + gfs2_glock_dq_uninit(&gh); +} + +/** + * gfs2_make_fs_ro - Turn a Read-Write FS into a Read-Only one + * @sdp: the filesystem + * + * Returns: errno + */ + +void gfs2_make_fs_ro(struct gfs2_sbd *sdp) +{ + int log_write_allowed = test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags); + + if (!test_bit(SDF_KILL, &sdp->sd_flags)) + gfs2_flush_delete_work(sdp); + + gfs2_destroy_threads(sdp); + + if (log_write_allowed) { + gfs2_quota_sync(sdp->sd_vfs, 0); + gfs2_statfs_sync(sdp->sd_vfs, 0); + + /* We do two log flushes here. The first one commits dirty inodes + * and rgrps to the journal, but queues up revokes to the ail list. + * The second flush writes out and removes the revokes. + * + * The first must be done before the FLUSH_SHUTDOWN code + * clears the LIVE flag, otherwise it will not be able to start + * a transaction to write its revokes, and the error will cause + * a withdraw of the file system. */ + gfs2_log_flush(sdp, NULL, GFS2_LFC_MAKE_FS_RO); + gfs2_log_flush(sdp, NULL, GFS2_LOG_HEAD_FLUSH_SHUTDOWN | + GFS2_LFC_MAKE_FS_RO); + wait_event_timeout(sdp->sd_log_waitq, + gfs2_log_is_empty(sdp), + HZ * 5); + gfs2_assert_warn(sdp, gfs2_log_is_empty(sdp)); + } + gfs2_quota_cleanup(sdp); +} + +/** + * gfs2_put_super - Unmount the filesystem + * @sb: The VFS superblock + * + */ + +static void gfs2_put_super(struct super_block *sb) +{ + struct gfs2_sbd *sdp = sb->s_fs_info; + struct gfs2_jdesc *jd; + + /* No more recovery requests */ + set_bit(SDF_NORECOVERY, &sdp->sd_flags); + smp_mb(); + + /* Wait on outstanding recovery */ +restart: + spin_lock(&sdp->sd_jindex_spin); + list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) { + if (!test_bit(JDF_RECOVERY, &jd->jd_flags)) + continue; + spin_unlock(&sdp->sd_jindex_spin); + wait_on_bit(&jd->jd_flags, JDF_RECOVERY, + TASK_UNINTERRUPTIBLE); + goto restart; + } + spin_unlock(&sdp->sd_jindex_spin); + + if (!sb_rdonly(sb)) + gfs2_make_fs_ro(sdp); + else { + if (gfs2_withdrawn(sdp)) + gfs2_destroy_threads(sdp); + + gfs2_quota_cleanup(sdp); + } + + WARN_ON(gfs2_withdrawing(sdp)); + + /* At this point, we're through modifying the disk */ + + /* Release stuff */ + + gfs2_freeze_unlock(&sdp->sd_freeze_gh); + + iput(sdp->sd_jindex); + iput(sdp->sd_statfs_inode); + iput(sdp->sd_rindex); + iput(sdp->sd_quota_inode); + + gfs2_glock_put(sdp->sd_rename_gl); + gfs2_glock_put(sdp->sd_freeze_gl); + + if (!sdp->sd_args.ar_spectator) { + if (gfs2_holder_initialized(&sdp->sd_journal_gh)) + gfs2_glock_dq_uninit(&sdp->sd_journal_gh); + if (gfs2_holder_initialized(&sdp->sd_jinode_gh)) + gfs2_glock_dq_uninit(&sdp->sd_jinode_gh); + brelse(sdp->sd_sc_bh); + gfs2_glock_dq_uninit(&sdp->sd_sc_gh); + gfs2_glock_dq_uninit(&sdp->sd_qc_gh); + free_local_statfs_inodes(sdp); + iput(sdp->sd_qc_inode); + } + + gfs2_glock_dq_uninit(&sdp->sd_live_gh); + gfs2_clear_rgrpd(sdp); + gfs2_jindex_free(sdp); + /* Take apart glock structures and buffer lists */ + gfs2_gl_hash_clear(sdp); + truncate_inode_pages_final(&sdp->sd_aspace); + gfs2_delete_debugfs_file(sdp); + /* Unmount the locking protocol */ + gfs2_lm_unmount(sdp); + + /* At this point, we're through participating in the lockspace */ + gfs2_sys_fs_del(sdp); + free_sbd(sdp); +} + +/** + * gfs2_sync_fs - sync the filesystem + * @sb: the superblock + * @wait: true to wait for completion + * + * Flushes the log to disk. + */ + +static int gfs2_sync_fs(struct super_block *sb, int wait) +{ + struct gfs2_sbd *sdp = sb->s_fs_info; + + gfs2_quota_sync(sb, -1); + if (wait) + gfs2_log_flush(sdp, NULL, GFS2_LOG_HEAD_FLUSH_NORMAL | + GFS2_LFC_SYNC_FS); + return sdp->sd_log_error; +} + +static int gfs2_freeze_locally(struct gfs2_sbd *sdp) +{ + struct super_block *sb = sdp->sd_vfs; + int error; + + error = freeze_super(sb, FREEZE_HOLDER_USERSPACE); + if (error) + return error; + + if (test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) { + gfs2_log_flush(sdp, NULL, GFS2_LOG_HEAD_FLUSH_FREEZE | + GFS2_LFC_FREEZE_GO_SYNC); + if (gfs2_withdrawn(sdp)) { + error = thaw_super(sb, FREEZE_HOLDER_USERSPACE); + if (error) + return error; + return -EIO; + } + } + return 0; +} + +static int gfs2_do_thaw(struct gfs2_sbd *sdp) +{ + struct super_block *sb = sdp->sd_vfs; + int error; + + error = gfs2_freeze_lock_shared(sdp); + if (error) + goto fail; + error = thaw_super(sb, FREEZE_HOLDER_USERSPACE); + if (!error) + return 0; + +fail: + fs_info(sdp, "GFS2: couldn't thaw filesystem: %d\n", error); + gfs2_assert_withdraw(sdp, 0); + return error; +} + +void gfs2_freeze_func(struct work_struct *work) +{ + struct gfs2_sbd *sdp = container_of(work, struct gfs2_sbd, sd_freeze_work); + struct super_block *sb = sdp->sd_vfs; + int error; + + mutex_lock(&sdp->sd_freeze_mutex); + error = -EBUSY; + if (test_bit(SDF_FROZEN, &sdp->sd_flags)) + goto freeze_failed; + + error = gfs2_freeze_locally(sdp); + if (error) + goto freeze_failed; + + gfs2_freeze_unlock(&sdp->sd_freeze_gh); + set_bit(SDF_FROZEN, &sdp->sd_flags); + + error = gfs2_do_thaw(sdp); + if (error) + goto out; + + clear_bit(SDF_FROZEN, &sdp->sd_flags); + goto out; + +freeze_failed: + fs_info(sdp, "GFS2: couldn't freeze filesystem: %d\n", error); + +out: + mutex_unlock(&sdp->sd_freeze_mutex); + deactivate_super(sb); +} + +/** + * gfs2_freeze_super - prevent further writes to the filesystem + * @sb: the VFS structure for the filesystem + * + */ + +static int gfs2_freeze_super(struct super_block *sb, enum freeze_holder who) +{ + struct gfs2_sbd *sdp = sb->s_fs_info; + int error; + + if (!mutex_trylock(&sdp->sd_freeze_mutex)) + return -EBUSY; + error = -EBUSY; + if (test_bit(SDF_FROZEN, &sdp->sd_flags)) + goto out; + + for (;;) { + error = gfs2_freeze_locally(sdp); + if (error) { + fs_info(sdp, "GFS2: couldn't freeze filesystem: %d\n", + error); + goto out; + } + + error = gfs2_lock_fs_check_clean(sdp); + if (!error) + break; /* success */ + + error = gfs2_do_thaw(sdp); + if (error) + goto out; + + if (error == -EBUSY) + fs_err(sdp, "waiting for recovery before freeze\n"); + else if (error == -EIO) { + fs_err(sdp, "Fatal IO error: cannot freeze gfs2 due " + "to recovery error.\n"); + goto out; + } else { + fs_err(sdp, "error freezing FS: %d\n", error); + } + fs_err(sdp, "retrying...\n"); + msleep(1000); + } + +out: + if (!error) { + set_bit(SDF_FREEZE_INITIATOR, &sdp->sd_flags); + set_bit(SDF_FROZEN, &sdp->sd_flags); + } + mutex_unlock(&sdp->sd_freeze_mutex); + return error; +} + +/** + * gfs2_thaw_super - reallow writes to the filesystem + * @sb: the VFS structure for the filesystem + * + */ + +static int gfs2_thaw_super(struct super_block *sb, enum freeze_holder who) +{ + struct gfs2_sbd *sdp = sb->s_fs_info; + int error; + + if (!mutex_trylock(&sdp->sd_freeze_mutex)) + return -EBUSY; + error = -EINVAL; + if (!test_bit(SDF_FREEZE_INITIATOR, &sdp->sd_flags)) + goto out; + + gfs2_freeze_unlock(&sdp->sd_freeze_gh); + + error = gfs2_do_thaw(sdp); + + if (!error) { + clear_bit(SDF_FREEZE_INITIATOR, &sdp->sd_flags); + clear_bit(SDF_FROZEN, &sdp->sd_flags); + } +out: + mutex_unlock(&sdp->sd_freeze_mutex); + return error; +} + +void gfs2_thaw_freeze_initiator(struct super_block *sb) +{ + struct gfs2_sbd *sdp = sb->s_fs_info; + + mutex_lock(&sdp->sd_freeze_mutex); + if (!test_bit(SDF_FREEZE_INITIATOR, &sdp->sd_flags)) + goto out; + + gfs2_freeze_unlock(&sdp->sd_freeze_gh); + +out: + mutex_unlock(&sdp->sd_freeze_mutex); +} + +/** + * statfs_slow_fill - fill in the sg for a given RG + * @rgd: the RG + * @sc: the sc structure + * + * Returns: 0 on success, -ESTALE if the LVB is invalid + */ + +static int statfs_slow_fill(struct gfs2_rgrpd *rgd, + struct gfs2_statfs_change_host *sc) +{ + gfs2_rgrp_verify(rgd); + sc->sc_total += rgd->rd_data; + sc->sc_free += rgd->rd_free; + sc->sc_dinodes += rgd->rd_dinodes; + return 0; +} + +/** + * gfs2_statfs_slow - Stat a filesystem using asynchronous locking + * @sdp: the filesystem + * @sc: the sc info that will be returned + * + * Any error (other than a signal) will cause this routine to fall back + * to the synchronous version. + * + * FIXME: This really shouldn't busy wait like this. + * + * Returns: errno + */ + +static int gfs2_statfs_slow(struct gfs2_sbd *sdp, struct gfs2_statfs_change_host *sc) +{ + struct gfs2_rgrpd *rgd_next; + struct gfs2_holder *gha, *gh; + unsigned int slots = 64; + unsigned int x; + int done; + int error = 0, err; + + memset(sc, 0, sizeof(struct gfs2_statfs_change_host)); + gha = kmalloc_array(slots, sizeof(struct gfs2_holder), GFP_KERNEL); + if (!gha) + return -ENOMEM; + for (x = 0; x < slots; x++) + gfs2_holder_mark_uninitialized(gha + x); + + rgd_next = gfs2_rgrpd_get_first(sdp); + + for (;;) { + done = 1; + + for (x = 0; x < slots; x++) { + gh = gha + x; + + if (gfs2_holder_initialized(gh) && gfs2_glock_poll(gh)) { + err = gfs2_glock_wait(gh); + if (err) { + gfs2_holder_uninit(gh); + error = err; + } else { + if (!error) { + struct gfs2_rgrpd *rgd = + gfs2_glock2rgrp(gh->gh_gl); + + error = statfs_slow_fill(rgd, sc); + } + gfs2_glock_dq_uninit(gh); + } + } + + if (gfs2_holder_initialized(gh)) + done = 0; + else if (rgd_next && !error) { + error = gfs2_glock_nq_init(rgd_next->rd_gl, + LM_ST_SHARED, + GL_ASYNC, + gh); + rgd_next = gfs2_rgrpd_get_next(rgd_next); + done = 0; + } + + if (signal_pending(current)) + error = -ERESTARTSYS; + } + + if (done) + break; + + yield(); + } + + kfree(gha); + return error; +} + +/** + * gfs2_statfs_i - Do a statfs + * @sdp: the filesystem + * @sc: the sc structure + * + * Returns: errno + */ + +static int gfs2_statfs_i(struct gfs2_sbd *sdp, struct gfs2_statfs_change_host *sc) +{ + struct gfs2_statfs_change_host *m_sc = &sdp->sd_statfs_master; + struct gfs2_statfs_change_host *l_sc = &sdp->sd_statfs_local; + + spin_lock(&sdp->sd_statfs_spin); + + *sc = *m_sc; + sc->sc_total += l_sc->sc_total; + sc->sc_free += l_sc->sc_free; + sc->sc_dinodes += l_sc->sc_dinodes; + + spin_unlock(&sdp->sd_statfs_spin); + + if (sc->sc_free < 0) + sc->sc_free = 0; + if (sc->sc_free > sc->sc_total) + sc->sc_free = sc->sc_total; + if (sc->sc_dinodes < 0) + sc->sc_dinodes = 0; + + return 0; +} + +/** + * gfs2_statfs - Gather and return stats about the filesystem + * @dentry: The name of the link + * @buf: The buffer + * + * Returns: 0 on success or error code + */ + +static int gfs2_statfs(struct dentry *dentry, struct kstatfs *buf) +{ + struct super_block *sb = dentry->d_sb; + struct gfs2_sbd *sdp = sb->s_fs_info; + struct gfs2_statfs_change_host sc; + int error; + + error = gfs2_rindex_update(sdp); + if (error) + return error; + + if (gfs2_tune_get(sdp, gt_statfs_slow)) + error = gfs2_statfs_slow(sdp, &sc); + else + error = gfs2_statfs_i(sdp, &sc); + + if (error) + return error; + + buf->f_type = GFS2_MAGIC; + buf->f_bsize = sdp->sd_sb.sb_bsize; + buf->f_blocks = sc.sc_total; + buf->f_bfree = sc.sc_free; + buf->f_bavail = sc.sc_free; + buf->f_files = sc.sc_dinodes + sc.sc_free; + buf->f_ffree = sc.sc_free; + buf->f_namelen = GFS2_FNAMESIZE; + + return 0; +} + +/** + * gfs2_drop_inode - Drop an inode (test for remote unlink) + * @inode: The inode to drop + * + * If we've received a callback on an iopen lock then it's because a + * remote node tried to deallocate the inode but failed due to this node + * still having the inode open. Here we mark the link count zero + * since we know that it must have reached zero if the GLF_DEMOTE flag + * is set on the iopen glock. If we didn't do a disk read since the + * remote node removed the final link then we might otherwise miss + * this event. This check ensures that this node will deallocate the + * inode's blocks, or alternatively pass the baton on to another + * node for later deallocation. + */ + +static int gfs2_drop_inode(struct inode *inode) +{ + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_sbd *sdp = GFS2_SB(inode); + + if (inode->i_nlink && + gfs2_holder_initialized(&ip->i_iopen_gh)) { + struct gfs2_glock *gl = ip->i_iopen_gh.gh_gl; + if (test_bit(GLF_DEMOTE, &gl->gl_flags)) + clear_nlink(inode); + } + + /* + * When under memory pressure when an inode's link count has dropped to + * zero, defer deleting the inode to the delete workqueue. This avoids + * calling into DLM under memory pressure, which can deadlock. + */ + if (!inode->i_nlink && + unlikely(current->flags & PF_MEMALLOC) && + gfs2_holder_initialized(&ip->i_iopen_gh)) { + struct gfs2_glock *gl = ip->i_iopen_gh.gh_gl; + + gfs2_glock_hold(gl); + if (!gfs2_queue_try_to_evict(gl)) + gfs2_glock_queue_put(gl); + return 0; + } + + /* + * No longer cache inodes when trying to evict them all. + */ + if (test_bit(SDF_EVICTING, &sdp->sd_flags)) + return 1; + + return generic_drop_inode(inode); +} + +static int is_ancestor(const struct dentry *d1, const struct dentry *d2) +{ + do { + if (d1 == d2) + return 1; + d1 = d1->d_parent; + } while (!IS_ROOT(d1)); + return 0; +} + +/** + * gfs2_show_options - Show mount options for /proc/mounts + * @s: seq_file structure + * @root: root of this (sub)tree + * + * Returns: 0 on success or error code + */ + +static int gfs2_show_options(struct seq_file *s, struct dentry *root) +{ + struct gfs2_sbd *sdp = root->d_sb->s_fs_info; + struct gfs2_args *args = &sdp->sd_args; + unsigned int logd_secs, statfs_slow, statfs_quantum, quota_quantum; + + spin_lock(&sdp->sd_tune.gt_spin); + logd_secs = sdp->sd_tune.gt_logd_secs; + quota_quantum = sdp->sd_tune.gt_quota_quantum; + statfs_quantum = sdp->sd_tune.gt_statfs_quantum; + statfs_slow = sdp->sd_tune.gt_statfs_slow; + spin_unlock(&sdp->sd_tune.gt_spin); + + if (is_ancestor(root, sdp->sd_master_dir)) + seq_puts(s, ",meta"); + if (args->ar_lockproto[0]) + seq_show_option(s, "lockproto", args->ar_lockproto); + if (args->ar_locktable[0]) + seq_show_option(s, "locktable", args->ar_locktable); + if (args->ar_hostdata[0]) + seq_show_option(s, "hostdata", args->ar_hostdata); + if (args->ar_spectator) + seq_puts(s, ",spectator"); + if (args->ar_localflocks) + seq_puts(s, ",localflocks"); + if (args->ar_debug) + seq_puts(s, ",debug"); + if (args->ar_posix_acl) + seq_puts(s, ",acl"); + if (args->ar_quota != GFS2_QUOTA_DEFAULT) { + char *state; + switch (args->ar_quota) { + case GFS2_QUOTA_OFF: + state = "off"; + break; + case GFS2_QUOTA_ACCOUNT: + state = "account"; + break; + case GFS2_QUOTA_ON: + state = "on"; + break; + case GFS2_QUOTA_QUIET: + state = "quiet"; + break; + default: + state = "unknown"; + break; + } + seq_printf(s, ",quota=%s", state); + } + if (args->ar_suiddir) + seq_puts(s, ",suiddir"); + if (args->ar_data != GFS2_DATA_DEFAULT) { + char *state; + switch (args->ar_data) { + case GFS2_DATA_WRITEBACK: + state = "writeback"; + break; + case GFS2_DATA_ORDERED: + state = "ordered"; + break; + default: + state = "unknown"; + break; + } + seq_printf(s, ",data=%s", state); + } + if (args->ar_discard) + seq_puts(s, ",discard"); + if (logd_secs != 30) + seq_printf(s, ",commit=%d", logd_secs); + if (statfs_quantum != 30) + seq_printf(s, ",statfs_quantum=%d", statfs_quantum); + else if (statfs_slow) + seq_puts(s, ",statfs_quantum=0"); + if (quota_quantum != 60) + seq_printf(s, ",quota_quantum=%d", quota_quantum); + if (args->ar_statfs_percent) + seq_printf(s, ",statfs_percent=%d", args->ar_statfs_percent); + if (args->ar_errors != GFS2_ERRORS_DEFAULT) { + const char *state; + + switch (args->ar_errors) { + case GFS2_ERRORS_WITHDRAW: + state = "withdraw"; + break; + case GFS2_ERRORS_PANIC: + state = "panic"; + break; + default: + state = "unknown"; + break; + } + seq_printf(s, ",errors=%s", state); + } + if (test_bit(SDF_NOBARRIERS, &sdp->sd_flags)) + seq_puts(s, ",nobarrier"); + if (test_bit(SDF_DEMOTE, &sdp->sd_flags)) + seq_puts(s, ",demote_interface_used"); + if (args->ar_rgrplvb) + seq_puts(s, ",rgrplvb"); + if (args->ar_loccookie) + seq_puts(s, ",loccookie"); + return 0; +} + +static void gfs2_final_release_pages(struct gfs2_inode *ip) +{ + struct inode *inode = &ip->i_inode; + struct gfs2_glock *gl = ip->i_gl; + + if (unlikely(!gl)) { + /* This can only happen during incomplete inode creation. */ + BUG_ON(!test_bit(GIF_ALLOC_FAILED, &ip->i_flags)); + return; + } + + truncate_inode_pages(gfs2_glock2aspace(gl), 0); + truncate_inode_pages(&inode->i_data, 0); + + if (atomic_read(&gl->gl_revokes) == 0) { + clear_bit(GLF_LFLUSH, &gl->gl_flags); + clear_bit(GLF_DIRTY, &gl->gl_flags); + } +} + +static int gfs2_dinode_dealloc(struct gfs2_inode *ip) +{ + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + struct gfs2_rgrpd *rgd; + struct gfs2_holder gh; + int error; + + if (gfs2_get_inode_blocks(&ip->i_inode) != 1) { + gfs2_consist_inode(ip); + return -EIO; + } + + gfs2_rindex_update(sdp); + + error = gfs2_quota_hold(ip, NO_UID_QUOTA_CHANGE, NO_GID_QUOTA_CHANGE); + if (error) + return error; + + rgd = gfs2_blk2rgrpd(sdp, ip->i_no_addr, 1); + if (!rgd) { + gfs2_consist_inode(ip); + error = -EIO; + goto out_qs; + } + + error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, + LM_FLAG_NODE_SCOPE, &gh); + if (error) + goto out_qs; + + error = gfs2_trans_begin(sdp, RES_RG_BIT + RES_STATFS + RES_QUOTA, + sdp->sd_jdesc->jd_blocks); + if (error) + goto out_rg_gunlock; + + gfs2_free_di(rgd, ip); + + gfs2_final_release_pages(ip); + + gfs2_trans_end(sdp); + +out_rg_gunlock: + gfs2_glock_dq_uninit(&gh); +out_qs: + gfs2_quota_unhold(ip); + return error; +} + +/** + * gfs2_glock_put_eventually + * @gl: The glock to put + * + * When under memory pressure, trigger a deferred glock put to make sure we + * won't call into DLM and deadlock. Otherwise, put the glock directly. + */ + +static void gfs2_glock_put_eventually(struct gfs2_glock *gl) +{ + if (current->flags & PF_MEMALLOC) + gfs2_glock_queue_put(gl); + else + gfs2_glock_put(gl); +} + +static bool gfs2_upgrade_iopen_glock(struct inode *inode) +{ + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_sbd *sdp = GFS2_SB(inode); + struct gfs2_holder *gh = &ip->i_iopen_gh; + long timeout = 5 * HZ; + int error; + + gh->gh_flags |= GL_NOCACHE; + gfs2_glock_dq_wait(gh); + + /* + * If there are no other lock holders, we will immediately get + * exclusive access to the iopen glock here. + * + * Otherwise, the other nodes holding the lock will be notified about + * our locking request. If they do not have the inode open, they are + * expected to evict the cached inode and release the lock, allowing us + * to proceed. + * + * Otherwise, if they cannot evict the inode, they are expected to poke + * the inode glock (note: not the iopen glock). We will notice that + * and stop waiting for the iopen glock immediately. The other node(s) + * are then expected to take care of deleting the inode when they no + * longer use it. + * + * As a last resort, if another node keeps holding the iopen glock + * without showing any activity on the inode glock, we will eventually + * time out and fail the iopen glock upgrade. + * + * Note that we're passing the LM_FLAG_TRY_1CB flag to the first + * locking request as an optimization to notify lock holders as soon as + * possible. Without that flag, they'd be notified implicitly by the + * second locking request. + */ + + gfs2_holder_reinit(LM_ST_EXCLUSIVE, LM_FLAG_TRY_1CB | GL_NOCACHE, gh); + error = gfs2_glock_nq(gh); + if (error != GLR_TRYFAILED) + return !error; + + gfs2_holder_reinit(LM_ST_EXCLUSIVE, GL_ASYNC | GL_NOCACHE, gh); + error = gfs2_glock_nq(gh); + if (error) + return false; + + timeout = wait_event_interruptible_timeout(sdp->sd_async_glock_wait, + !test_bit(HIF_WAIT, &gh->gh_iflags) || + test_bit(GLF_DEMOTE, &ip->i_gl->gl_flags), + timeout); + if (!test_bit(HIF_HOLDER, &gh->gh_iflags)) { + gfs2_glock_dq(gh); + return false; + } + return gfs2_glock_holder_ready(gh) == 0; +} + +/** + * evict_should_delete - determine whether the inode is eligible for deletion + * @inode: The inode to evict + * @gh: The glock holder structure + * + * This function determines whether the evicted inode is eligible to be deleted + * and locks the inode glock. + * + * Returns: the fate of the dinode + */ +static enum dinode_demise evict_should_delete(struct inode *inode, + struct gfs2_holder *gh) +{ + struct gfs2_inode *ip = GFS2_I(inode); + struct super_block *sb = inode->i_sb; + struct gfs2_sbd *sdp = sb->s_fs_info; + int ret; + + if (unlikely(test_bit(GIF_ALLOC_FAILED, &ip->i_flags))) + goto should_delete; + + if (test_bit(GIF_DEFERRED_DELETE, &ip->i_flags)) + return SHOULD_DEFER_EVICTION; + + /* Deletes should never happen under memory pressure anymore. */ + if (WARN_ON_ONCE(current->flags & PF_MEMALLOC)) + return SHOULD_DEFER_EVICTION; + + /* Must not read inode block until block type has been verified */ + ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, GL_SKIP, gh); + if (unlikely(ret)) { + glock_clear_object(ip->i_iopen_gh.gh_gl, ip); + ip->i_iopen_gh.gh_flags |= GL_NOCACHE; + gfs2_glock_dq_uninit(&ip->i_iopen_gh); + return SHOULD_DEFER_EVICTION; + } + + if (gfs2_inode_already_deleted(ip->i_gl, ip->i_no_formal_ino)) + return SHOULD_NOT_DELETE_DINODE; + ret = gfs2_check_blk_type(sdp, ip->i_no_addr, GFS2_BLKST_UNLINKED); + if (ret) + return SHOULD_NOT_DELETE_DINODE; + + ret = gfs2_instantiate(gh); + if (ret) + return SHOULD_NOT_DELETE_DINODE; + + /* + * The inode may have been recreated in the meantime. + */ + if (inode->i_nlink) + return SHOULD_NOT_DELETE_DINODE; + +should_delete: + if (gfs2_holder_initialized(&ip->i_iopen_gh) && + test_bit(HIF_HOLDER, &ip->i_iopen_gh.gh_iflags)) { + if (!gfs2_upgrade_iopen_glock(inode)) { + gfs2_holder_uninit(&ip->i_iopen_gh); + return SHOULD_NOT_DELETE_DINODE; + } + } + return SHOULD_DELETE_DINODE; +} + +/** + * evict_unlinked_inode - delete the pieces of an unlinked evicted inode + * @inode: The inode to evict + */ +static int evict_unlinked_inode(struct inode *inode) +{ + struct gfs2_inode *ip = GFS2_I(inode); + int ret; + + if (S_ISDIR(inode->i_mode) && + (ip->i_diskflags & GFS2_DIF_EXHASH)) { + ret = gfs2_dir_exhash_dealloc(ip); + if (ret) + goto out; + } + + if (ip->i_eattr) { + ret = gfs2_ea_dealloc(ip); + if (ret) + goto out; + } + + if (!gfs2_is_stuffed(ip)) { + ret = gfs2_file_dealloc(ip); + if (ret) + goto out; + } + + /* + * As soon as we clear the bitmap for the dinode, gfs2_create_inode() + * can get called to recreate it, or even gfs2_inode_lookup() if the + * inode was recreated on another node in the meantime. + * + * However, inserting the new inode into the inode hash table will not + * succeed until the old inode is removed, and that only happens after + * ->evict_inode() returns. The new inode is attached to its inode and + * iopen glocks after inserting it into the inode hash table, so at + * that point we can be sure that both glocks are unused. + */ + + ret = gfs2_dinode_dealloc(ip); + if (!ret && ip->i_gl) + gfs2_inode_remember_delete(ip->i_gl, ip->i_no_formal_ino); + +out: + return ret; +} + +/* + * evict_linked_inode - evict an inode whose dinode has not been unlinked + * @inode: The inode to evict + */ +static int evict_linked_inode(struct inode *inode) +{ + struct super_block *sb = inode->i_sb; + struct gfs2_sbd *sdp = sb->s_fs_info; + struct gfs2_inode *ip = GFS2_I(inode); + struct address_space *metamapping; + int ret; + + gfs2_log_flush(sdp, ip->i_gl, GFS2_LOG_HEAD_FLUSH_NORMAL | + GFS2_LFC_EVICT_INODE); + metamapping = gfs2_glock2aspace(ip->i_gl); + if (test_bit(GLF_DIRTY, &ip->i_gl->gl_flags)) { + filemap_fdatawrite(metamapping); + filemap_fdatawait(metamapping); + } + write_inode_now(inode, 1); + gfs2_ail_flush(ip->i_gl, 0); + + ret = gfs2_trans_begin(sdp, 0, sdp->sd_jdesc->jd_blocks); + if (ret) + return ret; + + /* Needs to be done before glock release & also in a transaction */ + truncate_inode_pages(&inode->i_data, 0); + truncate_inode_pages(metamapping, 0); + gfs2_trans_end(sdp); + return 0; +} + +/** + * gfs2_evict_inode - Remove an inode from cache + * @inode: The inode to evict + * + * There are three cases to consider: + * 1. i_nlink == 0, we are final opener (and must deallocate) + * 2. i_nlink == 0, we are not the final opener (and cannot deallocate) + * 3. i_nlink > 0 + * + * If the fs is read only, then we have to treat all cases as per #3 + * since we are unable to do any deallocation. The inode will be + * deallocated by the next read/write node to attempt an allocation + * in the same resource group + * + * We have to (at the moment) hold the inodes main lock to cover + * the gap between unlocking the shared lock on the iopen lock and + * taking the exclusive lock. I'd rather do a shared -> exclusive + * conversion on the iopen lock, but we can change that later. This + * is safe, just less efficient. + */ + +static void gfs2_evict_inode(struct inode *inode) +{ + struct super_block *sb = inode->i_sb; + struct gfs2_sbd *sdp = sb->s_fs_info; + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_holder gh; + int ret; + + if (inode->i_nlink || sb_rdonly(sb) || !ip->i_no_addr) + goto out; + + /* + * In case of an incomplete mount, gfs2_evict_inode() may be called for + * system files without having an active journal to write to. In that + * case, skip the filesystem evict. + */ + if (!sdp->sd_jdesc) + goto out; + + gfs2_holder_mark_uninitialized(&gh); + ret = evict_should_delete(inode, &gh); + if (ret == SHOULD_DEFER_EVICTION) + goto out; + if (ret == SHOULD_DELETE_DINODE) + ret = evict_unlinked_inode(inode); + else + ret = evict_linked_inode(inode); + + if (gfs2_rs_active(&ip->i_res)) + gfs2_rs_deltree(&ip->i_res); + + if (gfs2_holder_initialized(&gh)) + gfs2_glock_dq_uninit(&gh); + if (ret && ret != GLR_TRYFAILED && ret != -EROFS) + fs_warn(sdp, "gfs2_evict_inode: %d\n", ret); +out: + truncate_inode_pages_final(&inode->i_data); + if (ip->i_qadata) + gfs2_assert_warn(sdp, ip->i_qadata->qa_ref == 0); + gfs2_rs_deltree(&ip->i_res); + gfs2_ordered_del_inode(ip); + clear_inode(inode); + gfs2_dir_hash_inval(ip); + if (gfs2_holder_initialized(&ip->i_iopen_gh)) { + struct gfs2_glock *gl = ip->i_iopen_gh.gh_gl; + + glock_clear_object(gl, ip); + gfs2_glock_hold(gl); + ip->i_iopen_gh.gh_flags |= GL_NOCACHE; + gfs2_glock_dq_uninit(&ip->i_iopen_gh); + gfs2_glock_put_eventually(gl); + } + if (ip->i_gl) { + glock_clear_object(ip->i_gl, ip); + wait_on_bit_io(&ip->i_flags, GIF_GLOP_PENDING, TASK_UNINTERRUPTIBLE); + gfs2_glock_add_to_lru(ip->i_gl); + gfs2_glock_put_eventually(ip->i_gl); + rcu_assign_pointer(ip->i_gl, NULL); + } +} + +static struct inode *gfs2_alloc_inode(struct super_block *sb) +{ + struct gfs2_inode *ip; + + ip = alloc_inode_sb(sb, gfs2_inode_cachep, GFP_KERNEL); + if (!ip) + return NULL; + ip->i_no_addr = 0; + ip->i_flags = 0; + ip->i_gl = NULL; + gfs2_holder_mark_uninitialized(&ip->i_iopen_gh); + memset(&ip->i_res, 0, sizeof(ip->i_res)); + RB_CLEAR_NODE(&ip->i_res.rs_node); + ip->i_rahead = 0; + return &ip->i_inode; +} + +static void gfs2_free_inode(struct inode *inode) +{ + kmem_cache_free(gfs2_inode_cachep, GFS2_I(inode)); +} + +extern void free_local_statfs_inodes(struct gfs2_sbd *sdp) +{ + struct local_statfs_inode *lsi, *safe; + + /* Run through the statfs inodes list to iput and free memory */ + list_for_each_entry_safe(lsi, safe, &sdp->sd_sc_inodes_list, si_list) { + if (lsi->si_jid == sdp->sd_jdesc->jd_jid) + sdp->sd_sc_inode = NULL; /* belongs to this node */ + if (lsi->si_sc_inode) + iput(lsi->si_sc_inode); + list_del(&lsi->si_list); + kfree(lsi); + } +} + +extern struct inode *find_local_statfs_inode(struct gfs2_sbd *sdp, + unsigned int index) +{ + struct local_statfs_inode *lsi; + + /* Return the local (per node) statfs inode in the + * sdp->sd_sc_inodes_list corresponding to the 'index'. */ + list_for_each_entry(lsi, &sdp->sd_sc_inodes_list, si_list) { + if (lsi->si_jid == index) + return lsi->si_sc_inode; + } + return NULL; +} + +const struct super_operations gfs2_super_ops = { + .alloc_inode = gfs2_alloc_inode, + .free_inode = gfs2_free_inode, + .write_inode = gfs2_write_inode, + .dirty_inode = gfs2_dirty_inode, + .evict_inode = gfs2_evict_inode, + .put_super = gfs2_put_super, + .sync_fs = gfs2_sync_fs, + .freeze_super = gfs2_freeze_super, + .thaw_super = gfs2_thaw_super, + .statfs = gfs2_statfs, + .drop_inode = gfs2_drop_inode, + .show_options = gfs2_show_options, +}; + diff --git a/fs/gfs2/super.h b/fs/gfs2/super.h new file mode 100644 index 0000000000..ab9c831069 --- /dev/null +++ b/fs/gfs2/super.h @@ -0,0 +1,67 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + */ + +#ifndef __SUPER_DOT_H__ +#define __SUPER_DOT_H__ + +#include <linux/fs.h> +#include <linux/dcache.h> +#include "incore.h" + +/* Supported fs format version range */ +#define GFS2_FS_FORMAT_MIN (1801) +#define GFS2_FS_FORMAT_MAX (1802) + +extern void gfs2_lm_unmount(struct gfs2_sbd *sdp); + +static inline unsigned int gfs2_jindex_size(struct gfs2_sbd *sdp) +{ + unsigned int x; + spin_lock(&sdp->sd_jindex_spin); + x = sdp->sd_journals; + spin_unlock(&sdp->sd_jindex_spin); + return x; +} + +extern void gfs2_jindex_free(struct gfs2_sbd *sdp); + +extern struct gfs2_jdesc *gfs2_jdesc_find(struct gfs2_sbd *sdp, unsigned int jid); +extern int gfs2_jdesc_check(struct gfs2_jdesc *jd); +extern int gfs2_lookup_in_master_dir(struct gfs2_sbd *sdp, char *filename, + struct gfs2_inode **ipp); + +extern int gfs2_make_fs_rw(struct gfs2_sbd *sdp); +extern void gfs2_make_fs_ro(struct gfs2_sbd *sdp); +extern void gfs2_online_uevent(struct gfs2_sbd *sdp); +extern void gfs2_destroy_threads(struct gfs2_sbd *sdp); +extern int gfs2_statfs_init(struct gfs2_sbd *sdp); +extern void gfs2_statfs_change(struct gfs2_sbd *sdp, s64 total, s64 free, + s64 dinodes); +extern void gfs2_statfs_change_in(struct gfs2_statfs_change_host *sc, + const void *buf); +extern void gfs2_statfs_change_out(const struct gfs2_statfs_change_host *sc, + void *buf); +extern void update_statfs(struct gfs2_sbd *sdp, struct buffer_head *m_bh); +extern int gfs2_statfs_sync(struct super_block *sb, int type); +extern void gfs2_freeze_func(struct work_struct *work); +extern void gfs2_thaw_freeze_initiator(struct super_block *sb); + +extern void free_local_statfs_inodes(struct gfs2_sbd *sdp); +extern struct inode *find_local_statfs_inode(struct gfs2_sbd *sdp, + unsigned int index); +extern void free_sbd(struct gfs2_sbd *sdp); + +extern struct file_system_type gfs2_fs_type; +extern struct file_system_type gfs2meta_fs_type; +extern const struct export_operations gfs2_export_ops; +extern const struct super_operations gfs2_super_ops; +extern const struct dentry_operations gfs2_dops; + +extern const struct xattr_handler *gfs2_xattr_handlers_max[]; +extern const struct xattr_handler **gfs2_xattr_handlers_min; + +#endif /* __SUPER_DOT_H__ */ + diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c new file mode 100644 index 0000000000..60a0206890 --- /dev/null +++ b/fs/gfs2/sys.c @@ -0,0 +1,810 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/sched.h> +#include <linux/cred.h> +#include <linux/spinlock.h> +#include <linux/completion.h> +#include <linux/buffer_head.h> +#include <linux/module.h> +#include <linux/kobject.h> +#include <linux/uaccess.h> +#include <linux/gfs2_ondisk.h> +#include <linux/blkdev.h> + +#include "gfs2.h" +#include "incore.h" +#include "sys.h" +#include "super.h" +#include "glock.h" +#include "quota.h" +#include "util.h" +#include "glops.h" +#include "recovery.h" + +struct gfs2_attr { + struct attribute attr; + ssize_t (*show)(struct gfs2_sbd *, char *); + ssize_t (*store)(struct gfs2_sbd *, const char *, size_t); +}; + +static ssize_t gfs2_attr_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + struct gfs2_sbd *sdp = container_of(kobj, struct gfs2_sbd, sd_kobj); + struct gfs2_attr *a = container_of(attr, struct gfs2_attr, attr); + return a->show ? a->show(sdp, buf) : 0; +} + +static ssize_t gfs2_attr_store(struct kobject *kobj, struct attribute *attr, + const char *buf, size_t len) +{ + struct gfs2_sbd *sdp = container_of(kobj, struct gfs2_sbd, sd_kobj); + struct gfs2_attr *a = container_of(attr, struct gfs2_attr, attr); + return a->store ? a->store(sdp, buf, len) : len; +} + +static const struct sysfs_ops gfs2_attr_ops = { + .show = gfs2_attr_show, + .store = gfs2_attr_store, +}; + + +static struct kset *gfs2_kset; + +static ssize_t id_show(struct gfs2_sbd *sdp, char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%u:%u\n", + MAJOR(sdp->sd_vfs->s_dev), MINOR(sdp->sd_vfs->s_dev)); +} + +static ssize_t status_show(struct gfs2_sbd *sdp, char *buf) +{ + unsigned long f = sdp->sd_flags; + ssize_t s; + + s = snprintf(buf, PAGE_SIZE, + "Journal Checked: %d\n" + "Journal Live: %d\n" + "Journal ID: %d\n" + "Spectator: %d\n" + "Withdrawn: %d\n" + "No barriers: %d\n" + "No recovery: %d\n" + "Demote: %d\n" + "No Journal ID: %d\n" + "Mounted RO: %d\n" + "RO Recovery: %d\n" + "Skip DLM Unlock: %d\n" + "Force AIL Flush: %d\n" + "FS Freeze Initiator: %d\n" + "FS Frozen: %d\n" + "Withdrawing: %d\n" + "Withdraw In Prog: %d\n" + "Remote Withdraw: %d\n" + "Withdraw Recovery: %d\n" + "Deactivating: %d\n" + "sd_log_error: %d\n" + "sd_log_flush_lock: %d\n" + "sd_log_num_revoke: %u\n" + "sd_log_in_flight: %d\n" + "sd_log_blks_needed: %d\n" + "sd_log_blks_free: %d\n" + "sd_log_flush_head: %d\n" + "sd_log_flush_tail: %d\n" + "sd_log_blks_reserved: %d\n" + "sd_log_revokes_available: %d\n" + "sd_log_pinned: %d\n" + "sd_log_thresh1: %d\n" + "sd_log_thresh2: %d\n", + test_bit(SDF_JOURNAL_CHECKED, &f), + test_bit(SDF_JOURNAL_LIVE, &f), + (sdp->sd_jdesc ? sdp->sd_jdesc->jd_jid : 0), + (sdp->sd_args.ar_spectator ? 1 : 0), + test_bit(SDF_WITHDRAWN, &f), + test_bit(SDF_NOBARRIERS, &f), + test_bit(SDF_NORECOVERY, &f), + test_bit(SDF_DEMOTE, &f), + test_bit(SDF_NOJOURNALID, &f), + (sb_rdonly(sdp->sd_vfs) ? 1 : 0), + test_bit(SDF_RORECOVERY, &f), + test_bit(SDF_SKIP_DLM_UNLOCK, &f), + test_bit(SDF_FORCE_AIL_FLUSH, &f), + test_bit(SDF_FREEZE_INITIATOR, &f), + test_bit(SDF_FROZEN, &f), + test_bit(SDF_WITHDRAWING, &f), + test_bit(SDF_WITHDRAW_IN_PROG, &f), + test_bit(SDF_REMOTE_WITHDRAW, &f), + test_bit(SDF_WITHDRAW_RECOVERY, &f), + test_bit(SDF_KILL, &f), + sdp->sd_log_error, + rwsem_is_locked(&sdp->sd_log_flush_lock), + sdp->sd_log_num_revoke, + atomic_read(&sdp->sd_log_in_flight), + atomic_read(&sdp->sd_log_blks_needed), + atomic_read(&sdp->sd_log_blks_free), + sdp->sd_log_flush_head, + sdp->sd_log_flush_tail, + sdp->sd_log_blks_reserved, + atomic_read(&sdp->sd_log_revokes_available), + atomic_read(&sdp->sd_log_pinned), + atomic_read(&sdp->sd_log_thresh1), + atomic_read(&sdp->sd_log_thresh2)); + return s; +} + +static ssize_t fsname_show(struct gfs2_sbd *sdp, char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%s\n", sdp->sd_fsname); +} + +static ssize_t uuid_show(struct gfs2_sbd *sdp, char *buf) +{ + struct super_block *s = sdp->sd_vfs; + + buf[0] = '\0'; + if (uuid_is_null(&s->s_uuid)) + return 0; + return snprintf(buf, PAGE_SIZE, "%pUB\n", &s->s_uuid); +} + +static ssize_t freeze_show(struct gfs2_sbd *sdp, char *buf) +{ + struct super_block *sb = sdp->sd_vfs; + int frozen = (sb->s_writers.frozen == SB_UNFROZEN) ? 0 : 1; + + return snprintf(buf, PAGE_SIZE, "%d\n", frozen); +} + +static ssize_t freeze_store(struct gfs2_sbd *sdp, const char *buf, size_t len) +{ + int error, n; + + error = kstrtoint(buf, 0, &n); + if (error) + return error; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + switch (n) { + case 0: + error = thaw_super(sdp->sd_vfs, FREEZE_HOLDER_USERSPACE); + break; + case 1: + error = freeze_super(sdp->sd_vfs, FREEZE_HOLDER_USERSPACE); + break; + default: + return -EINVAL; + } + + if (error) { + fs_warn(sdp, "freeze %d error %d\n", n, error); + return error; + } + + return len; +} + +static ssize_t withdraw_show(struct gfs2_sbd *sdp, char *buf) +{ + unsigned int b = gfs2_withdrawn(sdp); + return snprintf(buf, PAGE_SIZE, "%u\n", b); +} + +static ssize_t withdraw_store(struct gfs2_sbd *sdp, const char *buf, size_t len) +{ + int error, val; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + error = kstrtoint(buf, 0, &val); + if (error) + return error; + + if (val != 1) + return -EINVAL; + + gfs2_lm(sdp, "withdrawing from cluster at user's request\n"); + gfs2_withdraw(sdp); + + return len; +} + +static ssize_t statfs_sync_store(struct gfs2_sbd *sdp, const char *buf, + size_t len) +{ + int error, val; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + error = kstrtoint(buf, 0, &val); + if (error) + return error; + + if (val != 1) + return -EINVAL; + + gfs2_statfs_sync(sdp->sd_vfs, 0); + return len; +} + +static ssize_t quota_sync_store(struct gfs2_sbd *sdp, const char *buf, + size_t len) +{ + int error, val; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + error = kstrtoint(buf, 0, &val); + if (error) + return error; + + if (val != 1) + return -EINVAL; + + gfs2_quota_sync(sdp->sd_vfs, 0); + return len; +} + +static ssize_t quota_refresh_user_store(struct gfs2_sbd *sdp, const char *buf, + size_t len) +{ + struct kqid qid; + int error; + u32 id; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + error = kstrtou32(buf, 0, &id); + if (error) + return error; + + qid = make_kqid(current_user_ns(), USRQUOTA, id); + if (!qid_valid(qid)) + return -EINVAL; + + error = gfs2_quota_refresh(sdp, qid); + return error ? error : len; +} + +static ssize_t quota_refresh_group_store(struct gfs2_sbd *sdp, const char *buf, + size_t len) +{ + struct kqid qid; + int error; + u32 id; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + error = kstrtou32(buf, 0, &id); + if (error) + return error; + + qid = make_kqid(current_user_ns(), GRPQUOTA, id); + if (!qid_valid(qid)) + return -EINVAL; + + error = gfs2_quota_refresh(sdp, qid); + return error ? error : len; +} + +static ssize_t demote_rq_store(struct gfs2_sbd *sdp, const char *buf, size_t len) +{ + struct gfs2_glock *gl; + const struct gfs2_glock_operations *glops; + unsigned int glmode; + unsigned int gltype; + unsigned long long glnum; + char mode[16]; + int rv; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + rv = sscanf(buf, "%u:%llu %15s", &gltype, &glnum, + mode); + if (rv != 3) + return -EINVAL; + + if (strcmp(mode, "EX") == 0) + glmode = LM_ST_UNLOCKED; + else if ((strcmp(mode, "CW") == 0) || (strcmp(mode, "DF") == 0)) + glmode = LM_ST_DEFERRED; + else if ((strcmp(mode, "PR") == 0) || (strcmp(mode, "SH") == 0)) + glmode = LM_ST_SHARED; + else + return -EINVAL; + + if (gltype > LM_TYPE_JOURNAL) + return -EINVAL; + if (gltype == LM_TYPE_NONDISK && glnum == GFS2_FREEZE_LOCK) + glops = &gfs2_freeze_glops; + else + glops = gfs2_glops_list[gltype]; + if (glops == NULL) + return -EINVAL; + if (!test_and_set_bit(SDF_DEMOTE, &sdp->sd_flags)) + fs_info(sdp, "demote interface used\n"); + rv = gfs2_glock_get(sdp, glnum, glops, 0, &gl); + if (rv) + return rv; + gfs2_glock_cb(gl, glmode); + gfs2_glock_put(gl); + return len; +} + + +#define GFS2_ATTR(name, mode, show, store) \ +static struct gfs2_attr gfs2_attr_##name = __ATTR(name, mode, show, store) + +GFS2_ATTR(id, 0444, id_show, NULL); +GFS2_ATTR(fsname, 0444, fsname_show, NULL); +GFS2_ATTR(uuid, 0444, uuid_show, NULL); +GFS2_ATTR(freeze, 0644, freeze_show, freeze_store); +GFS2_ATTR(withdraw, 0644, withdraw_show, withdraw_store); +GFS2_ATTR(statfs_sync, 0200, NULL, statfs_sync_store); +GFS2_ATTR(quota_sync, 0200, NULL, quota_sync_store); +GFS2_ATTR(quota_refresh_user, 0200, NULL, quota_refresh_user_store); +GFS2_ATTR(quota_refresh_group, 0200, NULL, quota_refresh_group_store); +GFS2_ATTR(demote_rq, 0200, NULL, demote_rq_store); +GFS2_ATTR(status, 0400, status_show, NULL); + +static struct attribute *gfs2_attrs[] = { + &gfs2_attr_id.attr, + &gfs2_attr_fsname.attr, + &gfs2_attr_uuid.attr, + &gfs2_attr_freeze.attr, + &gfs2_attr_withdraw.attr, + &gfs2_attr_statfs_sync.attr, + &gfs2_attr_quota_sync.attr, + &gfs2_attr_quota_refresh_user.attr, + &gfs2_attr_quota_refresh_group.attr, + &gfs2_attr_demote_rq.attr, + &gfs2_attr_status.attr, + NULL, +}; +ATTRIBUTE_GROUPS(gfs2); + +static void gfs2_sbd_release(struct kobject *kobj) +{ + struct gfs2_sbd *sdp = container_of(kobj, struct gfs2_sbd, sd_kobj); + + complete(&sdp->sd_kobj_unregister); +} + +static struct kobj_type gfs2_ktype = { + .release = gfs2_sbd_release, + .default_groups = gfs2_groups, + .sysfs_ops = &gfs2_attr_ops, +}; + + +/* + * lock_module. Originally from lock_dlm + */ + +static ssize_t proto_name_show(struct gfs2_sbd *sdp, char *buf) +{ + const struct lm_lockops *ops = sdp->sd_lockstruct.ls_ops; + return sprintf(buf, "%s\n", ops->lm_proto_name); +} + +static ssize_t block_show(struct gfs2_sbd *sdp, char *buf) +{ + struct lm_lockstruct *ls = &sdp->sd_lockstruct; + ssize_t ret; + int val = 0; + + if (test_bit(DFL_BLOCK_LOCKS, &ls->ls_recover_flags)) + val = 1; + ret = sprintf(buf, "%d\n", val); + return ret; +} + +static ssize_t block_store(struct gfs2_sbd *sdp, const char *buf, size_t len) +{ + struct lm_lockstruct *ls = &sdp->sd_lockstruct; + int ret, val; + + ret = kstrtoint(buf, 0, &val); + if (ret) + return ret; + + if (val == 1) + set_bit(DFL_BLOCK_LOCKS, &ls->ls_recover_flags); + else if (val == 0) { + clear_bit(DFL_BLOCK_LOCKS, &ls->ls_recover_flags); + smp_mb__after_atomic(); + gfs2_glock_thaw(sdp); + } else { + return -EINVAL; + } + return len; +} + +static ssize_t wdack_show(struct gfs2_sbd *sdp, char *buf) +{ + int val = completion_done(&sdp->sd_wdack) ? 1 : 0; + + return sprintf(buf, "%d\n", val); +} + +static ssize_t wdack_store(struct gfs2_sbd *sdp, const char *buf, size_t len) +{ + int ret, val; + + ret = kstrtoint(buf, 0, &val); + if (ret) + return ret; + + if ((val == 1) && + !strcmp(sdp->sd_lockstruct.ls_ops->lm_proto_name, "lock_dlm")) + complete(&sdp->sd_wdack); + else + return -EINVAL; + return len; +} + +static ssize_t lkfirst_show(struct gfs2_sbd *sdp, char *buf) +{ + struct lm_lockstruct *ls = &sdp->sd_lockstruct; + return sprintf(buf, "%d\n", ls->ls_first); +} + +static ssize_t lkfirst_store(struct gfs2_sbd *sdp, const char *buf, size_t len) +{ + unsigned first; + int rv; + + rv = sscanf(buf, "%u", &first); + if (rv != 1 || first > 1) + return -EINVAL; + rv = wait_for_completion_killable(&sdp->sd_locking_init); + if (rv) + return rv; + spin_lock(&sdp->sd_jindex_spin); + rv = -EBUSY; + if (test_bit(SDF_NOJOURNALID, &sdp->sd_flags) == 0) + goto out; + rv = -EINVAL; + if (sdp->sd_args.ar_spectator) + goto out; + if (sdp->sd_lockstruct.ls_ops->lm_mount == NULL) + goto out; + sdp->sd_lockstruct.ls_first = first; + rv = 0; +out: + spin_unlock(&sdp->sd_jindex_spin); + return rv ? rv : len; +} + +static ssize_t first_done_show(struct gfs2_sbd *sdp, char *buf) +{ + struct lm_lockstruct *ls = &sdp->sd_lockstruct; + return sprintf(buf, "%d\n", !!test_bit(DFL_FIRST_MOUNT_DONE, &ls->ls_recover_flags)); +} + +int gfs2_recover_set(struct gfs2_sbd *sdp, unsigned jid) +{ + struct gfs2_jdesc *jd; + int rv; + + /* Wait for our primary journal to be initialized */ + wait_for_completion(&sdp->sd_journal_ready); + + spin_lock(&sdp->sd_jindex_spin); + rv = -EBUSY; + /** + * If we're a spectator, we use journal0, but it's not really ours. + * So we need to wait for its recovery too. If we skip it we'd never + * queue work to the recovery workqueue, and so its completion would + * never clear the DFL_BLOCK_LOCKS flag, so all our locks would + * permanently stop working. + */ + if (!sdp->sd_jdesc) + goto out; + if (sdp->sd_jdesc->jd_jid == jid && !sdp->sd_args.ar_spectator) + goto out; + rv = -ENOENT; + list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) { + if (jd->jd_jid != jid && !sdp->sd_args.ar_spectator) + continue; + rv = gfs2_recover_journal(jd, false); + break; + } +out: + spin_unlock(&sdp->sd_jindex_spin); + return rv; +} + +static ssize_t recover_store(struct gfs2_sbd *sdp, const char *buf, size_t len) +{ + unsigned jid; + int rv; + + rv = sscanf(buf, "%u", &jid); + if (rv != 1) + return -EINVAL; + + if (test_bit(SDF_NORECOVERY, &sdp->sd_flags)) { + rv = -ESHUTDOWN; + goto out; + } + + rv = gfs2_recover_set(sdp, jid); +out: + return rv ? rv : len; +} + +static ssize_t recover_done_show(struct gfs2_sbd *sdp, char *buf) +{ + struct lm_lockstruct *ls = &sdp->sd_lockstruct; + return sprintf(buf, "%d\n", ls->ls_recover_jid_done); +} + +static ssize_t recover_status_show(struct gfs2_sbd *sdp, char *buf) +{ + struct lm_lockstruct *ls = &sdp->sd_lockstruct; + return sprintf(buf, "%d\n", ls->ls_recover_jid_status); +} + +static ssize_t jid_show(struct gfs2_sbd *sdp, char *buf) +{ + return sprintf(buf, "%d\n", sdp->sd_lockstruct.ls_jid); +} + +static ssize_t jid_store(struct gfs2_sbd *sdp, const char *buf, size_t len) +{ + int jid; + int rv; + + rv = sscanf(buf, "%d", &jid); + if (rv != 1) + return -EINVAL; + rv = wait_for_completion_killable(&sdp->sd_locking_init); + if (rv) + return rv; + spin_lock(&sdp->sd_jindex_spin); + rv = -EINVAL; + if (sdp->sd_lockstruct.ls_ops->lm_mount == NULL) + goto out; + rv = -EBUSY; + if (test_bit(SDF_NOJOURNALID, &sdp->sd_flags) == 0) + goto out; + rv = 0; + if (sdp->sd_args.ar_spectator && jid > 0) + rv = jid = -EINVAL; + sdp->sd_lockstruct.ls_jid = jid; + clear_bit(SDF_NOJOURNALID, &sdp->sd_flags); + smp_mb__after_atomic(); + wake_up_bit(&sdp->sd_flags, SDF_NOJOURNALID); +out: + spin_unlock(&sdp->sd_jindex_spin); + return rv ? rv : len; +} + +#define GDLM_ATTR(_name,_mode,_show,_store) \ +static struct gfs2_attr gdlm_attr_##_name = __ATTR(_name,_mode,_show,_store) + +GDLM_ATTR(proto_name, 0444, proto_name_show, NULL); +GDLM_ATTR(block, 0644, block_show, block_store); +GDLM_ATTR(withdraw, 0644, wdack_show, wdack_store); +GDLM_ATTR(jid, 0644, jid_show, jid_store); +GDLM_ATTR(first, 0644, lkfirst_show, lkfirst_store); +GDLM_ATTR(first_done, 0444, first_done_show, NULL); +GDLM_ATTR(recover, 0600, NULL, recover_store); +GDLM_ATTR(recover_done, 0444, recover_done_show, NULL); +GDLM_ATTR(recover_status, 0444, recover_status_show, NULL); + +static struct attribute *lock_module_attrs[] = { + &gdlm_attr_proto_name.attr, + &gdlm_attr_block.attr, + &gdlm_attr_withdraw.attr, + &gdlm_attr_jid.attr, + &gdlm_attr_first.attr, + &gdlm_attr_first_done.attr, + &gdlm_attr_recover.attr, + &gdlm_attr_recover_done.attr, + &gdlm_attr_recover_status.attr, + NULL, +}; + +/* + * get and set struct gfs2_tune fields + */ + +static ssize_t quota_scale_show(struct gfs2_sbd *sdp, char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%u %u\n", + sdp->sd_tune.gt_quota_scale_num, + sdp->sd_tune.gt_quota_scale_den); +} + +static ssize_t quota_scale_store(struct gfs2_sbd *sdp, const char *buf, + size_t len) +{ + struct gfs2_tune *gt = &sdp->sd_tune; + unsigned int x, y; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (sscanf(buf, "%u %u", &x, &y) != 2 || !y) + return -EINVAL; + + spin_lock(>->gt_spin); + gt->gt_quota_scale_num = x; + gt->gt_quota_scale_den = y; + spin_unlock(>->gt_spin); + return len; +} + +static ssize_t tune_set(struct gfs2_sbd *sdp, unsigned int *field, + int check_zero, const char *buf, size_t len) +{ + struct gfs2_tune *gt = &sdp->sd_tune; + unsigned int x; + int error; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + error = kstrtouint(buf, 0, &x); + if (error) + return error; + + if (check_zero && !x) + return -EINVAL; + + spin_lock(>->gt_spin); + *field = x; + spin_unlock(>->gt_spin); + return len; +} + +#define TUNE_ATTR_3(name, show, store) \ +static struct gfs2_attr tune_attr_##name = __ATTR(name, 0644, show, store) + +#define TUNE_ATTR_2(name, store) \ +static ssize_t name##_show(struct gfs2_sbd *sdp, char *buf) \ +{ \ + return snprintf(buf, PAGE_SIZE, "%u\n", sdp->sd_tune.gt_##name); \ +} \ +TUNE_ATTR_3(name, name##_show, store) + +#define TUNE_ATTR(name, check_zero) \ +static ssize_t name##_store(struct gfs2_sbd *sdp, const char *buf, size_t len)\ +{ \ + return tune_set(sdp, &sdp->sd_tune.gt_##name, check_zero, buf, len); \ +} \ +TUNE_ATTR_2(name, name##_store) + +TUNE_ATTR(quota_warn_period, 0); +TUNE_ATTR(quota_quantum, 0); +TUNE_ATTR(max_readahead, 0); +TUNE_ATTR(complain_secs, 0); +TUNE_ATTR(statfs_slow, 0); +TUNE_ATTR(new_files_jdata, 0); +TUNE_ATTR(statfs_quantum, 1); +TUNE_ATTR_3(quota_scale, quota_scale_show, quota_scale_store); + +static struct attribute *tune_attrs[] = { + &tune_attr_quota_warn_period.attr, + &tune_attr_quota_quantum.attr, + &tune_attr_max_readahead.attr, + &tune_attr_complain_secs.attr, + &tune_attr_statfs_slow.attr, + &tune_attr_statfs_quantum.attr, + &tune_attr_quota_scale.attr, + &tune_attr_new_files_jdata.attr, + NULL, +}; + +static const struct attribute_group tune_group = { + .name = "tune", + .attrs = tune_attrs, +}; + +static const struct attribute_group lock_module_group = { + .name = "lock_module", + .attrs = lock_module_attrs, +}; + +int gfs2_sys_fs_add(struct gfs2_sbd *sdp) +{ + struct super_block *sb = sdp->sd_vfs; + int error; + char ro[20]; + char spectator[20]; + char *envp[] = { ro, spectator, NULL }; + + sprintf(ro, "RDONLY=%d", sb_rdonly(sb)); + sprintf(spectator, "SPECTATOR=%d", sdp->sd_args.ar_spectator ? 1 : 0); + + init_completion(&sdp->sd_kobj_unregister); + sdp->sd_kobj.kset = gfs2_kset; + error = kobject_init_and_add(&sdp->sd_kobj, &gfs2_ktype, NULL, + "%s", sdp->sd_table_name); + if (error) + goto fail_reg; + + error = sysfs_create_group(&sdp->sd_kobj, &tune_group); + if (error) + goto fail_reg; + + error = sysfs_create_group(&sdp->sd_kobj, &lock_module_group); + if (error) + goto fail_tune; + + error = sysfs_create_link(&sdp->sd_kobj, + &disk_to_dev(sb->s_bdev->bd_disk)->kobj, + "device"); + if (error) + goto fail_lock_module; + + kobject_uevent_env(&sdp->sd_kobj, KOBJ_ADD, envp); + return 0; + +fail_lock_module: + sysfs_remove_group(&sdp->sd_kobj, &lock_module_group); +fail_tune: + sysfs_remove_group(&sdp->sd_kobj, &tune_group); +fail_reg: + fs_err(sdp, "error %d adding sysfs files\n", error); + kobject_put(&sdp->sd_kobj); + wait_for_completion(&sdp->sd_kobj_unregister); + sb->s_fs_info = NULL; + return error; +} + +void gfs2_sys_fs_del(struct gfs2_sbd *sdp) +{ + sysfs_remove_link(&sdp->sd_kobj, "device"); + sysfs_remove_group(&sdp->sd_kobj, &tune_group); + sysfs_remove_group(&sdp->sd_kobj, &lock_module_group); + kobject_put(&sdp->sd_kobj); + wait_for_completion(&sdp->sd_kobj_unregister); +} + +static int gfs2_uevent(const struct kobject *kobj, struct kobj_uevent_env *env) +{ + const struct gfs2_sbd *sdp = container_of(kobj, struct gfs2_sbd, sd_kobj); + const struct super_block *s = sdp->sd_vfs; + + add_uevent_var(env, "LOCKTABLE=%s", sdp->sd_table_name); + add_uevent_var(env, "LOCKPROTO=%s", sdp->sd_proto_name); + if (!test_bit(SDF_NOJOURNALID, &sdp->sd_flags)) + add_uevent_var(env, "JOURNALID=%d", sdp->sd_lockstruct.ls_jid); + if (!uuid_is_null(&s->s_uuid)) + add_uevent_var(env, "UUID=%pUB", &s->s_uuid); + return 0; +} + +static const struct kset_uevent_ops gfs2_uevent_ops = { + .uevent = gfs2_uevent, +}; + +int gfs2_sys_init(void) +{ + gfs2_kset = kset_create_and_add("gfs2", &gfs2_uevent_ops, fs_kobj); + if (!gfs2_kset) + return -ENOMEM; + return 0; +} + +void gfs2_sys_uninit(void) +{ + kset_unregister(gfs2_kset); +} + diff --git a/fs/gfs2/sys.h b/fs/gfs2/sys.h new file mode 100644 index 0000000000..f8dacf20e1 --- /dev/null +++ b/fs/gfs2/sys.h @@ -0,0 +1,22 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + */ + +#ifndef __SYS_DOT_H__ +#define __SYS_DOT_H__ + +#include <linux/spinlock.h> +struct gfs2_sbd; + +int gfs2_sys_fs_add(struct gfs2_sbd *sdp); +void gfs2_sys_fs_del(struct gfs2_sbd *sdp); + +int gfs2_sys_init(void); +void gfs2_sys_uninit(void); + +int gfs2_recover_set(struct gfs2_sbd *sdp, unsigned jid); + +#endif /* __SYS_DOT_H__ */ + diff --git a/fs/gfs2/trace_gfs2.h b/fs/gfs2/trace_gfs2.h new file mode 100644 index 0000000000..a5deb9f868 --- /dev/null +++ b/fs/gfs2/trace_gfs2.h @@ -0,0 +1,642 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM gfs2 + +#if !defined(_TRACE_GFS2_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_GFS2_H + +#include <linux/tracepoint.h> + +#include <linux/fs.h> +#include <linux/buffer_head.h> +#include <linux/dlmconstants.h> +#include <linux/gfs2_ondisk.h> +#include <linux/writeback.h> +#include <linux/ktime.h> +#include <linux/iomap.h> +#include "incore.h" +#include "glock.h" +#include "rgrp.h" + +#define dlm_state_name(nn) { DLM_LOCK_##nn, #nn } +#define glock_trace_name(x) __print_symbolic(x, \ + dlm_state_name(IV), \ + dlm_state_name(NL), \ + dlm_state_name(CR), \ + dlm_state_name(CW), \ + dlm_state_name(PR), \ + dlm_state_name(PW), \ + dlm_state_name(EX)) + +#define block_state_name(x) __print_symbolic(x, \ + { GFS2_BLKST_FREE, "free" }, \ + { GFS2_BLKST_USED, "used" }, \ + { GFS2_BLKST_DINODE, "dinode" }, \ + { GFS2_BLKST_UNLINKED, "unlinked" }) + +#define TRACE_RS_DELETE 0 +#define TRACE_RS_TREEDEL 1 +#define TRACE_RS_INSERT 2 +#define TRACE_RS_CLAIM 3 + +#define rs_func_name(x) __print_symbolic(x, \ + { 0, "del " }, \ + { 1, "tdel" }, \ + { 2, "ins " }, \ + { 3, "clm " }) + +#define show_glock_flags(flags) __print_flags(flags, "", \ + {(1UL << GLF_LOCK), "l" }, \ + {(1UL << GLF_DEMOTE), "D" }, \ + {(1UL << GLF_PENDING_DEMOTE), "d" }, \ + {(1UL << GLF_DEMOTE_IN_PROGRESS), "p" }, \ + {(1UL << GLF_DIRTY), "y" }, \ + {(1UL << GLF_LFLUSH), "f" }, \ + {(1UL << GLF_INVALIDATE_IN_PROGRESS), "i" }, \ + {(1UL << GLF_REPLY_PENDING), "r" }, \ + {(1UL << GLF_INITIAL), "I" }, \ + {(1UL << GLF_FROZEN), "F" }, \ + {(1UL << GLF_LRU), "L" }, \ + {(1UL << GLF_OBJECT), "o" }, \ + {(1UL << GLF_BLOCKING), "b" }) + +#ifndef NUMPTY +#define NUMPTY +static inline u8 glock_trace_state(unsigned int state) +{ + switch(state) { + case LM_ST_SHARED: + return DLM_LOCK_PR; + case LM_ST_DEFERRED: + return DLM_LOCK_CW; + case LM_ST_EXCLUSIVE: + return DLM_LOCK_EX; + } + return DLM_LOCK_NL; +} +#endif + +/* Section 1 - Locking + * + * Objectives: + * Latency: Remote demote request to state change + * Latency: Local lock request to state change + * Latency: State change to lock grant + * Correctness: Ordering of local lock state vs. I/O requests + * Correctness: Responses to remote demote requests + */ + +/* General glock state change (DLM lock request completes) */ +TRACE_EVENT(gfs2_glock_state_change, + + TP_PROTO(const struct gfs2_glock *gl, unsigned int new_state), + + TP_ARGS(gl, new_state), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( u64, glnum ) + __field( u32, gltype ) + __field( u8, cur_state ) + __field( u8, new_state ) + __field( u8, dmt_state ) + __field( u8, tgt_state ) + __field( unsigned long, flags ) + ), + + TP_fast_assign( + __entry->dev = gl->gl_name.ln_sbd->sd_vfs->s_dev; + __entry->glnum = gl->gl_name.ln_number; + __entry->gltype = gl->gl_name.ln_type; + __entry->cur_state = glock_trace_state(gl->gl_state); + __entry->new_state = glock_trace_state(new_state); + __entry->tgt_state = glock_trace_state(gl->gl_target); + __entry->dmt_state = glock_trace_state(gl->gl_demote_state); + __entry->flags = gl->gl_flags | (gl->gl_object ? (1UL<<GLF_OBJECT) : 0); + ), + + TP_printk("%u,%u glock %d:%lld state %s to %s tgt:%s dmt:%s flags:%s", + MAJOR(__entry->dev), MINOR(__entry->dev), __entry->gltype, + (unsigned long long)__entry->glnum, + glock_trace_name(__entry->cur_state), + glock_trace_name(__entry->new_state), + glock_trace_name(__entry->tgt_state), + glock_trace_name(__entry->dmt_state), + show_glock_flags(__entry->flags)) +); + +/* State change -> unlocked, glock is being deallocated */ +TRACE_EVENT(gfs2_glock_put, + + TP_PROTO(const struct gfs2_glock *gl), + + TP_ARGS(gl), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( u64, glnum ) + __field( u32, gltype ) + __field( u8, cur_state ) + __field( unsigned long, flags ) + ), + + TP_fast_assign( + __entry->dev = gl->gl_name.ln_sbd->sd_vfs->s_dev; + __entry->gltype = gl->gl_name.ln_type; + __entry->glnum = gl->gl_name.ln_number; + __entry->cur_state = glock_trace_state(gl->gl_state); + __entry->flags = gl->gl_flags | (gl->gl_object ? (1UL<<GLF_OBJECT) : 0); + ), + + TP_printk("%u,%u glock %d:%lld state %s => %s flags:%s", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->gltype, (unsigned long long)__entry->glnum, + glock_trace_name(__entry->cur_state), + glock_trace_name(DLM_LOCK_IV), + show_glock_flags(__entry->flags)) + +); + +/* Callback (local or remote) requesting lock demotion */ +TRACE_EVENT(gfs2_demote_rq, + + TP_PROTO(const struct gfs2_glock *gl, bool remote), + + TP_ARGS(gl, remote), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( u64, glnum ) + __field( u32, gltype ) + __field( u8, cur_state ) + __field( u8, dmt_state ) + __field( unsigned long, flags ) + __field( bool, remote ) + ), + + TP_fast_assign( + __entry->dev = gl->gl_name.ln_sbd->sd_vfs->s_dev; + __entry->gltype = gl->gl_name.ln_type; + __entry->glnum = gl->gl_name.ln_number; + __entry->cur_state = glock_trace_state(gl->gl_state); + __entry->dmt_state = glock_trace_state(gl->gl_demote_state); + __entry->flags = gl->gl_flags | (gl->gl_object ? (1UL<<GLF_OBJECT) : 0); + __entry->remote = remote; + ), + + TP_printk("%u,%u glock %d:%lld demote %s to %s flags:%s %s", + MAJOR(__entry->dev), MINOR(__entry->dev), __entry->gltype, + (unsigned long long)__entry->glnum, + glock_trace_name(__entry->cur_state), + glock_trace_name(__entry->dmt_state), + show_glock_flags(__entry->flags), + __entry->remote ? "remote" : "local") + +); + +/* Promotion/grant of a glock */ +TRACE_EVENT(gfs2_promote, + + TP_PROTO(const struct gfs2_holder *gh), + + TP_ARGS(gh), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( u64, glnum ) + __field( u32, gltype ) + __field( u8, state ) + ), + + TP_fast_assign( + __entry->dev = gh->gh_gl->gl_name.ln_sbd->sd_vfs->s_dev; + __entry->glnum = gh->gh_gl->gl_name.ln_number; + __entry->gltype = gh->gh_gl->gl_name.ln_type; + __entry->state = glock_trace_state(gh->gh_state); + ), + + TP_printk("%u,%u glock %u:%llu promote %s", + MAJOR(__entry->dev), MINOR(__entry->dev), __entry->gltype, + (unsigned long long)__entry->glnum, + glock_trace_name(__entry->state)) +); + +/* Queue/dequeue a lock request */ +TRACE_EVENT(gfs2_glock_queue, + + TP_PROTO(const struct gfs2_holder *gh, int queue), + + TP_ARGS(gh, queue), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( u64, glnum ) + __field( u32, gltype ) + __field( int, queue ) + __field( u8, state ) + ), + + TP_fast_assign( + __entry->dev = gh->gh_gl->gl_name.ln_sbd->sd_vfs->s_dev; + __entry->glnum = gh->gh_gl->gl_name.ln_number; + __entry->gltype = gh->gh_gl->gl_name.ln_type; + __entry->queue = queue; + __entry->state = glock_trace_state(gh->gh_state); + ), + + TP_printk("%u,%u glock %u:%llu %squeue %s", + MAJOR(__entry->dev), MINOR(__entry->dev), __entry->gltype, + (unsigned long long)__entry->glnum, + __entry->queue ? "" : "de", + glock_trace_name(__entry->state)) +); + +/* DLM sends a reply to GFS2 */ +TRACE_EVENT(gfs2_glock_lock_time, + + TP_PROTO(const struct gfs2_glock *gl, s64 tdiff), + + TP_ARGS(gl, tdiff), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( u64, glnum ) + __field( u32, gltype ) + __field( int, status ) + __field( char, flags ) + __field( s64, tdiff ) + __field( u64, srtt ) + __field( u64, srttvar ) + __field( u64, srttb ) + __field( u64, srttvarb ) + __field( u64, sirt ) + __field( u64, sirtvar ) + __field( u64, dcount ) + __field( u64, qcount ) + ), + + TP_fast_assign( + __entry->dev = gl->gl_name.ln_sbd->sd_vfs->s_dev; + __entry->glnum = gl->gl_name.ln_number; + __entry->gltype = gl->gl_name.ln_type; + __entry->status = gl->gl_lksb.sb_status; + __entry->flags = gl->gl_lksb.sb_flags; + __entry->tdiff = tdiff; + __entry->srtt = gl->gl_stats.stats[GFS2_LKS_SRTT]; + __entry->srttvar = gl->gl_stats.stats[GFS2_LKS_SRTTVAR]; + __entry->srttb = gl->gl_stats.stats[GFS2_LKS_SRTTB]; + __entry->srttvarb = gl->gl_stats.stats[GFS2_LKS_SRTTVARB]; + __entry->sirt = gl->gl_stats.stats[GFS2_LKS_SIRT]; + __entry->sirtvar = gl->gl_stats.stats[GFS2_LKS_SIRTVAR]; + __entry->dcount = gl->gl_stats.stats[GFS2_LKS_DCOUNT]; + __entry->qcount = gl->gl_stats.stats[GFS2_LKS_QCOUNT]; + ), + + TP_printk("%u,%u glock %d:%lld status:%d flags:%02x tdiff:%lld srtt:%lld/%lld srttb:%lld/%lld sirt:%lld/%lld dcnt:%lld qcnt:%lld", + MAJOR(__entry->dev), MINOR(__entry->dev), __entry->gltype, + (unsigned long long)__entry->glnum, + __entry->status, __entry->flags, + (long long)__entry->tdiff, + (long long)__entry->srtt, + (long long)__entry->srttvar, + (long long)__entry->srttb, + (long long)__entry->srttvarb, + (long long)__entry->sirt, + (long long)__entry->sirtvar, + (long long)__entry->dcount, + (long long)__entry->qcount) +); + +/* Section 2 - Log/journal + * + * Objectives: + * Latency: Log flush time + * Correctness: pin/unpin vs. disk I/O ordering + * Performance: Log usage stats + */ + +/* Pin/unpin a block in the log */ +TRACE_EVENT(gfs2_pin, + + TP_PROTO(const struct gfs2_bufdata *bd, int pin), + + TP_ARGS(bd, pin), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( int, pin ) + __field( u32, len ) + __field( sector_t, block ) + __field( u64, ino ) + ), + + TP_fast_assign( + __entry->dev = bd->bd_gl->gl_name.ln_sbd->sd_vfs->s_dev; + __entry->pin = pin; + __entry->len = bd->bd_bh->b_size; + __entry->block = bd->bd_bh->b_blocknr; + __entry->ino = bd->bd_gl->gl_name.ln_number; + ), + + TP_printk("%u,%u log %s %llu/%lu inode %llu", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->pin ? "pin" : "unpin", + (unsigned long long)__entry->block, + (unsigned long)__entry->len, + (unsigned long long)__entry->ino) +); + +/* Flushing the log */ +TRACE_EVENT(gfs2_log_flush, + + TP_PROTO(const struct gfs2_sbd *sdp, int start, u32 flags), + + TP_ARGS(sdp, start, flags), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( int, start ) + __field( u64, log_seq ) + __field( u32, flags ) + ), + + TP_fast_assign( + __entry->dev = sdp->sd_vfs->s_dev; + __entry->start = start; + __entry->log_seq = sdp->sd_log_sequence; + __entry->flags = flags; + ), + + TP_printk("%u,%u log flush %s %llu %llx", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->start ? "start" : "end", + (unsigned long long)__entry->log_seq, + (unsigned long long)__entry->flags) +); + +/* Reserving/releasing blocks in the log */ +TRACE_EVENT(gfs2_log_blocks, + + TP_PROTO(const struct gfs2_sbd *sdp, int blocks), + + TP_ARGS(sdp, blocks), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( int, blocks ) + __field( int, blks_free ) + ), + + TP_fast_assign( + __entry->dev = sdp->sd_vfs->s_dev; + __entry->blocks = blocks; + __entry->blks_free = atomic_read(&sdp->sd_log_blks_free); + ), + + TP_printk("%u,%u log reserve %d %d", MAJOR(__entry->dev), + MINOR(__entry->dev), __entry->blocks, __entry->blks_free) +); + +/* Writing back the AIL */ +TRACE_EVENT(gfs2_ail_flush, + + TP_PROTO(const struct gfs2_sbd *sdp, const struct writeback_control *wbc, int start), + + TP_ARGS(sdp, wbc, start), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( int, start ) + __field( int, sync_mode ) + __field( long, nr_to_write ) + ), + + TP_fast_assign( + __entry->dev = sdp->sd_vfs->s_dev; + __entry->start = start; + __entry->sync_mode = wbc->sync_mode; + __entry->nr_to_write = wbc->nr_to_write; + ), + + TP_printk("%u,%u ail flush %s %s %ld", MAJOR(__entry->dev), + MINOR(__entry->dev), __entry->start ? "start" : "end", + __entry->sync_mode == WB_SYNC_ALL ? "all" : "none", + __entry->nr_to_write) +); + +/* Section 3 - bmap + * + * Objectives: + * Latency: Bmap request time + * Performance: Block allocator tracing + * Correctness: Test of disard generation vs. blocks allocated + */ + +/* Map an extent of blocks, possibly a new allocation */ +TRACE_EVENT(gfs2_bmap, + + TP_PROTO(const struct gfs2_inode *ip, const struct buffer_head *bh, + sector_t lblock, int create, int errno), + + TP_ARGS(ip, bh, lblock, create, errno), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( sector_t, lblock ) + __field( sector_t, pblock ) + __field( u64, inum ) + __field( unsigned long, state ) + __field( u32, len ) + __field( int, create ) + __field( int, errno ) + ), + + TP_fast_assign( + __entry->dev = ip->i_gl->gl_name.ln_sbd->sd_vfs->s_dev; + __entry->lblock = lblock; + __entry->pblock = buffer_mapped(bh) ? bh->b_blocknr : 0; + __entry->inum = ip->i_no_addr; + __entry->state = bh->b_state; + __entry->len = bh->b_size; + __entry->create = create; + __entry->errno = errno; + ), + + TP_printk("%u,%u bmap %llu map %llu/%lu to %llu flags:%08lx %s %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->inum, + (unsigned long long)__entry->lblock, + (unsigned long)__entry->len, + (unsigned long long)__entry->pblock, + __entry->state, __entry->create ? "create " : "nocreate", + __entry->errno) +); + +TRACE_EVENT(gfs2_iomap_start, + + TP_PROTO(const struct gfs2_inode *ip, loff_t pos, ssize_t length, + u16 flags), + + TP_ARGS(ip, pos, length, flags), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( u64, inum ) + __field( loff_t, pos ) + __field( ssize_t, length ) + __field( u16, flags ) + ), + + TP_fast_assign( + __entry->dev = ip->i_gl->gl_name.ln_sbd->sd_vfs->s_dev; + __entry->inum = ip->i_no_addr; + __entry->pos = pos; + __entry->length = length; + __entry->flags = flags; + ), + + TP_printk("%u,%u bmap %llu iomap start %llu/%lu flags:%08x", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->inum, + (unsigned long long)__entry->pos, + (unsigned long)__entry->length, (u16)__entry->flags) +); + +TRACE_EVENT(gfs2_iomap_end, + + TP_PROTO(const struct gfs2_inode *ip, struct iomap *iomap, int ret), + + TP_ARGS(ip, iomap, ret), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( u64, inum ) + __field( loff_t, offset ) + __field( ssize_t, length ) + __field( sector_t, pblock ) + __field( u16, flags ) + __field( u16, type ) + __field( int, ret ) + ), + + TP_fast_assign( + __entry->dev = ip->i_gl->gl_name.ln_sbd->sd_vfs->s_dev; + __entry->inum = ip->i_no_addr; + __entry->offset = iomap->offset; + __entry->length = iomap->length; + __entry->pblock = iomap->addr == IOMAP_NULL_ADDR ? 0 : + (iomap->addr >> ip->i_inode.i_blkbits); + __entry->flags = iomap->flags; + __entry->type = iomap->type; + __entry->ret = ret; + ), + + TP_printk("%u,%u bmap %llu iomap end %llu/%lu to %llu ty:%d flags:%08x rc:%d", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->inum, + (unsigned long long)__entry->offset, + (unsigned long)__entry->length, + (long long)__entry->pblock, + (u16)__entry->type, + (u16)__entry->flags, __entry->ret) +); + +/* Keep track of blocks as they are allocated/freed */ +TRACE_EVENT(gfs2_block_alloc, + + TP_PROTO(const struct gfs2_inode *ip, struct gfs2_rgrpd *rgd, + u64 block, unsigned len, u8 block_state), + + TP_ARGS(ip, rgd, block, len, block_state), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( u64, start ) + __field( u64, inum ) + __field( u32, len ) + __field( u8, block_state ) + __field( u64, rd_addr ) + __field( u32, rd_free_clone ) + __field( u32, rd_requested ) + __field( u32, rd_reserved ) + ), + + TP_fast_assign( + __entry->dev = rgd->rd_gl->gl_name.ln_sbd->sd_vfs->s_dev; + __entry->start = block; + __entry->inum = ip->i_no_addr; + __entry->len = len; + __entry->block_state = block_state; + __entry->rd_addr = rgd->rd_addr; + __entry->rd_free_clone = rgd->rd_free_clone; + __entry->rd_requested = rgd->rd_requested; + __entry->rd_reserved = rgd->rd_reserved; + ), + + TP_printk("%u,%u bmap %llu alloc %llu/%lu %s rg:%llu rf:%u rq:%u rr:%u", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->inum, + (unsigned long long)__entry->start, + (unsigned long)__entry->len, + block_state_name(__entry->block_state), + (unsigned long long)__entry->rd_addr, + __entry->rd_free_clone, + __entry->rd_requested, + __entry->rd_reserved) +); + +/* Keep track of multi-block reservations as they are allocated/freed */ +TRACE_EVENT(gfs2_rs, + + TP_PROTO(const struct gfs2_blkreserv *rs, u8 func), + + TP_ARGS(rs, func), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( u64, rd_addr ) + __field( u32, rd_free_clone ) + __field( u32, rd_requested ) + __field( u32, rd_reserved ) + __field( u64, inum ) + __field( u64, start ) + __field( u32, requested ) + __field( u32, reserved ) + __field( u8, func ) + ), + + TP_fast_assign( + __entry->dev = rs->rs_rgd->rd_sbd->sd_vfs->s_dev; + __entry->rd_addr = rs->rs_rgd->rd_addr; + __entry->rd_free_clone = rs->rs_rgd->rd_free_clone; + __entry->rd_requested = rs->rs_rgd->rd_requested; + __entry->rd_reserved = rs->rs_rgd->rd_reserved; + __entry->inum = container_of(rs, struct gfs2_inode, + i_res)->i_no_addr; + __entry->start = rs->rs_start; + __entry->requested = rs->rs_requested; + __entry->reserved = rs->rs_reserved; + __entry->func = func; + ), + + TP_printk("%u,%u bmap %llu resrv %llu rg:%llu rf:%u rq:%u rr:%u %s q:%u r:%u", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->inum, + (unsigned long long)__entry->start, + (unsigned long long)__entry->rd_addr, + __entry->rd_free_clone, + __entry->rd_requested, + __entry->rd_reserved, + rs_func_name(__entry->func), + __entry->requested, + __entry->reserved) +); + +#endif /* _TRACE_GFS2_H */ + +/* This part must be outside protection */ +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH . +#define TRACE_INCLUDE_FILE trace_gfs2 +#include <trace/define_trace.h> + diff --git a/fs/gfs2/trans.c b/fs/gfs2/trans.c new file mode 100644 index 0000000000..7e835be703 --- /dev/null +++ b/fs/gfs2/trans.c @@ -0,0 +1,336 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/sched.h> +#include <linux/slab.h> +#include <linux/spinlock.h> +#include <linux/completion.h> +#include <linux/buffer_head.h> +#include <linux/kallsyms.h> +#include <linux/gfs2_ondisk.h> + +#include "gfs2.h" +#include "incore.h" +#include "glock.h" +#include "inode.h" +#include "log.h" +#include "lops.h" +#include "meta_io.h" +#include "trans.h" +#include "util.h" +#include "trace_gfs2.h" + +static void gfs2_print_trans(struct gfs2_sbd *sdp, const struct gfs2_trans *tr) +{ + fs_warn(sdp, "Transaction created at: %pSR\n", (void *)tr->tr_ip); + fs_warn(sdp, "blocks=%u revokes=%u reserved=%u touched=%u\n", + tr->tr_blocks, tr->tr_revokes, tr->tr_reserved, + test_bit(TR_TOUCHED, &tr->tr_flags)); + fs_warn(sdp, "Buf %u/%u Databuf %u/%u Revoke %u\n", + tr->tr_num_buf_new, tr->tr_num_buf_rm, + tr->tr_num_databuf_new, tr->tr_num_databuf_rm, + tr->tr_num_revoke); +} + +int __gfs2_trans_begin(struct gfs2_trans *tr, struct gfs2_sbd *sdp, + unsigned int blocks, unsigned int revokes, + unsigned long ip) +{ + unsigned int extra_revokes; + + if (current->journal_info) { + gfs2_print_trans(sdp, current->journal_info); + BUG(); + } + BUG_ON(blocks == 0 && revokes == 0); + + if (!test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) + return -EROFS; + + tr->tr_ip = ip; + tr->tr_blocks = blocks; + tr->tr_revokes = revokes; + tr->tr_reserved = GFS2_LOG_FLUSH_MIN_BLOCKS; + if (blocks) { + /* + * The reserved blocks are either used for data or metadata. + * We can have mixed data and metadata, each with its own log + * descriptor block; see calc_reserved(). + */ + tr->tr_reserved += blocks + 1 + DIV_ROUND_UP(blocks - 1, databuf_limit(sdp)); + } + INIT_LIST_HEAD(&tr->tr_databuf); + INIT_LIST_HEAD(&tr->tr_buf); + INIT_LIST_HEAD(&tr->tr_list); + INIT_LIST_HEAD(&tr->tr_ail1_list); + INIT_LIST_HEAD(&tr->tr_ail2_list); + + if (gfs2_assert_warn(sdp, tr->tr_reserved <= sdp->sd_jdesc->jd_blocks)) + return -EINVAL; + + sb_start_intwrite(sdp->sd_vfs); + + /* + * Try the reservations under sd_log_flush_lock to prevent log flushes + * from creating inconsistencies between the number of allocated and + * reserved revokes. If that fails, do a full-block allocation outside + * of the lock to avoid stalling log flushes. Then, allot the + * appropriate number of blocks to revokes, use as many revokes locally + * as needed, and "release" the surplus into the revokes pool. + */ + + down_read(&sdp->sd_log_flush_lock); + if (gfs2_log_try_reserve(sdp, tr, &extra_revokes)) + goto reserved; + up_read(&sdp->sd_log_flush_lock); + gfs2_log_reserve(sdp, tr, &extra_revokes); + down_read(&sdp->sd_log_flush_lock); + +reserved: + gfs2_log_release_revokes(sdp, extra_revokes); + if (unlikely(!test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags))) { + gfs2_log_release_revokes(sdp, tr->tr_revokes); + up_read(&sdp->sd_log_flush_lock); + gfs2_log_release(sdp, tr->tr_reserved); + sb_end_intwrite(sdp->sd_vfs); + return -EROFS; + } + + current->journal_info = tr; + + return 0; +} + +int gfs2_trans_begin(struct gfs2_sbd *sdp, unsigned int blocks, + unsigned int revokes) +{ + struct gfs2_trans *tr; + int error; + + tr = kmem_cache_zalloc(gfs2_trans_cachep, GFP_NOFS); + if (!tr) + return -ENOMEM; + error = __gfs2_trans_begin(tr, sdp, blocks, revokes, _RET_IP_); + if (error) + kmem_cache_free(gfs2_trans_cachep, tr); + return error; +} + +void gfs2_trans_end(struct gfs2_sbd *sdp) +{ + struct gfs2_trans *tr = current->journal_info; + s64 nbuf; + + current->journal_info = NULL; + + if (!test_bit(TR_TOUCHED, &tr->tr_flags)) { + gfs2_log_release_revokes(sdp, tr->tr_revokes); + up_read(&sdp->sd_log_flush_lock); + gfs2_log_release(sdp, tr->tr_reserved); + if (!test_bit(TR_ONSTACK, &tr->tr_flags)) + gfs2_trans_free(sdp, tr); + sb_end_intwrite(sdp->sd_vfs); + return; + } + + gfs2_log_release_revokes(sdp, tr->tr_revokes - tr->tr_num_revoke); + + nbuf = tr->tr_num_buf_new + tr->tr_num_databuf_new; + nbuf -= tr->tr_num_buf_rm; + nbuf -= tr->tr_num_databuf_rm; + + if (gfs2_assert_withdraw(sdp, nbuf <= tr->tr_blocks) || + gfs2_assert_withdraw(sdp, tr->tr_num_revoke <= tr->tr_revokes)) + gfs2_print_trans(sdp, tr); + + gfs2_log_commit(sdp, tr); + if (!test_bit(TR_ONSTACK, &tr->tr_flags) && + !test_bit(TR_ATTACHED, &tr->tr_flags)) + gfs2_trans_free(sdp, tr); + up_read(&sdp->sd_log_flush_lock); + + if (sdp->sd_vfs->s_flags & SB_SYNCHRONOUS) + gfs2_log_flush(sdp, NULL, GFS2_LOG_HEAD_FLUSH_NORMAL | + GFS2_LFC_TRANS_END); + sb_end_intwrite(sdp->sd_vfs); +} + +static struct gfs2_bufdata *gfs2_alloc_bufdata(struct gfs2_glock *gl, + struct buffer_head *bh) +{ + struct gfs2_bufdata *bd; + + bd = kmem_cache_zalloc(gfs2_bufdata_cachep, GFP_NOFS | __GFP_NOFAIL); + bd->bd_bh = bh; + bd->bd_gl = gl; + INIT_LIST_HEAD(&bd->bd_list); + INIT_LIST_HEAD(&bd->bd_ail_st_list); + INIT_LIST_HEAD(&bd->bd_ail_gl_list); + bh->b_private = bd; + return bd; +} + +/** + * gfs2_trans_add_data - Add a databuf to the transaction. + * @gl: The inode glock associated with the buffer + * @bh: The buffer to add + * + * This is used in journaled data mode. + * We need to journal the data block in the same way as metadata in + * the functions above. The difference is that here we have a tag + * which is two __be64's being the block number (as per meta data) + * and a flag which says whether the data block needs escaping or + * not. This means we need a new log entry for each 251 or so data + * blocks, which isn't an enormous overhead but twice as much as + * for normal metadata blocks. + */ +void gfs2_trans_add_data(struct gfs2_glock *gl, struct buffer_head *bh) +{ + struct gfs2_trans *tr = current->journal_info; + struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; + struct gfs2_bufdata *bd; + + lock_buffer(bh); + if (buffer_pinned(bh)) { + set_bit(TR_TOUCHED, &tr->tr_flags); + goto out; + } + gfs2_log_lock(sdp); + bd = bh->b_private; + if (bd == NULL) { + gfs2_log_unlock(sdp); + unlock_buffer(bh); + if (bh->b_private == NULL) + bd = gfs2_alloc_bufdata(gl, bh); + else + bd = bh->b_private; + lock_buffer(bh); + gfs2_log_lock(sdp); + } + gfs2_assert(sdp, bd->bd_gl == gl); + set_bit(TR_TOUCHED, &tr->tr_flags); + if (list_empty(&bd->bd_list)) { + set_bit(GLF_LFLUSH, &bd->bd_gl->gl_flags); + set_bit(GLF_DIRTY, &bd->bd_gl->gl_flags); + gfs2_pin(sdp, bd->bd_bh); + tr->tr_num_databuf_new++; + list_add_tail(&bd->bd_list, &tr->tr_databuf); + } + gfs2_log_unlock(sdp); +out: + unlock_buffer(bh); +} + +void gfs2_trans_add_meta(struct gfs2_glock *gl, struct buffer_head *bh) +{ + + struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; + struct super_block *sb = sdp->sd_vfs; + struct gfs2_bufdata *bd; + struct gfs2_meta_header *mh; + struct gfs2_trans *tr = current->journal_info; + bool withdraw = false; + + lock_buffer(bh); + if (buffer_pinned(bh)) { + set_bit(TR_TOUCHED, &tr->tr_flags); + goto out; + } + gfs2_log_lock(sdp); + bd = bh->b_private; + if (bd == NULL) { + gfs2_log_unlock(sdp); + unlock_buffer(bh); + lock_page(bh->b_page); + if (bh->b_private == NULL) + bd = gfs2_alloc_bufdata(gl, bh); + else + bd = bh->b_private; + unlock_page(bh->b_page); + lock_buffer(bh); + gfs2_log_lock(sdp); + } + gfs2_assert(sdp, bd->bd_gl == gl); + set_bit(TR_TOUCHED, &tr->tr_flags); + if (!list_empty(&bd->bd_list)) + goto out_unlock; + set_bit(GLF_LFLUSH, &bd->bd_gl->gl_flags); + set_bit(GLF_DIRTY, &bd->bd_gl->gl_flags); + mh = (struct gfs2_meta_header *)bd->bd_bh->b_data; + if (unlikely(mh->mh_magic != cpu_to_be32(GFS2_MAGIC))) { + fs_err(sdp, "Attempting to add uninitialised block to " + "journal (inplace block=%lld)\n", + (unsigned long long)bd->bd_bh->b_blocknr); + BUG(); + } + if (unlikely(gfs2_withdrawn(sdp))) { + fs_info(sdp, "GFS2:adding buf while withdrawn! 0x%llx\n", + (unsigned long long)bd->bd_bh->b_blocknr); + goto out_unlock; + } + if (unlikely(sb->s_writers.frozen == SB_FREEZE_COMPLETE)) { + fs_info(sdp, "GFS2:adding buf while frozen\n"); + withdraw = true; + goto out_unlock; + } + gfs2_pin(sdp, bd->bd_bh); + mh->__pad0 = cpu_to_be64(0); + mh->mh_jid = cpu_to_be32(sdp->sd_jdesc->jd_jid); + list_add(&bd->bd_list, &tr->tr_buf); + tr->tr_num_buf_new++; +out_unlock: + gfs2_log_unlock(sdp); + if (withdraw) + gfs2_assert_withdraw(sdp, 0); +out: + unlock_buffer(bh); +} + +void gfs2_trans_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd) +{ + struct gfs2_trans *tr = current->journal_info; + + BUG_ON(!list_empty(&bd->bd_list)); + gfs2_add_revoke(sdp, bd); + set_bit(TR_TOUCHED, &tr->tr_flags); + tr->tr_num_revoke++; +} + +void gfs2_trans_remove_revoke(struct gfs2_sbd *sdp, u64 blkno, unsigned int len) +{ + struct gfs2_bufdata *bd, *tmp; + unsigned int n = len; + + gfs2_log_lock(sdp); + list_for_each_entry_safe(bd, tmp, &sdp->sd_log_revokes, bd_list) { + if ((bd->bd_blkno >= blkno) && (bd->bd_blkno < (blkno + len))) { + list_del_init(&bd->bd_list); + gfs2_assert_withdraw(sdp, sdp->sd_log_num_revoke); + sdp->sd_log_num_revoke--; + if (bd->bd_gl) + gfs2_glock_remove_revoke(bd->bd_gl); + kmem_cache_free(gfs2_bufdata_cachep, bd); + gfs2_log_release_revokes(sdp, 1); + if (--n == 0) + break; + } + } + gfs2_log_unlock(sdp); +} + +void gfs2_trans_free(struct gfs2_sbd *sdp, struct gfs2_trans *tr) +{ + if (tr == NULL) + return; + + gfs2_assert_warn(sdp, list_empty(&tr->tr_ail1_list)); + gfs2_assert_warn(sdp, list_empty(&tr->tr_ail2_list)); + gfs2_assert_warn(sdp, list_empty(&tr->tr_databuf)); + gfs2_assert_warn(sdp, list_empty(&tr->tr_buf)); + kmem_cache_free(gfs2_trans_cachep, tr); +} diff --git a/fs/gfs2/trans.h b/fs/gfs2/trans.h new file mode 100644 index 0000000000..c76ad9a4c7 --- /dev/null +++ b/fs/gfs2/trans.h @@ -0,0 +1,50 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + */ + +#ifndef __TRANS_DOT_H__ +#define __TRANS_DOT_H__ + +#include <linux/buffer_head.h> +struct gfs2_sbd; +struct gfs2_rgrpd; +struct gfs2_glock; + +#define RES_DINODE 1 +#define RES_INDIRECT 1 +#define RES_JDATA 1 +#define RES_DATA 1 +#define RES_LEAF 1 +#define RES_RG_HDR 1 +#define RES_RG_BIT 2 +#define RES_EATTR 1 +#define RES_STATFS 1 +#define RES_QUOTA 2 + +/* reserve either the number of blocks to be allocated plus the rg header + * block, or all of the blocks in the rg, whichever is smaller */ +static inline unsigned int gfs2_rg_blocks(const struct gfs2_inode *ip, unsigned requested) +{ + struct gfs2_rgrpd *rgd = ip->i_res.rs_rgd; + + if (requested < rgd->rd_length) + return requested + 1; + return rgd->rd_length; +} + +extern int __gfs2_trans_begin(struct gfs2_trans *tr, struct gfs2_sbd *sdp, + unsigned int blocks, unsigned int revokes, + unsigned long ip); +extern int gfs2_trans_begin(struct gfs2_sbd *sdp, unsigned int blocks, + unsigned int revokes); + +extern void gfs2_trans_end(struct gfs2_sbd *sdp); +extern void gfs2_trans_add_data(struct gfs2_glock *gl, struct buffer_head *bh); +extern void gfs2_trans_add_meta(struct gfs2_glock *gl, struct buffer_head *bh); +extern void gfs2_trans_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd); +extern void gfs2_trans_remove_revoke(struct gfs2_sbd *sdp, u64 blkno, unsigned int len); +extern void gfs2_trans_free(struct gfs2_sbd *sdp, struct gfs2_trans *tr); + +#endif /* __TRANS_DOT_H__ */ diff --git a/fs/gfs2/util.c b/fs/gfs2/util.c new file mode 100644 index 0000000000..da29fafb62 --- /dev/null +++ b/fs/gfs2/util.c @@ -0,0 +1,561 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/spinlock.h> +#include <linux/completion.h> +#include <linux/buffer_head.h> +#include <linux/kthread.h> +#include <linux/crc32.h> +#include <linux/gfs2_ondisk.h> +#include <linux/delay.h> +#include <linux/uaccess.h> + +#include "gfs2.h" +#include "incore.h" +#include "glock.h" +#include "glops.h" +#include "log.h" +#include "lops.h" +#include "recovery.h" +#include "rgrp.h" +#include "super.h" +#include "util.h" + +struct kmem_cache *gfs2_glock_cachep __read_mostly; +struct kmem_cache *gfs2_glock_aspace_cachep __read_mostly; +struct kmem_cache *gfs2_inode_cachep __read_mostly; +struct kmem_cache *gfs2_bufdata_cachep __read_mostly; +struct kmem_cache *gfs2_rgrpd_cachep __read_mostly; +struct kmem_cache *gfs2_quotad_cachep __read_mostly; +struct kmem_cache *gfs2_qadata_cachep __read_mostly; +struct kmem_cache *gfs2_trans_cachep __read_mostly; +mempool_t *gfs2_page_pool __read_mostly; + +void gfs2_assert_i(struct gfs2_sbd *sdp) +{ + fs_emerg(sdp, "fatal assertion failed\n"); +} + +/** + * check_journal_clean - Make sure a journal is clean for a spectator mount + * @sdp: The GFS2 superblock + * @jd: The journal descriptor + * @verbose: Show more prints in the log + * + * Returns: 0 if the journal is clean or locked, else an error + */ +int check_journal_clean(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd, + bool verbose) +{ + int error; + struct gfs2_holder j_gh; + struct gfs2_log_header_host head; + struct gfs2_inode *ip; + + ip = GFS2_I(jd->jd_inode); + error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_NOEXP | + GL_EXACT | GL_NOCACHE, &j_gh); + if (error) { + if (verbose) + fs_err(sdp, "Error %d locking journal for spectator " + "mount.\n", error); + return -EPERM; + } + error = gfs2_jdesc_check(jd); + if (error) { + if (verbose) + fs_err(sdp, "Error checking journal for spectator " + "mount.\n"); + goto out_unlock; + } + error = gfs2_find_jhead(jd, &head, false); + if (error) { + if (verbose) + fs_err(sdp, "Error parsing journal for spectator " + "mount.\n"); + goto out_unlock; + } + if (!(head.lh_flags & GFS2_LOG_HEAD_UNMOUNT)) { + error = -EPERM; + if (verbose) + fs_err(sdp, "jid=%u: Journal is dirty, so the first " + "mounter must not be a spectator.\n", + jd->jd_jid); + } + +out_unlock: + gfs2_glock_dq_uninit(&j_gh); + return error; +} + +/** + * gfs2_freeze_lock_shared - hold the freeze glock + * @sdp: the superblock + */ +int gfs2_freeze_lock_shared(struct gfs2_sbd *sdp) +{ + int error; + + error = gfs2_glock_nq_init(sdp->sd_freeze_gl, LM_ST_SHARED, + LM_FLAG_NOEXP | GL_EXACT, + &sdp->sd_freeze_gh); + if (error) + fs_err(sdp, "can't lock the freeze glock: %d\n", error); + return error; +} + +void gfs2_freeze_unlock(struct gfs2_holder *freeze_gh) +{ + if (gfs2_holder_initialized(freeze_gh)) + gfs2_glock_dq_uninit(freeze_gh); +} + +static void signal_our_withdraw(struct gfs2_sbd *sdp) +{ + struct gfs2_glock *live_gl = sdp->sd_live_gh.gh_gl; + struct inode *inode; + struct gfs2_inode *ip; + struct gfs2_glock *i_gl; + u64 no_formal_ino; + int ret = 0; + int tries; + + if (test_bit(SDF_NORECOVERY, &sdp->sd_flags) || !sdp->sd_jdesc) + return; + + gfs2_ail_drain(sdp); /* frees all transactions */ + inode = sdp->sd_jdesc->jd_inode; + ip = GFS2_I(inode); + i_gl = ip->i_gl; + no_formal_ino = ip->i_no_formal_ino; + + /* Prevent any glock dq until withdraw recovery is complete */ + set_bit(SDF_WITHDRAW_RECOVERY, &sdp->sd_flags); + /* + * Don't tell dlm we're bailing until we have no more buffers in the + * wind. If journal had an IO error, the log code should just purge + * the outstanding buffers rather than submitting new IO. Making the + * file system read-only will flush the journal, etc. + * + * During a normal unmount, gfs2_make_fs_ro calls gfs2_log_shutdown + * which clears SDF_JOURNAL_LIVE. In a withdraw, we must not write + * any UNMOUNT log header, so we can't call gfs2_log_shutdown, and + * therefore we need to clear SDF_JOURNAL_LIVE manually. + */ + clear_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags); + if (!sb_rdonly(sdp->sd_vfs)) { + bool locked = mutex_trylock(&sdp->sd_freeze_mutex); + + wake_up(&sdp->sd_logd_waitq); + wake_up(&sdp->sd_quota_wait); + + wait_event_timeout(sdp->sd_log_waitq, + gfs2_log_is_empty(sdp), + HZ * 5); + + sdp->sd_vfs->s_flags |= SB_RDONLY; + + if (locked) + mutex_unlock(&sdp->sd_freeze_mutex); + + /* + * Dequeue any pending non-system glock holders that can no + * longer be granted because the file system is withdrawn. + */ + gfs2_gl_dq_holders(sdp); + } + + if (sdp->sd_lockstruct.ls_ops->lm_lock == NULL) { /* lock_nolock */ + if (!ret) + ret = -EIO; + clear_bit(SDF_WITHDRAW_RECOVERY, &sdp->sd_flags); + goto skip_recovery; + } + /* + * Drop the glock for our journal so another node can recover it. + */ + if (gfs2_holder_initialized(&sdp->sd_journal_gh)) { + gfs2_glock_dq_wait(&sdp->sd_journal_gh); + gfs2_holder_uninit(&sdp->sd_journal_gh); + } + sdp->sd_jinode_gh.gh_flags |= GL_NOCACHE; + gfs2_glock_dq(&sdp->sd_jinode_gh); + gfs2_thaw_freeze_initiator(sdp->sd_vfs); + wait_on_bit(&i_gl->gl_flags, GLF_DEMOTE, TASK_UNINTERRUPTIBLE); + + /* + * holder_uninit to force glock_put, to force dlm to let go + */ + gfs2_holder_uninit(&sdp->sd_jinode_gh); + + /* + * Note: We need to be careful here: + * Our iput of jd_inode will evict it. The evict will dequeue its + * glock, but the glock dq will wait for the withdraw unless we have + * exception code in glock_dq. + */ + iput(inode); + sdp->sd_jdesc->jd_inode = NULL; + /* + * Wait until the journal inode's glock is freed. This allows try locks + * on other nodes to be successful, otherwise we remain the owner of + * the glock as far as dlm is concerned. + */ + if (i_gl->gl_ops->go_free) { + set_bit(GLF_FREEING, &i_gl->gl_flags); + wait_on_bit(&i_gl->gl_flags, GLF_FREEING, TASK_UNINTERRUPTIBLE); + } + + /* + * Dequeue the "live" glock, but keep a reference so it's never freed. + */ + gfs2_glock_hold(live_gl); + gfs2_glock_dq_wait(&sdp->sd_live_gh); + /* + * We enqueue the "live" glock in EX so that all other nodes + * get a demote request and act on it. We don't really want the + * lock in EX, so we send a "try" lock with 1CB to produce a callback. + */ + fs_warn(sdp, "Requesting recovery of jid %d.\n", + sdp->sd_lockstruct.ls_jid); + gfs2_holder_reinit(LM_ST_EXCLUSIVE, + LM_FLAG_TRY_1CB | LM_FLAG_NOEXP | GL_NOPID, + &sdp->sd_live_gh); + msleep(GL_GLOCK_MAX_HOLD); + /* + * This will likely fail in a cluster, but succeed standalone: + */ + ret = gfs2_glock_nq(&sdp->sd_live_gh); + + /* + * If we actually got the "live" lock in EX mode, there are no other + * nodes available to replay our journal. So we try to replay it + * ourselves. We hold the "live" glock to prevent other mounters + * during recovery, then just dequeue it and reacquire it in our + * normal SH mode. Just in case the problem that caused us to + * withdraw prevents us from recovering our journal (e.g. io errors + * and such) we still check if the journal is clean before proceeding + * but we may wait forever until another mounter does the recovery. + */ + if (ret == 0) { + fs_warn(sdp, "No other mounters found. Trying to recover our " + "own journal jid %d.\n", sdp->sd_lockstruct.ls_jid); + if (gfs2_recover_journal(sdp->sd_jdesc, 1)) + fs_warn(sdp, "Unable to recover our journal jid %d.\n", + sdp->sd_lockstruct.ls_jid); + gfs2_glock_dq_wait(&sdp->sd_live_gh); + gfs2_holder_reinit(LM_ST_SHARED, + LM_FLAG_NOEXP | GL_EXACT | GL_NOPID, + &sdp->sd_live_gh); + gfs2_glock_nq(&sdp->sd_live_gh); + } + + gfs2_glock_queue_put(live_gl); /* drop extra reference we acquired */ + clear_bit(SDF_WITHDRAW_RECOVERY, &sdp->sd_flags); + + /* + * At this point our journal is evicted, so we need to get a new inode + * for it. Once done, we need to call gfs2_find_jhead which + * calls gfs2_map_journal_extents to map it for us again. + * + * Note that we don't really want it to look up a FREE block. The + * GFS2_BLKST_FREE simply overrides a block check in gfs2_inode_lookup + * which would otherwise fail because it requires grabbing an rgrp + * glock, which would fail with -EIO because we're withdrawing. + */ + inode = gfs2_inode_lookup(sdp->sd_vfs, DT_UNKNOWN, + sdp->sd_jdesc->jd_no_addr, no_formal_ino, + GFS2_BLKST_FREE); + if (IS_ERR(inode)) { + fs_warn(sdp, "Reprocessing of jid %d failed with %ld.\n", + sdp->sd_lockstruct.ls_jid, PTR_ERR(inode)); + goto skip_recovery; + } + sdp->sd_jdesc->jd_inode = inode; + d_mark_dontcache(inode); + + /* + * Now wait until recovery is complete. + */ + for (tries = 0; tries < 10; tries++) { + ret = check_journal_clean(sdp, sdp->sd_jdesc, false); + if (!ret) + break; + msleep(HZ); + fs_warn(sdp, "Waiting for journal recovery jid %d.\n", + sdp->sd_lockstruct.ls_jid); + } +skip_recovery: + if (!ret) + fs_warn(sdp, "Journal recovery complete for jid %d.\n", + sdp->sd_lockstruct.ls_jid); + else + fs_warn(sdp, "Journal recovery skipped for jid %d until next " + "mount.\n", sdp->sd_lockstruct.ls_jid); + fs_warn(sdp, "Glock dequeues delayed: %lu\n", sdp->sd_glock_dqs_held); + sdp->sd_glock_dqs_held = 0; + wake_up_bit(&sdp->sd_flags, SDF_WITHDRAW_RECOVERY); +} + +void gfs2_lm(struct gfs2_sbd *sdp, const char *fmt, ...) +{ + struct va_format vaf; + va_list args; + + if (sdp->sd_args.ar_errors == GFS2_ERRORS_WITHDRAW && + test_bit(SDF_WITHDRAWN, &sdp->sd_flags)) + return; + + va_start(args, fmt); + vaf.fmt = fmt; + vaf.va = &args; + fs_err(sdp, "%pV", &vaf); + va_end(args); +} + +int gfs2_withdraw(struct gfs2_sbd *sdp) +{ + struct lm_lockstruct *ls = &sdp->sd_lockstruct; + const struct lm_lockops *lm = ls->ls_ops; + + if (sdp->sd_args.ar_errors == GFS2_ERRORS_WITHDRAW) { + unsigned long old = READ_ONCE(sdp->sd_flags), new; + + do { + if (old & BIT(SDF_WITHDRAWN)) { + wait_on_bit(&sdp->sd_flags, + SDF_WITHDRAW_IN_PROG, + TASK_UNINTERRUPTIBLE); + return -1; + } + new = old | BIT(SDF_WITHDRAWN) | BIT(SDF_WITHDRAW_IN_PROG); + } while (unlikely(!try_cmpxchg(&sdp->sd_flags, &old, new))); + + fs_err(sdp, "about to withdraw this file system\n"); + BUG_ON(sdp->sd_args.ar_debug); + + signal_our_withdraw(sdp); + + kobject_uevent(&sdp->sd_kobj, KOBJ_OFFLINE); + + if (!strcmp(sdp->sd_lockstruct.ls_ops->lm_proto_name, "lock_dlm")) + wait_for_completion(&sdp->sd_wdack); + + if (lm->lm_unmount) { + fs_err(sdp, "telling LM to unmount\n"); + lm->lm_unmount(sdp); + } + set_bit(SDF_SKIP_DLM_UNLOCK, &sdp->sd_flags); + fs_err(sdp, "File system withdrawn\n"); + dump_stack(); + clear_bit(SDF_WITHDRAW_IN_PROG, &sdp->sd_flags); + smp_mb__after_atomic(); + wake_up_bit(&sdp->sd_flags, SDF_WITHDRAW_IN_PROG); + } + + if (sdp->sd_args.ar_errors == GFS2_ERRORS_PANIC) + panic("GFS2: fsid=%s: panic requested\n", sdp->sd_fsname); + + return -1; +} + +/* + * gfs2_assert_withdraw_i - Cause the machine to withdraw if @assertion is false + */ + +void gfs2_assert_withdraw_i(struct gfs2_sbd *sdp, char *assertion, + const char *function, char *file, unsigned int line, + bool delayed) +{ + if (gfs2_withdrawn(sdp)) + return; + + fs_err(sdp, + "fatal: assertion \"%s\" failed\n" + " function = %s, file = %s, line = %u\n", + assertion, function, file, line); + + /* + * If errors=panic was specified on mount, it won't help to delay the + * withdraw. + */ + if (sdp->sd_args.ar_errors == GFS2_ERRORS_PANIC) + delayed = false; + + if (delayed) + gfs2_withdraw_delayed(sdp); + else + gfs2_withdraw(sdp); + dump_stack(); +} + +/* + * gfs2_assert_warn_i - Print a message to the console if @assertion is false + */ + +void gfs2_assert_warn_i(struct gfs2_sbd *sdp, char *assertion, + const char *function, char *file, unsigned int line) +{ + if (time_before(jiffies, + sdp->sd_last_warning + + gfs2_tune_get(sdp, gt_complain_secs) * HZ)) + return; + + if (sdp->sd_args.ar_errors == GFS2_ERRORS_WITHDRAW) + fs_warn(sdp, "warning: assertion \"%s\" failed at function = %s, file = %s, line = %u\n", + assertion, function, file, line); + + if (sdp->sd_args.ar_debug) + BUG(); + else + dump_stack(); + + if (sdp->sd_args.ar_errors == GFS2_ERRORS_PANIC) + panic("GFS2: fsid=%s: warning: assertion \"%s\" failed\n" + "GFS2: fsid=%s: function = %s, file = %s, line = %u\n", + sdp->sd_fsname, assertion, + sdp->sd_fsname, function, file, line); + + sdp->sd_last_warning = jiffies; +} + +/* + * gfs2_consist_i - Flag a filesystem consistency error and withdraw + */ + +void gfs2_consist_i(struct gfs2_sbd *sdp, const char *function, + char *file, unsigned int line) +{ + gfs2_lm(sdp, + "fatal: filesystem consistency error - function = %s, file = %s, line = %u\n", + function, file, line); + gfs2_withdraw(sdp); +} + +/* + * gfs2_consist_inode_i - Flag an inode consistency error and withdraw + */ + +void gfs2_consist_inode_i(struct gfs2_inode *ip, + const char *function, char *file, unsigned int line) +{ + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + + gfs2_lm(sdp, + "fatal: filesystem consistency error\n" + " inode = %llu %llu\n" + " function = %s, file = %s, line = %u\n", + (unsigned long long)ip->i_no_formal_ino, + (unsigned long long)ip->i_no_addr, + function, file, line); + gfs2_dump_glock(NULL, ip->i_gl, 1); + gfs2_withdraw(sdp); +} + +/* + * gfs2_consist_rgrpd_i - Flag a RG consistency error and withdraw + */ + +void gfs2_consist_rgrpd_i(struct gfs2_rgrpd *rgd, + const char *function, char *file, unsigned int line) +{ + struct gfs2_sbd *sdp = rgd->rd_sbd; + char fs_id_buf[sizeof(sdp->sd_fsname) + 7]; + + sprintf(fs_id_buf, "fsid=%s: ", sdp->sd_fsname); + gfs2_rgrp_dump(NULL, rgd, fs_id_buf); + gfs2_lm(sdp, + "fatal: filesystem consistency error\n" + " RG = %llu\n" + " function = %s, file = %s, line = %u\n", + (unsigned long long)rgd->rd_addr, + function, file, line); + gfs2_dump_glock(NULL, rgd->rd_gl, 1); + gfs2_withdraw(sdp); +} + +/* + * gfs2_meta_check_ii - Flag a magic number consistency error and withdraw + * Returns: -1 if this call withdrew the machine, + * -2 if it was already withdrawn + */ + +int gfs2_meta_check_ii(struct gfs2_sbd *sdp, struct buffer_head *bh, + const char *type, const char *function, char *file, + unsigned int line) +{ + int me; + + gfs2_lm(sdp, + "fatal: invalid metadata block\n" + " bh = %llu (%s)\n" + " function = %s, file = %s, line = %u\n", + (unsigned long long)bh->b_blocknr, type, + function, file, line); + me = gfs2_withdraw(sdp); + return (me) ? -1 : -2; +} + +/* + * gfs2_metatype_check_ii - Flag a metadata type consistency error and withdraw + * Returns: -1 if this call withdrew the machine, + * -2 if it was already withdrawn + */ + +int gfs2_metatype_check_ii(struct gfs2_sbd *sdp, struct buffer_head *bh, + u16 type, u16 t, const char *function, + char *file, unsigned int line) +{ + int me; + + gfs2_lm(sdp, + "fatal: invalid metadata block\n" + " bh = %llu (type: exp=%u, found=%u)\n" + " function = %s, file = %s, line = %u\n", + (unsigned long long)bh->b_blocknr, type, t, + function, file, line); + me = gfs2_withdraw(sdp); + return (me) ? -1 : -2; +} + +/* + * gfs2_io_error_i - Flag an I/O error and withdraw + * Returns: -1 if this call withdrew the machine, + * 0 if it was already withdrawn + */ + +int gfs2_io_error_i(struct gfs2_sbd *sdp, const char *function, char *file, + unsigned int line) +{ + gfs2_lm(sdp, + "fatal: I/O error\n" + " function = %s, file = %s, line = %u\n", + function, file, line); + return gfs2_withdraw(sdp); +} + +/* + * gfs2_io_error_bh_i - Flag a buffer I/O error + * @withdraw: withdraw the filesystem + */ + +void gfs2_io_error_bh_i(struct gfs2_sbd *sdp, struct buffer_head *bh, + const char *function, char *file, unsigned int line, + bool withdraw) +{ + if (gfs2_withdrawn(sdp)) + return; + + fs_err(sdp, "fatal: I/O error\n" + " block = %llu\n" + " function = %s, file = %s, line = %u\n", + (unsigned long long)bh->b_blocknr, function, file, line); + if (withdraw) + gfs2_withdraw(sdp); +} + diff --git a/fs/gfs2/util.h b/fs/gfs2/util.h new file mode 100644 index 0000000000..cdb8395291 --- /dev/null +++ b/fs/gfs2/util.h @@ -0,0 +1,232 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + */ + +#ifndef __UTIL_DOT_H__ +#define __UTIL_DOT_H__ + +#ifdef pr_fmt +#undef pr_fmt +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#endif + +#include <linux/mempool.h> + +#include "incore.h" + +#define fs_emerg(fs, fmt, ...) \ + pr_emerg("fsid=%s: " fmt, (fs)->sd_fsname, ##__VA_ARGS__) +#define fs_warn(fs, fmt, ...) \ + pr_warn("fsid=%s: " fmt, (fs)->sd_fsname, ##__VA_ARGS__) +#define fs_err(fs, fmt, ...) \ + pr_err("fsid=%s: " fmt, (fs)->sd_fsname, ##__VA_ARGS__) +#define fs_info(fs, fmt, ...) \ + pr_info("fsid=%s: " fmt, (fs)->sd_fsname, ##__VA_ARGS__) + +void gfs2_assert_i(struct gfs2_sbd *sdp); + +#define gfs2_assert(sdp, assertion) \ +do { \ + if (unlikely(!(assertion))) { \ + gfs2_assert_i(sdp); \ + BUG(); \ + } \ +} while (0) + + +void gfs2_assert_withdraw_i(struct gfs2_sbd *sdp, char *assertion, + const char *function, char *file, unsigned int line, + bool delayed); + +#define gfs2_assert_withdraw(sdp, assertion) \ + ({ \ + bool _bool = (assertion); \ + if (unlikely(!_bool)) \ + gfs2_assert_withdraw_i((sdp), #assertion, \ + __func__, __FILE__, __LINE__, false); \ + !_bool; \ + }) + +#define gfs2_assert_withdraw_delayed(sdp, assertion) \ + ({ \ + bool _bool = (assertion); \ + if (unlikely(!_bool)) \ + gfs2_assert_withdraw_i((sdp), #assertion, \ + __func__, __FILE__, __LINE__, true); \ + !_bool; \ + }) + +void gfs2_assert_warn_i(struct gfs2_sbd *sdp, char *assertion, + const char *function, char *file, unsigned int line); + +#define gfs2_assert_warn(sdp, assertion) \ + ({ \ + bool _bool = (assertion); \ + if (unlikely(!_bool)) \ + gfs2_assert_warn_i((sdp), #assertion, \ + __func__, __FILE__, __LINE__); \ + !_bool; \ + }) + +void gfs2_consist_i(struct gfs2_sbd *sdp, + const char *function, char *file, unsigned int line); + +#define gfs2_consist(sdp) \ +gfs2_consist_i((sdp), __func__, __FILE__, __LINE__) + + +void gfs2_consist_inode_i(struct gfs2_inode *ip, + const char *function, char *file, unsigned int line); + +#define gfs2_consist_inode(ip) \ +gfs2_consist_inode_i((ip), __func__, __FILE__, __LINE__) + + +void gfs2_consist_rgrpd_i(struct gfs2_rgrpd *rgd, + const char *function, char *file, unsigned int line); + +#define gfs2_consist_rgrpd(rgd) \ +gfs2_consist_rgrpd_i((rgd), __func__, __FILE__, __LINE__) + + +int gfs2_meta_check_ii(struct gfs2_sbd *sdp, struct buffer_head *bh, + const char *type, const char *function, + char *file, unsigned int line); + +static inline int gfs2_meta_check(struct gfs2_sbd *sdp, + struct buffer_head *bh) +{ + struct gfs2_meta_header *mh = (struct gfs2_meta_header *)bh->b_data; + u32 magic = be32_to_cpu(mh->mh_magic); + if (unlikely(magic != GFS2_MAGIC)) { + fs_err(sdp, "Magic number missing at %llu\n", + (unsigned long long)bh->b_blocknr); + return -EIO; + } + return 0; +} + +int gfs2_metatype_check_ii(struct gfs2_sbd *sdp, struct buffer_head *bh, + u16 type, u16 t, + const char *function, + char *file, unsigned int line); + +static inline int gfs2_metatype_check_i(struct gfs2_sbd *sdp, + struct buffer_head *bh, + u16 type, + const char *function, + char *file, unsigned int line) +{ + struct gfs2_meta_header *mh = (struct gfs2_meta_header *)bh->b_data; + u32 magic = be32_to_cpu(mh->mh_magic); + u16 t = be32_to_cpu(mh->mh_type); + if (unlikely(magic != GFS2_MAGIC)) + return gfs2_meta_check_ii(sdp, bh, "magic number", function, + file, line); + if (unlikely(t != type)) + return gfs2_metatype_check_ii(sdp, bh, type, t, function, + file, line); + return 0; +} + +#define gfs2_metatype_check(sdp, bh, type) \ +gfs2_metatype_check_i((sdp), (bh), (type), __func__, __FILE__, __LINE__) + +static inline void gfs2_metatype_set(struct buffer_head *bh, u16 type, + u16 format) +{ + struct gfs2_meta_header *mh; + mh = (struct gfs2_meta_header *)bh->b_data; + mh->mh_type = cpu_to_be32(type); + mh->mh_format = cpu_to_be32(format); +} + + +int gfs2_io_error_i(struct gfs2_sbd *sdp, const char *function, + char *file, unsigned int line); + +extern int check_journal_clean(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd, + bool verbose); +extern int gfs2_freeze_lock_shared(struct gfs2_sbd *sdp); +extern void gfs2_freeze_unlock(struct gfs2_holder *freeze_gh); + +#define gfs2_io_error(sdp) \ +gfs2_io_error_i((sdp), __func__, __FILE__, __LINE__) + + +void gfs2_io_error_bh_i(struct gfs2_sbd *sdp, struct buffer_head *bh, + const char *function, char *file, unsigned int line, + bool withdraw); + +#define gfs2_io_error_bh_wd(sdp, bh) \ +gfs2_io_error_bh_i((sdp), (bh), __func__, __FILE__, __LINE__, true) + +#define gfs2_io_error_bh(sdp, bh) \ +gfs2_io_error_bh_i((sdp), (bh), __func__, __FILE__, __LINE__, false) + + +extern struct kmem_cache *gfs2_glock_cachep; +extern struct kmem_cache *gfs2_glock_aspace_cachep; +extern struct kmem_cache *gfs2_inode_cachep; +extern struct kmem_cache *gfs2_bufdata_cachep; +extern struct kmem_cache *gfs2_rgrpd_cachep; +extern struct kmem_cache *gfs2_quotad_cachep; +extern struct kmem_cache *gfs2_qadata_cachep; +extern struct kmem_cache *gfs2_trans_cachep; +extern mempool_t *gfs2_page_pool; +extern struct workqueue_struct *gfs2_control_wq; + +static inline unsigned int gfs2_tune_get_i(struct gfs2_tune *gt, + unsigned int *p) +{ + unsigned int x; + spin_lock(>->gt_spin); + x = *p; + spin_unlock(>->gt_spin); + return x; +} + +/** + * gfs2_withdraw_delayed - withdraw as soon as possible without deadlocks + * @sdp: the superblock + */ +static inline void gfs2_withdraw_delayed(struct gfs2_sbd *sdp) +{ + set_bit(SDF_WITHDRAWING, &sdp->sd_flags); +} + +/** + * gfs2_withdrawn - test whether the file system is withdrawing or withdrawn + * @sdp: the superblock + */ +static inline bool gfs2_withdrawn(struct gfs2_sbd *sdp) +{ + return test_bit(SDF_WITHDRAWN, &sdp->sd_flags) || + test_bit(SDF_WITHDRAWING, &sdp->sd_flags); +} + +/** + * gfs2_withdrawing - check if a withdraw is pending + * @sdp: the superblock + */ +static inline bool gfs2_withdrawing(struct gfs2_sbd *sdp) +{ + return test_bit(SDF_WITHDRAWING, &sdp->sd_flags) && + !test_bit(SDF_WITHDRAWN, &sdp->sd_flags); +} + +static inline bool gfs2_withdraw_in_prog(struct gfs2_sbd *sdp) +{ + return test_bit(SDF_WITHDRAW_IN_PROG, &sdp->sd_flags); +} + +#define gfs2_tune_get(sdp, field) \ +gfs2_tune_get_i(&(sdp)->sd_tune, &(sdp)->sd_tune.field) + +__printf(2, 3) +void gfs2_lm(struct gfs2_sbd *sdp, const char *fmt, ...); +int gfs2_withdraw(struct gfs2_sbd *sdp); + +#endif /* __UTIL_DOT_H__ */ diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c new file mode 100644 index 0000000000..4fea70c0fe --- /dev/null +++ b/fs/gfs2/xattr.c @@ -0,0 +1,1507 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + */ + +#include <linux/slab.h> +#include <linux/spinlock.h> +#include <linux/completion.h> +#include <linux/buffer_head.h> +#include <linux/xattr.h> +#include <linux/gfs2_ondisk.h> +#include <linux/posix_acl_xattr.h> +#include <linux/uaccess.h> + +#include "gfs2.h" +#include "incore.h" +#include "acl.h" +#include "xattr.h" +#include "glock.h" +#include "inode.h" +#include "meta_io.h" +#include "quota.h" +#include "rgrp.h" +#include "super.h" +#include "trans.h" +#include "util.h" + +/* + * ea_calc_size - returns the actual number of bytes the request will take up + * (not counting any unstuffed data blocks) + * + * Returns: 1 if the EA should be stuffed + */ + +static int ea_calc_size(struct gfs2_sbd *sdp, unsigned int nsize, size_t dsize, + unsigned int *size) +{ + unsigned int jbsize = sdp->sd_jbsize; + + /* Stuffed */ + *size = ALIGN(sizeof(struct gfs2_ea_header) + nsize + dsize, 8); + + if (*size <= jbsize) + return 1; + + /* Unstuffed */ + *size = ALIGN(sizeof(struct gfs2_ea_header) + nsize + + (sizeof(__be64) * DIV_ROUND_UP(dsize, jbsize)), 8); + + return 0; +} + +static int ea_check_size(struct gfs2_sbd *sdp, unsigned int nsize, size_t dsize) +{ + unsigned int size; + + if (dsize > GFS2_EA_MAX_DATA_LEN) + return -ERANGE; + + ea_calc_size(sdp, nsize, dsize, &size); + + /* This can only happen with 512 byte blocks */ + if (size > sdp->sd_jbsize) + return -ERANGE; + + return 0; +} + +static bool gfs2_eatype_valid(struct gfs2_sbd *sdp, u8 type) +{ + switch(sdp->sd_sb.sb_fs_format) { + case GFS2_FS_FORMAT_MAX: + return true; + + case GFS2_FS_FORMAT_MIN: + return type <= GFS2_EATYPE_SECURITY; + + default: + return false; + } +} + +typedef int (*ea_call_t) (struct gfs2_inode *ip, struct buffer_head *bh, + struct gfs2_ea_header *ea, + struct gfs2_ea_header *prev, void *private); + +static int ea_foreach_i(struct gfs2_inode *ip, struct buffer_head *bh, + ea_call_t ea_call, void *data) +{ + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + struct gfs2_ea_header *ea, *prev = NULL; + int error = 0; + + if (gfs2_metatype_check(GFS2_SB(&ip->i_inode), bh, GFS2_METATYPE_EA)) + return -EIO; + + for (ea = GFS2_EA_BH2FIRST(bh);; prev = ea, ea = GFS2_EA2NEXT(ea)) { + if (!GFS2_EA_REC_LEN(ea)) + goto fail; + if (!(bh->b_data <= (char *)ea && (char *)GFS2_EA2NEXT(ea) <= + bh->b_data + bh->b_size)) + goto fail; + if (!gfs2_eatype_valid(sdp, ea->ea_type)) + goto fail; + error = ea_call(ip, bh, ea, prev, data); + if (error) + return error; + + if (GFS2_EA_IS_LAST(ea)) { + if ((char *)GFS2_EA2NEXT(ea) != + bh->b_data + bh->b_size) + goto fail; + break; + } + } + + return error; + +fail: + gfs2_consist_inode(ip); + return -EIO; +} + +static int ea_foreach(struct gfs2_inode *ip, ea_call_t ea_call, void *data) +{ + struct buffer_head *bh, *eabh; + __be64 *eablk, *end; + int error; + + error = gfs2_meta_read(ip->i_gl, ip->i_eattr, DIO_WAIT, 0, &bh); + if (error) + return error; + + if (!(ip->i_diskflags & GFS2_DIF_EA_INDIRECT)) { + error = ea_foreach_i(ip, bh, ea_call, data); + goto out; + } + + if (gfs2_metatype_check(GFS2_SB(&ip->i_inode), bh, GFS2_METATYPE_IN)) { + error = -EIO; + goto out; + } + + eablk = (__be64 *)(bh->b_data + sizeof(struct gfs2_meta_header)); + end = eablk + GFS2_SB(&ip->i_inode)->sd_inptrs; + + for (; eablk < end; eablk++) { + u64 bn; + + if (!*eablk) + break; + bn = be64_to_cpu(*eablk); + + error = gfs2_meta_read(ip->i_gl, bn, DIO_WAIT, 0, &eabh); + if (error) + break; + error = ea_foreach_i(ip, eabh, ea_call, data); + brelse(eabh); + if (error) + break; + } +out: + brelse(bh); + return error; +} + +struct ea_find { + int type; + const char *name; + size_t namel; + struct gfs2_ea_location *ef_el; +}; + +static int ea_find_i(struct gfs2_inode *ip, struct buffer_head *bh, + struct gfs2_ea_header *ea, struct gfs2_ea_header *prev, + void *private) +{ + struct ea_find *ef = private; + + if (ea->ea_type == GFS2_EATYPE_UNUSED) + return 0; + + if (ea->ea_type == ef->type) { + if (ea->ea_name_len == ef->namel && + !memcmp(GFS2_EA2NAME(ea), ef->name, ea->ea_name_len)) { + struct gfs2_ea_location *el = ef->ef_el; + get_bh(bh); + el->el_bh = bh; + el->el_ea = ea; + el->el_prev = prev; + return 1; + } + } + + return 0; +} + +static int gfs2_ea_find(struct gfs2_inode *ip, int type, const char *name, + struct gfs2_ea_location *el) +{ + struct ea_find ef; + int error; + + ef.type = type; + ef.name = name; + ef.namel = strlen(name); + ef.ef_el = el; + + memset(el, 0, sizeof(struct gfs2_ea_location)); + + error = ea_foreach(ip, ea_find_i, &ef); + if (error > 0) + return 0; + + return error; +} + +/* + * ea_dealloc_unstuffed + * + * Take advantage of the fact that all unstuffed blocks are + * allocated from the same RG. But watch, this may not always + * be true. + * + * Returns: errno + */ + +static int ea_dealloc_unstuffed(struct gfs2_inode *ip, struct buffer_head *bh, + struct gfs2_ea_header *ea, + struct gfs2_ea_header *prev, void *private) +{ + int *leave = private; + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + struct gfs2_rgrpd *rgd; + struct gfs2_holder rg_gh; + __be64 *dataptrs; + u64 bn = 0; + u64 bstart = 0; + unsigned int blen = 0; + unsigned int blks = 0; + unsigned int x; + int error; + + error = gfs2_rindex_update(sdp); + if (error) + return error; + + if (GFS2_EA_IS_STUFFED(ea)) + return 0; + + dataptrs = GFS2_EA2DATAPTRS(ea); + for (x = 0; x < ea->ea_num_ptrs; x++, dataptrs++) { + if (*dataptrs) { + blks++; + bn = be64_to_cpu(*dataptrs); + } + } + if (!blks) + return 0; + + rgd = gfs2_blk2rgrpd(sdp, bn, 1); + if (!rgd) { + gfs2_consist_inode(ip); + return -EIO; + } + + error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, + LM_FLAG_NODE_SCOPE, &rg_gh); + if (error) + return error; + + error = gfs2_trans_begin(sdp, rgd->rd_length + RES_DINODE + + RES_EATTR + RES_STATFS + RES_QUOTA, blks); + if (error) + goto out_gunlock; + + gfs2_trans_add_meta(ip->i_gl, bh); + + dataptrs = GFS2_EA2DATAPTRS(ea); + for (x = 0; x < ea->ea_num_ptrs; x++, dataptrs++) { + if (!*dataptrs) + break; + bn = be64_to_cpu(*dataptrs); + + if (bstart + blen == bn) + blen++; + else { + if (bstart) + gfs2_free_meta(ip, rgd, bstart, blen); + bstart = bn; + blen = 1; + } + + *dataptrs = 0; + gfs2_add_inode_blocks(&ip->i_inode, -1); + } + if (bstart) + gfs2_free_meta(ip, rgd, bstart, blen); + + if (prev && !leave) { + u32 len; + + len = GFS2_EA_REC_LEN(prev) + GFS2_EA_REC_LEN(ea); + prev->ea_rec_len = cpu_to_be32(len); + + if (GFS2_EA_IS_LAST(ea)) + prev->ea_flags |= GFS2_EAFLAG_LAST; + } else { + ea->ea_type = GFS2_EATYPE_UNUSED; + ea->ea_num_ptrs = 0; + } + + inode_set_ctime_current(&ip->i_inode); + __mark_inode_dirty(&ip->i_inode, I_DIRTY_DATASYNC); + + gfs2_trans_end(sdp); + +out_gunlock: + gfs2_glock_dq_uninit(&rg_gh); + return error; +} + +static int ea_remove_unstuffed(struct gfs2_inode *ip, struct buffer_head *bh, + struct gfs2_ea_header *ea, + struct gfs2_ea_header *prev, int leave) +{ + int error; + + error = gfs2_rindex_update(GFS2_SB(&ip->i_inode)); + if (error) + return error; + + error = gfs2_quota_hold(ip, NO_UID_QUOTA_CHANGE, NO_GID_QUOTA_CHANGE); + if (error) + goto out_alloc; + + error = ea_dealloc_unstuffed(ip, bh, ea, prev, (leave) ? &error : NULL); + + gfs2_quota_unhold(ip); +out_alloc: + return error; +} + +struct ea_list { + struct gfs2_ea_request *ei_er; + unsigned int ei_size; +}; + +static int ea_list_i(struct gfs2_inode *ip, struct buffer_head *bh, + struct gfs2_ea_header *ea, struct gfs2_ea_header *prev, + void *private) +{ + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + struct ea_list *ei = private; + struct gfs2_ea_request *er = ei->ei_er; + unsigned int ea_size; + char *prefix; + unsigned int l; + + if (ea->ea_type == GFS2_EATYPE_UNUSED) + return 0; + + BUG_ON(ea->ea_type > GFS2_EATYPE_SECURITY && + sdp->sd_sb.sb_fs_format == GFS2_FS_FORMAT_MIN); + switch (ea->ea_type) { + case GFS2_EATYPE_USR: + prefix = "user."; + l = 5; + break; + case GFS2_EATYPE_SYS: + prefix = "system."; + l = 7; + break; + case GFS2_EATYPE_SECURITY: + prefix = "security."; + l = 9; + break; + case GFS2_EATYPE_TRUSTED: + prefix = "trusted."; + l = 8; + break; + default: + return 0; + } + + ea_size = l + ea->ea_name_len + 1; + if (er->er_data_len) { + if (ei->ei_size + ea_size > er->er_data_len) + return -ERANGE; + + memcpy(er->er_data + ei->ei_size, prefix, l); + memcpy(er->er_data + ei->ei_size + l, GFS2_EA2NAME(ea), + ea->ea_name_len); + er->er_data[ei->ei_size + ea_size - 1] = 0; + } + + ei->ei_size += ea_size; + + return 0; +} + +/** + * gfs2_listxattr - List gfs2 extended attributes + * @dentry: The dentry whose inode we are interested in + * @buffer: The buffer to write the results + * @size: The size of the buffer + * + * Returns: actual size of data on success, -errno on error + */ + +ssize_t gfs2_listxattr(struct dentry *dentry, char *buffer, size_t size) +{ + struct gfs2_inode *ip = GFS2_I(d_inode(dentry)); + struct gfs2_ea_request er; + struct gfs2_holder i_gh; + int error; + + memset(&er, 0, sizeof(struct gfs2_ea_request)); + if (size) { + er.er_data = buffer; + er.er_data_len = size; + } + + error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh); + if (error) + return error; + + if (ip->i_eattr) { + struct ea_list ei = { .ei_er = &er, .ei_size = 0 }; + + error = ea_foreach(ip, ea_list_i, &ei); + if (!error) + error = ei.ei_size; + } + + gfs2_glock_dq_uninit(&i_gh); + + return error; +} + +/** + * gfs2_iter_unstuffed - copies the unstuffed xattr data to/from the + * request buffer + * @ip: The GFS2 inode + * @ea: The extended attribute header structure + * @din: The data to be copied in + * @dout: The data to be copied out (one of din,dout will be NULL) + * + * Returns: errno + */ + +static int gfs2_iter_unstuffed(struct gfs2_inode *ip, struct gfs2_ea_header *ea, + const char *din, char *dout) +{ + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + struct buffer_head **bh; + unsigned int amount = GFS2_EA_DATA_LEN(ea); + unsigned int nptrs = DIV_ROUND_UP(amount, sdp->sd_jbsize); + __be64 *dataptrs = GFS2_EA2DATAPTRS(ea); + unsigned int x; + int error = 0; + unsigned char *pos; + unsigned cp_size; + + bh = kcalloc(nptrs, sizeof(struct buffer_head *), GFP_NOFS); + if (!bh) + return -ENOMEM; + + for (x = 0; x < nptrs; x++) { + error = gfs2_meta_read(ip->i_gl, be64_to_cpu(*dataptrs), 0, 0, + bh + x); + if (error) { + while (x--) + brelse(bh[x]); + goto out; + } + dataptrs++; + } + + for (x = 0; x < nptrs; x++) { + error = gfs2_meta_wait(sdp, bh[x]); + if (error) { + for (; x < nptrs; x++) + brelse(bh[x]); + goto out; + } + if (gfs2_metatype_check(sdp, bh[x], GFS2_METATYPE_ED)) { + for (; x < nptrs; x++) + brelse(bh[x]); + error = -EIO; + goto out; + } + + pos = bh[x]->b_data + sizeof(struct gfs2_meta_header); + cp_size = (sdp->sd_jbsize > amount) ? amount : sdp->sd_jbsize; + + if (dout) { + memcpy(dout, pos, cp_size); + dout += sdp->sd_jbsize; + } + + if (din) { + gfs2_trans_add_meta(ip->i_gl, bh[x]); + memcpy(pos, din, cp_size); + din += sdp->sd_jbsize; + } + + amount -= sdp->sd_jbsize; + brelse(bh[x]); + } + +out: + kfree(bh); + return error; +} + +static int gfs2_ea_get_copy(struct gfs2_inode *ip, struct gfs2_ea_location *el, + char *data, size_t size) +{ + int ret; + size_t len = GFS2_EA_DATA_LEN(el->el_ea); + if (len > size) + return -ERANGE; + + if (GFS2_EA_IS_STUFFED(el->el_ea)) { + memcpy(data, GFS2_EA2DATA(el->el_ea), len); + return len; + } + ret = gfs2_iter_unstuffed(ip, el->el_ea, NULL, data); + if (ret < 0) + return ret; + return len; +} + +int gfs2_xattr_acl_get(struct gfs2_inode *ip, const char *name, char **ppdata) +{ + struct gfs2_ea_location el; + int error; + int len; + char *data; + + error = gfs2_ea_find(ip, GFS2_EATYPE_SYS, name, &el); + if (error) + return error; + if (!el.el_ea) + goto out; + if (!GFS2_EA_DATA_LEN(el.el_ea)) + goto out; + + len = GFS2_EA_DATA_LEN(el.el_ea); + data = kmalloc(len, GFP_NOFS); + error = -ENOMEM; + if (data == NULL) + goto out; + + error = gfs2_ea_get_copy(ip, &el, data, len); + if (error < 0) + kfree(data); + else + *ppdata = data; +out: + brelse(el.el_bh); + return error; +} + +/** + * __gfs2_xattr_get - Get a GFS2 extended attribute + * @inode: The inode + * @name: The name of the extended attribute + * @buffer: The buffer to write the result into + * @size: The size of the buffer + * @type: The type of extended attribute + * + * Returns: actual size of data on success, -errno on error + */ +static int __gfs2_xattr_get(struct inode *inode, const char *name, + void *buffer, size_t size, int type) +{ + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_ea_location el; + int error; + + if (!ip->i_eattr) + return -ENODATA; + if (strlen(name) > GFS2_EA_MAX_NAME_LEN) + return -EINVAL; + + error = gfs2_ea_find(ip, type, name, &el); + if (error) + return error; + if (!el.el_ea) + return -ENODATA; + if (size) + error = gfs2_ea_get_copy(ip, &el, buffer, size); + else + error = GFS2_EA_DATA_LEN(el.el_ea); + brelse(el.el_bh); + + return error; +} + +static int gfs2_xattr_get(const struct xattr_handler *handler, + struct dentry *unused, struct inode *inode, + const char *name, void *buffer, size_t size) +{ + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_holder gh; + int ret; + + /* During lookup, SELinux calls this function with the glock locked. */ + + if (!gfs2_glock_is_locked_by_me(ip->i_gl)) { + ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &gh); + if (ret) + return ret; + } else { + gfs2_holder_mark_uninitialized(&gh); + } + ret = __gfs2_xattr_get(inode, name, buffer, size, handler->flags); + if (gfs2_holder_initialized(&gh)) + gfs2_glock_dq_uninit(&gh); + return ret; +} + +/** + * ea_alloc_blk - allocates a new block for extended attributes. + * @ip: A pointer to the inode that's getting extended attributes + * @bhp: Pointer to pointer to a struct buffer_head + * + * Returns: errno + */ + +static int ea_alloc_blk(struct gfs2_inode *ip, struct buffer_head **bhp) +{ + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + struct gfs2_ea_header *ea; + unsigned int n = 1; + u64 block; + int error; + + error = gfs2_alloc_blocks(ip, &block, &n, 0, NULL); + if (error) + return error; + gfs2_trans_remove_revoke(sdp, block, 1); + *bhp = gfs2_meta_new(ip->i_gl, block); + gfs2_trans_add_meta(ip->i_gl, *bhp); + gfs2_metatype_set(*bhp, GFS2_METATYPE_EA, GFS2_FORMAT_EA); + gfs2_buffer_clear_tail(*bhp, sizeof(struct gfs2_meta_header)); + + ea = GFS2_EA_BH2FIRST(*bhp); + ea->ea_rec_len = cpu_to_be32(sdp->sd_jbsize); + ea->ea_type = GFS2_EATYPE_UNUSED; + ea->ea_flags = GFS2_EAFLAG_LAST; + ea->ea_num_ptrs = 0; + + gfs2_add_inode_blocks(&ip->i_inode, 1); + + return 0; +} + +/** + * ea_write - writes the request info to an ea, creating new blocks if + * necessary + * @ip: inode that is being modified + * @ea: the location of the new ea in a block + * @er: the write request + * + * Note: does not update ea_rec_len or the GFS2_EAFLAG_LAST bin of ea_flags + * + * returns : errno + */ + +static int ea_write(struct gfs2_inode *ip, struct gfs2_ea_header *ea, + struct gfs2_ea_request *er) +{ + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + int error; + + ea->ea_data_len = cpu_to_be32(er->er_data_len); + ea->ea_name_len = er->er_name_len; + ea->ea_type = er->er_type; + ea->__pad = 0; + + memcpy(GFS2_EA2NAME(ea), er->er_name, er->er_name_len); + + if (GFS2_EAREQ_SIZE_STUFFED(er) <= sdp->sd_jbsize) { + ea->ea_num_ptrs = 0; + memcpy(GFS2_EA2DATA(ea), er->er_data, er->er_data_len); + } else { + __be64 *dataptr = GFS2_EA2DATAPTRS(ea); + const char *data = er->er_data; + unsigned int data_len = er->er_data_len; + unsigned int copy; + unsigned int x; + + ea->ea_num_ptrs = DIV_ROUND_UP(er->er_data_len, sdp->sd_jbsize); + for (x = 0; x < ea->ea_num_ptrs; x++) { + struct buffer_head *bh; + u64 block; + int mh_size = sizeof(struct gfs2_meta_header); + unsigned int n = 1; + + error = gfs2_alloc_blocks(ip, &block, &n, 0, NULL); + if (error) + return error; + gfs2_trans_remove_revoke(sdp, block, 1); + bh = gfs2_meta_new(ip->i_gl, block); + gfs2_trans_add_meta(ip->i_gl, bh); + gfs2_metatype_set(bh, GFS2_METATYPE_ED, GFS2_FORMAT_ED); + + gfs2_add_inode_blocks(&ip->i_inode, 1); + + copy = data_len > sdp->sd_jbsize ? sdp->sd_jbsize : + data_len; + memcpy(bh->b_data + mh_size, data, copy); + if (copy < sdp->sd_jbsize) + memset(bh->b_data + mh_size + copy, 0, + sdp->sd_jbsize - copy); + + *dataptr++ = cpu_to_be64(bh->b_blocknr); + data += copy; + data_len -= copy; + + brelse(bh); + } + + gfs2_assert_withdraw(sdp, !data_len); + } + + return 0; +} + +typedef int (*ea_skeleton_call_t) (struct gfs2_inode *ip, + struct gfs2_ea_request *er, void *private); + +static int ea_alloc_skeleton(struct gfs2_inode *ip, struct gfs2_ea_request *er, + unsigned int blks, + ea_skeleton_call_t skeleton_call, void *private) +{ + struct gfs2_alloc_parms ap = { .target = blks }; + int error; + + error = gfs2_rindex_update(GFS2_SB(&ip->i_inode)); + if (error) + return error; + + error = gfs2_quota_lock_check(ip, &ap); + if (error) + return error; + + error = gfs2_inplace_reserve(ip, &ap); + if (error) + goto out_gunlock_q; + + error = gfs2_trans_begin(GFS2_SB(&ip->i_inode), + blks + gfs2_rg_blocks(ip, blks) + + RES_DINODE + RES_STATFS + RES_QUOTA, 0); + if (error) + goto out_ipres; + + error = skeleton_call(ip, er, private); + if (error) + goto out_end_trans; + + inode_set_ctime_current(&ip->i_inode); + __mark_inode_dirty(&ip->i_inode, I_DIRTY_DATASYNC); + +out_end_trans: + gfs2_trans_end(GFS2_SB(&ip->i_inode)); +out_ipres: + gfs2_inplace_release(ip); +out_gunlock_q: + gfs2_quota_unlock(ip); + return error; +} + +static int ea_init_i(struct gfs2_inode *ip, struct gfs2_ea_request *er, + void *private) +{ + struct buffer_head *bh; + int error; + + error = ea_alloc_blk(ip, &bh); + if (error) + return error; + + ip->i_eattr = bh->b_blocknr; + error = ea_write(ip, GFS2_EA_BH2FIRST(bh), er); + + brelse(bh); + + return error; +} + +/* + * ea_init - initializes a new eattr block + * + * Returns: errno + */ +static int ea_init(struct gfs2_inode *ip, int type, const char *name, + const void *data, size_t size) +{ + struct gfs2_ea_request er; + unsigned int jbsize = GFS2_SB(&ip->i_inode)->sd_jbsize; + unsigned int blks = 1; + + er.er_type = type; + er.er_name = name; + er.er_name_len = strlen(name); + er.er_data = (void *)data; + er.er_data_len = size; + + if (GFS2_EAREQ_SIZE_STUFFED(&er) > jbsize) + blks += DIV_ROUND_UP(er.er_data_len, jbsize); + + return ea_alloc_skeleton(ip, &er, blks, ea_init_i, NULL); +} + +static struct gfs2_ea_header *ea_split_ea(struct gfs2_ea_header *ea) +{ + u32 ea_size = GFS2_EA_SIZE(ea); + struct gfs2_ea_header *new = (struct gfs2_ea_header *)((char *)ea + + ea_size); + u32 new_size = GFS2_EA_REC_LEN(ea) - ea_size; + int last = ea->ea_flags & GFS2_EAFLAG_LAST; + + ea->ea_rec_len = cpu_to_be32(ea_size); + ea->ea_flags ^= last; + + new->ea_rec_len = cpu_to_be32(new_size); + new->ea_flags = last; + + return new; +} + +static void ea_set_remove_stuffed(struct gfs2_inode *ip, + struct gfs2_ea_location *el) +{ + struct gfs2_ea_header *ea = el->el_ea; + struct gfs2_ea_header *prev = el->el_prev; + u32 len; + + gfs2_trans_add_meta(ip->i_gl, el->el_bh); + + if (!prev || !GFS2_EA_IS_STUFFED(ea)) { + ea->ea_type = GFS2_EATYPE_UNUSED; + return; + } else if (GFS2_EA2NEXT(prev) != ea) { + prev = GFS2_EA2NEXT(prev); + gfs2_assert_withdraw(GFS2_SB(&ip->i_inode), GFS2_EA2NEXT(prev) == ea); + } + + len = GFS2_EA_REC_LEN(prev) + GFS2_EA_REC_LEN(ea); + prev->ea_rec_len = cpu_to_be32(len); + + if (GFS2_EA_IS_LAST(ea)) + prev->ea_flags |= GFS2_EAFLAG_LAST; +} + +struct ea_set { + int ea_split; + + struct gfs2_ea_request *es_er; + struct gfs2_ea_location *es_el; + + struct buffer_head *es_bh; + struct gfs2_ea_header *es_ea; +}; + +static int ea_set_simple_noalloc(struct gfs2_inode *ip, struct buffer_head *bh, + struct gfs2_ea_header *ea, struct ea_set *es) +{ + struct gfs2_ea_request *er = es->es_er; + int error; + + error = gfs2_trans_begin(GFS2_SB(&ip->i_inode), RES_DINODE + 2 * RES_EATTR, 0); + if (error) + return error; + + gfs2_trans_add_meta(ip->i_gl, bh); + + if (es->ea_split) + ea = ea_split_ea(ea); + + ea_write(ip, ea, er); + + if (es->es_el) + ea_set_remove_stuffed(ip, es->es_el); + + inode_set_ctime_current(&ip->i_inode); + __mark_inode_dirty(&ip->i_inode, I_DIRTY_DATASYNC); + + gfs2_trans_end(GFS2_SB(&ip->i_inode)); + return error; +} + +static int ea_set_simple_alloc(struct gfs2_inode *ip, + struct gfs2_ea_request *er, void *private) +{ + struct ea_set *es = private; + struct gfs2_ea_header *ea = es->es_ea; + int error; + + gfs2_trans_add_meta(ip->i_gl, es->es_bh); + + if (es->ea_split) + ea = ea_split_ea(ea); + + error = ea_write(ip, ea, er); + if (error) + return error; + + if (es->es_el) + ea_set_remove_stuffed(ip, es->es_el); + + return 0; +} + +static int ea_set_simple(struct gfs2_inode *ip, struct buffer_head *bh, + struct gfs2_ea_header *ea, struct gfs2_ea_header *prev, + void *private) +{ + struct ea_set *es = private; + unsigned int size; + int stuffed; + int error; + + stuffed = ea_calc_size(GFS2_SB(&ip->i_inode), es->es_er->er_name_len, + es->es_er->er_data_len, &size); + + if (ea->ea_type == GFS2_EATYPE_UNUSED) { + if (GFS2_EA_REC_LEN(ea) < size) + return 0; + if (!GFS2_EA_IS_STUFFED(ea)) { + error = ea_remove_unstuffed(ip, bh, ea, prev, 1); + if (error) + return error; + } + es->ea_split = 0; + } else if (GFS2_EA_REC_LEN(ea) - GFS2_EA_SIZE(ea) >= size) + es->ea_split = 1; + else + return 0; + + if (stuffed) { + error = ea_set_simple_noalloc(ip, bh, ea, es); + if (error) + return error; + } else { + unsigned int blks; + + es->es_bh = bh; + es->es_ea = ea; + blks = 2 + DIV_ROUND_UP(es->es_er->er_data_len, + GFS2_SB(&ip->i_inode)->sd_jbsize); + + error = ea_alloc_skeleton(ip, es->es_er, blks, + ea_set_simple_alloc, es); + if (error) + return error; + } + + return 1; +} + +static int ea_set_block(struct gfs2_inode *ip, struct gfs2_ea_request *er, + void *private) +{ + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + struct buffer_head *indbh, *newbh; + __be64 *eablk; + int error; + int mh_size = sizeof(struct gfs2_meta_header); + + if (ip->i_diskflags & GFS2_DIF_EA_INDIRECT) { + __be64 *end; + + error = gfs2_meta_read(ip->i_gl, ip->i_eattr, DIO_WAIT, 0, + &indbh); + if (error) + return error; + + if (gfs2_metatype_check(sdp, indbh, GFS2_METATYPE_IN)) { + error = -EIO; + goto out; + } + + eablk = (__be64 *)(indbh->b_data + mh_size); + end = eablk + sdp->sd_inptrs; + + for (; eablk < end; eablk++) + if (!*eablk) + break; + + if (eablk == end) { + error = -ENOSPC; + goto out; + } + + gfs2_trans_add_meta(ip->i_gl, indbh); + } else { + u64 blk; + unsigned int n = 1; + error = gfs2_alloc_blocks(ip, &blk, &n, 0, NULL); + if (error) + return error; + gfs2_trans_remove_revoke(sdp, blk, 1); + indbh = gfs2_meta_new(ip->i_gl, blk); + gfs2_trans_add_meta(ip->i_gl, indbh); + gfs2_metatype_set(indbh, GFS2_METATYPE_IN, GFS2_FORMAT_IN); + gfs2_buffer_clear_tail(indbh, mh_size); + + eablk = (__be64 *)(indbh->b_data + mh_size); + *eablk = cpu_to_be64(ip->i_eattr); + ip->i_eattr = blk; + ip->i_diskflags |= GFS2_DIF_EA_INDIRECT; + gfs2_add_inode_blocks(&ip->i_inode, 1); + + eablk++; + } + + error = ea_alloc_blk(ip, &newbh); + if (error) + goto out; + + *eablk = cpu_to_be64((u64)newbh->b_blocknr); + error = ea_write(ip, GFS2_EA_BH2FIRST(newbh), er); + brelse(newbh); + if (error) + goto out; + + if (private) + ea_set_remove_stuffed(ip, private); + +out: + brelse(indbh); + return error; +} + +static int ea_set_i(struct gfs2_inode *ip, int type, const char *name, + const void *value, size_t size, struct gfs2_ea_location *el) +{ + struct gfs2_ea_request er; + struct ea_set es; + unsigned int blks = 2; + int error; + + er.er_type = type; + er.er_name = name; + er.er_data = (void *)value; + er.er_name_len = strlen(name); + er.er_data_len = size; + + memset(&es, 0, sizeof(struct ea_set)); + es.es_er = &er; + es.es_el = el; + + error = ea_foreach(ip, ea_set_simple, &es); + if (error > 0) + return 0; + if (error) + return error; + + if (!(ip->i_diskflags & GFS2_DIF_EA_INDIRECT)) + blks++; + if (GFS2_EAREQ_SIZE_STUFFED(&er) > GFS2_SB(&ip->i_inode)->sd_jbsize) + blks += DIV_ROUND_UP(er.er_data_len, GFS2_SB(&ip->i_inode)->sd_jbsize); + + return ea_alloc_skeleton(ip, &er, blks, ea_set_block, el); +} + +static int ea_set_remove_unstuffed(struct gfs2_inode *ip, + struct gfs2_ea_location *el) +{ + if (el->el_prev && GFS2_EA2NEXT(el->el_prev) != el->el_ea) { + el->el_prev = GFS2_EA2NEXT(el->el_prev); + gfs2_assert_withdraw(GFS2_SB(&ip->i_inode), + GFS2_EA2NEXT(el->el_prev) == el->el_ea); + } + + return ea_remove_unstuffed(ip, el->el_bh, el->el_ea, el->el_prev, 0); +} + +static int ea_remove_stuffed(struct gfs2_inode *ip, struct gfs2_ea_location *el) +{ + struct gfs2_ea_header *ea = el->el_ea; + struct gfs2_ea_header *prev = el->el_prev; + int error; + + error = gfs2_trans_begin(GFS2_SB(&ip->i_inode), RES_DINODE + RES_EATTR, 0); + if (error) + return error; + + gfs2_trans_add_meta(ip->i_gl, el->el_bh); + + if (prev) { + u32 len; + + len = GFS2_EA_REC_LEN(prev) + GFS2_EA_REC_LEN(ea); + prev->ea_rec_len = cpu_to_be32(len); + + if (GFS2_EA_IS_LAST(ea)) + prev->ea_flags |= GFS2_EAFLAG_LAST; + } else { + ea->ea_type = GFS2_EATYPE_UNUSED; + } + + inode_set_ctime_current(&ip->i_inode); + __mark_inode_dirty(&ip->i_inode, I_DIRTY_DATASYNC); + + gfs2_trans_end(GFS2_SB(&ip->i_inode)); + + return error; +} + +/** + * gfs2_xattr_remove - Remove a GFS2 extended attribute + * @ip: The inode + * @type: The type of the extended attribute + * @name: The name of the extended attribute + * + * This is not called directly by the VFS since we use the (common) + * scheme of making a "set with NULL data" mean a remove request. Note + * that this is different from a set with zero length data. + * + * Returns: 0, or errno on failure + */ + +static int gfs2_xattr_remove(struct gfs2_inode *ip, int type, const char *name) +{ + struct gfs2_ea_location el; + int error; + + if (!ip->i_eattr) + return -ENODATA; + + error = gfs2_ea_find(ip, type, name, &el); + if (error) + return error; + if (!el.el_ea) + return -ENODATA; + + if (GFS2_EA_IS_STUFFED(el.el_ea)) + error = ea_remove_stuffed(ip, &el); + else + error = ea_remove_unstuffed(ip, el.el_bh, el.el_ea, el.el_prev, 0); + + brelse(el.el_bh); + + return error; +} + +/** + * __gfs2_xattr_set - Set (or remove) a GFS2 extended attribute + * @inode: The inode + * @name: The name of the extended attribute + * @value: The value of the extended attribute (NULL for remove) + * @size: The size of the @value argument + * @flags: Create or Replace + * @type: The type of the extended attribute + * + * See gfs2_xattr_remove() for details of the removal of xattrs. + * + * Returns: 0 or errno on failure + */ + +int __gfs2_xattr_set(struct inode *inode, const char *name, + const void *value, size_t size, int flags, int type) +{ + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_sbd *sdp = GFS2_SB(inode); + struct gfs2_ea_location el; + unsigned int namel = strlen(name); + int error; + + if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) + return -EPERM; + if (namel > GFS2_EA_MAX_NAME_LEN) + return -ERANGE; + + if (value == NULL) { + error = gfs2_xattr_remove(ip, type, name); + if (error == -ENODATA && !(flags & XATTR_REPLACE)) + error = 0; + return error; + } + + if (ea_check_size(sdp, namel, size)) + return -ERANGE; + + if (!ip->i_eattr) { + if (flags & XATTR_REPLACE) + return -ENODATA; + return ea_init(ip, type, name, value, size); + } + + error = gfs2_ea_find(ip, type, name, &el); + if (error) + return error; + + if (el.el_ea) { + if (ip->i_diskflags & GFS2_DIF_APPENDONLY) { + brelse(el.el_bh); + return -EPERM; + } + + error = -EEXIST; + if (!(flags & XATTR_CREATE)) { + int unstuffed = !GFS2_EA_IS_STUFFED(el.el_ea); + error = ea_set_i(ip, type, name, value, size, &el); + if (!error && unstuffed) + ea_set_remove_unstuffed(ip, &el); + } + + brelse(el.el_bh); + return error; + } + + error = -ENODATA; + if (!(flags & XATTR_REPLACE)) + error = ea_set_i(ip, type, name, value, size, NULL); + + return error; +} + +static int gfs2_xattr_set(const struct xattr_handler *handler, + struct mnt_idmap *idmap, + struct dentry *unused, struct inode *inode, + const char *name, const void *value, + size_t size, int flags) +{ + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_holder gh; + int ret; + + ret = gfs2_qa_get(ip); + if (ret) + return ret; + + /* May be called from gfs_setattr with the glock locked. */ + + if (!gfs2_glock_is_locked_by_me(ip->i_gl)) { + ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh); + if (ret) + goto out; + } else { + if (WARN_ON_ONCE(ip->i_gl->gl_state != LM_ST_EXCLUSIVE)) { + ret = -EIO; + goto out; + } + gfs2_holder_mark_uninitialized(&gh); + } + ret = __gfs2_xattr_set(inode, name, value, size, flags, handler->flags); + if (gfs2_holder_initialized(&gh)) + gfs2_glock_dq_uninit(&gh); +out: + gfs2_qa_put(ip); + return ret; +} + +static int ea_dealloc_indirect(struct gfs2_inode *ip) +{ + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + struct gfs2_rgrp_list rlist; + struct gfs2_rgrpd *rgd; + struct buffer_head *indbh, *dibh; + __be64 *eablk, *end; + unsigned int rg_blocks = 0; + u64 bstart = 0; + unsigned int blen = 0; + unsigned int blks = 0; + unsigned int x; + int error; + + error = gfs2_rindex_update(sdp); + if (error) + return error; + + memset(&rlist, 0, sizeof(struct gfs2_rgrp_list)); + + error = gfs2_meta_read(ip->i_gl, ip->i_eattr, DIO_WAIT, 0, &indbh); + if (error) + return error; + + if (gfs2_metatype_check(sdp, indbh, GFS2_METATYPE_IN)) { + error = -EIO; + goto out; + } + + eablk = (__be64 *)(indbh->b_data + sizeof(struct gfs2_meta_header)); + end = eablk + sdp->sd_inptrs; + + for (; eablk < end; eablk++) { + u64 bn; + + if (!*eablk) + break; + bn = be64_to_cpu(*eablk); + + if (bstart + blen == bn) + blen++; + else { + if (bstart) + gfs2_rlist_add(ip, &rlist, bstart); + bstart = bn; + blen = 1; + } + blks++; + } + if (bstart) + gfs2_rlist_add(ip, &rlist, bstart); + else + goto out; + + gfs2_rlist_alloc(&rlist, LM_ST_EXCLUSIVE, LM_FLAG_NODE_SCOPE); + + for (x = 0; x < rlist.rl_rgrps; x++) { + rgd = gfs2_glock2rgrp(rlist.rl_ghs[x].gh_gl); + rg_blocks += rgd->rd_length; + } + + error = gfs2_glock_nq_m(rlist.rl_rgrps, rlist.rl_ghs); + if (error) + goto out_rlist_free; + + error = gfs2_trans_begin(sdp, rg_blocks + RES_DINODE + RES_INDIRECT + + RES_STATFS + RES_QUOTA, blks); + if (error) + goto out_gunlock; + + gfs2_trans_add_meta(ip->i_gl, indbh); + + eablk = (__be64 *)(indbh->b_data + sizeof(struct gfs2_meta_header)); + bstart = 0; + rgd = NULL; + blen = 0; + + for (; eablk < end; eablk++) { + u64 bn; + + if (!*eablk) + break; + bn = be64_to_cpu(*eablk); + + if (bstart + blen == bn) + blen++; + else { + if (bstart) + gfs2_free_meta(ip, rgd, bstart, blen); + bstart = bn; + rgd = gfs2_blk2rgrpd(sdp, bstart, true); + blen = 1; + } + + *eablk = 0; + gfs2_add_inode_blocks(&ip->i_inode, -1); + } + if (bstart) + gfs2_free_meta(ip, rgd, bstart, blen); + + ip->i_diskflags &= ~GFS2_DIF_EA_INDIRECT; + + error = gfs2_meta_inode_buffer(ip, &dibh); + if (!error) { + gfs2_trans_add_meta(ip->i_gl, dibh); + gfs2_dinode_out(ip, dibh->b_data); + brelse(dibh); + } + + gfs2_trans_end(sdp); + +out_gunlock: + gfs2_glock_dq_m(rlist.rl_rgrps, rlist.rl_ghs); +out_rlist_free: + gfs2_rlist_free(&rlist); +out: + brelse(indbh); + return error; +} + +static int ea_dealloc_block(struct gfs2_inode *ip) +{ + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + struct gfs2_rgrpd *rgd; + struct buffer_head *dibh; + struct gfs2_holder gh; + int error; + + error = gfs2_rindex_update(sdp); + if (error) + return error; + + rgd = gfs2_blk2rgrpd(sdp, ip->i_eattr, 1); + if (!rgd) { + gfs2_consist_inode(ip); + return -EIO; + } + + error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, + LM_FLAG_NODE_SCOPE, &gh); + if (error) + return error; + + error = gfs2_trans_begin(sdp, RES_RG_BIT + RES_DINODE + RES_STATFS + + RES_QUOTA, 1); + if (error) + goto out_gunlock; + + gfs2_free_meta(ip, rgd, ip->i_eattr, 1); + + ip->i_eattr = 0; + gfs2_add_inode_blocks(&ip->i_inode, -1); + + if (likely(!test_bit(GIF_ALLOC_FAILED, &ip->i_flags))) { + error = gfs2_meta_inode_buffer(ip, &dibh); + if (!error) { + gfs2_trans_add_meta(ip->i_gl, dibh); + gfs2_dinode_out(ip, dibh->b_data); + brelse(dibh); + } + } + + gfs2_trans_end(sdp); + +out_gunlock: + gfs2_glock_dq_uninit(&gh); + return error; +} + +/** + * gfs2_ea_dealloc - deallocate the extended attribute fork + * @ip: the inode + * + * Returns: errno + */ + +int gfs2_ea_dealloc(struct gfs2_inode *ip) +{ + int error; + + error = gfs2_rindex_update(GFS2_SB(&ip->i_inode)); + if (error) + return error; + + error = gfs2_quota_hold(ip, NO_UID_QUOTA_CHANGE, NO_GID_QUOTA_CHANGE); + if (error) + return error; + + if (likely(!test_bit(GIF_ALLOC_FAILED, &ip->i_flags))) { + error = ea_foreach(ip, ea_dealloc_unstuffed, NULL); + if (error) + goto out_quota; + + if (ip->i_diskflags & GFS2_DIF_EA_INDIRECT) { + error = ea_dealloc_indirect(ip); + if (error) + goto out_quota; + } + } + + error = ea_dealloc_block(ip); + +out_quota: + gfs2_quota_unhold(ip); + return error; +} + +static const struct xattr_handler gfs2_xattr_user_handler = { + .prefix = XATTR_USER_PREFIX, + .flags = GFS2_EATYPE_USR, + .get = gfs2_xattr_get, + .set = gfs2_xattr_set, +}; + +static const struct xattr_handler gfs2_xattr_security_handler = { + .prefix = XATTR_SECURITY_PREFIX, + .flags = GFS2_EATYPE_SECURITY, + .get = gfs2_xattr_get, + .set = gfs2_xattr_set, +}; + +static bool +gfs2_xattr_trusted_list(struct dentry *dentry) +{ + return capable(CAP_SYS_ADMIN); +} + +static const struct xattr_handler gfs2_xattr_trusted_handler = { + .prefix = XATTR_TRUSTED_PREFIX, + .flags = GFS2_EATYPE_TRUSTED, + .list = gfs2_xattr_trusted_list, + .get = gfs2_xattr_get, + .set = gfs2_xattr_set, +}; + +const struct xattr_handler *gfs2_xattr_handlers_max[] = { + /* GFS2_FS_FORMAT_MAX */ + &gfs2_xattr_trusted_handler, + + /* GFS2_FS_FORMAT_MIN */ + &gfs2_xattr_user_handler, + &gfs2_xattr_security_handler, + NULL, +}; + +const struct xattr_handler **gfs2_xattr_handlers_min = gfs2_xattr_handlers_max + 1; diff --git a/fs/gfs2/xattr.h b/fs/gfs2/xattr.h new file mode 100644 index 0000000000..2aed9d7d48 --- /dev/null +++ b/fs/gfs2/xattr.h @@ -0,0 +1,63 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. + * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. + */ + +#ifndef __EATTR_DOT_H__ +#define __EATTR_DOT_H__ + +struct gfs2_inode; +struct iattr; + +#define GFS2_EA_REC_LEN(ea) be32_to_cpu((ea)->ea_rec_len) +#define GFS2_EA_DATA_LEN(ea) be32_to_cpu((ea)->ea_data_len) + +#define GFS2_EA_SIZE(ea) \ +ALIGN(sizeof(struct gfs2_ea_header) + (ea)->ea_name_len + \ + ((GFS2_EA_IS_STUFFED(ea)) ? GFS2_EA_DATA_LEN(ea) : \ + (sizeof(__be64) * (ea)->ea_num_ptrs)), 8) + +#define GFS2_EA_IS_STUFFED(ea) (!(ea)->ea_num_ptrs) +#define GFS2_EA_IS_LAST(ea) ((ea)->ea_flags & GFS2_EAFLAG_LAST) + +#define GFS2_EAREQ_SIZE_STUFFED(er) \ +ALIGN(sizeof(struct gfs2_ea_header) + (er)->er_name_len + (er)->er_data_len, 8) + +#define GFS2_EA2NAME(ea) ((char *)((struct gfs2_ea_header *)(ea) + 1)) +#define GFS2_EA2DATA(ea) (GFS2_EA2NAME(ea) + (ea)->ea_name_len) + +#define GFS2_EA2DATAPTRS(ea) \ +((__be64 *)(GFS2_EA2NAME(ea) + ALIGN((ea)->ea_name_len, 8))) + +#define GFS2_EA2NEXT(ea) \ +((struct gfs2_ea_header *)((char *)(ea) + GFS2_EA_REC_LEN(ea))) + +#define GFS2_EA_BH2FIRST(bh) \ +((struct gfs2_ea_header *)((bh)->b_data + sizeof(struct gfs2_meta_header))) + +struct gfs2_ea_request { + const char *er_name; + char *er_data; + unsigned int er_name_len; + unsigned int er_data_len; + unsigned int er_type; /* GFS2_EATYPE_... */ +}; + +struct gfs2_ea_location { + struct buffer_head *el_bh; + struct gfs2_ea_header *el_ea; + struct gfs2_ea_header *el_prev; +}; + +extern int __gfs2_xattr_set(struct inode *inode, const char *name, + const void *value, size_t size, + int flags, int type); +extern ssize_t gfs2_listxattr(struct dentry *dentry, char *buffer, size_t size); +extern int gfs2_ea_dealloc(struct gfs2_inode *ip); + +/* Exported to acl.c */ + +extern int gfs2_xattr_acl_get(struct gfs2_inode *ip, const char *name, char **data); + +#endif /* __EATTR_DOT_H__ */ |