diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-07 18:49:45 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-07 18:49:45 +0000 |
commit | 2c3c1048746a4622d8c89a29670120dc8fab93c4 (patch) | |
tree | 848558de17fb3008cdf4d861b01ac7781903ce39 /fs/ntfs | |
parent | Initial commit. (diff) | |
download | linux-2c3c1048746a4622d8c89a29670120dc8fab93c4.tar.xz linux-2c3c1048746a4622d8c89a29670120dc8fab93c4.zip |
Adding upstream version 6.1.76.upstream/6.1.76
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to '')
74 files changed, 60304 insertions, 0 deletions
diff --git a/fs/ntfs/Kconfig b/fs/ntfs/Kconfig new file mode 100644 index 000000000..f93e69a61 --- /dev/null +++ b/fs/ntfs/Kconfig @@ -0,0 +1,80 @@ +# SPDX-License-Identifier: GPL-2.0-only +config NTFS_FS + tristate "NTFS file system support" + select NLS + help + NTFS is the file system of Microsoft Windows NT, 2000, XP and 2003. + + Saying Y or M here enables read support. There is partial, but + safe, write support available. For write support you must also + say Y to "NTFS write support" below. + + There are also a number of user-space tools available, called + ntfsprogs. These include ntfsundelete and ntfsresize, that work + without NTFS support enabled in the kernel. + + This is a rewrite from scratch of Linux NTFS support and replaced + the old NTFS code starting with Linux 2.5.11. A backport to + the Linux 2.4 kernel series is separately available as a patch + from the project web site. + + For more information see <file:Documentation/filesystems/ntfs.rst> + and <http://www.linux-ntfs.org/>. + + To compile this file system support as a module, choose M here: the + module will be called ntfs. + + If you are not using Windows NT, 2000, XP or 2003 in addition to + Linux on your computer it is safe to say N. + +config NTFS_DEBUG + bool "NTFS debugging support" + depends on NTFS_FS + help + If you are experiencing any problems with the NTFS file system, say + Y here. This will result in additional consistency checks to be + performed by the driver as well as additional debugging messages to + be written to the system log. Note that debugging messages are + disabled by default. To enable them, supply the option debug_msgs=1 + at the kernel command line when booting the kernel or as an option + to insmod when loading the ntfs module. Once the driver is active, + you can enable debugging messages by doing (as root): + echo 1 > /proc/sys/fs/ntfs-debug + Replacing the "1" with "0" would disable debug messages. + + If you leave debugging messages disabled, this results in little + overhead, but enabling debug messages results in very significant + slowdown of the system. + + When reporting bugs, please try to have available a full dump of + debugging messages while the misbehaviour was occurring. + +config NTFS_RW + bool "NTFS write support" + depends on NTFS_FS + depends on PAGE_SIZE_LESS_THAN_64KB + help + This enables the partial, but safe, write support in the NTFS driver. + + The only supported operation is overwriting existing files, without + changing the file length. No file or directory creation, deletion or + renaming is possible. Note only non-resident files can be written to + so you may find that some very small files (<500 bytes or so) cannot + be written to. + + While we cannot guarantee that it will not damage any data, we have + so far not received a single report where the driver would have + damaged someones data so we assume it is perfectly safe to use. + + Note: While write support is safe in this version (a rewrite from + scratch of the NTFS support), it should be noted that the old NTFS + write support, included in Linux 2.5.10 and before (since 1997), + is not safe. + + This is currently useful with TopologiLinux. TopologiLinux is run + on top of any DOS/Microsoft Windows system without partitioning your + hard disk. Unlike other Linux distributions TopologiLinux does not + need its own partition. For more information see + <http://topologi-linux.sourceforge.net/> + + It is perfectly safe to say N here. diff --git a/fs/ntfs/Makefile b/fs/ntfs/Makefile new file mode 100644 index 000000000..3e736572e --- /dev/null +++ b/fs/ntfs/Makefile @@ -0,0 +1,15 @@ +# SPDX-License-Identifier: GPL-2.0 +# Rules for making the NTFS driver. + +obj-$(CONFIG_NTFS_FS) += ntfs.o + +ntfs-y := aops.o attrib.o collate.o compress.o debug.o dir.o file.o \ + index.o inode.o mft.o mst.o namei.o runlist.o super.o sysctl.o \ + unistr.o upcase.o + +ntfs-$(CONFIG_NTFS_RW) += bitmap.o lcnalloc.o logfile.o quota.o usnjrnl.o + +ccflags-y := -DNTFS_VERSION=\"2.1.32\" +ccflags-$(CONFIG_NTFS_DEBUG) += -DDEBUG +ccflags-$(CONFIG_NTFS_RW) += -DNTFS_RW + diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c new file mode 100644 index 000000000..9364d35b4 --- /dev/null +++ b/fs/ntfs/aops.c @@ -0,0 +1,1761 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/** + * aops.c - NTFS kernel address space operations and page cache handling. + * + * Copyright (c) 2001-2014 Anton Altaparmakov and Tuxera Inc. + * Copyright (c) 2002 Richard Russon + */ + +#include <linux/errno.h> +#include <linux/fs.h> +#include <linux/gfp.h> +#include <linux/mm.h> +#include <linux/pagemap.h> +#include <linux/swap.h> +#include <linux/buffer_head.h> +#include <linux/writeback.h> +#include <linux/bit_spinlock.h> +#include <linux/bio.h> + +#include "aops.h" +#include "attrib.h" +#include "debug.h" +#include "inode.h" +#include "mft.h" +#include "runlist.h" +#include "types.h" +#include "ntfs.h" + +/** + * ntfs_end_buffer_async_read - async io completion for reading attributes + * @bh: buffer head on which io is completed + * @uptodate: whether @bh is now uptodate or not + * + * Asynchronous I/O completion handler for reading pages belonging to the + * attribute address space of an inode. The inodes can either be files or + * directories or they can be fake inodes describing some attribute. + * + * If NInoMstProtected(), perform the post read mst fixups when all IO on the + * page has been completed and mark the page uptodate or set the error bit on + * the page. To determine the size of the records that need fixing up, we + * cheat a little bit by setting the index_block_size in ntfs_inode to the ntfs + * record size, and index_block_size_bits, to the log(base 2) of the ntfs + * record size. + */ +static void ntfs_end_buffer_async_read(struct buffer_head *bh, int uptodate) +{ + unsigned long flags; + struct buffer_head *first, *tmp; + struct page *page; + struct inode *vi; + ntfs_inode *ni; + int page_uptodate = 1; + + page = bh->b_page; + vi = page->mapping->host; + ni = NTFS_I(vi); + + if (likely(uptodate)) { + loff_t i_size; + s64 file_ofs, init_size; + + set_buffer_uptodate(bh); + + file_ofs = ((s64)page->index << PAGE_SHIFT) + + bh_offset(bh); + read_lock_irqsave(&ni->size_lock, flags); + init_size = ni->initialized_size; + i_size = i_size_read(vi); + read_unlock_irqrestore(&ni->size_lock, flags); + if (unlikely(init_size > i_size)) { + /* Race with shrinking truncate. */ + init_size = i_size; + } + /* Check for the current buffer head overflowing. */ + if (unlikely(file_ofs + bh->b_size > init_size)) { + int ofs; + void *kaddr; + + ofs = 0; + if (file_ofs < init_size) + ofs = init_size - file_ofs; + kaddr = kmap_atomic(page); + memset(kaddr + bh_offset(bh) + ofs, 0, + bh->b_size - ofs); + flush_dcache_page(page); + kunmap_atomic(kaddr); + } + } else { + clear_buffer_uptodate(bh); + SetPageError(page); + ntfs_error(ni->vol->sb, "Buffer I/O error, logical block " + "0x%llx.", (unsigned long long)bh->b_blocknr); + } + first = page_buffers(page); + spin_lock_irqsave(&first->b_uptodate_lock, flags); + clear_buffer_async_read(bh); + unlock_buffer(bh); + tmp = bh; + do { + if (!buffer_uptodate(tmp)) + page_uptodate = 0; + if (buffer_async_read(tmp)) { + if (likely(buffer_locked(tmp))) + goto still_busy; + /* Async buffers must be locked. */ + BUG(); + } + tmp = tmp->b_this_page; + } while (tmp != bh); + spin_unlock_irqrestore(&first->b_uptodate_lock, flags); + /* + * If none of the buffers had errors then we can set the page uptodate, + * but we first have to perform the post read mst fixups, if the + * attribute is mst protected, i.e. if NInoMstProteced(ni) is true. + * Note we ignore fixup errors as those are detected when + * map_mft_record() is called which gives us per record granularity + * rather than per page granularity. + */ + if (!NInoMstProtected(ni)) { + if (likely(page_uptodate && !PageError(page))) + SetPageUptodate(page); + } else { + u8 *kaddr; + unsigned int i, recs; + u32 rec_size; + + rec_size = ni->itype.index.block_size; + recs = PAGE_SIZE / rec_size; + /* Should have been verified before we got here... */ + BUG_ON(!recs); + kaddr = kmap_atomic(page); + for (i = 0; i < recs; i++) + post_read_mst_fixup((NTFS_RECORD*)(kaddr + + i * rec_size), rec_size); + kunmap_atomic(kaddr); + flush_dcache_page(page); + if (likely(page_uptodate && !PageError(page))) + SetPageUptodate(page); + } + unlock_page(page); + return; +still_busy: + spin_unlock_irqrestore(&first->b_uptodate_lock, flags); + return; +} + +/** + * ntfs_read_block - fill a @page of an address space with data + * @page: page cache page to fill with data + * + * Fill the page @page of the address space belonging to the @page->host inode. + * We read each buffer asynchronously and when all buffers are read in, our io + * completion handler ntfs_end_buffer_read_async(), if required, automatically + * applies the mst fixups to the page before finally marking it uptodate and + * unlocking it. + * + * We only enforce allocated_size limit because i_size is checked for in + * generic_file_read(). + * + * Return 0 on success and -errno on error. + * + * Contains an adapted version of fs/buffer.c::block_read_full_folio(). + */ +static int ntfs_read_block(struct page *page) +{ + loff_t i_size; + VCN vcn; + LCN lcn; + s64 init_size; + struct inode *vi; + ntfs_inode *ni; + ntfs_volume *vol; + runlist_element *rl; + struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE]; + sector_t iblock, lblock, zblock; + unsigned long flags; + unsigned int blocksize, vcn_ofs; + int i, nr; + unsigned char blocksize_bits; + + vi = page->mapping->host; + ni = NTFS_I(vi); + vol = ni->vol; + + /* $MFT/$DATA must have its complete runlist in memory at all times. */ + BUG_ON(!ni->runlist.rl && !ni->mft_no && !NInoAttr(ni)); + + blocksize = vol->sb->s_blocksize; + blocksize_bits = vol->sb->s_blocksize_bits; + + if (!page_has_buffers(page)) { + create_empty_buffers(page, blocksize, 0); + if (unlikely(!page_has_buffers(page))) { + unlock_page(page); + return -ENOMEM; + } + } + bh = head = page_buffers(page); + BUG_ON(!bh); + + /* + * We may be racing with truncate. To avoid some of the problems we + * now take a snapshot of the various sizes and use those for the whole + * of the function. In case of an extending truncate it just means we + * may leave some buffers unmapped which are now allocated. This is + * not a problem since these buffers will just get mapped when a write + * occurs. In case of a shrinking truncate, we will detect this later + * on due to the runlist being incomplete and if the page is being + * fully truncated, truncate will throw it away as soon as we unlock + * it so no need to worry what we do with it. + */ + iblock = (s64)page->index << (PAGE_SHIFT - blocksize_bits); + read_lock_irqsave(&ni->size_lock, flags); + lblock = (ni->allocated_size + blocksize - 1) >> blocksize_bits; + init_size = ni->initialized_size; + i_size = i_size_read(vi); + read_unlock_irqrestore(&ni->size_lock, flags); + if (unlikely(init_size > i_size)) { + /* Race with shrinking truncate. */ + init_size = i_size; + } + zblock = (init_size + blocksize - 1) >> blocksize_bits; + + /* Loop through all the buffers in the page. */ + rl = NULL; + nr = i = 0; + do { + int err = 0; + + if (unlikely(buffer_uptodate(bh))) + continue; + if (unlikely(buffer_mapped(bh))) { + arr[nr++] = bh; + continue; + } + bh->b_bdev = vol->sb->s_bdev; + /* Is the block within the allowed limits? */ + if (iblock < lblock) { + bool is_retry = false; + + /* Convert iblock into corresponding vcn and offset. */ + vcn = (VCN)iblock << blocksize_bits >> + vol->cluster_size_bits; + vcn_ofs = ((VCN)iblock << blocksize_bits) & + vol->cluster_size_mask; + if (!rl) { +lock_retry_remap: + down_read(&ni->runlist.lock); + rl = ni->runlist.rl; + } + if (likely(rl != NULL)) { + /* Seek to element containing target vcn. */ + while (rl->length && rl[1].vcn <= vcn) + rl++; + lcn = ntfs_rl_vcn_to_lcn(rl, vcn); + } else + lcn = LCN_RL_NOT_MAPPED; + /* Successful remap. */ + if (lcn >= 0) { + /* Setup buffer head to correct block. */ + bh->b_blocknr = ((lcn << vol->cluster_size_bits) + + vcn_ofs) >> blocksize_bits; + set_buffer_mapped(bh); + /* Only read initialized data blocks. */ + if (iblock < zblock) { + arr[nr++] = bh; + continue; + } + /* Fully non-initialized data block, zero it. */ + goto handle_zblock; + } + /* It is a hole, need to zero it. */ + if (lcn == LCN_HOLE) + goto handle_hole; + /* If first try and runlist unmapped, map and retry. */ + if (!is_retry && lcn == LCN_RL_NOT_MAPPED) { + is_retry = true; + /* + * Attempt to map runlist, dropping lock for + * the duration. + */ + up_read(&ni->runlist.lock); + err = ntfs_map_runlist(ni, vcn); + if (likely(!err)) + goto lock_retry_remap; + rl = NULL; + } else if (!rl) + up_read(&ni->runlist.lock); + /* + * If buffer is outside the runlist, treat it as a + * hole. This can happen due to concurrent truncate + * for example. + */ + if (err == -ENOENT || lcn == LCN_ENOENT) { + err = 0; + goto handle_hole; + } + /* Hard error, zero out region. */ + if (!err) + err = -EIO; + bh->b_blocknr = -1; + SetPageError(page); + ntfs_error(vol->sb, "Failed to read from inode 0x%lx, " + "attribute type 0x%x, vcn 0x%llx, " + "offset 0x%x because its location on " + "disk could not be determined%s " + "(error code %i).", ni->mft_no, + ni->type, (unsigned long long)vcn, + vcn_ofs, is_retry ? " even after " + "retrying" : "", err); + } + /* + * Either iblock was outside lblock limits or + * ntfs_rl_vcn_to_lcn() returned error. Just zero that portion + * of the page and set the buffer uptodate. + */ +handle_hole: + bh->b_blocknr = -1UL; + clear_buffer_mapped(bh); +handle_zblock: + zero_user(page, i * blocksize, blocksize); + if (likely(!err)) + set_buffer_uptodate(bh); + } while (i++, iblock++, (bh = bh->b_this_page) != head); + + /* Release the lock if we took it. */ + if (rl) + up_read(&ni->runlist.lock); + + /* Check we have at least one buffer ready for i/o. */ + if (nr) { + struct buffer_head *tbh; + + /* Lock the buffers. */ + for (i = 0; i < nr; i++) { + tbh = arr[i]; + lock_buffer(tbh); + tbh->b_end_io = ntfs_end_buffer_async_read; + set_buffer_async_read(tbh); + } + /* Finally, start i/o on the buffers. */ + for (i = 0; i < nr; i++) { + tbh = arr[i]; + if (likely(!buffer_uptodate(tbh))) + submit_bh(REQ_OP_READ, tbh); + else + ntfs_end_buffer_async_read(tbh, 1); + } + return 0; + } + /* No i/o was scheduled on any of the buffers. */ + if (likely(!PageError(page))) + SetPageUptodate(page); + else /* Signal synchronous i/o error. */ + nr = -EIO; + unlock_page(page); + return nr; +} + +/** + * ntfs_read_folio - fill a @folio of a @file with data from the device + * @file: open file to which the folio @folio belongs or NULL + * @folio: page cache folio to fill with data + * + * For non-resident attributes, ntfs_read_folio() fills the @folio of the open + * file @file by calling the ntfs version of the generic block_read_full_folio() + * function, ntfs_read_block(), which in turn creates and reads in the buffers + * associated with the folio asynchronously. + * + * For resident attributes, OTOH, ntfs_read_folio() fills @folio by copying the + * data from the mft record (which at this stage is most likely in memory) and + * fills the remainder with zeroes. Thus, in this case, I/O is synchronous, as + * even if the mft record is not cached at this point in time, we need to wait + * for it to be read in before we can do the copy. + * + * Return 0 on success and -errno on error. + */ +static int ntfs_read_folio(struct file *file, struct folio *folio) +{ + struct page *page = &folio->page; + loff_t i_size; + struct inode *vi; + ntfs_inode *ni, *base_ni; + u8 *addr; + ntfs_attr_search_ctx *ctx; + MFT_RECORD *mrec; + unsigned long flags; + u32 attr_len; + int err = 0; + +retry_readpage: + BUG_ON(!PageLocked(page)); + vi = page->mapping->host; + i_size = i_size_read(vi); + /* Is the page fully outside i_size? (truncate in progress) */ + if (unlikely(page->index >= (i_size + PAGE_SIZE - 1) >> + PAGE_SHIFT)) { + zero_user(page, 0, PAGE_SIZE); + ntfs_debug("Read outside i_size - truncated?"); + goto done; + } + /* + * This can potentially happen because we clear PageUptodate() during + * ntfs_writepage() of MstProtected() attributes. + */ + if (PageUptodate(page)) { + unlock_page(page); + return 0; + } + ni = NTFS_I(vi); + /* + * Only $DATA attributes can be encrypted and only unnamed $DATA + * attributes can be compressed. Index root can have the flags set but + * this means to create compressed/encrypted files, not that the + * attribute is compressed/encrypted. Note we need to check for + * AT_INDEX_ALLOCATION since this is the type of both directory and + * index inodes. + */ + if (ni->type != AT_INDEX_ALLOCATION) { + /* If attribute is encrypted, deny access, just like NT4. */ + if (NInoEncrypted(ni)) { + BUG_ON(ni->type != AT_DATA); + err = -EACCES; + goto err_out; + } + /* Compressed data streams are handled in compress.c. */ + if (NInoNonResident(ni) && NInoCompressed(ni)) { + BUG_ON(ni->type != AT_DATA); + BUG_ON(ni->name_len); + return ntfs_read_compressed_block(page); + } + } + /* NInoNonResident() == NInoIndexAllocPresent() */ + if (NInoNonResident(ni)) { + /* Normal, non-resident data stream. */ + return ntfs_read_block(page); + } + /* + * Attribute is resident, implying it is not compressed or encrypted. + * This also means the attribute is smaller than an mft record and + * hence smaller than a page, so can simply zero out any pages with + * index above 0. Note the attribute can actually be marked compressed + * but if it is resident the actual data is not compressed so we are + * ok to ignore the compressed flag here. + */ + if (unlikely(page->index > 0)) { + zero_user(page, 0, PAGE_SIZE); + goto done; + } + if (!NInoAttr(ni)) + base_ni = ni; + else + base_ni = ni->ext.base_ntfs_ino; + /* Map, pin, and lock the mft record. */ + mrec = map_mft_record(base_ni); + if (IS_ERR(mrec)) { + err = PTR_ERR(mrec); + goto err_out; + } + /* + * If a parallel write made the attribute non-resident, drop the mft + * record and retry the read_folio. + */ + if (unlikely(NInoNonResident(ni))) { + unmap_mft_record(base_ni); + goto retry_readpage; + } + ctx = ntfs_attr_get_search_ctx(base_ni, mrec); + if (unlikely(!ctx)) { + err = -ENOMEM; + goto unm_err_out; + } + err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len, + CASE_SENSITIVE, 0, NULL, 0, ctx); + if (unlikely(err)) + goto put_unm_err_out; + attr_len = le32_to_cpu(ctx->attr->data.resident.value_length); + read_lock_irqsave(&ni->size_lock, flags); + if (unlikely(attr_len > ni->initialized_size)) + attr_len = ni->initialized_size; + i_size = i_size_read(vi); + read_unlock_irqrestore(&ni->size_lock, flags); + if (unlikely(attr_len > i_size)) { + /* Race with shrinking truncate. */ + attr_len = i_size; + } + addr = kmap_atomic(page); + /* Copy the data to the page. */ + memcpy(addr, (u8*)ctx->attr + + le16_to_cpu(ctx->attr->data.resident.value_offset), + attr_len); + /* Zero the remainder of the page. */ + memset(addr + attr_len, 0, PAGE_SIZE - attr_len); + flush_dcache_page(page); + kunmap_atomic(addr); +put_unm_err_out: + ntfs_attr_put_search_ctx(ctx); +unm_err_out: + unmap_mft_record(base_ni); +done: + SetPageUptodate(page); +err_out: + unlock_page(page); + return err; +} + +#ifdef NTFS_RW + +/** + * ntfs_write_block - write a @page to the backing store + * @page: page cache page to write out + * @wbc: writeback control structure + * + * This function is for writing pages belonging to non-resident, non-mst + * protected attributes to their backing store. + * + * For a page with buffers, map and write the dirty buffers asynchronously + * under page writeback. For a page without buffers, create buffers for the + * page, then proceed as above. + * + * If a page doesn't have buffers the page dirty state is definitive. If a page + * does have buffers, the page dirty state is just a hint, and the buffer dirty + * state is definitive. (A hint which has rules: dirty buffers against a clean + * page is illegal. Other combinations are legal and need to be handled. In + * particular a dirty page containing clean buffers for example.) + * + * Return 0 on success and -errno on error. + * + * Based on ntfs_read_block() and __block_write_full_page(). + */ +static int ntfs_write_block(struct page *page, struct writeback_control *wbc) +{ + VCN vcn; + LCN lcn; + s64 initialized_size; + loff_t i_size; + sector_t block, dblock, iblock; + struct inode *vi; + ntfs_inode *ni; + ntfs_volume *vol; + runlist_element *rl; + struct buffer_head *bh, *head; + unsigned long flags; + unsigned int blocksize, vcn_ofs; + int err; + bool need_end_writeback; + unsigned char blocksize_bits; + + vi = page->mapping->host; + ni = NTFS_I(vi); + vol = ni->vol; + + ntfs_debug("Entering for inode 0x%lx, attribute type 0x%x, page index " + "0x%lx.", ni->mft_no, ni->type, page->index); + + BUG_ON(!NInoNonResident(ni)); + BUG_ON(NInoMstProtected(ni)); + blocksize = vol->sb->s_blocksize; + blocksize_bits = vol->sb->s_blocksize_bits; + if (!page_has_buffers(page)) { + BUG_ON(!PageUptodate(page)); + create_empty_buffers(page, blocksize, + (1 << BH_Uptodate) | (1 << BH_Dirty)); + if (unlikely(!page_has_buffers(page))) { + ntfs_warning(vol->sb, "Error allocating page " + "buffers. Redirtying page so we try " + "again later."); + /* + * Put the page back on mapping->dirty_pages, but leave + * its buffers' dirty state as-is. + */ + redirty_page_for_writepage(wbc, page); + unlock_page(page); + return 0; + } + } + bh = head = page_buffers(page); + BUG_ON(!bh); + + /* NOTE: Different naming scheme to ntfs_read_block()! */ + + /* The first block in the page. */ + block = (s64)page->index << (PAGE_SHIFT - blocksize_bits); + + read_lock_irqsave(&ni->size_lock, flags); + i_size = i_size_read(vi); + initialized_size = ni->initialized_size; + read_unlock_irqrestore(&ni->size_lock, flags); + + /* The first out of bounds block for the data size. */ + dblock = (i_size + blocksize - 1) >> blocksize_bits; + + /* The last (fully or partially) initialized block. */ + iblock = initialized_size >> blocksize_bits; + + /* + * Be very careful. We have no exclusion from block_dirty_folio + * here, and the (potentially unmapped) buffers may become dirty at + * any time. If a buffer becomes dirty here after we've inspected it + * then we just miss that fact, and the page stays dirty. + * + * Buffers outside i_size may be dirtied by block_dirty_folio; + * handle that here by just cleaning them. + */ + + /* + * Loop through all the buffers in the page, mapping all the dirty + * buffers to disk addresses and handling any aliases from the + * underlying block device's mapping. + */ + rl = NULL; + err = 0; + do { + bool is_retry = false; + + if (unlikely(block >= dblock)) { + /* + * Mapped buffers outside i_size will occur, because + * this page can be outside i_size when there is a + * truncate in progress. The contents of such buffers + * were zeroed by ntfs_writepage(). + * + * FIXME: What about the small race window where + * ntfs_writepage() has not done any clearing because + * the page was within i_size but before we get here, + * vmtruncate() modifies i_size? + */ + clear_buffer_dirty(bh); + set_buffer_uptodate(bh); + continue; + } + + /* Clean buffers are not written out, so no need to map them. */ + if (!buffer_dirty(bh)) + continue; + + /* Make sure we have enough initialized size. */ + if (unlikely((block >= iblock) && + (initialized_size < i_size))) { + /* + * If this page is fully outside initialized + * size, zero out all pages between the current + * initialized size and the current page. Just + * use ntfs_read_folio() to do the zeroing + * transparently. + */ + if (block > iblock) { + // TODO: + // For each page do: + // - read_cache_page() + // Again for each page do: + // - wait_on_page_locked() + // - Check (PageUptodate(page) && + // !PageError(page)) + // Update initialized size in the attribute and + // in the inode. + // Again, for each page do: + // block_dirty_folio(); + // put_page() + // We don't need to wait on the writes. + // Update iblock. + } + /* + * The current page straddles initialized size. Zero + * all non-uptodate buffers and set them uptodate (and + * dirty?). Note, there aren't any non-uptodate buffers + * if the page is uptodate. + * FIXME: For an uptodate page, the buffers may need to + * be written out because they were not initialized on + * disk before. + */ + if (!PageUptodate(page)) { + // TODO: + // Zero any non-uptodate buffers up to i_size. + // Set them uptodate and dirty. + } + // TODO: + // Update initialized size in the attribute and in the + // inode (up to i_size). + // Update iblock. + // FIXME: This is inefficient. Try to batch the two + // size changes to happen in one go. + ntfs_error(vol->sb, "Writing beyond initialized size " + "is not supported yet. Sorry."); + err = -EOPNOTSUPP; + break; + // Do NOT set_buffer_new() BUT DO clear buffer range + // outside write request range. + // set_buffer_uptodate() on complete buffers as well as + // set_buffer_dirty(). + } + + /* No need to map buffers that are already mapped. */ + if (buffer_mapped(bh)) + continue; + + /* Unmapped, dirty buffer. Need to map it. */ + bh->b_bdev = vol->sb->s_bdev; + + /* Convert block into corresponding vcn and offset. */ + vcn = (VCN)block << blocksize_bits; + vcn_ofs = vcn & vol->cluster_size_mask; + vcn >>= vol->cluster_size_bits; + if (!rl) { +lock_retry_remap: + down_read(&ni->runlist.lock); + rl = ni->runlist.rl; + } + if (likely(rl != NULL)) { + /* Seek to element containing target vcn. */ + while (rl->length && rl[1].vcn <= vcn) + rl++; + lcn = ntfs_rl_vcn_to_lcn(rl, vcn); + } else + lcn = LCN_RL_NOT_MAPPED; + /* Successful remap. */ + if (lcn >= 0) { + /* Setup buffer head to point to correct block. */ + bh->b_blocknr = ((lcn << vol->cluster_size_bits) + + vcn_ofs) >> blocksize_bits; + set_buffer_mapped(bh); + continue; + } + /* It is a hole, need to instantiate it. */ + if (lcn == LCN_HOLE) { + u8 *kaddr; + unsigned long *bpos, *bend; + + /* Check if the buffer is zero. */ + kaddr = kmap_atomic(page); + bpos = (unsigned long *)(kaddr + bh_offset(bh)); + bend = (unsigned long *)((u8*)bpos + blocksize); + do { + if (unlikely(*bpos)) + break; + } while (likely(++bpos < bend)); + kunmap_atomic(kaddr); + if (bpos == bend) { + /* + * Buffer is zero and sparse, no need to write + * it. + */ + bh->b_blocknr = -1; + clear_buffer_dirty(bh); + continue; + } + // TODO: Instantiate the hole. + // clear_buffer_new(bh); + // clean_bdev_bh_alias(bh); + ntfs_error(vol->sb, "Writing into sparse regions is " + "not supported yet. Sorry."); + err = -EOPNOTSUPP; + break; + } + /* If first try and runlist unmapped, map and retry. */ + if (!is_retry && lcn == LCN_RL_NOT_MAPPED) { + is_retry = true; + /* + * Attempt to map runlist, dropping lock for + * the duration. + */ + up_read(&ni->runlist.lock); + err = ntfs_map_runlist(ni, vcn); + if (likely(!err)) + goto lock_retry_remap; + rl = NULL; + } else if (!rl) + up_read(&ni->runlist.lock); + /* + * If buffer is outside the runlist, truncate has cut it out + * of the runlist. Just clean and clear the buffer and set it + * uptodate so it can get discarded by the VM. + */ + if (err == -ENOENT || lcn == LCN_ENOENT) { + bh->b_blocknr = -1; + clear_buffer_dirty(bh); + zero_user(page, bh_offset(bh), blocksize); + set_buffer_uptodate(bh); + err = 0; + continue; + } + /* Failed to map the buffer, even after retrying. */ + if (!err) + err = -EIO; + bh->b_blocknr = -1; + ntfs_error(vol->sb, "Failed to write to inode 0x%lx, " + "attribute type 0x%x, vcn 0x%llx, offset 0x%x " + "because its location on disk could not be " + "determined%s (error code %i).", ni->mft_no, + ni->type, (unsigned long long)vcn, + vcn_ofs, is_retry ? " even after " + "retrying" : "", err); + break; + } while (block++, (bh = bh->b_this_page) != head); + + /* Release the lock if we took it. */ + if (rl) + up_read(&ni->runlist.lock); + + /* For the error case, need to reset bh to the beginning. */ + bh = head; + + /* Just an optimization, so ->read_folio() is not called later. */ + if (unlikely(!PageUptodate(page))) { + int uptodate = 1; + do { + if (!buffer_uptodate(bh)) { + uptodate = 0; + bh = head; + break; + } + } while ((bh = bh->b_this_page) != head); + if (uptodate) + SetPageUptodate(page); + } + + /* Setup all mapped, dirty buffers for async write i/o. */ + do { + if (buffer_mapped(bh) && buffer_dirty(bh)) { + lock_buffer(bh); + if (test_clear_buffer_dirty(bh)) { + BUG_ON(!buffer_uptodate(bh)); + mark_buffer_async_write(bh); + } else + unlock_buffer(bh); + } else if (unlikely(err)) { + /* + * For the error case. The buffer may have been set + * dirty during attachment to a dirty page. + */ + if (err != -ENOMEM) + clear_buffer_dirty(bh); + } + } while ((bh = bh->b_this_page) != head); + + if (unlikely(err)) { + // TODO: Remove the -EOPNOTSUPP check later on... + if (unlikely(err == -EOPNOTSUPP)) + err = 0; + else if (err == -ENOMEM) { + ntfs_warning(vol->sb, "Error allocating memory. " + "Redirtying page so we try again " + "later."); + /* + * Put the page back on mapping->dirty_pages, but + * leave its buffer's dirty state as-is. + */ + redirty_page_for_writepage(wbc, page); + err = 0; + } else + SetPageError(page); + } + + BUG_ON(PageWriteback(page)); + set_page_writeback(page); /* Keeps try_to_free_buffers() away. */ + + /* Submit the prepared buffers for i/o. */ + need_end_writeback = true; + do { + struct buffer_head *next = bh->b_this_page; + if (buffer_async_write(bh)) { + submit_bh(REQ_OP_WRITE, bh); + need_end_writeback = false; + } + bh = next; + } while (bh != head); + unlock_page(page); + + /* If no i/o was started, need to end_page_writeback(). */ + if (unlikely(need_end_writeback)) + end_page_writeback(page); + + ntfs_debug("Done."); + return err; +} + +/** + * ntfs_write_mst_block - write a @page to the backing store + * @page: page cache page to write out + * @wbc: writeback control structure + * + * This function is for writing pages belonging to non-resident, mst protected + * attributes to their backing store. The only supported attributes are index + * allocation and $MFT/$DATA. Both directory inodes and index inodes are + * supported for the index allocation case. + * + * The page must remain locked for the duration of the write because we apply + * the mst fixups, write, and then undo the fixups, so if we were to unlock the + * page before undoing the fixups, any other user of the page will see the + * page contents as corrupt. + * + * We clear the page uptodate flag for the duration of the function to ensure + * exclusion for the $MFT/$DATA case against someone mapping an mft record we + * are about to apply the mst fixups to. + * + * Return 0 on success and -errno on error. + * + * Based on ntfs_write_block(), ntfs_mft_writepage(), and + * write_mft_record_nolock(). + */ +static int ntfs_write_mst_block(struct page *page, + struct writeback_control *wbc) +{ + sector_t block, dblock, rec_block; + struct inode *vi = page->mapping->host; + ntfs_inode *ni = NTFS_I(vi); + ntfs_volume *vol = ni->vol; + u8 *kaddr; + unsigned int rec_size = ni->itype.index.block_size; + ntfs_inode *locked_nis[PAGE_SIZE / NTFS_BLOCK_SIZE]; + struct buffer_head *bh, *head, *tbh, *rec_start_bh; + struct buffer_head *bhs[MAX_BUF_PER_PAGE]; + runlist_element *rl; + int i, nr_locked_nis, nr_recs, nr_bhs, max_bhs, bhs_per_rec, err, err2; + unsigned bh_size, rec_size_bits; + bool sync, is_mft, page_is_dirty, rec_is_dirty; + unsigned char bh_size_bits; + + if (WARN_ON(rec_size < NTFS_BLOCK_SIZE)) + return -EINVAL; + + ntfs_debug("Entering for inode 0x%lx, attribute type 0x%x, page index " + "0x%lx.", vi->i_ino, ni->type, page->index); + BUG_ON(!NInoNonResident(ni)); + BUG_ON(!NInoMstProtected(ni)); + is_mft = (S_ISREG(vi->i_mode) && !vi->i_ino); + /* + * NOTE: ntfs_write_mst_block() would be called for $MFTMirr if a page + * in its page cache were to be marked dirty. However this should + * never happen with the current driver and considering we do not + * handle this case here we do want to BUG(), at least for now. + */ + BUG_ON(!(is_mft || S_ISDIR(vi->i_mode) || + (NInoAttr(ni) && ni->type == AT_INDEX_ALLOCATION))); + bh_size = vol->sb->s_blocksize; + bh_size_bits = vol->sb->s_blocksize_bits; + max_bhs = PAGE_SIZE / bh_size; + BUG_ON(!max_bhs); + BUG_ON(max_bhs > MAX_BUF_PER_PAGE); + + /* Were we called for sync purposes? */ + sync = (wbc->sync_mode == WB_SYNC_ALL); + + /* Make sure we have mapped buffers. */ + bh = head = page_buffers(page); + BUG_ON(!bh); + + rec_size_bits = ni->itype.index.block_size_bits; + BUG_ON(!(PAGE_SIZE >> rec_size_bits)); + bhs_per_rec = rec_size >> bh_size_bits; + BUG_ON(!bhs_per_rec); + + /* The first block in the page. */ + rec_block = block = (sector_t)page->index << + (PAGE_SHIFT - bh_size_bits); + + /* The first out of bounds block for the data size. */ + dblock = (i_size_read(vi) + bh_size - 1) >> bh_size_bits; + + rl = NULL; + err = err2 = nr_bhs = nr_recs = nr_locked_nis = 0; + page_is_dirty = rec_is_dirty = false; + rec_start_bh = NULL; + do { + bool is_retry = false; + + if (likely(block < rec_block)) { + if (unlikely(block >= dblock)) { + clear_buffer_dirty(bh); + set_buffer_uptodate(bh); + continue; + } + /* + * This block is not the first one in the record. We + * ignore the buffer's dirty state because we could + * have raced with a parallel mark_ntfs_record_dirty(). + */ + if (!rec_is_dirty) + continue; + if (unlikely(err2)) { + if (err2 != -ENOMEM) + clear_buffer_dirty(bh); + continue; + } + } else /* if (block == rec_block) */ { + BUG_ON(block > rec_block); + /* This block is the first one in the record. */ + rec_block += bhs_per_rec; + err2 = 0; + if (unlikely(block >= dblock)) { + clear_buffer_dirty(bh); + continue; + } + if (!buffer_dirty(bh)) { + /* Clean records are not written out. */ + rec_is_dirty = false; + continue; + } + rec_is_dirty = true; + rec_start_bh = bh; + } + /* Need to map the buffer if it is not mapped already. */ + if (unlikely(!buffer_mapped(bh))) { + VCN vcn; + LCN lcn; + unsigned int vcn_ofs; + + bh->b_bdev = vol->sb->s_bdev; + /* Obtain the vcn and offset of the current block. */ + vcn = (VCN)block << bh_size_bits; + vcn_ofs = vcn & vol->cluster_size_mask; + vcn >>= vol->cluster_size_bits; + if (!rl) { +lock_retry_remap: + down_read(&ni->runlist.lock); + rl = ni->runlist.rl; + } + if (likely(rl != NULL)) { + /* Seek to element containing target vcn. */ + while (rl->length && rl[1].vcn <= vcn) + rl++; + lcn = ntfs_rl_vcn_to_lcn(rl, vcn); + } else + lcn = LCN_RL_NOT_MAPPED; + /* Successful remap. */ + if (likely(lcn >= 0)) { + /* Setup buffer head to correct block. */ + bh->b_blocknr = ((lcn << + vol->cluster_size_bits) + + vcn_ofs) >> bh_size_bits; + set_buffer_mapped(bh); + } else { + /* + * Remap failed. Retry to map the runlist once + * unless we are working on $MFT which always + * has the whole of its runlist in memory. + */ + if (!is_mft && !is_retry && + lcn == LCN_RL_NOT_MAPPED) { + is_retry = true; + /* + * Attempt to map runlist, dropping + * lock for the duration. + */ + up_read(&ni->runlist.lock); + err2 = ntfs_map_runlist(ni, vcn); + if (likely(!err2)) + goto lock_retry_remap; + if (err2 == -ENOMEM) + page_is_dirty = true; + lcn = err2; + } else { + err2 = -EIO; + if (!rl) + up_read(&ni->runlist.lock); + } + /* Hard error. Abort writing this record. */ + if (!err || err == -ENOMEM) + err = err2; + bh->b_blocknr = -1; + ntfs_error(vol->sb, "Cannot write ntfs record " + "0x%llx (inode 0x%lx, " + "attribute type 0x%x) because " + "its location on disk could " + "not be determined (error " + "code %lli).", + (long long)block << + bh_size_bits >> + vol->mft_record_size_bits, + ni->mft_no, ni->type, + (long long)lcn); + /* + * If this is not the first buffer, remove the + * buffers in this record from the list of + * buffers to write and clear their dirty bit + * if not error -ENOMEM. + */ + if (rec_start_bh != bh) { + while (bhs[--nr_bhs] != rec_start_bh) + ; + if (err2 != -ENOMEM) { + do { + clear_buffer_dirty( + rec_start_bh); + } while ((rec_start_bh = + rec_start_bh-> + b_this_page) != + bh); + } + } + continue; + } + } + BUG_ON(!buffer_uptodate(bh)); + BUG_ON(nr_bhs >= max_bhs); + bhs[nr_bhs++] = bh; + } while (block++, (bh = bh->b_this_page) != head); + if (unlikely(rl)) + up_read(&ni->runlist.lock); + /* If there were no dirty buffers, we are done. */ + if (!nr_bhs) + goto done; + /* Map the page so we can access its contents. */ + kaddr = kmap(page); + /* Clear the page uptodate flag whilst the mst fixups are applied. */ + BUG_ON(!PageUptodate(page)); + ClearPageUptodate(page); + for (i = 0; i < nr_bhs; i++) { + unsigned int ofs; + + /* Skip buffers which are not at the beginning of records. */ + if (i % bhs_per_rec) + continue; + tbh = bhs[i]; + ofs = bh_offset(tbh); + if (is_mft) { + ntfs_inode *tni; + unsigned long mft_no; + + /* Get the mft record number. */ + mft_no = (((s64)page->index << PAGE_SHIFT) + ofs) + >> rec_size_bits; + /* Check whether to write this mft record. */ + tni = NULL; + if (!ntfs_may_write_mft_record(vol, mft_no, + (MFT_RECORD*)(kaddr + ofs), &tni)) { + /* + * The record should not be written. This + * means we need to redirty the page before + * returning. + */ + page_is_dirty = true; + /* + * Remove the buffers in this mft record from + * the list of buffers to write. + */ + do { + bhs[i] = NULL; + } while (++i % bhs_per_rec); + continue; + } + /* + * The record should be written. If a locked ntfs + * inode was returned, add it to the array of locked + * ntfs inodes. + */ + if (tni) + locked_nis[nr_locked_nis++] = tni; + } + /* Apply the mst protection fixups. */ + err2 = pre_write_mst_fixup((NTFS_RECORD*)(kaddr + ofs), + rec_size); + if (unlikely(err2)) { + if (!err || err == -ENOMEM) + err = -EIO; + ntfs_error(vol->sb, "Failed to apply mst fixups " + "(inode 0x%lx, attribute type 0x%x, " + "page index 0x%lx, page offset 0x%x)!" + " Unmount and run chkdsk.", vi->i_ino, + ni->type, page->index, ofs); + /* + * Mark all the buffers in this record clean as we do + * not want to write corrupt data to disk. + */ + do { + clear_buffer_dirty(bhs[i]); + bhs[i] = NULL; + } while (++i % bhs_per_rec); + continue; + } + nr_recs++; + } + /* If no records are to be written out, we are done. */ + if (!nr_recs) + goto unm_done; + flush_dcache_page(page); + /* Lock buffers and start synchronous write i/o on them. */ + for (i = 0; i < nr_bhs; i++) { + tbh = bhs[i]; + if (!tbh) + continue; + if (!trylock_buffer(tbh)) + BUG(); + /* The buffer dirty state is now irrelevant, just clean it. */ + clear_buffer_dirty(tbh); + BUG_ON(!buffer_uptodate(tbh)); + BUG_ON(!buffer_mapped(tbh)); + get_bh(tbh); + tbh->b_end_io = end_buffer_write_sync; + submit_bh(REQ_OP_WRITE, tbh); + } + /* Synchronize the mft mirror now if not @sync. */ + if (is_mft && !sync) + goto do_mirror; +do_wait: + /* Wait on i/o completion of buffers. */ + for (i = 0; i < nr_bhs; i++) { + tbh = bhs[i]; + if (!tbh) + continue; + wait_on_buffer(tbh); + if (unlikely(!buffer_uptodate(tbh))) { + ntfs_error(vol->sb, "I/O error while writing ntfs " + "record buffer (inode 0x%lx, " + "attribute type 0x%x, page index " + "0x%lx, page offset 0x%lx)! Unmount " + "and run chkdsk.", vi->i_ino, ni->type, + page->index, bh_offset(tbh)); + if (!err || err == -ENOMEM) + err = -EIO; + /* + * Set the buffer uptodate so the page and buffer + * states do not become out of sync. + */ + set_buffer_uptodate(tbh); + } + } + /* If @sync, now synchronize the mft mirror. */ + if (is_mft && sync) { +do_mirror: + for (i = 0; i < nr_bhs; i++) { + unsigned long mft_no; + unsigned int ofs; + + /* + * Skip buffers which are not at the beginning of + * records. + */ + if (i % bhs_per_rec) + continue; + tbh = bhs[i]; + /* Skip removed buffers (and hence records). */ + if (!tbh) + continue; + ofs = bh_offset(tbh); + /* Get the mft record number. */ + mft_no = (((s64)page->index << PAGE_SHIFT) + ofs) + >> rec_size_bits; + if (mft_no < vol->mftmirr_size) + ntfs_sync_mft_mirror(vol, mft_no, + (MFT_RECORD*)(kaddr + ofs), + sync); + } + if (!sync) + goto do_wait; + } + /* Remove the mst protection fixups again. */ + for (i = 0; i < nr_bhs; i++) { + if (!(i % bhs_per_rec)) { + tbh = bhs[i]; + if (!tbh) + continue; + post_write_mst_fixup((NTFS_RECORD*)(kaddr + + bh_offset(tbh))); + } + } + flush_dcache_page(page); +unm_done: + /* Unlock any locked inodes. */ + while (nr_locked_nis-- > 0) { + ntfs_inode *tni, *base_tni; + + tni = locked_nis[nr_locked_nis]; + /* Get the base inode. */ + mutex_lock(&tni->extent_lock); + if (tni->nr_extents >= 0) + base_tni = tni; + else { + base_tni = tni->ext.base_ntfs_ino; + BUG_ON(!base_tni); + } + mutex_unlock(&tni->extent_lock); + ntfs_debug("Unlocking %s inode 0x%lx.", + tni == base_tni ? "base" : "extent", + tni->mft_no); + mutex_unlock(&tni->mrec_lock); + atomic_dec(&tni->count); + iput(VFS_I(base_tni)); + } + SetPageUptodate(page); + kunmap(page); +done: + if (unlikely(err && err != -ENOMEM)) { + /* + * Set page error if there is only one ntfs record in the page. + * Otherwise we would loose per-record granularity. + */ + if (ni->itype.index.block_size == PAGE_SIZE) + SetPageError(page); + NVolSetErrors(vol); + } + if (page_is_dirty) { + ntfs_debug("Page still contains one or more dirty ntfs " + "records. Redirtying the page starting at " + "record 0x%lx.", page->index << + (PAGE_SHIFT - rec_size_bits)); + redirty_page_for_writepage(wbc, page); + unlock_page(page); + } else { + /* + * Keep the VM happy. This must be done otherwise the + * radix-tree tag PAGECACHE_TAG_DIRTY remains set even though + * the page is clean. + */ + BUG_ON(PageWriteback(page)); + set_page_writeback(page); + unlock_page(page); + end_page_writeback(page); + } + if (likely(!err)) + ntfs_debug("Done."); + return err; +} + +/** + * ntfs_writepage - write a @page to the backing store + * @page: page cache page to write out + * @wbc: writeback control structure + * + * This is called from the VM when it wants to have a dirty ntfs page cache + * page cleaned. The VM has already locked the page and marked it clean. + * + * For non-resident attributes, ntfs_writepage() writes the @page by calling + * the ntfs version of the generic block_write_full_page() function, + * ntfs_write_block(), which in turn if necessary creates and writes the + * buffers associated with the page asynchronously. + * + * For resident attributes, OTOH, ntfs_writepage() writes the @page by copying + * the data to the mft record (which at this stage is most likely in memory). + * The mft record is then marked dirty and written out asynchronously via the + * vfs inode dirty code path for the inode the mft record belongs to or via the + * vm page dirty code path for the page the mft record is in. + * + * Based on ntfs_read_folio() and fs/buffer.c::block_write_full_page(). + * + * Return 0 on success and -errno on error. + */ +static int ntfs_writepage(struct page *page, struct writeback_control *wbc) +{ + loff_t i_size; + struct inode *vi = page->mapping->host; + ntfs_inode *base_ni = NULL, *ni = NTFS_I(vi); + char *addr; + ntfs_attr_search_ctx *ctx = NULL; + MFT_RECORD *m = NULL; + u32 attr_len; + int err; + +retry_writepage: + BUG_ON(!PageLocked(page)); + i_size = i_size_read(vi); + /* Is the page fully outside i_size? (truncate in progress) */ + if (unlikely(page->index >= (i_size + PAGE_SIZE - 1) >> + PAGE_SHIFT)) { + struct folio *folio = page_folio(page); + /* + * The page may have dirty, unmapped buffers. Make them + * freeable here, so the page does not leak. + */ + block_invalidate_folio(folio, 0, folio_size(folio)); + folio_unlock(folio); + ntfs_debug("Write outside i_size - truncated?"); + return 0; + } + /* + * Only $DATA attributes can be encrypted and only unnamed $DATA + * attributes can be compressed. Index root can have the flags set but + * this means to create compressed/encrypted files, not that the + * attribute is compressed/encrypted. Note we need to check for + * AT_INDEX_ALLOCATION since this is the type of both directory and + * index inodes. + */ + if (ni->type != AT_INDEX_ALLOCATION) { + /* If file is encrypted, deny access, just like NT4. */ + if (NInoEncrypted(ni)) { + unlock_page(page); + BUG_ON(ni->type != AT_DATA); + ntfs_debug("Denying write access to encrypted file."); + return -EACCES; + } + /* Compressed data streams are handled in compress.c. */ + if (NInoNonResident(ni) && NInoCompressed(ni)) { + BUG_ON(ni->type != AT_DATA); + BUG_ON(ni->name_len); + // TODO: Implement and replace this with + // return ntfs_write_compressed_block(page); + unlock_page(page); + ntfs_error(vi->i_sb, "Writing to compressed files is " + "not supported yet. Sorry."); + return -EOPNOTSUPP; + } + // TODO: Implement and remove this check. + if (NInoNonResident(ni) && NInoSparse(ni)) { + unlock_page(page); + ntfs_error(vi->i_sb, "Writing to sparse files is not " + "supported yet. Sorry."); + return -EOPNOTSUPP; + } + } + /* NInoNonResident() == NInoIndexAllocPresent() */ + if (NInoNonResident(ni)) { + /* We have to zero every time due to mmap-at-end-of-file. */ + if (page->index >= (i_size >> PAGE_SHIFT)) { + /* The page straddles i_size. */ + unsigned int ofs = i_size & ~PAGE_MASK; + zero_user_segment(page, ofs, PAGE_SIZE); + } + /* Handle mst protected attributes. */ + if (NInoMstProtected(ni)) + return ntfs_write_mst_block(page, wbc); + /* Normal, non-resident data stream. */ + return ntfs_write_block(page, wbc); + } + /* + * Attribute is resident, implying it is not compressed, encrypted, or + * mst protected. This also means the attribute is smaller than an mft + * record and hence smaller than a page, so can simply return error on + * any pages with index above 0. Note the attribute can actually be + * marked compressed but if it is resident the actual data is not + * compressed so we are ok to ignore the compressed flag here. + */ + BUG_ON(page_has_buffers(page)); + BUG_ON(!PageUptodate(page)); + if (unlikely(page->index > 0)) { + ntfs_error(vi->i_sb, "BUG()! page->index (0x%lx) > 0. " + "Aborting write.", page->index); + BUG_ON(PageWriteback(page)); + set_page_writeback(page); + unlock_page(page); + end_page_writeback(page); + return -EIO; + } + if (!NInoAttr(ni)) + base_ni = ni; + else + base_ni = ni->ext.base_ntfs_ino; + /* Map, pin, and lock the mft record. */ + m = map_mft_record(base_ni); + if (IS_ERR(m)) { + err = PTR_ERR(m); + m = NULL; + ctx = NULL; + goto err_out; + } + /* + * If a parallel write made the attribute non-resident, drop the mft + * record and retry the writepage. + */ + if (unlikely(NInoNonResident(ni))) { + unmap_mft_record(base_ni); + goto retry_writepage; + } + ctx = ntfs_attr_get_search_ctx(base_ni, m); + if (unlikely(!ctx)) { + err = -ENOMEM; + goto err_out; + } + err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len, + CASE_SENSITIVE, 0, NULL, 0, ctx); + if (unlikely(err)) + goto err_out; + /* + * Keep the VM happy. This must be done otherwise the radix-tree tag + * PAGECACHE_TAG_DIRTY remains set even though the page is clean. + */ + BUG_ON(PageWriteback(page)); + set_page_writeback(page); + unlock_page(page); + attr_len = le32_to_cpu(ctx->attr->data.resident.value_length); + i_size = i_size_read(vi); + if (unlikely(attr_len > i_size)) { + /* Race with shrinking truncate or a failed truncate. */ + attr_len = i_size; + /* + * If the truncate failed, fix it up now. If a concurrent + * truncate, we do its job, so it does not have to do anything. + */ + err = ntfs_resident_attr_value_resize(ctx->mrec, ctx->attr, + attr_len); + /* Shrinking cannot fail. */ + BUG_ON(err); + } + addr = kmap_atomic(page); + /* Copy the data from the page to the mft record. */ + memcpy((u8*)ctx->attr + + le16_to_cpu(ctx->attr->data.resident.value_offset), + addr, attr_len); + /* Zero out of bounds area in the page cache page. */ + memset(addr + attr_len, 0, PAGE_SIZE - attr_len); + kunmap_atomic(addr); + flush_dcache_page(page); + flush_dcache_mft_record_page(ctx->ntfs_ino); + /* We are done with the page. */ + end_page_writeback(page); + /* Finally, mark the mft record dirty, so it gets written back. */ + mark_mft_record_dirty(ctx->ntfs_ino); + ntfs_attr_put_search_ctx(ctx); + unmap_mft_record(base_ni); + return 0; +err_out: + if (err == -ENOMEM) { + ntfs_warning(vi->i_sb, "Error allocating memory. Redirtying " + "page so we try again later."); + /* + * Put the page back on mapping->dirty_pages, but leave its + * buffers' dirty state as-is. + */ + redirty_page_for_writepage(wbc, page); + err = 0; + } else { + ntfs_error(vi->i_sb, "Resident attribute write failed with " + "error %i.", err); + SetPageError(page); + NVolSetErrors(ni->vol); + } + unlock_page(page); + if (ctx) + ntfs_attr_put_search_ctx(ctx); + if (m) + unmap_mft_record(base_ni); + return err; +} + +#endif /* NTFS_RW */ + +/** + * ntfs_bmap - map logical file block to physical device block + * @mapping: address space mapping to which the block to be mapped belongs + * @block: logical block to map to its physical device block + * + * For regular, non-resident files (i.e. not compressed and not encrypted), map + * the logical @block belonging to the file described by the address space + * mapping @mapping to its physical device block. + * + * The size of the block is equal to the @s_blocksize field of the super block + * of the mounted file system which is guaranteed to be smaller than or equal + * to the cluster size thus the block is guaranteed to fit entirely inside the + * cluster which means we do not need to care how many contiguous bytes are + * available after the beginning of the block. + * + * Return the physical device block if the mapping succeeded or 0 if the block + * is sparse or there was an error. + * + * Note: This is a problem if someone tries to run bmap() on $Boot system file + * as that really is in block zero but there is nothing we can do. bmap() is + * just broken in that respect (just like it cannot distinguish sparse from + * not available or error). + */ +static sector_t ntfs_bmap(struct address_space *mapping, sector_t block) +{ + s64 ofs, size; + loff_t i_size; + LCN lcn; + unsigned long blocksize, flags; + ntfs_inode *ni = NTFS_I(mapping->host); + ntfs_volume *vol = ni->vol; + unsigned delta; + unsigned char blocksize_bits, cluster_size_shift; + + ntfs_debug("Entering for mft_no 0x%lx, logical block 0x%llx.", + ni->mft_no, (unsigned long long)block); + if (ni->type != AT_DATA || !NInoNonResident(ni) || NInoEncrypted(ni)) { + ntfs_error(vol->sb, "BMAP does not make sense for %s " + "attributes, returning 0.", + (ni->type != AT_DATA) ? "non-data" : + (!NInoNonResident(ni) ? "resident" : + "encrypted")); + return 0; + } + /* None of these can happen. */ + BUG_ON(NInoCompressed(ni)); + BUG_ON(NInoMstProtected(ni)); + blocksize = vol->sb->s_blocksize; + blocksize_bits = vol->sb->s_blocksize_bits; + ofs = (s64)block << blocksize_bits; + read_lock_irqsave(&ni->size_lock, flags); + size = ni->initialized_size; + i_size = i_size_read(VFS_I(ni)); + read_unlock_irqrestore(&ni->size_lock, flags); + /* + * If the offset is outside the initialized size or the block straddles + * the initialized size then pretend it is a hole unless the + * initialized size equals the file size. + */ + if (unlikely(ofs >= size || (ofs + blocksize > size && size < i_size))) + goto hole; + cluster_size_shift = vol->cluster_size_bits; + down_read(&ni->runlist.lock); + lcn = ntfs_attr_vcn_to_lcn_nolock(ni, ofs >> cluster_size_shift, false); + up_read(&ni->runlist.lock); + if (unlikely(lcn < LCN_HOLE)) { + /* + * Step down to an integer to avoid gcc doing a long long + * comparision in the switch when we know @lcn is between + * LCN_HOLE and LCN_EIO (i.e. -1 to -5). + * + * Otherwise older gcc (at least on some architectures) will + * try to use __cmpdi2() which is of course not available in + * the kernel. + */ + switch ((int)lcn) { + case LCN_ENOENT: + /* + * If the offset is out of bounds then pretend it is a + * hole. + */ + goto hole; + case LCN_ENOMEM: + ntfs_error(vol->sb, "Not enough memory to complete " + "mapping for inode 0x%lx. " + "Returning 0.", ni->mft_no); + break; + default: + ntfs_error(vol->sb, "Failed to complete mapping for " + "inode 0x%lx. Run chkdsk. " + "Returning 0.", ni->mft_no); + break; + } + return 0; + } + if (lcn < 0) { + /* It is a hole. */ +hole: + ntfs_debug("Done (returning hole)."); + return 0; + } + /* + * The block is really allocated and fullfils all our criteria. + * Convert the cluster to units of block size and return the result. + */ + delta = ofs & vol->cluster_size_mask; + if (unlikely(sizeof(block) < sizeof(lcn))) { + block = lcn = ((lcn << cluster_size_shift) + delta) >> + blocksize_bits; + /* If the block number was truncated return 0. */ + if (unlikely(block != lcn)) { + ntfs_error(vol->sb, "Physical block 0x%llx is too " + "large to be returned, returning 0.", + (long long)lcn); + return 0; + } + } else + block = ((lcn << cluster_size_shift) + delta) >> + blocksize_bits; + ntfs_debug("Done (returning block 0x%llx).", (unsigned long long)lcn); + return block; +} + +/** + * ntfs_normal_aops - address space operations for normal inodes and attributes + * + * Note these are not used for compressed or mst protected inodes and + * attributes. + */ +const struct address_space_operations ntfs_normal_aops = { + .read_folio = ntfs_read_folio, +#ifdef NTFS_RW + .writepage = ntfs_writepage, + .dirty_folio = block_dirty_folio, +#endif /* NTFS_RW */ + .bmap = ntfs_bmap, + .migrate_folio = buffer_migrate_folio, + .is_partially_uptodate = block_is_partially_uptodate, + .error_remove_page = generic_error_remove_page, +}; + +/** + * ntfs_compressed_aops - address space operations for compressed inodes + */ +const struct address_space_operations ntfs_compressed_aops = { + .read_folio = ntfs_read_folio, +#ifdef NTFS_RW + .writepage = ntfs_writepage, + .dirty_folio = block_dirty_folio, +#endif /* NTFS_RW */ + .migrate_folio = buffer_migrate_folio, + .is_partially_uptodate = block_is_partially_uptodate, + .error_remove_page = generic_error_remove_page, +}; + +/** + * ntfs_mst_aops - general address space operations for mst protecteed inodes + * and attributes + */ +const struct address_space_operations ntfs_mst_aops = { + .read_folio = ntfs_read_folio, /* Fill page with data. */ +#ifdef NTFS_RW + .writepage = ntfs_writepage, /* Write dirty page to disk. */ + .dirty_folio = filemap_dirty_folio, +#endif /* NTFS_RW */ + .migrate_folio = buffer_migrate_folio, + .is_partially_uptodate = block_is_partially_uptodate, + .error_remove_page = generic_error_remove_page, +}; + +#ifdef NTFS_RW + +/** + * mark_ntfs_record_dirty - mark an ntfs record dirty + * @page: page containing the ntfs record to mark dirty + * @ofs: byte offset within @page at which the ntfs record begins + * + * Set the buffers and the page in which the ntfs record is located dirty. + * + * The latter also marks the vfs inode the ntfs record belongs to dirty + * (I_DIRTY_PAGES only). + * + * If the page does not have buffers, we create them and set them uptodate. + * The page may not be locked which is why we need to handle the buffers under + * the mapping->private_lock. Once the buffers are marked dirty we no longer + * need the lock since try_to_free_buffers() does not free dirty buffers. + */ +void mark_ntfs_record_dirty(struct page *page, const unsigned int ofs) { + struct address_space *mapping = page->mapping; + ntfs_inode *ni = NTFS_I(mapping->host); + struct buffer_head *bh, *head, *buffers_to_free = NULL; + unsigned int end, bh_size, bh_ofs; + + BUG_ON(!PageUptodate(page)); + end = ofs + ni->itype.index.block_size; + bh_size = VFS_I(ni)->i_sb->s_blocksize; + spin_lock(&mapping->private_lock); + if (unlikely(!page_has_buffers(page))) { + spin_unlock(&mapping->private_lock); + bh = head = alloc_page_buffers(page, bh_size, true); + spin_lock(&mapping->private_lock); + if (likely(!page_has_buffers(page))) { + struct buffer_head *tail; + + do { + set_buffer_uptodate(bh); + tail = bh; + bh = bh->b_this_page; + } while (bh); + tail->b_this_page = head; + attach_page_private(page, head); + } else + buffers_to_free = bh; + } + bh = head = page_buffers(page); + BUG_ON(!bh); + do { + bh_ofs = bh_offset(bh); + if (bh_ofs + bh_size <= ofs) + continue; + if (unlikely(bh_ofs >= end)) + break; + set_buffer_dirty(bh); + } while ((bh = bh->b_this_page) != head); + spin_unlock(&mapping->private_lock); + filemap_dirty_folio(mapping, page_folio(page)); + if (unlikely(buffers_to_free)) { + do { + bh = buffers_to_free->b_this_page; + free_buffer_head(buffers_to_free); + buffers_to_free = bh; + } while (buffers_to_free); + } +} + +#endif /* NTFS_RW */ diff --git a/fs/ntfs/aops.h b/fs/ntfs/aops.h new file mode 100644 index 000000000..0cac5458c --- /dev/null +++ b/fs/ntfs/aops.h @@ -0,0 +1,88 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/** + * aops.h - Defines for NTFS kernel address space operations and page cache + * handling. Part of the Linux-NTFS project. + * + * Copyright (c) 2001-2004 Anton Altaparmakov + * Copyright (c) 2002 Richard Russon + */ + +#ifndef _LINUX_NTFS_AOPS_H +#define _LINUX_NTFS_AOPS_H + +#include <linux/mm.h> +#include <linux/highmem.h> +#include <linux/pagemap.h> +#include <linux/fs.h> + +#include "inode.h" + +/** + * ntfs_unmap_page - release a page that was mapped using ntfs_map_page() + * @page: the page to release + * + * Unpin, unmap and release a page that was obtained from ntfs_map_page(). + */ +static inline void ntfs_unmap_page(struct page *page) +{ + kunmap(page); + put_page(page); +} + +/** + * ntfs_map_page - map a page into accessible memory, reading it if necessary + * @mapping: address space for which to obtain the page + * @index: index into the page cache for @mapping of the page to map + * + * Read a page from the page cache of the address space @mapping at position + * @index, where @index is in units of PAGE_SIZE, and not in bytes. + * + * If the page is not in memory it is loaded from disk first using the + * read_folio method defined in the address space operations of @mapping + * and the page is added to the page cache of @mapping in the process. + * + * If the page belongs to an mst protected attribute and it is marked as such + * in its ntfs inode (NInoMstProtected()) the mst fixups are applied but no + * error checking is performed. This means the caller has to verify whether + * the ntfs record(s) contained in the page are valid or not using one of the + * ntfs_is_XXXX_record{,p}() macros, where XXXX is the record type you are + * expecting to see. (For details of the macros, see fs/ntfs/layout.h.) + * + * If the page is in high memory it is mapped into memory directly addressible + * by the kernel. + * + * Finally the page count is incremented, thus pinning the page into place. + * + * The above means that page_address(page) can be used on all pages obtained + * with ntfs_map_page() to get the kernel virtual address of the page. + * + * When finished with the page, the caller has to call ntfs_unmap_page() to + * unpin, unmap and release the page. + * + * Note this does not grant exclusive access. If such is desired, the caller + * must provide it independently of the ntfs_{un}map_page() calls by using + * a {rw_}semaphore or other means of serialization. A spin lock cannot be + * used as ntfs_map_page() can block. + * + * The unlocked and uptodate page is returned on success or an encoded error + * on failure. Caller has to test for error using the IS_ERR() macro on the + * return value. If that evaluates to 'true', the negative error code can be + * obtained using PTR_ERR() on the return value of ntfs_map_page(). + */ +static inline struct page *ntfs_map_page(struct address_space *mapping, + unsigned long index) +{ + struct page *page = read_mapping_page(mapping, index, NULL); + + if (!IS_ERR(page)) + kmap(page); + return page; +} + +#ifdef NTFS_RW + +extern void mark_ntfs_record_dirty(struct page *page, const unsigned int ofs); + +#endif /* NTFS_RW */ + +#endif /* _LINUX_NTFS_AOPS_H */ diff --git a/fs/ntfs/attrib.c b/fs/ntfs/attrib.c new file mode 100644 index 000000000..a3865bc4a --- /dev/null +++ b/fs/ntfs/attrib.c @@ -0,0 +1,2624 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * attrib.c - NTFS attribute operations. Part of the Linux-NTFS project. + * + * Copyright (c) 2001-2012 Anton Altaparmakov and Tuxera Inc. + * Copyright (c) 2002 Richard Russon + */ + +#include <linux/buffer_head.h> +#include <linux/sched.h> +#include <linux/slab.h> +#include <linux/swap.h> +#include <linux/writeback.h> + +#include "attrib.h" +#include "debug.h" +#include "layout.h" +#include "lcnalloc.h" +#include "malloc.h" +#include "mft.h" +#include "ntfs.h" +#include "types.h" + +/** + * ntfs_map_runlist_nolock - map (a part of) a runlist of an ntfs inode + * @ni: ntfs inode for which to map (part of) a runlist + * @vcn: map runlist part containing this vcn + * @ctx: active attribute search context if present or NULL if not + * + * Map the part of a runlist containing the @vcn of the ntfs inode @ni. + * + * If @ctx is specified, it is an active search context of @ni and its base mft + * record. This is needed when ntfs_map_runlist_nolock() encounters unmapped + * runlist fragments and allows their mapping. If you do not have the mft + * record mapped, you can specify @ctx as NULL and ntfs_map_runlist_nolock() + * will perform the necessary mapping and unmapping. + * + * Note, ntfs_map_runlist_nolock() saves the state of @ctx on entry and + * restores it before returning. Thus, @ctx will be left pointing to the same + * attribute on return as on entry. However, the actual pointers in @ctx may + * point to different memory locations on return, so you must remember to reset + * any cached pointers from the @ctx, i.e. after the call to + * ntfs_map_runlist_nolock(), you will probably want to do: + * m = ctx->mrec; + * a = ctx->attr; + * Assuming you cache ctx->attr in a variable @a of type ATTR_RECORD * and that + * you cache ctx->mrec in a variable @m of type MFT_RECORD *. + * + * Return 0 on success and -errno on error. There is one special error code + * which is not an error as such. This is -ENOENT. It means that @vcn is out + * of bounds of the runlist. + * + * Note the runlist can be NULL after this function returns if @vcn is zero and + * the attribute has zero allocated size, i.e. there simply is no runlist. + * + * WARNING: If @ctx is supplied, regardless of whether success or failure is + * returned, you need to check IS_ERR(@ctx->mrec) and if 'true' the @ctx + * is no longer valid, i.e. you need to either call + * ntfs_attr_reinit_search_ctx() or ntfs_attr_put_search_ctx() on it. + * In that case PTR_ERR(@ctx->mrec) will give you the error code for + * why the mapping of the old inode failed. + * + * Locking: - The runlist described by @ni must be locked for writing on entry + * and is locked on return. Note the runlist will be modified. + * - If @ctx is NULL, the base mft record of @ni must not be mapped on + * entry and it will be left unmapped on return. + * - If @ctx is not NULL, the base mft record must be mapped on entry + * and it will be left mapped on return. + */ +int ntfs_map_runlist_nolock(ntfs_inode *ni, VCN vcn, ntfs_attr_search_ctx *ctx) +{ + VCN end_vcn; + unsigned long flags; + ntfs_inode *base_ni; + MFT_RECORD *m; + ATTR_RECORD *a; + runlist_element *rl; + struct page *put_this_page = NULL; + int err = 0; + bool ctx_is_temporary, ctx_needs_reset; + ntfs_attr_search_ctx old_ctx = { NULL, }; + + ntfs_debug("Mapping runlist part containing vcn 0x%llx.", + (unsigned long long)vcn); + if (!NInoAttr(ni)) + base_ni = ni; + else + base_ni = ni->ext.base_ntfs_ino; + if (!ctx) { + ctx_is_temporary = ctx_needs_reset = true; + m = map_mft_record(base_ni); + if (IS_ERR(m)) + return PTR_ERR(m); + ctx = ntfs_attr_get_search_ctx(base_ni, m); + if (unlikely(!ctx)) { + err = -ENOMEM; + goto err_out; + } + } else { + VCN allocated_size_vcn; + + BUG_ON(IS_ERR(ctx->mrec)); + a = ctx->attr; + BUG_ON(!a->non_resident); + ctx_is_temporary = false; + end_vcn = sle64_to_cpu(a->data.non_resident.highest_vcn); + read_lock_irqsave(&ni->size_lock, flags); + allocated_size_vcn = ni->allocated_size >> + ni->vol->cluster_size_bits; + read_unlock_irqrestore(&ni->size_lock, flags); + if (!a->data.non_resident.lowest_vcn && end_vcn <= 0) + end_vcn = allocated_size_vcn - 1; + /* + * If we already have the attribute extent containing @vcn in + * @ctx, no need to look it up again. We slightly cheat in + * that if vcn exceeds the allocated size, we will refuse to + * map the runlist below, so there is definitely no need to get + * the right attribute extent. + */ + if (vcn >= allocated_size_vcn || (a->type == ni->type && + a->name_length == ni->name_len && + !memcmp((u8*)a + le16_to_cpu(a->name_offset), + ni->name, ni->name_len) && + sle64_to_cpu(a->data.non_resident.lowest_vcn) + <= vcn && end_vcn >= vcn)) + ctx_needs_reset = false; + else { + /* Save the old search context. */ + old_ctx = *ctx; + /* + * If the currently mapped (extent) inode is not the + * base inode we will unmap it when we reinitialize the + * search context which means we need to get a + * reference to the page containing the mapped mft + * record so we do not accidentally drop changes to the + * mft record when it has not been marked dirty yet. + */ + if (old_ctx.base_ntfs_ino && old_ctx.ntfs_ino != + old_ctx.base_ntfs_ino) { + put_this_page = old_ctx.ntfs_ino->page; + get_page(put_this_page); + } + /* + * Reinitialize the search context so we can lookup the + * needed attribute extent. + */ + ntfs_attr_reinit_search_ctx(ctx); + ctx_needs_reset = true; + } + } + if (ctx_needs_reset) { + err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len, + CASE_SENSITIVE, vcn, NULL, 0, ctx); + if (unlikely(err)) { + if (err == -ENOENT) + err = -EIO; + goto err_out; + } + BUG_ON(!ctx->attr->non_resident); + } + a = ctx->attr; + /* + * Only decompress the mapping pairs if @vcn is inside it. Otherwise + * we get into problems when we try to map an out of bounds vcn because + * we then try to map the already mapped runlist fragment and + * ntfs_mapping_pairs_decompress() fails. + */ + end_vcn = sle64_to_cpu(a->data.non_resident.highest_vcn) + 1; + if (unlikely(vcn && vcn >= end_vcn)) { + err = -ENOENT; + goto err_out; + } + rl = ntfs_mapping_pairs_decompress(ni->vol, a, ni->runlist.rl); + if (IS_ERR(rl)) + err = PTR_ERR(rl); + else + ni->runlist.rl = rl; +err_out: + if (ctx_is_temporary) { + if (likely(ctx)) + ntfs_attr_put_search_ctx(ctx); + unmap_mft_record(base_ni); + } else if (ctx_needs_reset) { + /* + * If there is no attribute list, restoring the search context + * is accomplished simply by copying the saved context back over + * the caller supplied context. If there is an attribute list, + * things are more complicated as we need to deal with mapping + * of mft records and resulting potential changes in pointers. + */ + if (NInoAttrList(base_ni)) { + /* + * If the currently mapped (extent) inode is not the + * one we had before, we need to unmap it and map the + * old one. + */ + if (ctx->ntfs_ino != old_ctx.ntfs_ino) { + /* + * If the currently mapped inode is not the + * base inode, unmap it. + */ + if (ctx->base_ntfs_ino && ctx->ntfs_ino != + ctx->base_ntfs_ino) { + unmap_extent_mft_record(ctx->ntfs_ino); + ctx->mrec = ctx->base_mrec; + BUG_ON(!ctx->mrec); + } + /* + * If the old mapped inode is not the base + * inode, map it. + */ + if (old_ctx.base_ntfs_ino && + old_ctx.ntfs_ino != + old_ctx.base_ntfs_ino) { +retry_map: + ctx->mrec = map_mft_record( + old_ctx.ntfs_ino); + /* + * Something bad has happened. If out + * of memory retry till it succeeds. + * Any other errors are fatal and we + * return the error code in ctx->mrec. + * Let the caller deal with it... We + * just need to fudge things so the + * caller can reinit and/or put the + * search context safely. + */ + if (IS_ERR(ctx->mrec)) { + if (PTR_ERR(ctx->mrec) == + -ENOMEM) { + schedule(); + goto retry_map; + } else + old_ctx.ntfs_ino = + old_ctx. + base_ntfs_ino; + } + } + } + /* Update the changed pointers in the saved context. */ + if (ctx->mrec != old_ctx.mrec) { + if (!IS_ERR(ctx->mrec)) + old_ctx.attr = (ATTR_RECORD*)( + (u8*)ctx->mrec + + ((u8*)old_ctx.attr - + (u8*)old_ctx.mrec)); + old_ctx.mrec = ctx->mrec; + } + } + /* Restore the search context to the saved one. */ + *ctx = old_ctx; + /* + * We drop the reference on the page we took earlier. In the + * case that IS_ERR(ctx->mrec) is true this means we might lose + * some changes to the mft record that had been made between + * the last time it was marked dirty/written out and now. This + * at this stage is not a problem as the mapping error is fatal + * enough that the mft record cannot be written out anyway and + * the caller is very likely to shutdown the whole inode + * immediately and mark the volume dirty for chkdsk to pick up + * the pieces anyway. + */ + if (put_this_page) + put_page(put_this_page); + } + return err; +} + +/** + * ntfs_map_runlist - map (a part of) a runlist of an ntfs inode + * @ni: ntfs inode for which to map (part of) a runlist + * @vcn: map runlist part containing this vcn + * + * Map the part of a runlist containing the @vcn of the ntfs inode @ni. + * + * Return 0 on success and -errno on error. There is one special error code + * which is not an error as such. This is -ENOENT. It means that @vcn is out + * of bounds of the runlist. + * + * Locking: - The runlist must be unlocked on entry and is unlocked on return. + * - This function takes the runlist lock for writing and may modify + * the runlist. + */ +int ntfs_map_runlist(ntfs_inode *ni, VCN vcn) +{ + int err = 0; + + down_write(&ni->runlist.lock); + /* Make sure someone else didn't do the work while we were sleeping. */ + if (likely(ntfs_rl_vcn_to_lcn(ni->runlist.rl, vcn) <= + LCN_RL_NOT_MAPPED)) + err = ntfs_map_runlist_nolock(ni, vcn, NULL); + up_write(&ni->runlist.lock); + return err; +} + +/** + * ntfs_attr_vcn_to_lcn_nolock - convert a vcn into a lcn given an ntfs inode + * @ni: ntfs inode of the attribute whose runlist to search + * @vcn: vcn to convert + * @write_locked: true if the runlist is locked for writing + * + * Find the virtual cluster number @vcn in the runlist of the ntfs attribute + * described by the ntfs inode @ni and return the corresponding logical cluster + * number (lcn). + * + * If the @vcn is not mapped yet, the attempt is made to map the attribute + * extent containing the @vcn and the vcn to lcn conversion is retried. + * + * If @write_locked is true the caller has locked the runlist for writing and + * if false for reading. + * + * Since lcns must be >= 0, we use negative return codes with special meaning: + * + * Return code Meaning / Description + * ========================================== + * LCN_HOLE Hole / not allocated on disk. + * LCN_ENOENT There is no such vcn in the runlist, i.e. @vcn is out of bounds. + * LCN_ENOMEM Not enough memory to map runlist. + * LCN_EIO Critical error (runlist/file is corrupt, i/o error, etc). + * + * Locking: - The runlist must be locked on entry and is left locked on return. + * - If @write_locked is 'false', i.e. the runlist is locked for reading, + * the lock may be dropped inside the function so you cannot rely on + * the runlist still being the same when this function returns. + */ +LCN ntfs_attr_vcn_to_lcn_nolock(ntfs_inode *ni, const VCN vcn, + const bool write_locked) +{ + LCN lcn; + unsigned long flags; + bool is_retry = false; + + BUG_ON(!ni); + ntfs_debug("Entering for i_ino 0x%lx, vcn 0x%llx, %s_locked.", + ni->mft_no, (unsigned long long)vcn, + write_locked ? "write" : "read"); + BUG_ON(!NInoNonResident(ni)); + BUG_ON(vcn < 0); + if (!ni->runlist.rl) { + read_lock_irqsave(&ni->size_lock, flags); + if (!ni->allocated_size) { + read_unlock_irqrestore(&ni->size_lock, flags); + return LCN_ENOENT; + } + read_unlock_irqrestore(&ni->size_lock, flags); + } +retry_remap: + /* Convert vcn to lcn. If that fails map the runlist and retry once. */ + lcn = ntfs_rl_vcn_to_lcn(ni->runlist.rl, vcn); + if (likely(lcn >= LCN_HOLE)) { + ntfs_debug("Done, lcn 0x%llx.", (long long)lcn); + return lcn; + } + if (lcn != LCN_RL_NOT_MAPPED) { + if (lcn != LCN_ENOENT) + lcn = LCN_EIO; + } else if (!is_retry) { + int err; + + if (!write_locked) { + up_read(&ni->runlist.lock); + down_write(&ni->runlist.lock); + if (unlikely(ntfs_rl_vcn_to_lcn(ni->runlist.rl, vcn) != + LCN_RL_NOT_MAPPED)) { + up_write(&ni->runlist.lock); + down_read(&ni->runlist.lock); + goto retry_remap; + } + } + err = ntfs_map_runlist_nolock(ni, vcn, NULL); + if (!write_locked) { + up_write(&ni->runlist.lock); + down_read(&ni->runlist.lock); + } + if (likely(!err)) { + is_retry = true; + goto retry_remap; + } + if (err == -ENOENT) + lcn = LCN_ENOENT; + else if (err == -ENOMEM) + lcn = LCN_ENOMEM; + else + lcn = LCN_EIO; + } + if (lcn != LCN_ENOENT) + ntfs_error(ni->vol->sb, "Failed with error code %lli.", + (long long)lcn); + return lcn; +} + +/** + * ntfs_attr_find_vcn_nolock - find a vcn in the runlist of an ntfs inode + * @ni: ntfs inode describing the runlist to search + * @vcn: vcn to find + * @ctx: active attribute search context if present or NULL if not + * + * Find the virtual cluster number @vcn in the runlist described by the ntfs + * inode @ni and return the address of the runlist element containing the @vcn. + * + * If the @vcn is not mapped yet, the attempt is made to map the attribute + * extent containing the @vcn and the vcn to lcn conversion is retried. + * + * If @ctx is specified, it is an active search context of @ni and its base mft + * record. This is needed when ntfs_attr_find_vcn_nolock() encounters unmapped + * runlist fragments and allows their mapping. If you do not have the mft + * record mapped, you can specify @ctx as NULL and ntfs_attr_find_vcn_nolock() + * will perform the necessary mapping and unmapping. + * + * Note, ntfs_attr_find_vcn_nolock() saves the state of @ctx on entry and + * restores it before returning. Thus, @ctx will be left pointing to the same + * attribute on return as on entry. However, the actual pointers in @ctx may + * point to different memory locations on return, so you must remember to reset + * any cached pointers from the @ctx, i.e. after the call to + * ntfs_attr_find_vcn_nolock(), you will probably want to do: + * m = ctx->mrec; + * a = ctx->attr; + * Assuming you cache ctx->attr in a variable @a of type ATTR_RECORD * and that + * you cache ctx->mrec in a variable @m of type MFT_RECORD *. + * Note you need to distinguish between the lcn of the returned runlist element + * being >= 0 and LCN_HOLE. In the later case you have to return zeroes on + * read and allocate clusters on write. + * + * Return the runlist element containing the @vcn on success and + * ERR_PTR(-errno) on error. You need to test the return value with IS_ERR() + * to decide if the return is success or failure and PTR_ERR() to get to the + * error code if IS_ERR() is true. + * + * The possible error return codes are: + * -ENOENT - No such vcn in the runlist, i.e. @vcn is out of bounds. + * -ENOMEM - Not enough memory to map runlist. + * -EIO - Critical error (runlist/file is corrupt, i/o error, etc). + * + * WARNING: If @ctx is supplied, regardless of whether success or failure is + * returned, you need to check IS_ERR(@ctx->mrec) and if 'true' the @ctx + * is no longer valid, i.e. you need to either call + * ntfs_attr_reinit_search_ctx() or ntfs_attr_put_search_ctx() on it. + * In that case PTR_ERR(@ctx->mrec) will give you the error code for + * why the mapping of the old inode failed. + * + * Locking: - The runlist described by @ni must be locked for writing on entry + * and is locked on return. Note the runlist may be modified when + * needed runlist fragments need to be mapped. + * - If @ctx is NULL, the base mft record of @ni must not be mapped on + * entry and it will be left unmapped on return. + * - If @ctx is not NULL, the base mft record must be mapped on entry + * and it will be left mapped on return. + */ +runlist_element *ntfs_attr_find_vcn_nolock(ntfs_inode *ni, const VCN vcn, + ntfs_attr_search_ctx *ctx) +{ + unsigned long flags; + runlist_element *rl; + int err = 0; + bool is_retry = false; + + BUG_ON(!ni); + ntfs_debug("Entering for i_ino 0x%lx, vcn 0x%llx, with%s ctx.", + ni->mft_no, (unsigned long long)vcn, ctx ? "" : "out"); + BUG_ON(!NInoNonResident(ni)); + BUG_ON(vcn < 0); + if (!ni->runlist.rl) { + read_lock_irqsave(&ni->size_lock, flags); + if (!ni->allocated_size) { + read_unlock_irqrestore(&ni->size_lock, flags); + return ERR_PTR(-ENOENT); + } + read_unlock_irqrestore(&ni->size_lock, flags); + } +retry_remap: + rl = ni->runlist.rl; + if (likely(rl && vcn >= rl[0].vcn)) { + while (likely(rl->length)) { + if (unlikely(vcn < rl[1].vcn)) { + if (likely(rl->lcn >= LCN_HOLE)) { + ntfs_debug("Done."); + return rl; + } + break; + } + rl++; + } + if (likely(rl->lcn != LCN_RL_NOT_MAPPED)) { + if (likely(rl->lcn == LCN_ENOENT)) + err = -ENOENT; + else + err = -EIO; + } + } + if (!err && !is_retry) { + /* + * If the search context is invalid we cannot map the unmapped + * region. + */ + if (IS_ERR(ctx->mrec)) + err = PTR_ERR(ctx->mrec); + else { + /* + * The @vcn is in an unmapped region, map the runlist + * and retry. + */ + err = ntfs_map_runlist_nolock(ni, vcn, ctx); + if (likely(!err)) { + is_retry = true; + goto retry_remap; + } + } + if (err == -EINVAL) + err = -EIO; + } else if (!err) + err = -EIO; + if (err != -ENOENT) + ntfs_error(ni->vol->sb, "Failed with error code %i.", err); + return ERR_PTR(err); +} + +/** + * ntfs_attr_find - find (next) attribute in mft record + * @type: attribute type to find + * @name: attribute name to find (optional, i.e. NULL means don't care) + * @name_len: attribute name length (only needed if @name present) + * @ic: IGNORE_CASE or CASE_SENSITIVE (ignored if @name not present) + * @val: attribute value to find (optional, resident attributes only) + * @val_len: attribute value length + * @ctx: search context with mft record and attribute to search from + * + * You should not need to call this function directly. Use ntfs_attr_lookup() + * instead. + * + * ntfs_attr_find() takes a search context @ctx as parameter and searches the + * mft record specified by @ctx->mrec, beginning at @ctx->attr, for an + * attribute of @type, optionally @name and @val. + * + * If the attribute is found, ntfs_attr_find() returns 0 and @ctx->attr will + * point to the found attribute. + * + * If the attribute is not found, ntfs_attr_find() returns -ENOENT and + * @ctx->attr will point to the attribute before which the attribute being + * searched for would need to be inserted if such an action were to be desired. + * + * On actual error, ntfs_attr_find() returns -EIO. In this case @ctx->attr is + * undefined and in particular do not rely on it not changing. + * + * If @ctx->is_first is 'true', the search begins with @ctx->attr itself. If it + * is 'false', the search begins after @ctx->attr. + * + * If @ic is IGNORE_CASE, the @name comparisson is not case sensitive and + * @ctx->ntfs_ino must be set to the ntfs inode to which the mft record + * @ctx->mrec belongs. This is so we can get at the ntfs volume and hence at + * the upcase table. If @ic is CASE_SENSITIVE, the comparison is case + * sensitive. When @name is present, @name_len is the @name length in Unicode + * characters. + * + * If @name is not present (NULL), we assume that the unnamed attribute is + * being searched for. + * + * Finally, the resident attribute value @val is looked for, if present. If + * @val is not present (NULL), @val_len is ignored. + * + * ntfs_attr_find() only searches the specified mft record and it ignores the + * presence of an attribute list attribute (unless it is the one being searched + * for, obviously). If you need to take attribute lists into consideration, + * use ntfs_attr_lookup() instead (see below). This also means that you cannot + * use ntfs_attr_find() to search for extent records of non-resident + * attributes, as extents with lowest_vcn != 0 are usually described by the + * attribute list attribute only. - Note that it is possible that the first + * extent is only in the attribute list while the last extent is in the base + * mft record, so do not rely on being able to find the first extent in the + * base mft record. + * + * Warning: Never use @val when looking for attribute types which can be + * non-resident as this most likely will result in a crash! + */ +static int ntfs_attr_find(const ATTR_TYPE type, const ntfschar *name, + const u32 name_len, const IGNORE_CASE_BOOL ic, + const u8 *val, const u32 val_len, ntfs_attr_search_ctx *ctx) +{ + ATTR_RECORD *a; + ntfs_volume *vol = ctx->ntfs_ino->vol; + ntfschar *upcase = vol->upcase; + u32 upcase_len = vol->upcase_len; + + /* + * Iterate over attributes in mft record starting at @ctx->attr, or the + * attribute following that, if @ctx->is_first is 'true'. + */ + if (ctx->is_first) { + a = ctx->attr; + ctx->is_first = false; + } else + a = (ATTR_RECORD*)((u8*)ctx->attr + + le32_to_cpu(ctx->attr->length)); + for (;; a = (ATTR_RECORD*)((u8*)a + le32_to_cpu(a->length))) { + u8 *mrec_end = (u8 *)ctx->mrec + + le32_to_cpu(ctx->mrec->bytes_allocated); + u8 *name_end; + + /* check whether ATTR_RECORD wrap */ + if ((u8 *)a < (u8 *)ctx->mrec) + break; + + /* check whether Attribute Record Header is within bounds */ + if ((u8 *)a > mrec_end || + (u8 *)a + sizeof(ATTR_RECORD) > mrec_end) + break; + + /* check whether ATTR_RECORD's name is within bounds */ + name_end = (u8 *)a + le16_to_cpu(a->name_offset) + + a->name_length * sizeof(ntfschar); + if (name_end > mrec_end) + break; + + ctx->attr = a; + if (unlikely(le32_to_cpu(a->type) > le32_to_cpu(type) || + a->type == AT_END)) + return -ENOENT; + if (unlikely(!a->length)) + break; + + /* check whether ATTR_RECORD's length wrap */ + if ((u8 *)a + le32_to_cpu(a->length) < (u8 *)a) + break; + /* check whether ATTR_RECORD's length is within bounds */ + if ((u8 *)a + le32_to_cpu(a->length) > mrec_end) + break; + + if (a->type != type) + continue; + /* + * If @name is present, compare the two names. If @name is + * missing, assume we want an unnamed attribute. + */ + if (!name) { + /* The search failed if the found attribute is named. */ + if (a->name_length) + return -ENOENT; + } else if (!ntfs_are_names_equal(name, name_len, + (ntfschar*)((u8*)a + le16_to_cpu(a->name_offset)), + a->name_length, ic, upcase, upcase_len)) { + register int rc; + + rc = ntfs_collate_names(name, name_len, + (ntfschar*)((u8*)a + + le16_to_cpu(a->name_offset)), + a->name_length, 1, IGNORE_CASE, + upcase, upcase_len); + /* + * If @name collates before a->name, there is no + * matching attribute. + */ + if (rc == -1) + return -ENOENT; + /* If the strings are not equal, continue search. */ + if (rc) + continue; + rc = ntfs_collate_names(name, name_len, + (ntfschar*)((u8*)a + + le16_to_cpu(a->name_offset)), + a->name_length, 1, CASE_SENSITIVE, + upcase, upcase_len); + if (rc == -1) + return -ENOENT; + if (rc) + continue; + } + /* + * The names match or @name not present and attribute is + * unnamed. If no @val specified, we have found the attribute + * and are done. + */ + if (!val) + return 0; + /* @val is present; compare values. */ + else { + register int rc; + + rc = memcmp(val, (u8*)a + le16_to_cpu( + a->data.resident.value_offset), + min_t(u32, val_len, le32_to_cpu( + a->data.resident.value_length))); + /* + * If @val collates before the current attribute's + * value, there is no matching attribute. + */ + if (!rc) { + register u32 avl; + + avl = le32_to_cpu( + a->data.resident.value_length); + if (val_len == avl) + return 0; + if (val_len < avl) + return -ENOENT; + } else if (rc < 0) + return -ENOENT; + } + } + ntfs_error(vol->sb, "Inode is corrupt. Run chkdsk."); + NVolSetErrors(vol); + return -EIO; +} + +/** + * load_attribute_list - load an attribute list into memory + * @vol: ntfs volume from which to read + * @runlist: runlist of the attribute list + * @al_start: destination buffer + * @size: size of the destination buffer in bytes + * @initialized_size: initialized size of the attribute list + * + * Walk the runlist @runlist and load all clusters from it copying them into + * the linear buffer @al. The maximum number of bytes copied to @al is @size + * bytes. Note, @size does not need to be a multiple of the cluster size. If + * @initialized_size is less than @size, the region in @al between + * @initialized_size and @size will be zeroed and not read from disk. + * + * Return 0 on success or -errno on error. + */ +int load_attribute_list(ntfs_volume *vol, runlist *runlist, u8 *al_start, + const s64 size, const s64 initialized_size) +{ + LCN lcn; + u8 *al = al_start; + u8 *al_end = al + initialized_size; + runlist_element *rl; + struct buffer_head *bh; + struct super_block *sb; + unsigned long block_size; + unsigned long block, max_block; + int err = 0; + unsigned char block_size_bits; + + ntfs_debug("Entering."); + if (!vol || !runlist || !al || size <= 0 || initialized_size < 0 || + initialized_size > size) + return -EINVAL; + if (!initialized_size) { + memset(al, 0, size); + return 0; + } + sb = vol->sb; + block_size = sb->s_blocksize; + block_size_bits = sb->s_blocksize_bits; + down_read(&runlist->lock); + rl = runlist->rl; + if (!rl) { + ntfs_error(sb, "Cannot read attribute list since runlist is " + "missing."); + goto err_out; + } + /* Read all clusters specified by the runlist one run at a time. */ + while (rl->length) { + lcn = ntfs_rl_vcn_to_lcn(rl, rl->vcn); + ntfs_debug("Reading vcn = 0x%llx, lcn = 0x%llx.", + (unsigned long long)rl->vcn, + (unsigned long long)lcn); + /* The attribute list cannot be sparse. */ + if (lcn < 0) { + ntfs_error(sb, "ntfs_rl_vcn_to_lcn() failed. Cannot " + "read attribute list."); + goto err_out; + } + block = lcn << vol->cluster_size_bits >> block_size_bits; + /* Read the run from device in chunks of block_size bytes. */ + max_block = block + (rl->length << vol->cluster_size_bits >> + block_size_bits); + ntfs_debug("max_block = 0x%lx.", max_block); + do { + ntfs_debug("Reading block = 0x%lx.", block); + bh = sb_bread(sb, block); + if (!bh) { + ntfs_error(sb, "sb_bread() failed. Cannot " + "read attribute list."); + goto err_out; + } + if (al + block_size >= al_end) + goto do_final; + memcpy(al, bh->b_data, block_size); + brelse(bh); + al += block_size; + } while (++block < max_block); + rl++; + } + if (initialized_size < size) { +initialize: + memset(al_start + initialized_size, 0, size - initialized_size); + } +done: + up_read(&runlist->lock); + return err; +do_final: + if (al < al_end) { + /* + * Partial block. + * + * Note: The attribute list can be smaller than its allocation + * by multiple clusters. This has been encountered by at least + * two people running Windows XP, thus we cannot do any + * truncation sanity checking here. (AIA) + */ + memcpy(al, bh->b_data, al_end - al); + brelse(bh); + if (initialized_size < size) + goto initialize; + goto done; + } + brelse(bh); + /* Real overflow! */ + ntfs_error(sb, "Attribute list buffer overflow. Read attribute list " + "is truncated."); +err_out: + err = -EIO; + goto done; +} + +/** + * ntfs_external_attr_find - find an attribute in the attribute list of an inode + * @type: attribute type to find + * @name: attribute name to find (optional, i.e. NULL means don't care) + * @name_len: attribute name length (only needed if @name present) + * @ic: IGNORE_CASE or CASE_SENSITIVE (ignored if @name not present) + * @lowest_vcn: lowest vcn to find (optional, non-resident attributes only) + * @val: attribute value to find (optional, resident attributes only) + * @val_len: attribute value length + * @ctx: search context with mft record and attribute to search from + * + * You should not need to call this function directly. Use ntfs_attr_lookup() + * instead. + * + * Find an attribute by searching the attribute list for the corresponding + * attribute list entry. Having found the entry, map the mft record if the + * attribute is in a different mft record/inode, ntfs_attr_find() the attribute + * in there and return it. + * + * On first search @ctx->ntfs_ino must be the base mft record and @ctx must + * have been obtained from a call to ntfs_attr_get_search_ctx(). On subsequent + * calls @ctx->ntfs_ino can be any extent inode, too (@ctx->base_ntfs_ino is + * then the base inode). + * + * After finishing with the attribute/mft record you need to call + * ntfs_attr_put_search_ctx() to cleanup the search context (unmapping any + * mapped inodes, etc). + * + * If the attribute is found, ntfs_external_attr_find() returns 0 and + * @ctx->attr will point to the found attribute. @ctx->mrec will point to the + * mft record in which @ctx->attr is located and @ctx->al_entry will point to + * the attribute list entry for the attribute. + * + * If the attribute is not found, ntfs_external_attr_find() returns -ENOENT and + * @ctx->attr will point to the attribute in the base mft record before which + * the attribute being searched for would need to be inserted if such an action + * were to be desired. @ctx->mrec will point to the mft record in which + * @ctx->attr is located and @ctx->al_entry will point to the attribute list + * entry of the attribute before which the attribute being searched for would + * need to be inserted if such an action were to be desired. + * + * Thus to insert the not found attribute, one wants to add the attribute to + * @ctx->mrec (the base mft record) and if there is not enough space, the + * attribute should be placed in a newly allocated extent mft record. The + * attribute list entry for the inserted attribute should be inserted in the + * attribute list attribute at @ctx->al_entry. + * + * On actual error, ntfs_external_attr_find() returns -EIO. In this case + * @ctx->attr is undefined and in particular do not rely on it not changing. + */ +static int ntfs_external_attr_find(const ATTR_TYPE type, + const ntfschar *name, const u32 name_len, + const IGNORE_CASE_BOOL ic, const VCN lowest_vcn, + const u8 *val, const u32 val_len, ntfs_attr_search_ctx *ctx) +{ + ntfs_inode *base_ni, *ni; + ntfs_volume *vol; + ATTR_LIST_ENTRY *al_entry, *next_al_entry; + u8 *al_start, *al_end; + ATTR_RECORD *a; + ntfschar *al_name; + u32 al_name_len; + int err = 0; + static const char *es = " Unmount and run chkdsk."; + + ni = ctx->ntfs_ino; + base_ni = ctx->base_ntfs_ino; + ntfs_debug("Entering for inode 0x%lx, type 0x%x.", ni->mft_no, type); + if (!base_ni) { + /* First call happens with the base mft record. */ + base_ni = ctx->base_ntfs_ino = ctx->ntfs_ino; + ctx->base_mrec = ctx->mrec; + } + if (ni == base_ni) + ctx->base_attr = ctx->attr; + if (type == AT_END) + goto not_found; + vol = base_ni->vol; + al_start = base_ni->attr_list; + al_end = al_start + base_ni->attr_list_size; + if (!ctx->al_entry) + ctx->al_entry = (ATTR_LIST_ENTRY*)al_start; + /* + * Iterate over entries in attribute list starting at @ctx->al_entry, + * or the entry following that, if @ctx->is_first is 'true'. + */ + if (ctx->is_first) { + al_entry = ctx->al_entry; + ctx->is_first = false; + } else + al_entry = (ATTR_LIST_ENTRY*)((u8*)ctx->al_entry + + le16_to_cpu(ctx->al_entry->length)); + for (;; al_entry = next_al_entry) { + /* Out of bounds check. */ + if ((u8*)al_entry < base_ni->attr_list || + (u8*)al_entry > al_end) + break; /* Inode is corrupt. */ + ctx->al_entry = al_entry; + /* Catch the end of the attribute list. */ + if ((u8*)al_entry == al_end) + goto not_found; + if (!al_entry->length) + break; + if ((u8*)al_entry + 6 > al_end || (u8*)al_entry + + le16_to_cpu(al_entry->length) > al_end) + break; + next_al_entry = (ATTR_LIST_ENTRY*)((u8*)al_entry + + le16_to_cpu(al_entry->length)); + if (le32_to_cpu(al_entry->type) > le32_to_cpu(type)) + goto not_found; + if (type != al_entry->type) + continue; + /* + * If @name is present, compare the two names. If @name is + * missing, assume we want an unnamed attribute. + */ + al_name_len = al_entry->name_length; + al_name = (ntfschar*)((u8*)al_entry + al_entry->name_offset); + if (!name) { + if (al_name_len) + goto not_found; + } else if (!ntfs_are_names_equal(al_name, al_name_len, name, + name_len, ic, vol->upcase, vol->upcase_len)) { + register int rc; + + rc = ntfs_collate_names(name, name_len, al_name, + al_name_len, 1, IGNORE_CASE, + vol->upcase, vol->upcase_len); + /* + * If @name collates before al_name, there is no + * matching attribute. + */ + if (rc == -1) + goto not_found; + /* If the strings are not equal, continue search. */ + if (rc) + continue; + /* + * FIXME: Reverse engineering showed 0, IGNORE_CASE but + * that is inconsistent with ntfs_attr_find(). The + * subsequent rc checks were also different. Perhaps I + * made a mistake in one of the two. Need to recheck + * which is correct or at least see what is going on... + * (AIA) + */ + rc = ntfs_collate_names(name, name_len, al_name, + al_name_len, 1, CASE_SENSITIVE, + vol->upcase, vol->upcase_len); + if (rc == -1) + goto not_found; + if (rc) + continue; + } + /* + * The names match or @name not present and attribute is + * unnamed. Now check @lowest_vcn. Continue search if the + * next attribute list entry still fits @lowest_vcn. Otherwise + * we have reached the right one or the search has failed. + */ + if (lowest_vcn && (u8*)next_al_entry >= al_start && + (u8*)next_al_entry + 6 < al_end && + (u8*)next_al_entry + le16_to_cpu( + next_al_entry->length) <= al_end && + sle64_to_cpu(next_al_entry->lowest_vcn) <= + lowest_vcn && + next_al_entry->type == al_entry->type && + next_al_entry->name_length == al_name_len && + ntfs_are_names_equal((ntfschar*)((u8*) + next_al_entry + + next_al_entry->name_offset), + next_al_entry->name_length, + al_name, al_name_len, CASE_SENSITIVE, + vol->upcase, vol->upcase_len)) + continue; + if (MREF_LE(al_entry->mft_reference) == ni->mft_no) { + if (MSEQNO_LE(al_entry->mft_reference) != ni->seq_no) { + ntfs_error(vol->sb, "Found stale mft " + "reference in attribute list " + "of base inode 0x%lx.%s", + base_ni->mft_no, es); + err = -EIO; + break; + } + } else { /* Mft references do not match. */ + /* If there is a mapped record unmap it first. */ + if (ni != base_ni) + unmap_extent_mft_record(ni); + /* Do we want the base record back? */ + if (MREF_LE(al_entry->mft_reference) == + base_ni->mft_no) { + ni = ctx->ntfs_ino = base_ni; + ctx->mrec = ctx->base_mrec; + } else { + /* We want an extent record. */ + ctx->mrec = map_extent_mft_record(base_ni, + le64_to_cpu( + al_entry->mft_reference), &ni); + if (IS_ERR(ctx->mrec)) { + ntfs_error(vol->sb, "Failed to map " + "extent mft record " + "0x%lx of base inode " + "0x%lx.%s", + MREF_LE(al_entry-> + mft_reference), + base_ni->mft_no, es); + err = PTR_ERR(ctx->mrec); + if (err == -ENOENT) + err = -EIO; + /* Cause @ctx to be sanitized below. */ + ni = NULL; + break; + } + ctx->ntfs_ino = ni; + } + ctx->attr = (ATTR_RECORD*)((u8*)ctx->mrec + + le16_to_cpu(ctx->mrec->attrs_offset)); + } + /* + * ctx->vfs_ino, ctx->mrec, and ctx->attr now point to the + * mft record containing the attribute represented by the + * current al_entry. + */ + /* + * We could call into ntfs_attr_find() to find the right + * attribute in this mft record but this would be less + * efficient and not quite accurate as ntfs_attr_find() ignores + * the attribute instance numbers for example which become + * important when one plays with attribute lists. Also, + * because a proper match has been found in the attribute list + * entry above, the comparison can now be optimized. So it is + * worth re-implementing a simplified ntfs_attr_find() here. + */ + a = ctx->attr; + /* + * Use a manual loop so we can still use break and continue + * with the same meanings as above. + */ +do_next_attr_loop: + if ((u8*)a < (u8*)ctx->mrec || (u8*)a > (u8*)ctx->mrec + + le32_to_cpu(ctx->mrec->bytes_allocated)) + break; + if (a->type == AT_END) + break; + if (!a->length) + break; + if (al_entry->instance != a->instance) + goto do_next_attr; + /* + * If the type and/or the name are mismatched between the + * attribute list entry and the attribute record, there is + * corruption so we break and return error EIO. + */ + if (al_entry->type != a->type) + break; + if (!ntfs_are_names_equal((ntfschar*)((u8*)a + + le16_to_cpu(a->name_offset)), a->name_length, + al_name, al_name_len, CASE_SENSITIVE, + vol->upcase, vol->upcase_len)) + break; + ctx->attr = a; + /* + * If no @val specified or @val specified and it matches, we + * have found it! + */ + if (!val || (!a->non_resident && le32_to_cpu( + a->data.resident.value_length) == val_len && + !memcmp((u8*)a + + le16_to_cpu(a->data.resident.value_offset), + val, val_len))) { + ntfs_debug("Done, found."); + return 0; + } +do_next_attr: + /* Proceed to the next attribute in the current mft record. */ + a = (ATTR_RECORD*)((u8*)a + le32_to_cpu(a->length)); + goto do_next_attr_loop; + } + if (!err) { + ntfs_error(vol->sb, "Base inode 0x%lx contains corrupt " + "attribute list attribute.%s", base_ni->mft_no, + es); + err = -EIO; + } + if (ni != base_ni) { + if (ni) + unmap_extent_mft_record(ni); + ctx->ntfs_ino = base_ni; + ctx->mrec = ctx->base_mrec; + ctx->attr = ctx->base_attr; + } + if (err != -ENOMEM) + NVolSetErrors(vol); + return err; +not_found: + /* + * If we were looking for AT_END, we reset the search context @ctx and + * use ntfs_attr_find() to seek to the end of the base mft record. + */ + if (type == AT_END) { + ntfs_attr_reinit_search_ctx(ctx); + return ntfs_attr_find(AT_END, name, name_len, ic, val, val_len, + ctx); + } + /* + * The attribute was not found. Before we return, we want to ensure + * @ctx->mrec and @ctx->attr indicate the position at which the + * attribute should be inserted in the base mft record. Since we also + * want to preserve @ctx->al_entry we cannot reinitialize the search + * context using ntfs_attr_reinit_search_ctx() as this would set + * @ctx->al_entry to NULL. Thus we do the necessary bits manually (see + * ntfs_attr_init_search_ctx() below). Note, we _only_ preserve + * @ctx->al_entry as the remaining fields (base_*) are identical to + * their non base_ counterparts and we cannot set @ctx->base_attr + * correctly yet as we do not know what @ctx->attr will be set to by + * the call to ntfs_attr_find() below. + */ + if (ni != base_ni) + unmap_extent_mft_record(ni); + ctx->mrec = ctx->base_mrec; + ctx->attr = (ATTR_RECORD*)((u8*)ctx->mrec + + le16_to_cpu(ctx->mrec->attrs_offset)); + ctx->is_first = true; + ctx->ntfs_ino = base_ni; + ctx->base_ntfs_ino = NULL; + ctx->base_mrec = NULL; + ctx->base_attr = NULL; + /* + * In case there are multiple matches in the base mft record, need to + * keep enumerating until we get an attribute not found response (or + * another error), otherwise we would keep returning the same attribute + * over and over again and all programs using us for enumeration would + * lock up in a tight loop. + */ + do { + err = ntfs_attr_find(type, name, name_len, ic, val, val_len, + ctx); + } while (!err); + ntfs_debug("Done, not found."); + return err; +} + +/** + * ntfs_attr_lookup - find an attribute in an ntfs inode + * @type: attribute type to find + * @name: attribute name to find (optional, i.e. NULL means don't care) + * @name_len: attribute name length (only needed if @name present) + * @ic: IGNORE_CASE or CASE_SENSITIVE (ignored if @name not present) + * @lowest_vcn: lowest vcn to find (optional, non-resident attributes only) + * @val: attribute value to find (optional, resident attributes only) + * @val_len: attribute value length + * @ctx: search context with mft record and attribute to search from + * + * Find an attribute in an ntfs inode. On first search @ctx->ntfs_ino must + * be the base mft record and @ctx must have been obtained from a call to + * ntfs_attr_get_search_ctx(). + * + * This function transparently handles attribute lists and @ctx is used to + * continue searches where they were left off at. + * + * After finishing with the attribute/mft record you need to call + * ntfs_attr_put_search_ctx() to cleanup the search context (unmapping any + * mapped inodes, etc). + * + * Return 0 if the search was successful and -errno if not. + * + * When 0, @ctx->attr is the found attribute and it is in mft record + * @ctx->mrec. If an attribute list attribute is present, @ctx->al_entry is + * the attribute list entry of the found attribute. + * + * When -ENOENT, @ctx->attr is the attribute which collates just after the + * attribute being searched for, i.e. if one wants to add the attribute to the + * mft record this is the correct place to insert it into. If an attribute + * list attribute is present, @ctx->al_entry is the attribute list entry which + * collates just after the attribute list entry of the attribute being searched + * for, i.e. if one wants to add the attribute to the mft record this is the + * correct place to insert its attribute list entry into. + * + * When -errno != -ENOENT, an error occurred during the lookup. @ctx->attr is + * then undefined and in particular you should not rely on it not changing. + */ +int ntfs_attr_lookup(const ATTR_TYPE type, const ntfschar *name, + const u32 name_len, const IGNORE_CASE_BOOL ic, + const VCN lowest_vcn, const u8 *val, const u32 val_len, + ntfs_attr_search_ctx *ctx) +{ + ntfs_inode *base_ni; + + ntfs_debug("Entering."); + BUG_ON(IS_ERR(ctx->mrec)); + if (ctx->base_ntfs_ino) + base_ni = ctx->base_ntfs_ino; + else + base_ni = ctx->ntfs_ino; + /* Sanity check, just for debugging really. */ + BUG_ON(!base_ni); + if (!NInoAttrList(base_ni) || type == AT_ATTRIBUTE_LIST) + return ntfs_attr_find(type, name, name_len, ic, val, val_len, + ctx); + return ntfs_external_attr_find(type, name, name_len, ic, lowest_vcn, + val, val_len, ctx); +} + +/** + * ntfs_attr_init_search_ctx - initialize an attribute search context + * @ctx: attribute search context to initialize + * @ni: ntfs inode with which to initialize the search context + * @mrec: mft record with which to initialize the search context + * + * Initialize the attribute search context @ctx with @ni and @mrec. + */ +static inline void ntfs_attr_init_search_ctx(ntfs_attr_search_ctx *ctx, + ntfs_inode *ni, MFT_RECORD *mrec) +{ + *ctx = (ntfs_attr_search_ctx) { + .mrec = mrec, + /* Sanity checks are performed elsewhere. */ + .attr = (ATTR_RECORD*)((u8*)mrec + + le16_to_cpu(mrec->attrs_offset)), + .is_first = true, + .ntfs_ino = ni, + }; +} + +/** + * ntfs_attr_reinit_search_ctx - reinitialize an attribute search context + * @ctx: attribute search context to reinitialize + * + * Reinitialize the attribute search context @ctx, unmapping an associated + * extent mft record if present, and initialize the search context again. + * + * This is used when a search for a new attribute is being started to reset + * the search context to the beginning. + */ +void ntfs_attr_reinit_search_ctx(ntfs_attr_search_ctx *ctx) +{ + if (likely(!ctx->base_ntfs_ino)) { + /* No attribute list. */ + ctx->is_first = true; + /* Sanity checks are performed elsewhere. */ + ctx->attr = (ATTR_RECORD*)((u8*)ctx->mrec + + le16_to_cpu(ctx->mrec->attrs_offset)); + /* + * This needs resetting due to ntfs_external_attr_find() which + * can leave it set despite having zeroed ctx->base_ntfs_ino. + */ + ctx->al_entry = NULL; + return; + } /* Attribute list. */ + if (ctx->ntfs_ino != ctx->base_ntfs_ino) + unmap_extent_mft_record(ctx->ntfs_ino); + ntfs_attr_init_search_ctx(ctx, ctx->base_ntfs_ino, ctx->base_mrec); + return; +} + +/** + * ntfs_attr_get_search_ctx - allocate/initialize a new attribute search context + * @ni: ntfs inode with which to initialize the search context + * @mrec: mft record with which to initialize the search context + * + * Allocate a new attribute search context, initialize it with @ni and @mrec, + * and return it. Return NULL if allocation failed. + */ +ntfs_attr_search_ctx *ntfs_attr_get_search_ctx(ntfs_inode *ni, MFT_RECORD *mrec) +{ + ntfs_attr_search_ctx *ctx; + + ctx = kmem_cache_alloc(ntfs_attr_ctx_cache, GFP_NOFS); + if (ctx) + ntfs_attr_init_search_ctx(ctx, ni, mrec); + return ctx; +} + +/** + * ntfs_attr_put_search_ctx - release an attribute search context + * @ctx: attribute search context to free + * + * Release the attribute search context @ctx, unmapping an associated extent + * mft record if present. + */ +void ntfs_attr_put_search_ctx(ntfs_attr_search_ctx *ctx) +{ + if (ctx->base_ntfs_ino && ctx->ntfs_ino != ctx->base_ntfs_ino) + unmap_extent_mft_record(ctx->ntfs_ino); + kmem_cache_free(ntfs_attr_ctx_cache, ctx); + return; +} + +#ifdef NTFS_RW + +/** + * ntfs_attr_find_in_attrdef - find an attribute in the $AttrDef system file + * @vol: ntfs volume to which the attribute belongs + * @type: attribute type which to find + * + * Search for the attribute definition record corresponding to the attribute + * @type in the $AttrDef system file. + * + * Return the attribute type definition record if found and NULL if not found. + */ +static ATTR_DEF *ntfs_attr_find_in_attrdef(const ntfs_volume *vol, + const ATTR_TYPE type) +{ + ATTR_DEF *ad; + + BUG_ON(!vol->attrdef); + BUG_ON(!type); + for (ad = vol->attrdef; (u8*)ad - (u8*)vol->attrdef < + vol->attrdef_size && ad->type; ++ad) { + /* We have not found it yet, carry on searching. */ + if (likely(le32_to_cpu(ad->type) < le32_to_cpu(type))) + continue; + /* We found the attribute; return it. */ + if (likely(ad->type == type)) + return ad; + /* We have gone too far already. No point in continuing. */ + break; + } + /* Attribute not found. */ + ntfs_debug("Attribute type 0x%x not found in $AttrDef.", + le32_to_cpu(type)); + return NULL; +} + +/** + * ntfs_attr_size_bounds_check - check a size of an attribute type for validity + * @vol: ntfs volume to which the attribute belongs + * @type: attribute type which to check + * @size: size which to check + * + * Check whether the @size in bytes is valid for an attribute of @type on the + * ntfs volume @vol. This information is obtained from $AttrDef system file. + * + * Return 0 if valid, -ERANGE if not valid, or -ENOENT if the attribute is not + * listed in $AttrDef. + */ +int ntfs_attr_size_bounds_check(const ntfs_volume *vol, const ATTR_TYPE type, + const s64 size) +{ + ATTR_DEF *ad; + + BUG_ON(size < 0); + /* + * $ATTRIBUTE_LIST has a maximum size of 256kiB, but this is not + * listed in $AttrDef. + */ + if (unlikely(type == AT_ATTRIBUTE_LIST && size > 256 * 1024)) + return -ERANGE; + /* Get the $AttrDef entry for the attribute @type. */ + ad = ntfs_attr_find_in_attrdef(vol, type); + if (unlikely(!ad)) + return -ENOENT; + /* Do the bounds check. */ + if (((sle64_to_cpu(ad->min_size) > 0) && + size < sle64_to_cpu(ad->min_size)) || + ((sle64_to_cpu(ad->max_size) > 0) && size > + sle64_to_cpu(ad->max_size))) + return -ERANGE; + return 0; +} + +/** + * ntfs_attr_can_be_non_resident - check if an attribute can be non-resident + * @vol: ntfs volume to which the attribute belongs + * @type: attribute type which to check + * + * Check whether the attribute of @type on the ntfs volume @vol is allowed to + * be non-resident. This information is obtained from $AttrDef system file. + * + * Return 0 if the attribute is allowed to be non-resident, -EPERM if not, and + * -ENOENT if the attribute is not listed in $AttrDef. + */ +int ntfs_attr_can_be_non_resident(const ntfs_volume *vol, const ATTR_TYPE type) +{ + ATTR_DEF *ad; + + /* Find the attribute definition record in $AttrDef. */ + ad = ntfs_attr_find_in_attrdef(vol, type); + if (unlikely(!ad)) + return -ENOENT; + /* Check the flags and return the result. */ + if (ad->flags & ATTR_DEF_RESIDENT) + return -EPERM; + return 0; +} + +/** + * ntfs_attr_can_be_resident - check if an attribute can be resident + * @vol: ntfs volume to which the attribute belongs + * @type: attribute type which to check + * + * Check whether the attribute of @type on the ntfs volume @vol is allowed to + * be resident. This information is derived from our ntfs knowledge and may + * not be completely accurate, especially when user defined attributes are + * present. Basically we allow everything to be resident except for index + * allocation and $EA attributes. + * + * Return 0 if the attribute is allowed to be non-resident and -EPERM if not. + * + * Warning: In the system file $MFT the attribute $Bitmap must be non-resident + * otherwise windows will not boot (blue screen of death)! We cannot + * check for this here as we do not know which inode's $Bitmap is + * being asked about so the caller needs to special case this. + */ +int ntfs_attr_can_be_resident(const ntfs_volume *vol, const ATTR_TYPE type) +{ + if (type == AT_INDEX_ALLOCATION) + return -EPERM; + return 0; +} + +/** + * ntfs_attr_record_resize - resize an attribute record + * @m: mft record containing attribute record + * @a: attribute record to resize + * @new_size: new size in bytes to which to resize the attribute record @a + * + * Resize the attribute record @a, i.e. the resident part of the attribute, in + * the mft record @m to @new_size bytes. + * + * Return 0 on success and -errno on error. The following error codes are + * defined: + * -ENOSPC - Not enough space in the mft record @m to perform the resize. + * + * Note: On error, no modifications have been performed whatsoever. + * + * Warning: If you make a record smaller without having copied all the data you + * are interested in the data may be overwritten. + */ +int ntfs_attr_record_resize(MFT_RECORD *m, ATTR_RECORD *a, u32 new_size) +{ + ntfs_debug("Entering for new_size %u.", new_size); + /* Align to 8 bytes if it is not already done. */ + if (new_size & 7) + new_size = (new_size + 7) & ~7; + /* If the actual attribute length has changed, move things around. */ + if (new_size != le32_to_cpu(a->length)) { + u32 new_muse = le32_to_cpu(m->bytes_in_use) - + le32_to_cpu(a->length) + new_size; + /* Not enough space in this mft record. */ + if (new_muse > le32_to_cpu(m->bytes_allocated)) + return -ENOSPC; + /* Move attributes following @a to their new location. */ + memmove((u8*)a + new_size, (u8*)a + le32_to_cpu(a->length), + le32_to_cpu(m->bytes_in_use) - ((u8*)a - + (u8*)m) - le32_to_cpu(a->length)); + /* Adjust @m to reflect the change in used space. */ + m->bytes_in_use = cpu_to_le32(new_muse); + /* Adjust @a to reflect the new size. */ + if (new_size >= offsetof(ATTR_REC, length) + sizeof(a->length)) + a->length = cpu_to_le32(new_size); + } + return 0; +} + +/** + * ntfs_resident_attr_value_resize - resize the value of a resident attribute + * @m: mft record containing attribute record + * @a: attribute record whose value to resize + * @new_size: new size in bytes to which to resize the attribute value of @a + * + * Resize the value of the attribute @a in the mft record @m to @new_size bytes. + * If the value is made bigger, the newly allocated space is cleared. + * + * Return 0 on success and -errno on error. The following error codes are + * defined: + * -ENOSPC - Not enough space in the mft record @m to perform the resize. + * + * Note: On error, no modifications have been performed whatsoever. + * + * Warning: If you make a record smaller without having copied all the data you + * are interested in the data may be overwritten. + */ +int ntfs_resident_attr_value_resize(MFT_RECORD *m, ATTR_RECORD *a, + const u32 new_size) +{ + u32 old_size; + + /* Resize the resident part of the attribute record. */ + if (ntfs_attr_record_resize(m, a, + le16_to_cpu(a->data.resident.value_offset) + new_size)) + return -ENOSPC; + /* + * The resize succeeded! If we made the attribute value bigger, clear + * the area between the old size and @new_size. + */ + old_size = le32_to_cpu(a->data.resident.value_length); + if (new_size > old_size) + memset((u8*)a + le16_to_cpu(a->data.resident.value_offset) + + old_size, 0, new_size - old_size); + /* Finally update the length of the attribute value. */ + a->data.resident.value_length = cpu_to_le32(new_size); + return 0; +} + +/** + * ntfs_attr_make_non_resident - convert a resident to a non-resident attribute + * @ni: ntfs inode describing the attribute to convert + * @data_size: size of the resident data to copy to the non-resident attribute + * + * Convert the resident ntfs attribute described by the ntfs inode @ni to a + * non-resident one. + * + * @data_size must be equal to the attribute value size. This is needed since + * we need to know the size before we can map the mft record and our callers + * always know it. The reason we cannot simply read the size from the vfs + * inode i_size is that this is not necessarily uptodate. This happens when + * ntfs_attr_make_non_resident() is called in the ->truncate call path(s). + * + * Return 0 on success and -errno on error. The following error return codes + * are defined: + * -EPERM - The attribute is not allowed to be non-resident. + * -ENOMEM - Not enough memory. + * -ENOSPC - Not enough disk space. + * -EINVAL - Attribute not defined on the volume. + * -EIO - I/o error or other error. + * Note that -ENOSPC is also returned in the case that there is not enough + * space in the mft record to do the conversion. This can happen when the mft + * record is already very full. The caller is responsible for trying to make + * space in the mft record and trying again. FIXME: Do we need a separate + * error return code for this kind of -ENOSPC or is it always worth trying + * again in case the attribute may then fit in a resident state so no need to + * make it non-resident at all? Ho-hum... (AIA) + * + * NOTE to self: No changes in the attribute list are required to move from + * a resident to a non-resident attribute. + * + * Locking: - The caller must hold i_mutex on the inode. + */ +int ntfs_attr_make_non_resident(ntfs_inode *ni, const u32 data_size) +{ + s64 new_size; + struct inode *vi = VFS_I(ni); + ntfs_volume *vol = ni->vol; + ntfs_inode *base_ni; + MFT_RECORD *m; + ATTR_RECORD *a; + ntfs_attr_search_ctx *ctx; + struct page *page; + runlist_element *rl; + u8 *kaddr; + unsigned long flags; + int mp_size, mp_ofs, name_ofs, arec_size, err, err2; + u32 attr_size; + u8 old_res_attr_flags; + + /* Check that the attribute is allowed to be non-resident. */ + err = ntfs_attr_can_be_non_resident(vol, ni->type); + if (unlikely(err)) { + if (err == -EPERM) + ntfs_debug("Attribute is not allowed to be " + "non-resident."); + else + ntfs_debug("Attribute not defined on the NTFS " + "volume!"); + return err; + } + /* + * FIXME: Compressed and encrypted attributes are not supported when + * writing and we should never have gotten here for them. + */ + BUG_ON(NInoCompressed(ni)); + BUG_ON(NInoEncrypted(ni)); + /* + * The size needs to be aligned to a cluster boundary for allocation + * purposes. + */ + new_size = (data_size + vol->cluster_size - 1) & + ~(vol->cluster_size - 1); + if (new_size > 0) { + /* + * Will need the page later and since the page lock nests + * outside all ntfs locks, we need to get the page now. + */ + page = find_or_create_page(vi->i_mapping, 0, + mapping_gfp_mask(vi->i_mapping)); + if (unlikely(!page)) + return -ENOMEM; + /* Start by allocating clusters to hold the attribute value. */ + rl = ntfs_cluster_alloc(vol, 0, new_size >> + vol->cluster_size_bits, -1, DATA_ZONE, true); + if (IS_ERR(rl)) { + err = PTR_ERR(rl); + ntfs_debug("Failed to allocate cluster%s, error code " + "%i.", (new_size >> + vol->cluster_size_bits) > 1 ? "s" : "", + err); + goto page_err_out; + } + } else { + rl = NULL; + page = NULL; + } + /* Determine the size of the mapping pairs array. */ + mp_size = ntfs_get_size_for_mapping_pairs(vol, rl, 0, -1); + if (unlikely(mp_size < 0)) { + err = mp_size; + ntfs_debug("Failed to get size for mapping pairs array, error " + "code %i.", err); + goto rl_err_out; + } + down_write(&ni->runlist.lock); + if (!NInoAttr(ni)) + base_ni = ni; + else + base_ni = ni->ext.base_ntfs_ino; + m = map_mft_record(base_ni); + if (IS_ERR(m)) { + err = PTR_ERR(m); + m = NULL; + ctx = NULL; + goto err_out; + } + ctx = ntfs_attr_get_search_ctx(base_ni, m); + if (unlikely(!ctx)) { + err = -ENOMEM; + goto err_out; + } + err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len, + CASE_SENSITIVE, 0, NULL, 0, ctx); + if (unlikely(err)) { + if (err == -ENOENT) + err = -EIO; + goto err_out; + } + m = ctx->mrec; + a = ctx->attr; + BUG_ON(NInoNonResident(ni)); + BUG_ON(a->non_resident); + /* + * Calculate new offsets for the name and the mapping pairs array. + */ + if (NInoSparse(ni) || NInoCompressed(ni)) + name_ofs = (offsetof(ATTR_REC, + data.non_resident.compressed_size) + + sizeof(a->data.non_resident.compressed_size) + + 7) & ~7; + else + name_ofs = (offsetof(ATTR_REC, + data.non_resident.compressed_size) + 7) & ~7; + mp_ofs = (name_ofs + a->name_length * sizeof(ntfschar) + 7) & ~7; + /* + * Determine the size of the resident part of the now non-resident + * attribute record. + */ + arec_size = (mp_ofs + mp_size + 7) & ~7; + /* + * If the page is not uptodate bring it uptodate by copying from the + * attribute value. + */ + attr_size = le32_to_cpu(a->data.resident.value_length); + BUG_ON(attr_size != data_size); + if (page && !PageUptodate(page)) { + kaddr = kmap_atomic(page); + memcpy(kaddr, (u8*)a + + le16_to_cpu(a->data.resident.value_offset), + attr_size); + memset(kaddr + attr_size, 0, PAGE_SIZE - attr_size); + kunmap_atomic(kaddr); + flush_dcache_page(page); + SetPageUptodate(page); + } + /* Backup the attribute flag. */ + old_res_attr_flags = a->data.resident.flags; + /* Resize the resident part of the attribute record. */ + err = ntfs_attr_record_resize(m, a, arec_size); + if (unlikely(err)) + goto err_out; + /* + * Convert the resident part of the attribute record to describe a + * non-resident attribute. + */ + a->non_resident = 1; + /* Move the attribute name if it exists and update the offset. */ + if (a->name_length) + memmove((u8*)a + name_ofs, (u8*)a + le16_to_cpu(a->name_offset), + a->name_length * sizeof(ntfschar)); + a->name_offset = cpu_to_le16(name_ofs); + /* Setup the fields specific to non-resident attributes. */ + a->data.non_resident.lowest_vcn = 0; + a->data.non_resident.highest_vcn = cpu_to_sle64((new_size - 1) >> + vol->cluster_size_bits); + a->data.non_resident.mapping_pairs_offset = cpu_to_le16(mp_ofs); + memset(&a->data.non_resident.reserved, 0, + sizeof(a->data.non_resident.reserved)); + a->data.non_resident.allocated_size = cpu_to_sle64(new_size); + a->data.non_resident.data_size = + a->data.non_resident.initialized_size = + cpu_to_sle64(attr_size); + if (NInoSparse(ni) || NInoCompressed(ni)) { + a->data.non_resident.compression_unit = 0; + if (NInoCompressed(ni) || vol->major_ver < 3) + a->data.non_resident.compression_unit = 4; + a->data.non_resident.compressed_size = + a->data.non_resident.allocated_size; + } else + a->data.non_resident.compression_unit = 0; + /* Generate the mapping pairs array into the attribute record. */ + err = ntfs_mapping_pairs_build(vol, (u8*)a + mp_ofs, + arec_size - mp_ofs, rl, 0, -1, NULL); + if (unlikely(err)) { + ntfs_debug("Failed to build mapping pairs, error code %i.", + err); + goto undo_err_out; + } + /* Setup the in-memory attribute structure to be non-resident. */ + ni->runlist.rl = rl; + write_lock_irqsave(&ni->size_lock, flags); + ni->allocated_size = new_size; + if (NInoSparse(ni) || NInoCompressed(ni)) { + ni->itype.compressed.size = ni->allocated_size; + if (a->data.non_resident.compression_unit) { + ni->itype.compressed.block_size = 1U << (a->data. + non_resident.compression_unit + + vol->cluster_size_bits); + ni->itype.compressed.block_size_bits = + ffs(ni->itype.compressed.block_size) - + 1; + ni->itype.compressed.block_clusters = 1U << + a->data.non_resident.compression_unit; + } else { + ni->itype.compressed.block_size = 0; + ni->itype.compressed.block_size_bits = 0; + ni->itype.compressed.block_clusters = 0; + } + vi->i_blocks = ni->itype.compressed.size >> 9; + } else + vi->i_blocks = ni->allocated_size >> 9; + write_unlock_irqrestore(&ni->size_lock, flags); + /* + * This needs to be last since the address space operations ->read_folio + * and ->writepage can run concurrently with us as they are not + * serialized on i_mutex. Note, we are not allowed to fail once we flip + * this switch, which is another reason to do this last. + */ + NInoSetNonResident(ni); + /* Mark the mft record dirty, so it gets written back. */ + flush_dcache_mft_record_page(ctx->ntfs_ino); + mark_mft_record_dirty(ctx->ntfs_ino); + ntfs_attr_put_search_ctx(ctx); + unmap_mft_record(base_ni); + up_write(&ni->runlist.lock); + if (page) { + set_page_dirty(page); + unlock_page(page); + put_page(page); + } + ntfs_debug("Done."); + return 0; +undo_err_out: + /* Convert the attribute back into a resident attribute. */ + a->non_resident = 0; + /* Move the attribute name if it exists and update the offset. */ + name_ofs = (offsetof(ATTR_RECORD, data.resident.reserved) + + sizeof(a->data.resident.reserved) + 7) & ~7; + if (a->name_length) + memmove((u8*)a + name_ofs, (u8*)a + le16_to_cpu(a->name_offset), + a->name_length * sizeof(ntfschar)); + mp_ofs = (name_ofs + a->name_length * sizeof(ntfschar) + 7) & ~7; + a->name_offset = cpu_to_le16(name_ofs); + arec_size = (mp_ofs + attr_size + 7) & ~7; + /* Resize the resident part of the attribute record. */ + err2 = ntfs_attr_record_resize(m, a, arec_size); + if (unlikely(err2)) { + /* + * This cannot happen (well if memory corruption is at work it + * could happen in theory), but deal with it as well as we can. + * If the old size is too small, truncate the attribute, + * otherwise simply give it a larger allocated size. + * FIXME: Should check whether chkdsk complains when the + * allocated size is much bigger than the resident value size. + */ + arec_size = le32_to_cpu(a->length); + if ((mp_ofs + attr_size) > arec_size) { + err2 = attr_size; + attr_size = arec_size - mp_ofs; + ntfs_error(vol->sb, "Failed to undo partial resident " + "to non-resident attribute " + "conversion. Truncating inode 0x%lx, " + "attribute type 0x%x from %i bytes to " + "%i bytes to maintain metadata " + "consistency. THIS MEANS YOU ARE " + "LOSING %i BYTES DATA FROM THIS %s.", + vi->i_ino, + (unsigned)le32_to_cpu(ni->type), + err2, attr_size, err2 - attr_size, + ((ni->type == AT_DATA) && + !ni->name_len) ? "FILE": "ATTRIBUTE"); + write_lock_irqsave(&ni->size_lock, flags); + ni->initialized_size = attr_size; + i_size_write(vi, attr_size); + write_unlock_irqrestore(&ni->size_lock, flags); + } + } + /* Setup the fields specific to resident attributes. */ + a->data.resident.value_length = cpu_to_le32(attr_size); + a->data.resident.value_offset = cpu_to_le16(mp_ofs); + a->data.resident.flags = old_res_attr_flags; + memset(&a->data.resident.reserved, 0, + sizeof(a->data.resident.reserved)); + /* Copy the data from the page back to the attribute value. */ + if (page) { + kaddr = kmap_atomic(page); + memcpy((u8*)a + mp_ofs, kaddr, attr_size); + kunmap_atomic(kaddr); + } + /* Setup the allocated size in the ntfs inode in case it changed. */ + write_lock_irqsave(&ni->size_lock, flags); + ni->allocated_size = arec_size - mp_ofs; + write_unlock_irqrestore(&ni->size_lock, flags); + /* Mark the mft record dirty, so it gets written back. */ + flush_dcache_mft_record_page(ctx->ntfs_ino); + mark_mft_record_dirty(ctx->ntfs_ino); +err_out: + if (ctx) + ntfs_attr_put_search_ctx(ctx); + if (m) + unmap_mft_record(base_ni); + ni->runlist.rl = NULL; + up_write(&ni->runlist.lock); +rl_err_out: + if (rl) { + if (ntfs_cluster_free_from_rl(vol, rl) < 0) { + ntfs_error(vol->sb, "Failed to release allocated " + "cluster(s) in error code path. Run " + "chkdsk to recover the lost " + "cluster(s)."); + NVolSetErrors(vol); + } + ntfs_free(rl); +page_err_out: + unlock_page(page); + put_page(page); + } + if (err == -EINVAL) + err = -EIO; + return err; +} + +/** + * ntfs_attr_extend_allocation - extend the allocated space of an attribute + * @ni: ntfs inode of the attribute whose allocation to extend + * @new_alloc_size: new size in bytes to which to extend the allocation to + * @new_data_size: new size in bytes to which to extend the data to + * @data_start: beginning of region which is required to be non-sparse + * + * Extend the allocated space of an attribute described by the ntfs inode @ni + * to @new_alloc_size bytes. If @data_start is -1, the whole extension may be + * implemented as a hole in the file (as long as both the volume and the ntfs + * inode @ni have sparse support enabled). If @data_start is >= 0, then the + * region between the old allocated size and @data_start - 1 may be made sparse + * but the regions between @data_start and @new_alloc_size must be backed by + * actual clusters. + * + * If @new_data_size is -1, it is ignored. If it is >= 0, then the data size + * of the attribute is extended to @new_data_size. Note that the i_size of the + * vfs inode is not updated. Only the data size in the base attribute record + * is updated. The caller has to update i_size separately if this is required. + * WARNING: It is a BUG() for @new_data_size to be smaller than the old data + * size as well as for @new_data_size to be greater than @new_alloc_size. + * + * For resident attributes this involves resizing the attribute record and if + * necessary moving it and/or other attributes into extent mft records and/or + * converting the attribute to a non-resident attribute which in turn involves + * extending the allocation of a non-resident attribute as described below. + * + * For non-resident attributes this involves allocating clusters in the data + * zone on the volume (except for regions that are being made sparse) and + * extending the run list to describe the allocated clusters as well as + * updating the mapping pairs array of the attribute. This in turn involves + * resizing the attribute record and if necessary moving it and/or other + * attributes into extent mft records and/or splitting the attribute record + * into multiple extent attribute records. + * + * Also, the attribute list attribute is updated if present and in some of the + * above cases (the ones where extent mft records/attributes come into play), + * an attribute list attribute is created if not already present. + * + * Return the new allocated size on success and -errno on error. In the case + * that an error is encountered but a partial extension at least up to + * @data_start (if present) is possible, the allocation is partially extended + * and this is returned. This means the caller must check the returned size to + * determine if the extension was partial. If @data_start is -1 then partial + * allocations are not performed. + * + * WARNING: Do not call ntfs_attr_extend_allocation() for $MFT/$DATA. + * + * Locking: This function takes the runlist lock of @ni for writing as well as + * locking the mft record of the base ntfs inode. These locks are maintained + * throughout execution of the function. These locks are required so that the + * attribute can be resized safely and so that it can for example be converted + * from resident to non-resident safely. + * + * TODO: At present attribute list attribute handling is not implemented. + * + * TODO: At present it is not safe to call this function for anything other + * than the $DATA attribute(s) of an uncompressed and unencrypted file. + */ +s64 ntfs_attr_extend_allocation(ntfs_inode *ni, s64 new_alloc_size, + const s64 new_data_size, const s64 data_start) +{ + VCN vcn; + s64 ll, allocated_size, start = data_start; + struct inode *vi = VFS_I(ni); + ntfs_volume *vol = ni->vol; + ntfs_inode *base_ni; + MFT_RECORD *m; + ATTR_RECORD *a; + ntfs_attr_search_ctx *ctx; + runlist_element *rl, *rl2; + unsigned long flags; + int err, mp_size; + u32 attr_len = 0; /* Silence stupid gcc warning. */ + bool mp_rebuilt; + +#ifdef DEBUG + read_lock_irqsave(&ni->size_lock, flags); + allocated_size = ni->allocated_size; + read_unlock_irqrestore(&ni->size_lock, flags); + ntfs_debug("Entering for i_ino 0x%lx, attribute type 0x%x, " + "old_allocated_size 0x%llx, " + "new_allocated_size 0x%llx, new_data_size 0x%llx, " + "data_start 0x%llx.", vi->i_ino, + (unsigned)le32_to_cpu(ni->type), + (unsigned long long)allocated_size, + (unsigned long long)new_alloc_size, + (unsigned long long)new_data_size, + (unsigned long long)start); +#endif +retry_extend: + /* + * For non-resident attributes, @start and @new_size need to be aligned + * to cluster boundaries for allocation purposes. + */ + if (NInoNonResident(ni)) { + if (start > 0) + start &= ~(s64)vol->cluster_size_mask; + new_alloc_size = (new_alloc_size + vol->cluster_size - 1) & + ~(s64)vol->cluster_size_mask; + } + BUG_ON(new_data_size >= 0 && new_data_size > new_alloc_size); + /* Check if new size is allowed in $AttrDef. */ + err = ntfs_attr_size_bounds_check(vol, ni->type, new_alloc_size); + if (unlikely(err)) { + /* Only emit errors when the write will fail completely. */ + read_lock_irqsave(&ni->size_lock, flags); + allocated_size = ni->allocated_size; + read_unlock_irqrestore(&ni->size_lock, flags); + if (start < 0 || start >= allocated_size) { + if (err == -ERANGE) { + ntfs_error(vol->sb, "Cannot extend allocation " + "of inode 0x%lx, attribute " + "type 0x%x, because the new " + "allocation would exceed the " + "maximum allowed size for " + "this attribute type.", + vi->i_ino, (unsigned) + le32_to_cpu(ni->type)); + } else { + ntfs_error(vol->sb, "Cannot extend allocation " + "of inode 0x%lx, attribute " + "type 0x%x, because this " + "attribute type is not " + "defined on the NTFS volume. " + "Possible corruption! You " + "should run chkdsk!", + vi->i_ino, (unsigned) + le32_to_cpu(ni->type)); + } + } + /* Translate error code to be POSIX conformant for write(2). */ + if (err == -ERANGE) + err = -EFBIG; + else + err = -EIO; + return err; + } + if (!NInoAttr(ni)) + base_ni = ni; + else + base_ni = ni->ext.base_ntfs_ino; + /* + * We will be modifying both the runlist (if non-resident) and the mft + * record so lock them both down. + */ + down_write(&ni->runlist.lock); + m = map_mft_record(base_ni); + if (IS_ERR(m)) { + err = PTR_ERR(m); + m = NULL; + ctx = NULL; + goto err_out; + } + ctx = ntfs_attr_get_search_ctx(base_ni, m); + if (unlikely(!ctx)) { + err = -ENOMEM; + goto err_out; + } + read_lock_irqsave(&ni->size_lock, flags); + allocated_size = ni->allocated_size; + read_unlock_irqrestore(&ni->size_lock, flags); + /* + * If non-resident, seek to the last extent. If resident, there is + * only one extent, so seek to that. + */ + vcn = NInoNonResident(ni) ? allocated_size >> vol->cluster_size_bits : + 0; + /* + * Abort if someone did the work whilst we waited for the locks. If we + * just converted the attribute from resident to non-resident it is + * likely that exactly this has happened already. We cannot quite + * abort if we need to update the data size. + */ + if (unlikely(new_alloc_size <= allocated_size)) { + ntfs_debug("Allocated size already exceeds requested size."); + new_alloc_size = allocated_size; + if (new_data_size < 0) + goto done; + /* + * We want the first attribute extent so that we can update the + * data size. + */ + vcn = 0; + } + err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len, + CASE_SENSITIVE, vcn, NULL, 0, ctx); + if (unlikely(err)) { + if (err == -ENOENT) + err = -EIO; + goto err_out; + } + m = ctx->mrec; + a = ctx->attr; + /* Use goto to reduce indentation. */ + if (a->non_resident) + goto do_non_resident_extend; + BUG_ON(NInoNonResident(ni)); + /* The total length of the attribute value. */ + attr_len = le32_to_cpu(a->data.resident.value_length); + /* + * Extend the attribute record to be able to store the new attribute + * size. ntfs_attr_record_resize() will not do anything if the size is + * not changing. + */ + if (new_alloc_size < vol->mft_record_size && + !ntfs_attr_record_resize(m, a, + le16_to_cpu(a->data.resident.value_offset) + + new_alloc_size)) { + /* The resize succeeded! */ + write_lock_irqsave(&ni->size_lock, flags); + ni->allocated_size = le32_to_cpu(a->length) - + le16_to_cpu(a->data.resident.value_offset); + write_unlock_irqrestore(&ni->size_lock, flags); + if (new_data_size >= 0) { + BUG_ON(new_data_size < attr_len); + a->data.resident.value_length = + cpu_to_le32((u32)new_data_size); + } + goto flush_done; + } + /* + * We have to drop all the locks so we can call + * ntfs_attr_make_non_resident(). This could be optimised by try- + * locking the first page cache page and only if that fails dropping + * the locks, locking the page, and redoing all the locking and + * lookups. While this would be a huge optimisation, it is not worth + * it as this is definitely a slow code path. + */ + ntfs_attr_put_search_ctx(ctx); + unmap_mft_record(base_ni); + up_write(&ni->runlist.lock); + /* + * Not enough space in the mft record, try to make the attribute + * non-resident and if successful restart the extension process. + */ + err = ntfs_attr_make_non_resident(ni, attr_len); + if (likely(!err)) + goto retry_extend; + /* + * Could not make non-resident. If this is due to this not being + * permitted for this attribute type or there not being enough space, + * try to make other attributes non-resident. Otherwise fail. + */ + if (unlikely(err != -EPERM && err != -ENOSPC)) { + /* Only emit errors when the write will fail completely. */ + read_lock_irqsave(&ni->size_lock, flags); + allocated_size = ni->allocated_size; + read_unlock_irqrestore(&ni->size_lock, flags); + if (start < 0 || start >= allocated_size) + ntfs_error(vol->sb, "Cannot extend allocation of " + "inode 0x%lx, attribute type 0x%x, " + "because the conversion from resident " + "to non-resident attribute failed " + "with error code %i.", vi->i_ino, + (unsigned)le32_to_cpu(ni->type), err); + if (err != -ENOMEM) + err = -EIO; + goto conv_err_out; + } + /* TODO: Not implemented from here, abort. */ + read_lock_irqsave(&ni->size_lock, flags); + allocated_size = ni->allocated_size; + read_unlock_irqrestore(&ni->size_lock, flags); + if (start < 0 || start >= allocated_size) { + if (err == -ENOSPC) + ntfs_error(vol->sb, "Not enough space in the mft " + "record/on disk for the non-resident " + "attribute value. This case is not " + "implemented yet."); + else /* if (err == -EPERM) */ + ntfs_error(vol->sb, "This attribute type may not be " + "non-resident. This case is not " + "implemented yet."); + } + err = -EOPNOTSUPP; + goto conv_err_out; +#if 0 + // TODO: Attempt to make other attributes non-resident. + if (!err) + goto do_resident_extend; + /* + * Both the attribute list attribute and the standard information + * attribute must remain in the base inode. Thus, if this is one of + * these attributes, we have to try to move other attributes out into + * extent mft records instead. + */ + if (ni->type == AT_ATTRIBUTE_LIST || + ni->type == AT_STANDARD_INFORMATION) { + // TODO: Attempt to move other attributes into extent mft + // records. + err = -EOPNOTSUPP; + if (!err) + goto do_resident_extend; + goto err_out; + } + // TODO: Attempt to move this attribute to an extent mft record, but + // only if it is not already the only attribute in an mft record in + // which case there would be nothing to gain. + err = -EOPNOTSUPP; + if (!err) + goto do_resident_extend; + /* There is nothing we can do to make enough space. )-: */ + goto err_out; +#endif +do_non_resident_extend: + BUG_ON(!NInoNonResident(ni)); + if (new_alloc_size == allocated_size) { + BUG_ON(vcn); + goto alloc_done; + } + /* + * If the data starts after the end of the old allocation, this is a + * $DATA attribute and sparse attributes are enabled on the volume and + * for this inode, then create a sparse region between the old + * allocated size and the start of the data. Otherwise simply proceed + * with filling the whole space between the old allocated size and the + * new allocated size with clusters. + */ + if ((start >= 0 && start <= allocated_size) || ni->type != AT_DATA || + !NVolSparseEnabled(vol) || NInoSparseDisabled(ni)) + goto skip_sparse; + // TODO: This is not implemented yet. We just fill in with real + // clusters for now... + ntfs_debug("Inserting holes is not-implemented yet. Falling back to " + "allocating real clusters instead."); +skip_sparse: + rl = ni->runlist.rl; + if (likely(rl)) { + /* Seek to the end of the runlist. */ + while (rl->length) + rl++; + } + /* If this attribute extent is not mapped, map it now. */ + if (unlikely(!rl || rl->lcn == LCN_RL_NOT_MAPPED || + (rl->lcn == LCN_ENOENT && rl > ni->runlist.rl && + (rl-1)->lcn == LCN_RL_NOT_MAPPED))) { + if (!rl && !allocated_size) + goto first_alloc; + rl = ntfs_mapping_pairs_decompress(vol, a, ni->runlist.rl); + if (IS_ERR(rl)) { + err = PTR_ERR(rl); + if (start < 0 || start >= allocated_size) + ntfs_error(vol->sb, "Cannot extend allocation " + "of inode 0x%lx, attribute " + "type 0x%x, because the " + "mapping of a runlist " + "fragment failed with error " + "code %i.", vi->i_ino, + (unsigned)le32_to_cpu(ni->type), + err); + if (err != -ENOMEM) + err = -EIO; + goto err_out; + } + ni->runlist.rl = rl; + /* Seek to the end of the runlist. */ + while (rl->length) + rl++; + } + /* + * We now know the runlist of the last extent is mapped and @rl is at + * the end of the runlist. We want to begin allocating clusters + * starting at the last allocated cluster to reduce fragmentation. If + * there are no valid LCNs in the attribute we let the cluster + * allocator choose the starting cluster. + */ + /* If the last LCN is a hole or simillar seek back to last real LCN. */ + while (rl->lcn < 0 && rl > ni->runlist.rl) + rl--; +first_alloc: + // FIXME: Need to implement partial allocations so at least part of the + // write can be performed when start >= 0. (Needed for POSIX write(2) + // conformance.) + rl2 = ntfs_cluster_alloc(vol, allocated_size >> vol->cluster_size_bits, + (new_alloc_size - allocated_size) >> + vol->cluster_size_bits, (rl && (rl->lcn >= 0)) ? + rl->lcn + rl->length : -1, DATA_ZONE, true); + if (IS_ERR(rl2)) { + err = PTR_ERR(rl2); + if (start < 0 || start >= allocated_size) + ntfs_error(vol->sb, "Cannot extend allocation of " + "inode 0x%lx, attribute type 0x%x, " + "because the allocation of clusters " + "failed with error code %i.", vi->i_ino, + (unsigned)le32_to_cpu(ni->type), err); + if (err != -ENOMEM && err != -ENOSPC) + err = -EIO; + goto err_out; + } + rl = ntfs_runlists_merge(ni->runlist.rl, rl2); + if (IS_ERR(rl)) { + err = PTR_ERR(rl); + if (start < 0 || start >= allocated_size) + ntfs_error(vol->sb, "Cannot extend allocation of " + "inode 0x%lx, attribute type 0x%x, " + "because the runlist merge failed " + "with error code %i.", vi->i_ino, + (unsigned)le32_to_cpu(ni->type), err); + if (err != -ENOMEM) + err = -EIO; + if (ntfs_cluster_free_from_rl(vol, rl2)) { + ntfs_error(vol->sb, "Failed to release allocated " + "cluster(s) in error code path. Run " + "chkdsk to recover the lost " + "cluster(s)."); + NVolSetErrors(vol); + } + ntfs_free(rl2); + goto err_out; + } + ni->runlist.rl = rl; + ntfs_debug("Allocated 0x%llx clusters.", (long long)(new_alloc_size - + allocated_size) >> vol->cluster_size_bits); + /* Find the runlist element with which the attribute extent starts. */ + ll = sle64_to_cpu(a->data.non_resident.lowest_vcn); + rl2 = ntfs_rl_find_vcn_nolock(rl, ll); + BUG_ON(!rl2); + BUG_ON(!rl2->length); + BUG_ON(rl2->lcn < LCN_HOLE); + mp_rebuilt = false; + /* Get the size for the new mapping pairs array for this extent. */ + mp_size = ntfs_get_size_for_mapping_pairs(vol, rl2, ll, -1); + if (unlikely(mp_size <= 0)) { + err = mp_size; + if (start < 0 || start >= allocated_size) + ntfs_error(vol->sb, "Cannot extend allocation of " + "inode 0x%lx, attribute type 0x%x, " + "because determining the size for the " + "mapping pairs failed with error code " + "%i.", vi->i_ino, + (unsigned)le32_to_cpu(ni->type), err); + err = -EIO; + goto undo_alloc; + } + /* Extend the attribute record to fit the bigger mapping pairs array. */ + attr_len = le32_to_cpu(a->length); + err = ntfs_attr_record_resize(m, a, mp_size + + le16_to_cpu(a->data.non_resident.mapping_pairs_offset)); + if (unlikely(err)) { + BUG_ON(err != -ENOSPC); + // TODO: Deal with this by moving this extent to a new mft + // record or by starting a new extent in a new mft record, + // possibly by extending this extent partially and filling it + // and creating a new extent for the remainder, or by making + // other attributes non-resident and/or by moving other + // attributes out of this mft record. + if (start < 0 || start >= allocated_size) + ntfs_error(vol->sb, "Not enough space in the mft " + "record for the extended attribute " + "record. This case is not " + "implemented yet."); + err = -EOPNOTSUPP; + goto undo_alloc; + } + mp_rebuilt = true; + /* Generate the mapping pairs array directly into the attr record. */ + err = ntfs_mapping_pairs_build(vol, (u8*)a + + le16_to_cpu(a->data.non_resident.mapping_pairs_offset), + mp_size, rl2, ll, -1, NULL); + if (unlikely(err)) { + if (start < 0 || start >= allocated_size) + ntfs_error(vol->sb, "Cannot extend allocation of " + "inode 0x%lx, attribute type 0x%x, " + "because building the mapping pairs " + "failed with error code %i.", vi->i_ino, + (unsigned)le32_to_cpu(ni->type), err); + err = -EIO; + goto undo_alloc; + } + /* Update the highest_vcn. */ + a->data.non_resident.highest_vcn = cpu_to_sle64((new_alloc_size >> + vol->cluster_size_bits) - 1); + /* + * We now have extended the allocated size of the attribute. Reflect + * this in the ntfs_inode structure and the attribute record. + */ + if (a->data.non_resident.lowest_vcn) { + /* + * We are not in the first attribute extent, switch to it, but + * first ensure the changes will make it to disk later. + */ + flush_dcache_mft_record_page(ctx->ntfs_ino); + mark_mft_record_dirty(ctx->ntfs_ino); + ntfs_attr_reinit_search_ctx(ctx); + err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len, + CASE_SENSITIVE, 0, NULL, 0, ctx); + if (unlikely(err)) + goto restore_undo_alloc; + /* @m is not used any more so no need to set it. */ + a = ctx->attr; + } + write_lock_irqsave(&ni->size_lock, flags); + ni->allocated_size = new_alloc_size; + a->data.non_resident.allocated_size = cpu_to_sle64(new_alloc_size); + /* + * FIXME: This would fail if @ni is a directory, $MFT, or an index, + * since those can have sparse/compressed set. For example can be + * set compressed even though it is not compressed itself and in that + * case the bit means that files are to be created compressed in the + * directory... At present this is ok as this code is only called for + * regular files, and only for their $DATA attribute(s). + * FIXME: The calculation is wrong if we created a hole above. For now + * it does not matter as we never create holes. + */ + if (NInoSparse(ni) || NInoCompressed(ni)) { + ni->itype.compressed.size += new_alloc_size - allocated_size; + a->data.non_resident.compressed_size = + cpu_to_sle64(ni->itype.compressed.size); + vi->i_blocks = ni->itype.compressed.size >> 9; + } else + vi->i_blocks = new_alloc_size >> 9; + write_unlock_irqrestore(&ni->size_lock, flags); +alloc_done: + if (new_data_size >= 0) { + BUG_ON(new_data_size < + sle64_to_cpu(a->data.non_resident.data_size)); + a->data.non_resident.data_size = cpu_to_sle64(new_data_size); + } +flush_done: + /* Ensure the changes make it to disk. */ + flush_dcache_mft_record_page(ctx->ntfs_ino); + mark_mft_record_dirty(ctx->ntfs_ino); +done: + ntfs_attr_put_search_ctx(ctx); + unmap_mft_record(base_ni); + up_write(&ni->runlist.lock); + ntfs_debug("Done, new_allocated_size 0x%llx.", + (unsigned long long)new_alloc_size); + return new_alloc_size; +restore_undo_alloc: + if (start < 0 || start >= allocated_size) + ntfs_error(vol->sb, "Cannot complete extension of allocation " + "of inode 0x%lx, attribute type 0x%x, because " + "lookup of first attribute extent failed with " + "error code %i.", vi->i_ino, + (unsigned)le32_to_cpu(ni->type), err); + if (err == -ENOENT) + err = -EIO; + ntfs_attr_reinit_search_ctx(ctx); + if (ntfs_attr_lookup(ni->type, ni->name, ni->name_len, CASE_SENSITIVE, + allocated_size >> vol->cluster_size_bits, NULL, 0, + ctx)) { + ntfs_error(vol->sb, "Failed to find last attribute extent of " + "attribute in error code path. Run chkdsk to " + "recover."); + write_lock_irqsave(&ni->size_lock, flags); + ni->allocated_size = new_alloc_size; + /* + * FIXME: This would fail if @ni is a directory... See above. + * FIXME: The calculation is wrong if we created a hole above. + * For now it does not matter as we never create holes. + */ + if (NInoSparse(ni) || NInoCompressed(ni)) { + ni->itype.compressed.size += new_alloc_size - + allocated_size; + vi->i_blocks = ni->itype.compressed.size >> 9; + } else + vi->i_blocks = new_alloc_size >> 9; + write_unlock_irqrestore(&ni->size_lock, flags); + ntfs_attr_put_search_ctx(ctx); + unmap_mft_record(base_ni); + up_write(&ni->runlist.lock); + /* + * The only thing that is now wrong is the allocated size of the + * base attribute extent which chkdsk should be able to fix. + */ + NVolSetErrors(vol); + return err; + } + ctx->attr->data.non_resident.highest_vcn = cpu_to_sle64( + (allocated_size >> vol->cluster_size_bits) - 1); +undo_alloc: + ll = allocated_size >> vol->cluster_size_bits; + if (ntfs_cluster_free(ni, ll, -1, ctx) < 0) { + ntfs_error(vol->sb, "Failed to release allocated cluster(s) " + "in error code path. Run chkdsk to recover " + "the lost cluster(s)."); + NVolSetErrors(vol); + } + m = ctx->mrec; + a = ctx->attr; + /* + * If the runlist truncation fails and/or the search context is no + * longer valid, we cannot resize the attribute record or build the + * mapping pairs array thus we mark the inode bad so that no access to + * the freed clusters can happen. + */ + if (ntfs_rl_truncate_nolock(vol, &ni->runlist, ll) || IS_ERR(m)) { + ntfs_error(vol->sb, "Failed to %s in error code path. Run " + "chkdsk to recover.", IS_ERR(m) ? + "restore attribute search context" : + "truncate attribute runlist"); + NVolSetErrors(vol); + } else if (mp_rebuilt) { + if (ntfs_attr_record_resize(m, a, attr_len)) { + ntfs_error(vol->sb, "Failed to restore attribute " + "record in error code path. Run " + "chkdsk to recover."); + NVolSetErrors(vol); + } else /* if (success) */ { + if (ntfs_mapping_pairs_build(vol, (u8*)a + le16_to_cpu( + a->data.non_resident. + mapping_pairs_offset), attr_len - + le16_to_cpu(a->data.non_resident. + mapping_pairs_offset), rl2, ll, -1, + NULL)) { + ntfs_error(vol->sb, "Failed to restore " + "mapping pairs array in error " + "code path. Run chkdsk to " + "recover."); + NVolSetErrors(vol); + } + flush_dcache_mft_record_page(ctx->ntfs_ino); + mark_mft_record_dirty(ctx->ntfs_ino); + } + } +err_out: + if (ctx) + ntfs_attr_put_search_ctx(ctx); + if (m) + unmap_mft_record(base_ni); + up_write(&ni->runlist.lock); +conv_err_out: + ntfs_debug("Failed. Returning error code %i.", err); + return err; +} + +/** + * ntfs_attr_set - fill (a part of) an attribute with a byte + * @ni: ntfs inode describing the attribute to fill + * @ofs: offset inside the attribute at which to start to fill + * @cnt: number of bytes to fill + * @val: the unsigned 8-bit value with which to fill the attribute + * + * Fill @cnt bytes of the attribute described by the ntfs inode @ni starting at + * byte offset @ofs inside the attribute with the constant byte @val. + * + * This function is effectively like memset() applied to an ntfs attribute. + * Note thie function actually only operates on the page cache pages belonging + * to the ntfs attribute and it marks them dirty after doing the memset(). + * Thus it relies on the vm dirty page write code paths to cause the modified + * pages to be written to the mft record/disk. + * + * Return 0 on success and -errno on error. An error code of -ESPIPE means + * that @ofs + @cnt were outside the end of the attribute and no write was + * performed. + */ +int ntfs_attr_set(ntfs_inode *ni, const s64 ofs, const s64 cnt, const u8 val) +{ + ntfs_volume *vol = ni->vol; + struct address_space *mapping; + struct page *page; + u8 *kaddr; + pgoff_t idx, end; + unsigned start_ofs, end_ofs, size; + + ntfs_debug("Entering for ofs 0x%llx, cnt 0x%llx, val 0x%hx.", + (long long)ofs, (long long)cnt, val); + BUG_ON(ofs < 0); + BUG_ON(cnt < 0); + if (!cnt) + goto done; + /* + * FIXME: Compressed and encrypted attributes are not supported when + * writing and we should never have gotten here for them. + */ + BUG_ON(NInoCompressed(ni)); + BUG_ON(NInoEncrypted(ni)); + mapping = VFS_I(ni)->i_mapping; + /* Work out the starting index and page offset. */ + idx = ofs >> PAGE_SHIFT; + start_ofs = ofs & ~PAGE_MASK; + /* Work out the ending index and page offset. */ + end = ofs + cnt; + end_ofs = end & ~PAGE_MASK; + /* If the end is outside the inode size return -ESPIPE. */ + if (unlikely(end > i_size_read(VFS_I(ni)))) { + ntfs_error(vol->sb, "Request exceeds end of attribute."); + return -ESPIPE; + } + end >>= PAGE_SHIFT; + /* If there is a first partial page, need to do it the slow way. */ + if (start_ofs) { + page = read_mapping_page(mapping, idx, NULL); + if (IS_ERR(page)) { + ntfs_error(vol->sb, "Failed to read first partial " + "page (error, index 0x%lx).", idx); + return PTR_ERR(page); + } + /* + * If the last page is the same as the first page, need to + * limit the write to the end offset. + */ + size = PAGE_SIZE; + if (idx == end) + size = end_ofs; + kaddr = kmap_atomic(page); + memset(kaddr + start_ofs, val, size - start_ofs); + flush_dcache_page(page); + kunmap_atomic(kaddr); + set_page_dirty(page); + put_page(page); + balance_dirty_pages_ratelimited(mapping); + cond_resched(); + if (idx == end) + goto done; + idx++; + } + /* Do the whole pages the fast way. */ + for (; idx < end; idx++) { + /* Find or create the current page. (The page is locked.) */ + page = grab_cache_page(mapping, idx); + if (unlikely(!page)) { + ntfs_error(vol->sb, "Insufficient memory to grab " + "page (index 0x%lx).", idx); + return -ENOMEM; + } + kaddr = kmap_atomic(page); + memset(kaddr, val, PAGE_SIZE); + flush_dcache_page(page); + kunmap_atomic(kaddr); + /* + * If the page has buffers, mark them uptodate since buffer + * state and not page state is definitive in 2.6 kernels. + */ + if (page_has_buffers(page)) { + struct buffer_head *bh, *head; + + bh = head = page_buffers(page); + do { + set_buffer_uptodate(bh); + } while ((bh = bh->b_this_page) != head); + } + /* Now that buffers are uptodate, set the page uptodate, too. */ + SetPageUptodate(page); + /* + * Set the page and all its buffers dirty and mark the inode + * dirty, too. The VM will write the page later on. + */ + set_page_dirty(page); + /* Finally unlock and release the page. */ + unlock_page(page); + put_page(page); + balance_dirty_pages_ratelimited(mapping); + cond_resched(); + } + /* If there is a last partial page, need to do it the slow way. */ + if (end_ofs) { + page = read_mapping_page(mapping, idx, NULL); + if (IS_ERR(page)) { + ntfs_error(vol->sb, "Failed to read last partial page " + "(error, index 0x%lx).", idx); + return PTR_ERR(page); + } + kaddr = kmap_atomic(page); + memset(kaddr, val, end_ofs); + flush_dcache_page(page); + kunmap_atomic(kaddr); + set_page_dirty(page); + put_page(page); + balance_dirty_pages_ratelimited(mapping); + cond_resched(); + } +done: + ntfs_debug("Done."); + return 0; +} + +#endif /* NTFS_RW */ diff --git a/fs/ntfs/attrib.h b/fs/ntfs/attrib.h new file mode 100644 index 000000000..fe0890d3d --- /dev/null +++ b/fs/ntfs/attrib.h @@ -0,0 +1,102 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * attrib.h - Defines for attribute handling in NTFS Linux kernel driver. + * Part of the Linux-NTFS project. + * + * Copyright (c) 2001-2005 Anton Altaparmakov + * Copyright (c) 2002 Richard Russon + */ + +#ifndef _LINUX_NTFS_ATTRIB_H +#define _LINUX_NTFS_ATTRIB_H + +#include "endian.h" +#include "types.h" +#include "layout.h" +#include "inode.h" +#include "runlist.h" +#include "volume.h" + +/** + * ntfs_attr_search_ctx - used in attribute search functions + * @mrec: buffer containing mft record to search + * @attr: attribute record in @mrec where to begin/continue search + * @is_first: if true ntfs_attr_lookup() begins search with @attr, else after + * + * Structure must be initialized to zero before the first call to one of the + * attribute search functions. Initialize @mrec to point to the mft record to + * search, and @attr to point to the first attribute within @mrec (not necessary + * if calling the _first() functions), and set @is_first to 'true' (not necessary + * if calling the _first() functions). + * + * If @is_first is 'true', the search begins with @attr. If @is_first is 'false', + * the search begins after @attr. This is so that, after the first call to one + * of the search attribute functions, we can call the function again, without + * any modification of the search context, to automagically get the next + * matching attribute. + */ +typedef struct { + MFT_RECORD *mrec; + ATTR_RECORD *attr; + bool is_first; + ntfs_inode *ntfs_ino; + ATTR_LIST_ENTRY *al_entry; + ntfs_inode *base_ntfs_ino; + MFT_RECORD *base_mrec; + ATTR_RECORD *base_attr; +} ntfs_attr_search_ctx; + +extern int ntfs_map_runlist_nolock(ntfs_inode *ni, VCN vcn, + ntfs_attr_search_ctx *ctx); +extern int ntfs_map_runlist(ntfs_inode *ni, VCN vcn); + +extern LCN ntfs_attr_vcn_to_lcn_nolock(ntfs_inode *ni, const VCN vcn, + const bool write_locked); + +extern runlist_element *ntfs_attr_find_vcn_nolock(ntfs_inode *ni, + const VCN vcn, ntfs_attr_search_ctx *ctx); + +int ntfs_attr_lookup(const ATTR_TYPE type, const ntfschar *name, + const u32 name_len, const IGNORE_CASE_BOOL ic, + const VCN lowest_vcn, const u8 *val, const u32 val_len, + ntfs_attr_search_ctx *ctx); + +extern int load_attribute_list(ntfs_volume *vol, runlist *rl, u8 *al_start, + const s64 size, const s64 initialized_size); + +static inline s64 ntfs_attr_size(const ATTR_RECORD *a) +{ + if (!a->non_resident) + return (s64)le32_to_cpu(a->data.resident.value_length); + return sle64_to_cpu(a->data.non_resident.data_size); +} + +extern void ntfs_attr_reinit_search_ctx(ntfs_attr_search_ctx *ctx); +extern ntfs_attr_search_ctx *ntfs_attr_get_search_ctx(ntfs_inode *ni, + MFT_RECORD *mrec); +extern void ntfs_attr_put_search_ctx(ntfs_attr_search_ctx *ctx); + +#ifdef NTFS_RW + +extern int ntfs_attr_size_bounds_check(const ntfs_volume *vol, + const ATTR_TYPE type, const s64 size); +extern int ntfs_attr_can_be_non_resident(const ntfs_volume *vol, + const ATTR_TYPE type); +extern int ntfs_attr_can_be_resident(const ntfs_volume *vol, + const ATTR_TYPE type); + +extern int ntfs_attr_record_resize(MFT_RECORD *m, ATTR_RECORD *a, u32 new_size); +extern int ntfs_resident_attr_value_resize(MFT_RECORD *m, ATTR_RECORD *a, + const u32 new_size); + +extern int ntfs_attr_make_non_resident(ntfs_inode *ni, const u32 data_size); + +extern s64 ntfs_attr_extend_allocation(ntfs_inode *ni, s64 new_alloc_size, + const s64 new_data_size, const s64 data_start); + +extern int ntfs_attr_set(ntfs_inode *ni, const s64 ofs, const s64 cnt, + const u8 val); + +#endif /* NTFS_RW */ + +#endif /* _LINUX_NTFS_ATTRIB_H */ diff --git a/fs/ntfs/bitmap.c b/fs/ntfs/bitmap.c new file mode 100644 index 000000000..0675b2400 --- /dev/null +++ b/fs/ntfs/bitmap.c @@ -0,0 +1,179 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * bitmap.c - NTFS kernel bitmap handling. Part of the Linux-NTFS project. + * + * Copyright (c) 2004-2005 Anton Altaparmakov + */ + +#ifdef NTFS_RW + +#include <linux/pagemap.h> + +#include "bitmap.h" +#include "debug.h" +#include "aops.h" +#include "ntfs.h" + +/** + * __ntfs_bitmap_set_bits_in_run - set a run of bits in a bitmap to a value + * @vi: vfs inode describing the bitmap + * @start_bit: first bit to set + * @count: number of bits to set + * @value: value to set the bits to (i.e. 0 or 1) + * @is_rollback: if 'true' this is a rollback operation + * + * Set @count bits starting at bit @start_bit in the bitmap described by the + * vfs inode @vi to @value, where @value is either 0 or 1. + * + * @is_rollback should always be 'false', it is for internal use to rollback + * errors. You probably want to use ntfs_bitmap_set_bits_in_run() instead. + * + * Return 0 on success and -errno on error. + */ +int __ntfs_bitmap_set_bits_in_run(struct inode *vi, const s64 start_bit, + const s64 count, const u8 value, const bool is_rollback) +{ + s64 cnt = count; + pgoff_t index, end_index; + struct address_space *mapping; + struct page *page; + u8 *kaddr; + int pos, len; + u8 bit; + + BUG_ON(!vi); + ntfs_debug("Entering for i_ino 0x%lx, start_bit 0x%llx, count 0x%llx, " + "value %u.%s", vi->i_ino, (unsigned long long)start_bit, + (unsigned long long)cnt, (unsigned int)value, + is_rollback ? " (rollback)" : ""); + BUG_ON(start_bit < 0); + BUG_ON(cnt < 0); + BUG_ON(value > 1); + /* + * Calculate the indices for the pages containing the first and last + * bits, i.e. @start_bit and @start_bit + @cnt - 1, respectively. + */ + index = start_bit >> (3 + PAGE_SHIFT); + end_index = (start_bit + cnt - 1) >> (3 + PAGE_SHIFT); + + /* Get the page containing the first bit (@start_bit). */ + mapping = vi->i_mapping; + page = ntfs_map_page(mapping, index); + if (IS_ERR(page)) { + if (!is_rollback) + ntfs_error(vi->i_sb, "Failed to map first page (error " + "%li), aborting.", PTR_ERR(page)); + return PTR_ERR(page); + } + kaddr = page_address(page); + + /* Set @pos to the position of the byte containing @start_bit. */ + pos = (start_bit >> 3) & ~PAGE_MASK; + + /* Calculate the position of @start_bit in the first byte. */ + bit = start_bit & 7; + + /* If the first byte is partial, modify the appropriate bits in it. */ + if (bit) { + u8 *byte = kaddr + pos; + while ((bit & 7) && cnt) { + cnt--; + if (value) + *byte |= 1 << bit++; + else + *byte &= ~(1 << bit++); + } + /* If we are done, unmap the page and return success. */ + if (!cnt) + goto done; + + /* Update @pos to the new position. */ + pos++; + } + /* + * Depending on @value, modify all remaining whole bytes in the page up + * to @cnt. + */ + len = min_t(s64, cnt >> 3, PAGE_SIZE - pos); + memset(kaddr + pos, value ? 0xff : 0, len); + cnt -= len << 3; + + /* Update @len to point to the first not-done byte in the page. */ + if (cnt < 8) + len += pos; + + /* If we are not in the last page, deal with all subsequent pages. */ + while (index < end_index) { + BUG_ON(cnt <= 0); + + /* Update @index and get the next page. */ + flush_dcache_page(page); + set_page_dirty(page); + ntfs_unmap_page(page); + page = ntfs_map_page(mapping, ++index); + if (IS_ERR(page)) + goto rollback; + kaddr = page_address(page); + /* + * Depending on @value, modify all remaining whole bytes in the + * page up to @cnt. + */ + len = min_t(s64, cnt >> 3, PAGE_SIZE); + memset(kaddr, value ? 0xff : 0, len); + cnt -= len << 3; + } + /* + * The currently mapped page is the last one. If the last byte is + * partial, modify the appropriate bits in it. Note, @len is the + * position of the last byte inside the page. + */ + if (cnt) { + u8 *byte; + + BUG_ON(cnt > 7); + + bit = cnt; + byte = kaddr + len; + while (bit--) { + if (value) + *byte |= 1 << bit; + else + *byte &= ~(1 << bit); + } + } +done: + /* We are done. Unmap the page and return success. */ + flush_dcache_page(page); + set_page_dirty(page); + ntfs_unmap_page(page); + ntfs_debug("Done."); + return 0; +rollback: + /* + * Current state: + * - no pages are mapped + * - @count - @cnt is the number of bits that have been modified + */ + if (is_rollback) + return PTR_ERR(page); + if (count != cnt) + pos = __ntfs_bitmap_set_bits_in_run(vi, start_bit, count - cnt, + value ? 0 : 1, true); + else + pos = 0; + if (!pos) { + /* Rollback was successful. */ + ntfs_error(vi->i_sb, "Failed to map subsequent page (error " + "%li), aborting.", PTR_ERR(page)); + } else { + /* Rollback failed. */ + ntfs_error(vi->i_sb, "Failed to map subsequent page (error " + "%li) and rollback failed (error %i). " + "Aborting and leaving inconsistent metadata. " + "Unmount and run chkdsk.", PTR_ERR(page), pos); + NVolSetErrors(NTFS_SB(vi->i_sb)); + } + return PTR_ERR(page); +} + +#endif /* NTFS_RW */ diff --git a/fs/ntfs/bitmap.h b/fs/ntfs/bitmap.h new file mode 100644 index 000000000..9dd2224ca --- /dev/null +++ b/fs/ntfs/bitmap.h @@ -0,0 +1,104 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * bitmap.h - Defines for NTFS kernel bitmap handling. Part of the Linux-NTFS + * project. + * + * Copyright (c) 2004 Anton Altaparmakov + */ + +#ifndef _LINUX_NTFS_BITMAP_H +#define _LINUX_NTFS_BITMAP_H + +#ifdef NTFS_RW + +#include <linux/fs.h> + +#include "types.h" + +extern int __ntfs_bitmap_set_bits_in_run(struct inode *vi, const s64 start_bit, + const s64 count, const u8 value, const bool is_rollback); + +/** + * ntfs_bitmap_set_bits_in_run - set a run of bits in a bitmap to a value + * @vi: vfs inode describing the bitmap + * @start_bit: first bit to set + * @count: number of bits to set + * @value: value to set the bits to (i.e. 0 or 1) + * + * Set @count bits starting at bit @start_bit in the bitmap described by the + * vfs inode @vi to @value, where @value is either 0 or 1. + * + * Return 0 on success and -errno on error. + */ +static inline int ntfs_bitmap_set_bits_in_run(struct inode *vi, + const s64 start_bit, const s64 count, const u8 value) +{ + return __ntfs_bitmap_set_bits_in_run(vi, start_bit, count, value, + false); +} + +/** + * ntfs_bitmap_set_run - set a run of bits in a bitmap + * @vi: vfs inode describing the bitmap + * @start_bit: first bit to set + * @count: number of bits to set + * + * Set @count bits starting at bit @start_bit in the bitmap described by the + * vfs inode @vi. + * + * Return 0 on success and -errno on error. + */ +static inline int ntfs_bitmap_set_run(struct inode *vi, const s64 start_bit, + const s64 count) +{ + return ntfs_bitmap_set_bits_in_run(vi, start_bit, count, 1); +} + +/** + * ntfs_bitmap_clear_run - clear a run of bits in a bitmap + * @vi: vfs inode describing the bitmap + * @start_bit: first bit to clear + * @count: number of bits to clear + * + * Clear @count bits starting at bit @start_bit in the bitmap described by the + * vfs inode @vi. + * + * Return 0 on success and -errno on error. + */ +static inline int ntfs_bitmap_clear_run(struct inode *vi, const s64 start_bit, + const s64 count) +{ + return ntfs_bitmap_set_bits_in_run(vi, start_bit, count, 0); +} + +/** + * ntfs_bitmap_set_bit - set a bit in a bitmap + * @vi: vfs inode describing the bitmap + * @bit: bit to set + * + * Set bit @bit in the bitmap described by the vfs inode @vi. + * + * Return 0 on success and -errno on error. + */ +static inline int ntfs_bitmap_set_bit(struct inode *vi, const s64 bit) +{ + return ntfs_bitmap_set_run(vi, bit, 1); +} + +/** + * ntfs_bitmap_clear_bit - clear a bit in a bitmap + * @vi: vfs inode describing the bitmap + * @bit: bit to clear + * + * Clear bit @bit in the bitmap described by the vfs inode @vi. + * + * Return 0 on success and -errno on error. + */ +static inline int ntfs_bitmap_clear_bit(struct inode *vi, const s64 bit) +{ + return ntfs_bitmap_clear_run(vi, bit, 1); +} + +#endif /* NTFS_RW */ + +#endif /* defined _LINUX_NTFS_BITMAP_H */ diff --git a/fs/ntfs/collate.c b/fs/ntfs/collate.c new file mode 100644 index 000000000..3ab6ec96a --- /dev/null +++ b/fs/ntfs/collate.c @@ -0,0 +1,110 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * collate.c - NTFS kernel collation handling. Part of the Linux-NTFS project. + * + * Copyright (c) 2004 Anton Altaparmakov + */ + +#include "collate.h" +#include "debug.h" +#include "ntfs.h" + +static int ntfs_collate_binary(ntfs_volume *vol, + const void *data1, const int data1_len, + const void *data2, const int data2_len) +{ + int rc; + + ntfs_debug("Entering."); + rc = memcmp(data1, data2, min(data1_len, data2_len)); + if (!rc && (data1_len != data2_len)) { + if (data1_len < data2_len) + rc = -1; + else + rc = 1; + } + ntfs_debug("Done, returning %i", rc); + return rc; +} + +static int ntfs_collate_ntofs_ulong(ntfs_volume *vol, + const void *data1, const int data1_len, + const void *data2, const int data2_len) +{ + int rc; + u32 d1, d2; + + ntfs_debug("Entering."); + // FIXME: We don't really want to bug here. + BUG_ON(data1_len != data2_len); + BUG_ON(data1_len != 4); + d1 = le32_to_cpup(data1); + d2 = le32_to_cpup(data2); + if (d1 < d2) + rc = -1; + else { + if (d1 == d2) + rc = 0; + else + rc = 1; + } + ntfs_debug("Done, returning %i", rc); + return rc; +} + +typedef int (*ntfs_collate_func_t)(ntfs_volume *, const void *, const int, + const void *, const int); + +static ntfs_collate_func_t ntfs_do_collate0x0[3] = { + ntfs_collate_binary, + NULL/*ntfs_collate_file_name*/, + NULL/*ntfs_collate_unicode_string*/, +}; + +static ntfs_collate_func_t ntfs_do_collate0x1[4] = { + ntfs_collate_ntofs_ulong, + NULL/*ntfs_collate_ntofs_sid*/, + NULL/*ntfs_collate_ntofs_security_hash*/, + NULL/*ntfs_collate_ntofs_ulongs*/, +}; + +/** + * ntfs_collate - collate two data items using a specified collation rule + * @vol: ntfs volume to which the data items belong + * @cr: collation rule to use when comparing the items + * @data1: first data item to collate + * @data1_len: length in bytes of @data1 + * @data2: second data item to collate + * @data2_len: length in bytes of @data2 + * + * Collate the two data items @data1 and @data2 using the collation rule @cr + * and return -1, 0, ir 1 if @data1 is found, respectively, to collate before, + * to match, or to collate after @data2. + * + * For speed we use the collation rule @cr as an index into two tables of + * function pointers to call the appropriate collation function. + */ +int ntfs_collate(ntfs_volume *vol, COLLATION_RULE cr, + const void *data1, const int data1_len, + const void *data2, const int data2_len) { + int i; + + ntfs_debug("Entering."); + /* + * FIXME: At the moment we only support COLLATION_BINARY and + * COLLATION_NTOFS_ULONG, so we BUG() for everything else for now. + */ + BUG_ON(cr != COLLATION_BINARY && cr != COLLATION_NTOFS_ULONG); + i = le32_to_cpu(cr); + BUG_ON(i < 0); + if (i <= 0x02) + return ntfs_do_collate0x0[i](vol, data1, data1_len, + data2, data2_len); + BUG_ON(i < 0x10); + i -= 0x10; + if (likely(i <= 3)) + return ntfs_do_collate0x1[i](vol, data1, data1_len, + data2, data2_len); + BUG(); + return 0; +} diff --git a/fs/ntfs/collate.h b/fs/ntfs/collate.h new file mode 100644 index 000000000..f2255619b --- /dev/null +++ b/fs/ntfs/collate.h @@ -0,0 +1,36 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * collate.h - Defines for NTFS kernel collation handling. Part of the + * Linux-NTFS project. + * + * Copyright (c) 2004 Anton Altaparmakov + */ + +#ifndef _LINUX_NTFS_COLLATE_H +#define _LINUX_NTFS_COLLATE_H + +#include "types.h" +#include "volume.h" + +static inline bool ntfs_is_collation_rule_supported(COLLATION_RULE cr) { + int i; + + /* + * FIXME: At the moment we only support COLLATION_BINARY and + * COLLATION_NTOFS_ULONG, so we return false for everything else for + * now. + */ + if (unlikely(cr != COLLATION_BINARY && cr != COLLATION_NTOFS_ULONG)) + return false; + i = le32_to_cpu(cr); + if (likely(((i >= 0) && (i <= 0x02)) || + ((i >= 0x10) && (i <= 0x13)))) + return true; + return false; +} + +extern int ntfs_collate(ntfs_volume *vol, COLLATION_RULE cr, + const void *data1, const int data1_len, + const void *data2, const int data2_len); + +#endif /* _LINUX_NTFS_COLLATE_H */ diff --git a/fs/ntfs/compress.c b/fs/ntfs/compress.c new file mode 100644 index 000000000..587e9b187 --- /dev/null +++ b/fs/ntfs/compress.c @@ -0,0 +1,950 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/** + * compress.c - NTFS kernel compressed attributes handling. + * Part of the Linux-NTFS project. + * + * Copyright (c) 2001-2004 Anton Altaparmakov + * Copyright (c) 2002 Richard Russon + */ + +#include <linux/fs.h> +#include <linux/buffer_head.h> +#include <linux/blkdev.h> +#include <linux/vmalloc.h> +#include <linux/slab.h> + +#include "attrib.h" +#include "inode.h" +#include "debug.h" +#include "ntfs.h" + +/** + * ntfs_compression_constants - enum of constants used in the compression code + */ +typedef enum { + /* Token types and access mask. */ + NTFS_SYMBOL_TOKEN = 0, + NTFS_PHRASE_TOKEN = 1, + NTFS_TOKEN_MASK = 1, + + /* Compression sub-block constants. */ + NTFS_SB_SIZE_MASK = 0x0fff, + NTFS_SB_SIZE = 0x1000, + NTFS_SB_IS_COMPRESSED = 0x8000, + + /* + * The maximum compression block size is by definition 16 * the cluster + * size, with the maximum supported cluster size being 4kiB. Thus the + * maximum compression buffer size is 64kiB, so we use this when + * initializing the compression buffer. + */ + NTFS_MAX_CB_SIZE = 64 * 1024, +} ntfs_compression_constants; + +/** + * ntfs_compression_buffer - one buffer for the decompression engine + */ +static u8 *ntfs_compression_buffer; + +/** + * ntfs_cb_lock - spinlock which protects ntfs_compression_buffer + */ +static DEFINE_SPINLOCK(ntfs_cb_lock); + +/** + * allocate_compression_buffers - allocate the decompression buffers + * + * Caller has to hold the ntfs_lock mutex. + * + * Return 0 on success or -ENOMEM if the allocations failed. + */ +int allocate_compression_buffers(void) +{ + BUG_ON(ntfs_compression_buffer); + + ntfs_compression_buffer = vmalloc(NTFS_MAX_CB_SIZE); + if (!ntfs_compression_buffer) + return -ENOMEM; + return 0; +} + +/** + * free_compression_buffers - free the decompression buffers + * + * Caller has to hold the ntfs_lock mutex. + */ +void free_compression_buffers(void) +{ + BUG_ON(!ntfs_compression_buffer); + vfree(ntfs_compression_buffer); + ntfs_compression_buffer = NULL; +} + +/** + * zero_partial_compressed_page - zero out of bounds compressed page region + */ +static void zero_partial_compressed_page(struct page *page, + const s64 initialized_size) +{ + u8 *kp = page_address(page); + unsigned int kp_ofs; + + ntfs_debug("Zeroing page region outside initialized size."); + if (((s64)page->index << PAGE_SHIFT) >= initialized_size) { + clear_page(kp); + return; + } + kp_ofs = initialized_size & ~PAGE_MASK; + memset(kp + kp_ofs, 0, PAGE_SIZE - kp_ofs); + return; +} + +/** + * handle_bounds_compressed_page - test for&handle out of bounds compressed page + */ +static inline void handle_bounds_compressed_page(struct page *page, + const loff_t i_size, const s64 initialized_size) +{ + if ((page->index >= (initialized_size >> PAGE_SHIFT)) && + (initialized_size < i_size)) + zero_partial_compressed_page(page, initialized_size); + return; +} + +/** + * ntfs_decompress - decompress a compression block into an array of pages + * @dest_pages: destination array of pages + * @completed_pages: scratch space to track completed pages + * @dest_index: current index into @dest_pages (IN/OUT) + * @dest_ofs: current offset within @dest_pages[@dest_index] (IN/OUT) + * @dest_max_index: maximum index into @dest_pages (IN) + * @dest_max_ofs: maximum offset within @dest_pages[@dest_max_index] (IN) + * @xpage: the target page (-1 if none) (IN) + * @xpage_done: set to 1 if xpage was completed successfully (IN/OUT) + * @cb_start: compression block to decompress (IN) + * @cb_size: size of compression block @cb_start in bytes (IN) + * @i_size: file size when we started the read (IN) + * @initialized_size: initialized file size when we started the read (IN) + * + * The caller must have disabled preemption. ntfs_decompress() reenables it when + * the critical section is finished. + * + * This decompresses the compression block @cb_start into the array of + * destination pages @dest_pages starting at index @dest_index into @dest_pages + * and at offset @dest_pos into the page @dest_pages[@dest_index]. + * + * When the page @dest_pages[@xpage] is completed, @xpage_done is set to 1. + * If xpage is -1 or @xpage has not been completed, @xpage_done is not modified. + * + * @cb_start is a pointer to the compression block which needs decompressing + * and @cb_size is the size of @cb_start in bytes (8-64kiB). + * + * Return 0 if success or -EOVERFLOW on error in the compressed stream. + * @xpage_done indicates whether the target page (@dest_pages[@xpage]) was + * completed during the decompression of the compression block (@cb_start). + * + * Warning: This function *REQUIRES* PAGE_SIZE >= 4096 or it will blow up + * unpredicatbly! You have been warned! + * + * Note to hackers: This function may not sleep until it has finished accessing + * the compression block @cb_start as it is a per-CPU buffer. + */ +static int ntfs_decompress(struct page *dest_pages[], int completed_pages[], + int *dest_index, int *dest_ofs, const int dest_max_index, + const int dest_max_ofs, const int xpage, char *xpage_done, + u8 *const cb_start, const u32 cb_size, const loff_t i_size, + const s64 initialized_size) +{ + /* + * Pointers into the compressed data, i.e. the compression block (cb), + * and the therein contained sub-blocks (sb). + */ + u8 *cb_end = cb_start + cb_size; /* End of cb. */ + u8 *cb = cb_start; /* Current position in cb. */ + u8 *cb_sb_start = cb; /* Beginning of the current sb in the cb. */ + u8 *cb_sb_end; /* End of current sb / beginning of next sb. */ + + /* Variables for uncompressed data / destination. */ + struct page *dp; /* Current destination page being worked on. */ + u8 *dp_addr; /* Current pointer into dp. */ + u8 *dp_sb_start; /* Start of current sub-block in dp. */ + u8 *dp_sb_end; /* End of current sb in dp (dp_sb_start + + NTFS_SB_SIZE). */ + u16 do_sb_start; /* @dest_ofs when starting this sub-block. */ + u16 do_sb_end; /* @dest_ofs of end of this sb (do_sb_start + + NTFS_SB_SIZE). */ + + /* Variables for tag and token parsing. */ + u8 tag; /* Current tag. */ + int token; /* Loop counter for the eight tokens in tag. */ + int nr_completed_pages = 0; + + /* Default error code. */ + int err = -EOVERFLOW; + + ntfs_debug("Entering, cb_size = 0x%x.", cb_size); +do_next_sb: + ntfs_debug("Beginning sub-block at offset = 0x%zx in the cb.", + cb - cb_start); + /* + * Have we reached the end of the compression block or the end of the + * decompressed data? The latter can happen for example if the current + * position in the compression block is one byte before its end so the + * first two checks do not detect it. + */ + if (cb == cb_end || !le16_to_cpup((le16*)cb) || + (*dest_index == dest_max_index && + *dest_ofs == dest_max_ofs)) { + int i; + + ntfs_debug("Completed. Returning success (0)."); + err = 0; +return_error: + /* We can sleep from now on, so we drop lock. */ + spin_unlock(&ntfs_cb_lock); + /* Second stage: finalize completed pages. */ + if (nr_completed_pages > 0) { + for (i = 0; i < nr_completed_pages; i++) { + int di = completed_pages[i]; + + dp = dest_pages[di]; + /* + * If we are outside the initialized size, zero + * the out of bounds page range. + */ + handle_bounds_compressed_page(dp, i_size, + initialized_size); + flush_dcache_page(dp); + kunmap(dp); + SetPageUptodate(dp); + unlock_page(dp); + if (di == xpage) + *xpage_done = 1; + else + put_page(dp); + dest_pages[di] = NULL; + } + } + return err; + } + + /* Setup offsets for the current sub-block destination. */ + do_sb_start = *dest_ofs; + do_sb_end = do_sb_start + NTFS_SB_SIZE; + + /* Check that we are still within allowed boundaries. */ + if (*dest_index == dest_max_index && do_sb_end > dest_max_ofs) + goto return_overflow; + + /* Does the minimum size of a compressed sb overflow valid range? */ + if (cb + 6 > cb_end) + goto return_overflow; + + /* Setup the current sub-block source pointers and validate range. */ + cb_sb_start = cb; + cb_sb_end = cb_sb_start + (le16_to_cpup((le16*)cb) & NTFS_SB_SIZE_MASK) + + 3; + if (cb_sb_end > cb_end) + goto return_overflow; + + /* Get the current destination page. */ + dp = dest_pages[*dest_index]; + if (!dp) { + /* No page present. Skip decompression of this sub-block. */ + cb = cb_sb_end; + + /* Advance destination position to next sub-block. */ + *dest_ofs = (*dest_ofs + NTFS_SB_SIZE) & ~PAGE_MASK; + if (!*dest_ofs && (++*dest_index > dest_max_index)) + goto return_overflow; + goto do_next_sb; + } + + /* We have a valid destination page. Setup the destination pointers. */ + dp_addr = (u8*)page_address(dp) + do_sb_start; + + /* Now, we are ready to process the current sub-block (sb). */ + if (!(le16_to_cpup((le16*)cb) & NTFS_SB_IS_COMPRESSED)) { + ntfs_debug("Found uncompressed sub-block."); + /* This sb is not compressed, just copy it into destination. */ + + /* Advance source position to first data byte. */ + cb += 2; + + /* An uncompressed sb must be full size. */ + if (cb_sb_end - cb != NTFS_SB_SIZE) + goto return_overflow; + + /* Copy the block and advance the source position. */ + memcpy(dp_addr, cb, NTFS_SB_SIZE); + cb += NTFS_SB_SIZE; + + /* Advance destination position to next sub-block. */ + *dest_ofs += NTFS_SB_SIZE; + if (!(*dest_ofs &= ~PAGE_MASK)) { +finalize_page: + /* + * First stage: add current page index to array of + * completed pages. + */ + completed_pages[nr_completed_pages++] = *dest_index; + if (++*dest_index > dest_max_index) + goto return_overflow; + } + goto do_next_sb; + } + ntfs_debug("Found compressed sub-block."); + /* This sb is compressed, decompress it into destination. */ + + /* Setup destination pointers. */ + dp_sb_start = dp_addr; + dp_sb_end = dp_sb_start + NTFS_SB_SIZE; + + /* Forward to the first tag in the sub-block. */ + cb += 2; +do_next_tag: + if (cb == cb_sb_end) { + /* Check if the decompressed sub-block was not full-length. */ + if (dp_addr < dp_sb_end) { + int nr_bytes = do_sb_end - *dest_ofs; + + ntfs_debug("Filling incomplete sub-block with " + "zeroes."); + /* Zero remainder and update destination position. */ + memset(dp_addr, 0, nr_bytes); + *dest_ofs += nr_bytes; + } + /* We have finished the current sub-block. */ + if (!(*dest_ofs &= ~PAGE_MASK)) + goto finalize_page; + goto do_next_sb; + } + + /* Check we are still in range. */ + if (cb > cb_sb_end || dp_addr > dp_sb_end) + goto return_overflow; + + /* Get the next tag and advance to first token. */ + tag = *cb++; + + /* Parse the eight tokens described by the tag. */ + for (token = 0; token < 8; token++, tag >>= 1) { + u16 lg, pt, length, max_non_overlap; + register u16 i; + u8 *dp_back_addr; + + /* Check if we are done / still in range. */ + if (cb >= cb_sb_end || dp_addr > dp_sb_end) + break; + + /* Determine token type and parse appropriately.*/ + if ((tag & NTFS_TOKEN_MASK) == NTFS_SYMBOL_TOKEN) { + /* + * We have a symbol token, copy the symbol across, and + * advance the source and destination positions. + */ + *dp_addr++ = *cb++; + ++*dest_ofs; + + /* Continue with the next token. */ + continue; + } + + /* + * We have a phrase token. Make sure it is not the first tag in + * the sb as this is illegal and would confuse the code below. + */ + if (dp_addr == dp_sb_start) + goto return_overflow; + + /* + * Determine the number of bytes to go back (p) and the number + * of bytes to copy (l). We use an optimized algorithm in which + * we first calculate log2(current destination position in sb), + * which allows determination of l and p in O(1) rather than + * O(n). We just need an arch-optimized log2() function now. + */ + lg = 0; + for (i = *dest_ofs - do_sb_start - 1; i >= 0x10; i >>= 1) + lg++; + + /* Get the phrase token into i. */ + pt = le16_to_cpup((le16*)cb); + + /* + * Calculate starting position of the byte sequence in + * the destination using the fact that p = (pt >> (12 - lg)) + 1 + * and make sure we don't go too far back. + */ + dp_back_addr = dp_addr - (pt >> (12 - lg)) - 1; + if (dp_back_addr < dp_sb_start) + goto return_overflow; + + /* Now calculate the length of the byte sequence. */ + length = (pt & (0xfff >> lg)) + 3; + + /* Advance destination position and verify it is in range. */ + *dest_ofs += length; + if (*dest_ofs > do_sb_end) + goto return_overflow; + + /* The number of non-overlapping bytes. */ + max_non_overlap = dp_addr - dp_back_addr; + + if (length <= max_non_overlap) { + /* The byte sequence doesn't overlap, just copy it. */ + memcpy(dp_addr, dp_back_addr, length); + + /* Advance destination pointer. */ + dp_addr += length; + } else { + /* + * The byte sequence does overlap, copy non-overlapping + * part and then do a slow byte by byte copy for the + * overlapping part. Also, advance the destination + * pointer. + */ + memcpy(dp_addr, dp_back_addr, max_non_overlap); + dp_addr += max_non_overlap; + dp_back_addr += max_non_overlap; + length -= max_non_overlap; + while (length--) + *dp_addr++ = *dp_back_addr++; + } + + /* Advance source position and continue with the next token. */ + cb += 2; + } + + /* No tokens left in the current tag. Continue with the next tag. */ + goto do_next_tag; + +return_overflow: + ntfs_error(NULL, "Failed. Returning -EOVERFLOW."); + goto return_error; +} + +/** + * ntfs_read_compressed_block - read a compressed block into the page cache + * @page: locked page in the compression block(s) we need to read + * + * When we are called the page has already been verified to be locked and the + * attribute is known to be non-resident, not encrypted, but compressed. + * + * 1. Determine which compression block(s) @page is in. + * 2. Get hold of all pages corresponding to this/these compression block(s). + * 3. Read the (first) compression block. + * 4. Decompress it into the corresponding pages. + * 5. Throw the compressed data away and proceed to 3. for the next compression + * block or return success if no more compression blocks left. + * + * Warning: We have to be careful what we do about existing pages. They might + * have been written to so that we would lose data if we were to just overwrite + * them with the out-of-date uncompressed data. + * + * FIXME: For PAGE_SIZE > cb_size we are not doing the Right Thing(TM) at + * the end of the file I think. We need to detect this case and zero the out + * of bounds remainder of the page in question and mark it as handled. At the + * moment we would just return -EIO on such a page. This bug will only become + * apparent if pages are above 8kiB and the NTFS volume only uses 512 byte + * clusters so is probably not going to be seen by anyone. Still this should + * be fixed. (AIA) + * + * FIXME: Again for PAGE_SIZE > cb_size we are screwing up both in + * handling sparse and compressed cbs. (AIA) + * + * FIXME: At the moment we don't do any zeroing out in the case that + * initialized_size is less than data_size. This should be safe because of the + * nature of the compression algorithm used. Just in case we check and output + * an error message in read inode if the two sizes are not equal for a + * compressed file. (AIA) + */ +int ntfs_read_compressed_block(struct page *page) +{ + loff_t i_size; + s64 initialized_size; + struct address_space *mapping = page->mapping; + ntfs_inode *ni = NTFS_I(mapping->host); + ntfs_volume *vol = ni->vol; + struct super_block *sb = vol->sb; + runlist_element *rl; + unsigned long flags, block_size = sb->s_blocksize; + unsigned char block_size_bits = sb->s_blocksize_bits; + u8 *cb, *cb_pos, *cb_end; + struct buffer_head **bhs; + unsigned long offset, index = page->index; + u32 cb_size = ni->itype.compressed.block_size; + u64 cb_size_mask = cb_size - 1UL; + VCN vcn; + LCN lcn; + /* The first wanted vcn (minimum alignment is PAGE_SIZE). */ + VCN start_vcn = (((s64)index << PAGE_SHIFT) & ~cb_size_mask) >> + vol->cluster_size_bits; + /* + * The first vcn after the last wanted vcn (minimum alignment is again + * PAGE_SIZE. + */ + VCN end_vcn = ((((s64)(index + 1UL) << PAGE_SHIFT) + cb_size - 1) + & ~cb_size_mask) >> vol->cluster_size_bits; + /* Number of compression blocks (cbs) in the wanted vcn range. */ + unsigned int nr_cbs = (end_vcn - start_vcn) << vol->cluster_size_bits + >> ni->itype.compressed.block_size_bits; + /* + * Number of pages required to store the uncompressed data from all + * compression blocks (cbs) overlapping @page. Due to alignment + * guarantees of start_vcn and end_vcn, no need to round up here. + */ + unsigned int nr_pages = (end_vcn - start_vcn) << + vol->cluster_size_bits >> PAGE_SHIFT; + unsigned int xpage, max_page, cur_page, cur_ofs, i; + unsigned int cb_clusters, cb_max_ofs; + int block, max_block, cb_max_page, bhs_size, nr_bhs, err = 0; + struct page **pages; + int *completed_pages; + unsigned char xpage_done = 0; + + ntfs_debug("Entering, page->index = 0x%lx, cb_size = 0x%x, nr_pages = " + "%i.", index, cb_size, nr_pages); + /* + * Bad things happen if we get here for anything that is not an + * unnamed $DATA attribute. + */ + BUG_ON(ni->type != AT_DATA); + BUG_ON(ni->name_len); + + pages = kmalloc_array(nr_pages, sizeof(struct page *), GFP_NOFS); + completed_pages = kmalloc_array(nr_pages + 1, sizeof(int), GFP_NOFS); + + /* Allocate memory to store the buffer heads we need. */ + bhs_size = cb_size / block_size * sizeof(struct buffer_head *); + bhs = kmalloc(bhs_size, GFP_NOFS); + + if (unlikely(!pages || !bhs || !completed_pages)) { + kfree(bhs); + kfree(pages); + kfree(completed_pages); + unlock_page(page); + ntfs_error(vol->sb, "Failed to allocate internal buffers."); + return -ENOMEM; + } + + /* + * We have already been given one page, this is the one we must do. + * Once again, the alignment guarantees keep it simple. + */ + offset = start_vcn << vol->cluster_size_bits >> PAGE_SHIFT; + xpage = index - offset; + pages[xpage] = page; + /* + * The remaining pages need to be allocated and inserted into the page + * cache, alignment guarantees keep all the below much simpler. (-8 + */ + read_lock_irqsave(&ni->size_lock, flags); + i_size = i_size_read(VFS_I(ni)); + initialized_size = ni->initialized_size; + read_unlock_irqrestore(&ni->size_lock, flags); + max_page = ((i_size + PAGE_SIZE - 1) >> PAGE_SHIFT) - + offset; + /* Is the page fully outside i_size? (truncate in progress) */ + if (xpage >= max_page) { + kfree(bhs); + kfree(pages); + kfree(completed_pages); + zero_user(page, 0, PAGE_SIZE); + ntfs_debug("Compressed read outside i_size - truncated?"); + SetPageUptodate(page); + unlock_page(page); + return 0; + } + if (nr_pages < max_page) + max_page = nr_pages; + for (i = 0; i < max_page; i++, offset++) { + if (i != xpage) + pages[i] = grab_cache_page_nowait(mapping, offset); + page = pages[i]; + if (page) { + /* + * We only (re)read the page if it isn't already read + * in and/or dirty or we would be losing data or at + * least wasting our time. + */ + if (!PageDirty(page) && (!PageUptodate(page) || + PageError(page))) { + ClearPageError(page); + kmap(page); + continue; + } + unlock_page(page); + put_page(page); + pages[i] = NULL; + } + } + + /* + * We have the runlist, and all the destination pages we need to fill. + * Now read the first compression block. + */ + cur_page = 0; + cur_ofs = 0; + cb_clusters = ni->itype.compressed.block_clusters; +do_next_cb: + nr_cbs--; + nr_bhs = 0; + + /* Read all cb buffer heads one cluster at a time. */ + rl = NULL; + for (vcn = start_vcn, start_vcn += cb_clusters; vcn < start_vcn; + vcn++) { + bool is_retry = false; + + if (!rl) { +lock_retry_remap: + down_read(&ni->runlist.lock); + rl = ni->runlist.rl; + } + if (likely(rl != NULL)) { + /* Seek to element containing target vcn. */ + while (rl->length && rl[1].vcn <= vcn) + rl++; + lcn = ntfs_rl_vcn_to_lcn(rl, vcn); + } else + lcn = LCN_RL_NOT_MAPPED; + ntfs_debug("Reading vcn = 0x%llx, lcn = 0x%llx.", + (unsigned long long)vcn, + (unsigned long long)lcn); + if (lcn < 0) { + /* + * When we reach the first sparse cluster we have + * finished with the cb. + */ + if (lcn == LCN_HOLE) + break; + if (is_retry || lcn != LCN_RL_NOT_MAPPED) + goto rl_err; + is_retry = true; + /* + * Attempt to map runlist, dropping lock for the + * duration. + */ + up_read(&ni->runlist.lock); + if (!ntfs_map_runlist(ni, vcn)) + goto lock_retry_remap; + goto map_rl_err; + } + block = lcn << vol->cluster_size_bits >> block_size_bits; + /* Read the lcn from device in chunks of block_size bytes. */ + max_block = block + (vol->cluster_size >> block_size_bits); + do { + ntfs_debug("block = 0x%x.", block); + if (unlikely(!(bhs[nr_bhs] = sb_getblk(sb, block)))) + goto getblk_err; + nr_bhs++; + } while (++block < max_block); + } + + /* Release the lock if we took it. */ + if (rl) + up_read(&ni->runlist.lock); + + /* Setup and initiate io on all buffer heads. */ + for (i = 0; i < nr_bhs; i++) { + struct buffer_head *tbh = bhs[i]; + + if (!trylock_buffer(tbh)) + continue; + if (unlikely(buffer_uptodate(tbh))) { + unlock_buffer(tbh); + continue; + } + get_bh(tbh); + tbh->b_end_io = end_buffer_read_sync; + submit_bh(REQ_OP_READ, tbh); + } + + /* Wait for io completion on all buffer heads. */ + for (i = 0; i < nr_bhs; i++) { + struct buffer_head *tbh = bhs[i]; + + if (buffer_uptodate(tbh)) + continue; + wait_on_buffer(tbh); + /* + * We need an optimization barrier here, otherwise we start + * hitting the below fixup code when accessing a loopback + * mounted ntfs partition. This indicates either there is a + * race condition in the loop driver or, more likely, gcc + * overoptimises the code without the barrier and it doesn't + * do the Right Thing(TM). + */ + barrier(); + if (unlikely(!buffer_uptodate(tbh))) { + ntfs_warning(vol->sb, "Buffer is unlocked but not " + "uptodate! Unplugging the disk queue " + "and rescheduling."); + get_bh(tbh); + io_schedule(); + put_bh(tbh); + if (unlikely(!buffer_uptodate(tbh))) + goto read_err; + ntfs_warning(vol->sb, "Buffer is now uptodate. Good."); + } + } + + /* + * Get the compression buffer. We must not sleep any more + * until we are finished with it. + */ + spin_lock(&ntfs_cb_lock); + cb = ntfs_compression_buffer; + + BUG_ON(!cb); + + cb_pos = cb; + cb_end = cb + cb_size; + + /* Copy the buffer heads into the contiguous buffer. */ + for (i = 0; i < nr_bhs; i++) { + memcpy(cb_pos, bhs[i]->b_data, block_size); + cb_pos += block_size; + } + + /* Just a precaution. */ + if (cb_pos + 2 <= cb + cb_size) + *(u16*)cb_pos = 0; + + /* Reset cb_pos back to the beginning. */ + cb_pos = cb; + + /* We now have both source (if present) and destination. */ + ntfs_debug("Successfully read the compression block."); + + /* The last page and maximum offset within it for the current cb. */ + cb_max_page = (cur_page << PAGE_SHIFT) + cur_ofs + cb_size; + cb_max_ofs = cb_max_page & ~PAGE_MASK; + cb_max_page >>= PAGE_SHIFT; + + /* Catch end of file inside a compression block. */ + if (cb_max_page > max_page) + cb_max_page = max_page; + + if (vcn == start_vcn - cb_clusters) { + /* Sparse cb, zero out page range overlapping the cb. */ + ntfs_debug("Found sparse compression block."); + /* We can sleep from now on, so we drop lock. */ + spin_unlock(&ntfs_cb_lock); + if (cb_max_ofs) + cb_max_page--; + for (; cur_page < cb_max_page; cur_page++) { + page = pages[cur_page]; + if (page) { + if (likely(!cur_ofs)) + clear_page(page_address(page)); + else + memset(page_address(page) + cur_ofs, 0, + PAGE_SIZE - + cur_ofs); + flush_dcache_page(page); + kunmap(page); + SetPageUptodate(page); + unlock_page(page); + if (cur_page == xpage) + xpage_done = 1; + else + put_page(page); + pages[cur_page] = NULL; + } + cb_pos += PAGE_SIZE - cur_ofs; + cur_ofs = 0; + if (cb_pos >= cb_end) + break; + } + /* If we have a partial final page, deal with it now. */ + if (cb_max_ofs && cb_pos < cb_end) { + page = pages[cur_page]; + if (page) + memset(page_address(page) + cur_ofs, 0, + cb_max_ofs - cur_ofs); + /* + * No need to update cb_pos at this stage: + * cb_pos += cb_max_ofs - cur_ofs; + */ + cur_ofs = cb_max_ofs; + } + } else if (vcn == start_vcn) { + /* We can't sleep so we need two stages. */ + unsigned int cur2_page = cur_page; + unsigned int cur_ofs2 = cur_ofs; + u8 *cb_pos2 = cb_pos; + + ntfs_debug("Found uncompressed compression block."); + /* Uncompressed cb, copy it to the destination pages. */ + /* + * TODO: As a big optimization, we could detect this case + * before we read all the pages and use block_read_full_folio() + * on all full pages instead (we still have to treat partial + * pages especially but at least we are getting rid of the + * synchronous io for the majority of pages. + * Or if we choose not to do the read-ahead/-behind stuff, we + * could just return block_read_full_folio(pages[xpage]) as long + * as PAGE_SIZE <= cb_size. + */ + if (cb_max_ofs) + cb_max_page--; + /* First stage: copy data into destination pages. */ + for (; cur_page < cb_max_page; cur_page++) { + page = pages[cur_page]; + if (page) + memcpy(page_address(page) + cur_ofs, cb_pos, + PAGE_SIZE - cur_ofs); + cb_pos += PAGE_SIZE - cur_ofs; + cur_ofs = 0; + if (cb_pos >= cb_end) + break; + } + /* If we have a partial final page, deal with it now. */ + if (cb_max_ofs && cb_pos < cb_end) { + page = pages[cur_page]; + if (page) + memcpy(page_address(page) + cur_ofs, cb_pos, + cb_max_ofs - cur_ofs); + cb_pos += cb_max_ofs - cur_ofs; + cur_ofs = cb_max_ofs; + } + /* We can sleep from now on, so drop lock. */ + spin_unlock(&ntfs_cb_lock); + /* Second stage: finalize pages. */ + for (; cur2_page < cb_max_page; cur2_page++) { + page = pages[cur2_page]; + if (page) { + /* + * If we are outside the initialized size, zero + * the out of bounds page range. + */ + handle_bounds_compressed_page(page, i_size, + initialized_size); + flush_dcache_page(page); + kunmap(page); + SetPageUptodate(page); + unlock_page(page); + if (cur2_page == xpage) + xpage_done = 1; + else + put_page(page); + pages[cur2_page] = NULL; + } + cb_pos2 += PAGE_SIZE - cur_ofs2; + cur_ofs2 = 0; + if (cb_pos2 >= cb_end) + break; + } + } else { + /* Compressed cb, decompress it into the destination page(s). */ + unsigned int prev_cur_page = cur_page; + + ntfs_debug("Found compressed compression block."); + err = ntfs_decompress(pages, completed_pages, &cur_page, + &cur_ofs, cb_max_page, cb_max_ofs, xpage, + &xpage_done, cb_pos, cb_size - (cb_pos - cb), + i_size, initialized_size); + /* + * We can sleep from now on, lock already dropped by + * ntfs_decompress(). + */ + if (err) { + ntfs_error(vol->sb, "ntfs_decompress() failed in inode " + "0x%lx with error code %i. Skipping " + "this compression block.", + ni->mft_no, -err); + /* Release the unfinished pages. */ + for (; prev_cur_page < cur_page; prev_cur_page++) { + page = pages[prev_cur_page]; + if (page) { + flush_dcache_page(page); + kunmap(page); + unlock_page(page); + if (prev_cur_page != xpage) + put_page(page); + pages[prev_cur_page] = NULL; + } + } + } + } + + /* Release the buffer heads. */ + for (i = 0; i < nr_bhs; i++) + brelse(bhs[i]); + + /* Do we have more work to do? */ + if (nr_cbs) + goto do_next_cb; + + /* We no longer need the list of buffer heads. */ + kfree(bhs); + + /* Clean up if we have any pages left. Should never happen. */ + for (cur_page = 0; cur_page < max_page; cur_page++) { + page = pages[cur_page]; + if (page) { + ntfs_error(vol->sb, "Still have pages left! " + "Terminating them with extreme " + "prejudice. Inode 0x%lx, page index " + "0x%lx.", ni->mft_no, page->index); + flush_dcache_page(page); + kunmap(page); + unlock_page(page); + if (cur_page != xpage) + put_page(page); + pages[cur_page] = NULL; + } + } + + /* We no longer need the list of pages. */ + kfree(pages); + kfree(completed_pages); + + /* If we have completed the requested page, we return success. */ + if (likely(xpage_done)) + return 0; + + ntfs_debug("Failed. Returning error code %s.", err == -EOVERFLOW ? + "EOVERFLOW" : (!err ? "EIO" : "unknown error")); + return err < 0 ? err : -EIO; + +read_err: + ntfs_error(vol->sb, "IO error while reading compressed data."); + /* Release the buffer heads. */ + for (i = 0; i < nr_bhs; i++) + brelse(bhs[i]); + goto err_out; + +map_rl_err: + ntfs_error(vol->sb, "ntfs_map_runlist() failed. Cannot read " + "compression block."); + goto err_out; + +rl_err: + up_read(&ni->runlist.lock); + ntfs_error(vol->sb, "ntfs_rl_vcn_to_lcn() failed. Cannot read " + "compression block."); + goto err_out; + +getblk_err: + up_read(&ni->runlist.lock); + ntfs_error(vol->sb, "getblk() failed. Cannot read compression block."); + +err_out: + kfree(bhs); + for (i = cur_page; i < max_page; i++) { + page = pages[i]; + if (page) { + flush_dcache_page(page); + kunmap(page); + unlock_page(page); + if (i != xpage) + put_page(page); + } + } + kfree(pages); + kfree(completed_pages); + return -EIO; +} diff --git a/fs/ntfs/debug.c b/fs/ntfs/debug.c new file mode 100644 index 000000000..a3c1c5656 --- /dev/null +++ b/fs/ntfs/debug.c @@ -0,0 +1,159 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * debug.c - NTFS kernel debug support. Part of the Linux-NTFS project. + * + * Copyright (c) 2001-2004 Anton Altaparmakov + */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include "debug.h" + +/** + * __ntfs_warning - output a warning to the syslog + * @function: name of function outputting the warning + * @sb: super block of mounted ntfs filesystem + * @fmt: warning string containing format specifications + * @...: a variable number of arguments specified in @fmt + * + * Outputs a warning to the syslog for the mounted ntfs filesystem described + * by @sb. + * + * @fmt and the corresponding @... is printf style format string containing + * the warning string and the corresponding format arguments, respectively. + * + * @function is the name of the function from which __ntfs_warning is being + * called. + * + * Note, you should be using debug.h::ntfs_warning(@sb, @fmt, @...) instead + * as this provides the @function parameter automatically. + */ +void __ntfs_warning(const char *function, const struct super_block *sb, + const char *fmt, ...) +{ + struct va_format vaf; + va_list args; + int flen = 0; + +#ifndef DEBUG + if (!printk_ratelimit()) + return; +#endif + if (function) + flen = strlen(function); + va_start(args, fmt); + vaf.fmt = fmt; + vaf.va = &args; + if (sb) + pr_warn("(device %s): %s(): %pV\n", + sb->s_id, flen ? function : "", &vaf); + else + pr_warn("%s(): %pV\n", flen ? function : "", &vaf); + va_end(args); +} + +/** + * __ntfs_error - output an error to the syslog + * @function: name of function outputting the error + * @sb: super block of mounted ntfs filesystem + * @fmt: error string containing format specifications + * @...: a variable number of arguments specified in @fmt + * + * Outputs an error to the syslog for the mounted ntfs filesystem described + * by @sb. + * + * @fmt and the corresponding @... is printf style format string containing + * the error string and the corresponding format arguments, respectively. + * + * @function is the name of the function from which __ntfs_error is being + * called. + * + * Note, you should be using debug.h::ntfs_error(@sb, @fmt, @...) instead + * as this provides the @function parameter automatically. + */ +void __ntfs_error(const char *function, const struct super_block *sb, + const char *fmt, ...) +{ + struct va_format vaf; + va_list args; + int flen = 0; + +#ifndef DEBUG + if (!printk_ratelimit()) + return; +#endif + if (function) + flen = strlen(function); + va_start(args, fmt); + vaf.fmt = fmt; + vaf.va = &args; + if (sb) + pr_err("(device %s): %s(): %pV\n", + sb->s_id, flen ? function : "", &vaf); + else + pr_err("%s(): %pV\n", flen ? function : "", &vaf); + va_end(args); +} + +#ifdef DEBUG + +/* If 1, output debug messages, and if 0, don't. */ +int debug_msgs = 0; + +void __ntfs_debug(const char *file, int line, const char *function, + const char *fmt, ...) +{ + struct va_format vaf; + va_list args; + int flen = 0; + + if (!debug_msgs) + return; + if (function) + flen = strlen(function); + va_start(args, fmt); + vaf.fmt = fmt; + vaf.va = &args; + pr_debug("(%s, %d): %s(): %pV", file, line, flen ? function : "", &vaf); + va_end(args); +} + +/* Dump a runlist. Caller has to provide synchronisation for @rl. */ +void ntfs_debug_dump_runlist(const runlist_element *rl) +{ + int i; + const char *lcn_str[5] = { "LCN_HOLE ", "LCN_RL_NOT_MAPPED", + "LCN_ENOENT ", "LCN_unknown " }; + + if (!debug_msgs) + return; + pr_debug("Dumping runlist (values in hex):\n"); + if (!rl) { + pr_debug("Run list not present.\n"); + return; + } + pr_debug("VCN LCN Run length\n"); + for (i = 0; ; i++) { + LCN lcn = (rl + i)->lcn; + + if (lcn < (LCN)0) { + int index = -lcn - 1; + + if (index > -LCN_ENOENT - 1) + index = 3; + pr_debug("%-16Lx %s %-16Lx%s\n", + (long long)(rl + i)->vcn, lcn_str[index], + (long long)(rl + i)->length, + (rl + i)->length ? "" : + " (runlist end)"); + } else + pr_debug("%-16Lx %-16Lx %-16Lx%s\n", + (long long)(rl + i)->vcn, + (long long)(rl + i)->lcn, + (long long)(rl + i)->length, + (rl + i)->length ? "" : + " (runlist end)"); + if (!(rl + i)->length) + break; + } +} + +#endif diff --git a/fs/ntfs/debug.h b/fs/ntfs/debug.h new file mode 100644 index 000000000..6fdef388f --- /dev/null +++ b/fs/ntfs/debug.h @@ -0,0 +1,57 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * debug.h - NTFS kernel debug support. Part of the Linux-NTFS project. + * + * Copyright (c) 2001-2004 Anton Altaparmakov + */ + +#ifndef _LINUX_NTFS_DEBUG_H +#define _LINUX_NTFS_DEBUG_H + +#include <linux/fs.h> + +#include "runlist.h" + +#ifdef DEBUG + +extern int debug_msgs; + +extern __printf(4, 5) +void __ntfs_debug(const char *file, int line, const char *function, + const char *format, ...); +/** + * ntfs_debug - write a debug level message to syslog + * @f: a printf format string containing the message + * @...: the variables to substitute into @f + * + * ntfs_debug() writes a DEBUG level message to the syslog but only if the + * driver was compiled with -DDEBUG. Otherwise, the call turns into a NOP. + */ +#define ntfs_debug(f, a...) \ + __ntfs_debug(__FILE__, __LINE__, __func__, f, ##a) + +extern void ntfs_debug_dump_runlist(const runlist_element *rl); + +#else /* !DEBUG */ + +#define ntfs_debug(fmt, ...) \ +do { \ + if (0) \ + no_printk(fmt, ##__VA_ARGS__); \ +} while (0) + +#define ntfs_debug_dump_runlist(rl) do {} while (0) + +#endif /* !DEBUG */ + +extern __printf(3, 4) +void __ntfs_warning(const char *function, const struct super_block *sb, + const char *fmt, ...); +#define ntfs_warning(sb, f, a...) __ntfs_warning(__func__, sb, f, ##a) + +extern __printf(3, 4) +void __ntfs_error(const char *function, const struct super_block *sb, + const char *fmt, ...); +#define ntfs_error(sb, f, a...) __ntfs_error(__func__, sb, f, ##a) + +#endif /* _LINUX_NTFS_DEBUG_H */ diff --git a/fs/ntfs/dir.c b/fs/ntfs/dir.c new file mode 100644 index 000000000..cd96083a1 --- /dev/null +++ b/fs/ntfs/dir.c @@ -0,0 +1,1538 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/** + * dir.c - NTFS kernel directory operations. Part of the Linux-NTFS project. + * + * Copyright (c) 2001-2007 Anton Altaparmakov + * Copyright (c) 2002 Richard Russon + */ + +#include <linux/buffer_head.h> +#include <linux/slab.h> +#include <linux/blkdev.h> + +#include "dir.h" +#include "aops.h" +#include "attrib.h" +#include "mft.h" +#include "debug.h" +#include "ntfs.h" + +/** + * The little endian Unicode string $I30 as a global constant. + */ +ntfschar I30[5] = { cpu_to_le16('$'), cpu_to_le16('I'), + cpu_to_le16('3'), cpu_to_le16('0'), 0 }; + +/** + * ntfs_lookup_inode_by_name - find an inode in a directory given its name + * @dir_ni: ntfs inode of the directory in which to search for the name + * @uname: Unicode name for which to search in the directory + * @uname_len: length of the name @uname in Unicode characters + * @res: return the found file name if necessary (see below) + * + * Look for an inode with name @uname in the directory with inode @dir_ni. + * ntfs_lookup_inode_by_name() walks the contents of the directory looking for + * the Unicode name. If the name is found in the directory, the corresponding + * inode number (>= 0) is returned as a mft reference in cpu format, i.e. it + * is a 64-bit number containing the sequence number. + * + * On error, a negative value is returned corresponding to the error code. In + * particular if the inode is not found -ENOENT is returned. Note that you + * can't just check the return value for being negative, you have to check the + * inode number for being negative which you can extract using MREC(return + * value). + * + * Note, @uname_len does not include the (optional) terminating NULL character. + * + * Note, we look for a case sensitive match first but we also look for a case + * insensitive match at the same time. If we find a case insensitive match, we + * save that for the case that we don't find an exact match, where we return + * the case insensitive match and setup @res (which we allocate!) with the mft + * reference, the file name type, length and with a copy of the little endian + * Unicode file name itself. If we match a file name which is in the DOS name + * space, we only return the mft reference and file name type in @res. + * ntfs_lookup() then uses this to find the long file name in the inode itself. + * This is to avoid polluting the dcache with short file names. We want them to + * work but we don't care for how quickly one can access them. This also fixes + * the dcache aliasing issues. + * + * Locking: - Caller must hold i_mutex on the directory. + * - Each page cache page in the index allocation mapping must be + * locked whilst being accessed otherwise we may find a corrupt + * page due to it being under ->writepage at the moment which + * applies the mst protection fixups before writing out and then + * removes them again after the write is complete after which it + * unlocks the page. + */ +MFT_REF ntfs_lookup_inode_by_name(ntfs_inode *dir_ni, const ntfschar *uname, + const int uname_len, ntfs_name **res) +{ + ntfs_volume *vol = dir_ni->vol; + struct super_block *sb = vol->sb; + MFT_RECORD *m; + INDEX_ROOT *ir; + INDEX_ENTRY *ie; + INDEX_ALLOCATION *ia; + u8 *index_end; + u64 mref; + ntfs_attr_search_ctx *ctx; + int err, rc; + VCN vcn, old_vcn; + struct address_space *ia_mapping; + struct page *page; + u8 *kaddr; + ntfs_name *name = NULL; + + BUG_ON(!S_ISDIR(VFS_I(dir_ni)->i_mode)); + BUG_ON(NInoAttr(dir_ni)); + /* Get hold of the mft record for the directory. */ + m = map_mft_record(dir_ni); + if (IS_ERR(m)) { + ntfs_error(sb, "map_mft_record() failed with error code %ld.", + -PTR_ERR(m)); + return ERR_MREF(PTR_ERR(m)); + } + ctx = ntfs_attr_get_search_ctx(dir_ni, m); + if (unlikely(!ctx)) { + err = -ENOMEM; + goto err_out; + } + /* Find the index root attribute in the mft record. */ + err = ntfs_attr_lookup(AT_INDEX_ROOT, I30, 4, CASE_SENSITIVE, 0, NULL, + 0, ctx); + if (unlikely(err)) { + if (err == -ENOENT) { + ntfs_error(sb, "Index root attribute missing in " + "directory inode 0x%lx.", + dir_ni->mft_no); + err = -EIO; + } + goto err_out; + } + /* Get to the index root value (it's been verified in read_inode). */ + ir = (INDEX_ROOT*)((u8*)ctx->attr + + le16_to_cpu(ctx->attr->data.resident.value_offset)); + index_end = (u8*)&ir->index + le32_to_cpu(ir->index.index_length); + /* The first index entry. */ + ie = (INDEX_ENTRY*)((u8*)&ir->index + + le32_to_cpu(ir->index.entries_offset)); + /* + * Loop until we exceed valid memory (corruption case) or until we + * reach the last entry. + */ + for (;; ie = (INDEX_ENTRY*)((u8*)ie + le16_to_cpu(ie->length))) { + /* Bounds checks. */ + if ((u8*)ie < (u8*)ctx->mrec || (u8*)ie + + sizeof(INDEX_ENTRY_HEADER) > index_end || + (u8*)ie + le16_to_cpu(ie->key_length) > + index_end) + goto dir_err_out; + /* + * The last entry cannot contain a name. It can however contain + * a pointer to a child node in the B+tree so we just break out. + */ + if (ie->flags & INDEX_ENTRY_END) + break; + /* + * We perform a case sensitive comparison and if that matches + * we are done and return the mft reference of the inode (i.e. + * the inode number together with the sequence number for + * consistency checking). We convert it to cpu format before + * returning. + */ + if (ntfs_are_names_equal(uname, uname_len, + (ntfschar*)&ie->key.file_name.file_name, + ie->key.file_name.file_name_length, + CASE_SENSITIVE, vol->upcase, vol->upcase_len)) { +found_it: + /* + * We have a perfect match, so we don't need to care + * about having matched imperfectly before, so we can + * free name and set *res to NULL. + * However, if the perfect match is a short file name, + * we need to signal this through *res, so that + * ntfs_lookup() can fix dcache aliasing issues. + * As an optimization we just reuse an existing + * allocation of *res. + */ + if (ie->key.file_name.file_name_type == FILE_NAME_DOS) { + if (!name) { + name = kmalloc(sizeof(ntfs_name), + GFP_NOFS); + if (!name) { + err = -ENOMEM; + goto err_out; + } + } + name->mref = le64_to_cpu( + ie->data.dir.indexed_file); + name->type = FILE_NAME_DOS; + name->len = 0; + *res = name; + } else { + kfree(name); + *res = NULL; + } + mref = le64_to_cpu(ie->data.dir.indexed_file); + ntfs_attr_put_search_ctx(ctx); + unmap_mft_record(dir_ni); + return mref; + } + /* + * For a case insensitive mount, we also perform a case + * insensitive comparison (provided the file name is not in the + * POSIX namespace). If the comparison matches, and the name is + * in the WIN32 namespace, we cache the filename in *res so + * that the caller, ntfs_lookup(), can work on it. If the + * comparison matches, and the name is in the DOS namespace, we + * only cache the mft reference and the file name type (we set + * the name length to zero for simplicity). + */ + if (!NVolCaseSensitive(vol) && + ie->key.file_name.file_name_type && + ntfs_are_names_equal(uname, uname_len, + (ntfschar*)&ie->key.file_name.file_name, + ie->key.file_name.file_name_length, + IGNORE_CASE, vol->upcase, vol->upcase_len)) { + int name_size = sizeof(ntfs_name); + u8 type = ie->key.file_name.file_name_type; + u8 len = ie->key.file_name.file_name_length; + + /* Only one case insensitive matching name allowed. */ + if (name) { + ntfs_error(sb, "Found already allocated name " + "in phase 1. Please run chkdsk " + "and if that doesn't find any " + "errors please report you saw " + "this message to " + "linux-ntfs-dev@lists." + "sourceforge.net."); + goto dir_err_out; + } + + if (type != FILE_NAME_DOS) + name_size += len * sizeof(ntfschar); + name = kmalloc(name_size, GFP_NOFS); + if (!name) { + err = -ENOMEM; + goto err_out; + } + name->mref = le64_to_cpu(ie->data.dir.indexed_file); + name->type = type; + if (type != FILE_NAME_DOS) { + name->len = len; + memcpy(name->name, ie->key.file_name.file_name, + len * sizeof(ntfschar)); + } else + name->len = 0; + *res = name; + } + /* + * Not a perfect match, need to do full blown collation so we + * know which way in the B+tree we have to go. + */ + rc = ntfs_collate_names(uname, uname_len, + (ntfschar*)&ie->key.file_name.file_name, + ie->key.file_name.file_name_length, 1, + IGNORE_CASE, vol->upcase, vol->upcase_len); + /* + * If uname collates before the name of the current entry, there + * is definitely no such name in this index but we might need to + * descend into the B+tree so we just break out of the loop. + */ + if (rc == -1) + break; + /* The names are not equal, continue the search. */ + if (rc) + continue; + /* + * Names match with case insensitive comparison, now try the + * case sensitive comparison, which is required for proper + * collation. + */ + rc = ntfs_collate_names(uname, uname_len, + (ntfschar*)&ie->key.file_name.file_name, + ie->key.file_name.file_name_length, 1, + CASE_SENSITIVE, vol->upcase, vol->upcase_len); + if (rc == -1) + break; + if (rc) + continue; + /* + * Perfect match, this will never happen as the + * ntfs_are_names_equal() call will have gotten a match but we + * still treat it correctly. + */ + goto found_it; + } + /* + * We have finished with this index without success. Check for the + * presence of a child node and if not present return -ENOENT, unless + * we have got a matching name cached in name in which case return the + * mft reference associated with it. + */ + if (!(ie->flags & INDEX_ENTRY_NODE)) { + if (name) { + ntfs_attr_put_search_ctx(ctx); + unmap_mft_record(dir_ni); + return name->mref; + } + ntfs_debug("Entry not found."); + err = -ENOENT; + goto err_out; + } /* Child node present, descend into it. */ + /* Consistency check: Verify that an index allocation exists. */ + if (!NInoIndexAllocPresent(dir_ni)) { + ntfs_error(sb, "No index allocation attribute but index entry " + "requires one. Directory inode 0x%lx is " + "corrupt or driver bug.", dir_ni->mft_no); + goto err_out; + } + /* Get the starting vcn of the index_block holding the child node. */ + vcn = sle64_to_cpup((sle64*)((u8*)ie + le16_to_cpu(ie->length) - 8)); + ia_mapping = VFS_I(dir_ni)->i_mapping; + /* + * We are done with the index root and the mft record. Release them, + * otherwise we deadlock with ntfs_map_page(). + */ + ntfs_attr_put_search_ctx(ctx); + unmap_mft_record(dir_ni); + m = NULL; + ctx = NULL; +descend_into_child_node: + /* + * Convert vcn to index into the index allocation attribute in units + * of PAGE_SIZE and map the page cache page, reading it from + * disk if necessary. + */ + page = ntfs_map_page(ia_mapping, vcn << + dir_ni->itype.index.vcn_size_bits >> PAGE_SHIFT); + if (IS_ERR(page)) { + ntfs_error(sb, "Failed to map directory index page, error %ld.", + -PTR_ERR(page)); + err = PTR_ERR(page); + goto err_out; + } + lock_page(page); + kaddr = (u8*)page_address(page); +fast_descend_into_child_node: + /* Get to the index allocation block. */ + ia = (INDEX_ALLOCATION*)(kaddr + ((vcn << + dir_ni->itype.index.vcn_size_bits) & ~PAGE_MASK)); + /* Bounds checks. */ + if ((u8*)ia < kaddr || (u8*)ia > kaddr + PAGE_SIZE) { + ntfs_error(sb, "Out of bounds check failed. Corrupt directory " + "inode 0x%lx or driver bug.", dir_ni->mft_no); + goto unm_err_out; + } + /* Catch multi sector transfer fixup errors. */ + if (unlikely(!ntfs_is_indx_record(ia->magic))) { + ntfs_error(sb, "Directory index record with vcn 0x%llx is " + "corrupt. Corrupt inode 0x%lx. Run chkdsk.", + (unsigned long long)vcn, dir_ni->mft_no); + goto unm_err_out; + } + if (sle64_to_cpu(ia->index_block_vcn) != vcn) { + ntfs_error(sb, "Actual VCN (0x%llx) of index buffer is " + "different from expected VCN (0x%llx). " + "Directory inode 0x%lx is corrupt or driver " + "bug.", (unsigned long long) + sle64_to_cpu(ia->index_block_vcn), + (unsigned long long)vcn, dir_ni->mft_no); + goto unm_err_out; + } + if (le32_to_cpu(ia->index.allocated_size) + 0x18 != + dir_ni->itype.index.block_size) { + ntfs_error(sb, "Index buffer (VCN 0x%llx) of directory inode " + "0x%lx has a size (%u) differing from the " + "directory specified size (%u). Directory " + "inode is corrupt or driver bug.", + (unsigned long long)vcn, dir_ni->mft_no, + le32_to_cpu(ia->index.allocated_size) + 0x18, + dir_ni->itype.index.block_size); + goto unm_err_out; + } + index_end = (u8*)ia + dir_ni->itype.index.block_size; + if (index_end > kaddr + PAGE_SIZE) { + ntfs_error(sb, "Index buffer (VCN 0x%llx) of directory inode " + "0x%lx crosses page boundary. Impossible! " + "Cannot access! This is probably a bug in the " + "driver.", (unsigned long long)vcn, + dir_ni->mft_no); + goto unm_err_out; + } + index_end = (u8*)&ia->index + le32_to_cpu(ia->index.index_length); + if (index_end > (u8*)ia + dir_ni->itype.index.block_size) { + ntfs_error(sb, "Size of index buffer (VCN 0x%llx) of directory " + "inode 0x%lx exceeds maximum size.", + (unsigned long long)vcn, dir_ni->mft_no); + goto unm_err_out; + } + /* The first index entry. */ + ie = (INDEX_ENTRY*)((u8*)&ia->index + + le32_to_cpu(ia->index.entries_offset)); + /* + * Iterate similar to above big loop but applied to index buffer, thus + * loop until we exceed valid memory (corruption case) or until we + * reach the last entry. + */ + for (;; ie = (INDEX_ENTRY*)((u8*)ie + le16_to_cpu(ie->length))) { + /* Bounds check. */ + if ((u8*)ie < (u8*)ia || (u8*)ie + + sizeof(INDEX_ENTRY_HEADER) > index_end || + (u8*)ie + le16_to_cpu(ie->key_length) > + index_end) { + ntfs_error(sb, "Index entry out of bounds in " + "directory inode 0x%lx.", + dir_ni->mft_no); + goto unm_err_out; + } + /* + * The last entry cannot contain a name. It can however contain + * a pointer to a child node in the B+tree so we just break out. + */ + if (ie->flags & INDEX_ENTRY_END) + break; + /* + * We perform a case sensitive comparison and if that matches + * we are done and return the mft reference of the inode (i.e. + * the inode number together with the sequence number for + * consistency checking). We convert it to cpu format before + * returning. + */ + if (ntfs_are_names_equal(uname, uname_len, + (ntfschar*)&ie->key.file_name.file_name, + ie->key.file_name.file_name_length, + CASE_SENSITIVE, vol->upcase, vol->upcase_len)) { +found_it2: + /* + * We have a perfect match, so we don't need to care + * about having matched imperfectly before, so we can + * free name and set *res to NULL. + * However, if the perfect match is a short file name, + * we need to signal this through *res, so that + * ntfs_lookup() can fix dcache aliasing issues. + * As an optimization we just reuse an existing + * allocation of *res. + */ + if (ie->key.file_name.file_name_type == FILE_NAME_DOS) { + if (!name) { + name = kmalloc(sizeof(ntfs_name), + GFP_NOFS); + if (!name) { + err = -ENOMEM; + goto unm_err_out; + } + } + name->mref = le64_to_cpu( + ie->data.dir.indexed_file); + name->type = FILE_NAME_DOS; + name->len = 0; + *res = name; + } else { + kfree(name); + *res = NULL; + } + mref = le64_to_cpu(ie->data.dir.indexed_file); + unlock_page(page); + ntfs_unmap_page(page); + return mref; + } + /* + * For a case insensitive mount, we also perform a case + * insensitive comparison (provided the file name is not in the + * POSIX namespace). If the comparison matches, and the name is + * in the WIN32 namespace, we cache the filename in *res so + * that the caller, ntfs_lookup(), can work on it. If the + * comparison matches, and the name is in the DOS namespace, we + * only cache the mft reference and the file name type (we set + * the name length to zero for simplicity). + */ + if (!NVolCaseSensitive(vol) && + ie->key.file_name.file_name_type && + ntfs_are_names_equal(uname, uname_len, + (ntfschar*)&ie->key.file_name.file_name, + ie->key.file_name.file_name_length, + IGNORE_CASE, vol->upcase, vol->upcase_len)) { + int name_size = sizeof(ntfs_name); + u8 type = ie->key.file_name.file_name_type; + u8 len = ie->key.file_name.file_name_length; + + /* Only one case insensitive matching name allowed. */ + if (name) { + ntfs_error(sb, "Found already allocated name " + "in phase 2. Please run chkdsk " + "and if that doesn't find any " + "errors please report you saw " + "this message to " + "linux-ntfs-dev@lists." + "sourceforge.net."); + unlock_page(page); + ntfs_unmap_page(page); + goto dir_err_out; + } + + if (type != FILE_NAME_DOS) + name_size += len * sizeof(ntfschar); + name = kmalloc(name_size, GFP_NOFS); + if (!name) { + err = -ENOMEM; + goto unm_err_out; + } + name->mref = le64_to_cpu(ie->data.dir.indexed_file); + name->type = type; + if (type != FILE_NAME_DOS) { + name->len = len; + memcpy(name->name, ie->key.file_name.file_name, + len * sizeof(ntfschar)); + } else + name->len = 0; + *res = name; + } + /* + * Not a perfect match, need to do full blown collation so we + * know which way in the B+tree we have to go. + */ + rc = ntfs_collate_names(uname, uname_len, + (ntfschar*)&ie->key.file_name.file_name, + ie->key.file_name.file_name_length, 1, + IGNORE_CASE, vol->upcase, vol->upcase_len); + /* + * If uname collates before the name of the current entry, there + * is definitely no such name in this index but we might need to + * descend into the B+tree so we just break out of the loop. + */ + if (rc == -1) + break; + /* The names are not equal, continue the search. */ + if (rc) + continue; + /* + * Names match with case insensitive comparison, now try the + * case sensitive comparison, which is required for proper + * collation. + */ + rc = ntfs_collate_names(uname, uname_len, + (ntfschar*)&ie->key.file_name.file_name, + ie->key.file_name.file_name_length, 1, + CASE_SENSITIVE, vol->upcase, vol->upcase_len); + if (rc == -1) + break; + if (rc) + continue; + /* + * Perfect match, this will never happen as the + * ntfs_are_names_equal() call will have gotten a match but we + * still treat it correctly. + */ + goto found_it2; + } + /* + * We have finished with this index buffer without success. Check for + * the presence of a child node. + */ + if (ie->flags & INDEX_ENTRY_NODE) { + if ((ia->index.flags & NODE_MASK) == LEAF_NODE) { + ntfs_error(sb, "Index entry with child node found in " + "a leaf node in directory inode 0x%lx.", + dir_ni->mft_no); + goto unm_err_out; + } + /* Child node present, descend into it. */ + old_vcn = vcn; + vcn = sle64_to_cpup((sle64*)((u8*)ie + + le16_to_cpu(ie->length) - 8)); + if (vcn >= 0) { + /* If vcn is in the same page cache page as old_vcn we + * recycle the mapped page. */ + if (old_vcn << vol->cluster_size_bits >> + PAGE_SHIFT == vcn << + vol->cluster_size_bits >> + PAGE_SHIFT) + goto fast_descend_into_child_node; + unlock_page(page); + ntfs_unmap_page(page); + goto descend_into_child_node; + } + ntfs_error(sb, "Negative child node vcn in directory inode " + "0x%lx.", dir_ni->mft_no); + goto unm_err_out; + } + /* + * No child node present, return -ENOENT, unless we have got a matching + * name cached in name in which case return the mft reference + * associated with it. + */ + if (name) { + unlock_page(page); + ntfs_unmap_page(page); + return name->mref; + } + ntfs_debug("Entry not found."); + err = -ENOENT; +unm_err_out: + unlock_page(page); + ntfs_unmap_page(page); +err_out: + if (!err) + err = -EIO; + if (ctx) + ntfs_attr_put_search_ctx(ctx); + if (m) + unmap_mft_record(dir_ni); + if (name) { + kfree(name); + *res = NULL; + } + return ERR_MREF(err); +dir_err_out: + ntfs_error(sb, "Corrupt directory. Aborting lookup."); + goto err_out; +} + +#if 0 + +// TODO: (AIA) +// The algorithm embedded in this code will be required for the time when we +// want to support adding of entries to directories, where we require correct +// collation of file names in order not to cause corruption of the filesystem. + +/** + * ntfs_lookup_inode_by_name - find an inode in a directory given its name + * @dir_ni: ntfs inode of the directory in which to search for the name + * @uname: Unicode name for which to search in the directory + * @uname_len: length of the name @uname in Unicode characters + * + * Look for an inode with name @uname in the directory with inode @dir_ni. + * ntfs_lookup_inode_by_name() walks the contents of the directory looking for + * the Unicode name. If the name is found in the directory, the corresponding + * inode number (>= 0) is returned as a mft reference in cpu format, i.e. it + * is a 64-bit number containing the sequence number. + * + * On error, a negative value is returned corresponding to the error code. In + * particular if the inode is not found -ENOENT is returned. Note that you + * can't just check the return value for being negative, you have to check the + * inode number for being negative which you can extract using MREC(return + * value). + * + * Note, @uname_len does not include the (optional) terminating NULL character. + */ +u64 ntfs_lookup_inode_by_name(ntfs_inode *dir_ni, const ntfschar *uname, + const int uname_len) +{ + ntfs_volume *vol = dir_ni->vol; + struct super_block *sb = vol->sb; + MFT_RECORD *m; + INDEX_ROOT *ir; + INDEX_ENTRY *ie; + INDEX_ALLOCATION *ia; + u8 *index_end; + u64 mref; + ntfs_attr_search_ctx *ctx; + int err, rc; + IGNORE_CASE_BOOL ic; + VCN vcn, old_vcn; + struct address_space *ia_mapping; + struct page *page; + u8 *kaddr; + + /* Get hold of the mft record for the directory. */ + m = map_mft_record(dir_ni); + if (IS_ERR(m)) { + ntfs_error(sb, "map_mft_record() failed with error code %ld.", + -PTR_ERR(m)); + return ERR_MREF(PTR_ERR(m)); + } + ctx = ntfs_attr_get_search_ctx(dir_ni, m); + if (!ctx) { + err = -ENOMEM; + goto err_out; + } + /* Find the index root attribute in the mft record. */ + err = ntfs_attr_lookup(AT_INDEX_ROOT, I30, 4, CASE_SENSITIVE, 0, NULL, + 0, ctx); + if (unlikely(err)) { + if (err == -ENOENT) { + ntfs_error(sb, "Index root attribute missing in " + "directory inode 0x%lx.", + dir_ni->mft_no); + err = -EIO; + } + goto err_out; + } + /* Get to the index root value (it's been verified in read_inode). */ + ir = (INDEX_ROOT*)((u8*)ctx->attr + + le16_to_cpu(ctx->attr->data.resident.value_offset)); + index_end = (u8*)&ir->index + le32_to_cpu(ir->index.index_length); + /* The first index entry. */ + ie = (INDEX_ENTRY*)((u8*)&ir->index + + le32_to_cpu(ir->index.entries_offset)); + /* + * Loop until we exceed valid memory (corruption case) or until we + * reach the last entry. + */ + for (;; ie = (INDEX_ENTRY*)((u8*)ie + le16_to_cpu(ie->length))) { + /* Bounds checks. */ + if ((u8*)ie < (u8*)ctx->mrec || (u8*)ie + + sizeof(INDEX_ENTRY_HEADER) > index_end || + (u8*)ie + le16_to_cpu(ie->key_length) > + index_end) + goto dir_err_out; + /* + * The last entry cannot contain a name. It can however contain + * a pointer to a child node in the B+tree so we just break out. + */ + if (ie->flags & INDEX_ENTRY_END) + break; + /* + * If the current entry has a name type of POSIX, the name is + * case sensitive and not otherwise. This has the effect of us + * not being able to access any POSIX file names which collate + * after the non-POSIX one when they only differ in case, but + * anyone doing screwy stuff like that deserves to burn in + * hell... Doing that kind of stuff on NT4 actually causes + * corruption on the partition even when using SP6a and Linux + * is not involved at all. + */ + ic = ie->key.file_name.file_name_type ? IGNORE_CASE : + CASE_SENSITIVE; + /* + * If the names match perfectly, we are done and return the + * mft reference of the inode (i.e. the inode number together + * with the sequence number for consistency checking. We + * convert it to cpu format before returning. + */ + if (ntfs_are_names_equal(uname, uname_len, + (ntfschar*)&ie->key.file_name.file_name, + ie->key.file_name.file_name_length, ic, + vol->upcase, vol->upcase_len)) { +found_it: + mref = le64_to_cpu(ie->data.dir.indexed_file); + ntfs_attr_put_search_ctx(ctx); + unmap_mft_record(dir_ni); + return mref; + } + /* + * Not a perfect match, need to do full blown collation so we + * know which way in the B+tree we have to go. + */ + rc = ntfs_collate_names(uname, uname_len, + (ntfschar*)&ie->key.file_name.file_name, + ie->key.file_name.file_name_length, 1, + IGNORE_CASE, vol->upcase, vol->upcase_len); + /* + * If uname collates before the name of the current entry, there + * is definitely no such name in this index but we might need to + * descend into the B+tree so we just break out of the loop. + */ + if (rc == -1) + break; + /* The names are not equal, continue the search. */ + if (rc) + continue; + /* + * Names match with case insensitive comparison, now try the + * case sensitive comparison, which is required for proper + * collation. + */ + rc = ntfs_collate_names(uname, uname_len, + (ntfschar*)&ie->key.file_name.file_name, + ie->key.file_name.file_name_length, 1, + CASE_SENSITIVE, vol->upcase, vol->upcase_len); + if (rc == -1) + break; + if (rc) + continue; + /* + * Perfect match, this will never happen as the + * ntfs_are_names_equal() call will have gotten a match but we + * still treat it correctly. + */ + goto found_it; + } + /* + * We have finished with this index without success. Check for the + * presence of a child node. + */ + if (!(ie->flags & INDEX_ENTRY_NODE)) { + /* No child node, return -ENOENT. */ + err = -ENOENT; + goto err_out; + } /* Child node present, descend into it. */ + /* Consistency check: Verify that an index allocation exists. */ + if (!NInoIndexAllocPresent(dir_ni)) { + ntfs_error(sb, "No index allocation attribute but index entry " + "requires one. Directory inode 0x%lx is " + "corrupt or driver bug.", dir_ni->mft_no); + goto err_out; + } + /* Get the starting vcn of the index_block holding the child node. */ + vcn = sle64_to_cpup((u8*)ie + le16_to_cpu(ie->length) - 8); + ia_mapping = VFS_I(dir_ni)->i_mapping; + /* + * We are done with the index root and the mft record. Release them, + * otherwise we deadlock with ntfs_map_page(). + */ + ntfs_attr_put_search_ctx(ctx); + unmap_mft_record(dir_ni); + m = NULL; + ctx = NULL; +descend_into_child_node: + /* + * Convert vcn to index into the index allocation attribute in units + * of PAGE_SIZE and map the page cache page, reading it from + * disk if necessary. + */ + page = ntfs_map_page(ia_mapping, vcn << + dir_ni->itype.index.vcn_size_bits >> PAGE_SHIFT); + if (IS_ERR(page)) { + ntfs_error(sb, "Failed to map directory index page, error %ld.", + -PTR_ERR(page)); + err = PTR_ERR(page); + goto err_out; + } + lock_page(page); + kaddr = (u8*)page_address(page); +fast_descend_into_child_node: + /* Get to the index allocation block. */ + ia = (INDEX_ALLOCATION*)(kaddr + ((vcn << + dir_ni->itype.index.vcn_size_bits) & ~PAGE_MASK)); + /* Bounds checks. */ + if ((u8*)ia < kaddr || (u8*)ia > kaddr + PAGE_SIZE) { + ntfs_error(sb, "Out of bounds check failed. Corrupt directory " + "inode 0x%lx or driver bug.", dir_ni->mft_no); + goto unm_err_out; + } + /* Catch multi sector transfer fixup errors. */ + if (unlikely(!ntfs_is_indx_record(ia->magic))) { + ntfs_error(sb, "Directory index record with vcn 0x%llx is " + "corrupt. Corrupt inode 0x%lx. Run chkdsk.", + (unsigned long long)vcn, dir_ni->mft_no); + goto unm_err_out; + } + if (sle64_to_cpu(ia->index_block_vcn) != vcn) { + ntfs_error(sb, "Actual VCN (0x%llx) of index buffer is " + "different from expected VCN (0x%llx). " + "Directory inode 0x%lx is corrupt or driver " + "bug.", (unsigned long long) + sle64_to_cpu(ia->index_block_vcn), + (unsigned long long)vcn, dir_ni->mft_no); + goto unm_err_out; + } + if (le32_to_cpu(ia->index.allocated_size) + 0x18 != + dir_ni->itype.index.block_size) { + ntfs_error(sb, "Index buffer (VCN 0x%llx) of directory inode " + "0x%lx has a size (%u) differing from the " + "directory specified size (%u). Directory " + "inode is corrupt or driver bug.", + (unsigned long long)vcn, dir_ni->mft_no, + le32_to_cpu(ia->index.allocated_size) + 0x18, + dir_ni->itype.index.block_size); + goto unm_err_out; + } + index_end = (u8*)ia + dir_ni->itype.index.block_size; + if (index_end > kaddr + PAGE_SIZE) { + ntfs_error(sb, "Index buffer (VCN 0x%llx) of directory inode " + "0x%lx crosses page boundary. Impossible! " + "Cannot access! This is probably a bug in the " + "driver.", (unsigned long long)vcn, + dir_ni->mft_no); + goto unm_err_out; + } + index_end = (u8*)&ia->index + le32_to_cpu(ia->index.index_length); + if (index_end > (u8*)ia + dir_ni->itype.index.block_size) { + ntfs_error(sb, "Size of index buffer (VCN 0x%llx) of directory " + "inode 0x%lx exceeds maximum size.", + (unsigned long long)vcn, dir_ni->mft_no); + goto unm_err_out; + } + /* The first index entry. */ + ie = (INDEX_ENTRY*)((u8*)&ia->index + + le32_to_cpu(ia->index.entries_offset)); + /* + * Iterate similar to above big loop but applied to index buffer, thus + * loop until we exceed valid memory (corruption case) or until we + * reach the last entry. + */ + for (;; ie = (INDEX_ENTRY*)((u8*)ie + le16_to_cpu(ie->length))) { + /* Bounds check. */ + if ((u8*)ie < (u8*)ia || (u8*)ie + + sizeof(INDEX_ENTRY_HEADER) > index_end || + (u8*)ie + le16_to_cpu(ie->key_length) > + index_end) { + ntfs_error(sb, "Index entry out of bounds in " + "directory inode 0x%lx.", + dir_ni->mft_no); + goto unm_err_out; + } + /* + * The last entry cannot contain a name. It can however contain + * a pointer to a child node in the B+tree so we just break out. + */ + if (ie->flags & INDEX_ENTRY_END) + break; + /* + * If the current entry has a name type of POSIX, the name is + * case sensitive and not otherwise. This has the effect of us + * not being able to access any POSIX file names which collate + * after the non-POSIX one when they only differ in case, but + * anyone doing screwy stuff like that deserves to burn in + * hell... Doing that kind of stuff on NT4 actually causes + * corruption on the partition even when using SP6a and Linux + * is not involved at all. + */ + ic = ie->key.file_name.file_name_type ? IGNORE_CASE : + CASE_SENSITIVE; + /* + * If the names match perfectly, we are done and return the + * mft reference of the inode (i.e. the inode number together + * with the sequence number for consistency checking. We + * convert it to cpu format before returning. + */ + if (ntfs_are_names_equal(uname, uname_len, + (ntfschar*)&ie->key.file_name.file_name, + ie->key.file_name.file_name_length, ic, + vol->upcase, vol->upcase_len)) { +found_it2: + mref = le64_to_cpu(ie->data.dir.indexed_file); + unlock_page(page); + ntfs_unmap_page(page); + return mref; + } + /* + * Not a perfect match, need to do full blown collation so we + * know which way in the B+tree we have to go. + */ + rc = ntfs_collate_names(uname, uname_len, + (ntfschar*)&ie->key.file_name.file_name, + ie->key.file_name.file_name_length, 1, + IGNORE_CASE, vol->upcase, vol->upcase_len); + /* + * If uname collates before the name of the current entry, there + * is definitely no such name in this index but we might need to + * descend into the B+tree so we just break out of the loop. + */ + if (rc == -1) + break; + /* The names are not equal, continue the search. */ + if (rc) + continue; + /* + * Names match with case insensitive comparison, now try the + * case sensitive comparison, which is required for proper + * collation. + */ + rc = ntfs_collate_names(uname, uname_len, + (ntfschar*)&ie->key.file_name.file_name, + ie->key.file_name.file_name_length, 1, + CASE_SENSITIVE, vol->upcase, vol->upcase_len); + if (rc == -1) + break; + if (rc) + continue; + /* + * Perfect match, this will never happen as the + * ntfs_are_names_equal() call will have gotten a match but we + * still treat it correctly. + */ + goto found_it2; + } + /* + * We have finished with this index buffer without success. Check for + * the presence of a child node. + */ + if (ie->flags & INDEX_ENTRY_NODE) { + if ((ia->index.flags & NODE_MASK) == LEAF_NODE) { + ntfs_error(sb, "Index entry with child node found in " + "a leaf node in directory inode 0x%lx.", + dir_ni->mft_no); + goto unm_err_out; + } + /* Child node present, descend into it. */ + old_vcn = vcn; + vcn = sle64_to_cpup((u8*)ie + le16_to_cpu(ie->length) - 8); + if (vcn >= 0) { + /* If vcn is in the same page cache page as old_vcn we + * recycle the mapped page. */ + if (old_vcn << vol->cluster_size_bits >> + PAGE_SHIFT == vcn << + vol->cluster_size_bits >> + PAGE_SHIFT) + goto fast_descend_into_child_node; + unlock_page(page); + ntfs_unmap_page(page); + goto descend_into_child_node; + } + ntfs_error(sb, "Negative child node vcn in directory inode " + "0x%lx.", dir_ni->mft_no); + goto unm_err_out; + } + /* No child node, return -ENOENT. */ + ntfs_debug("Entry not found."); + err = -ENOENT; +unm_err_out: + unlock_page(page); + ntfs_unmap_page(page); +err_out: + if (!err) + err = -EIO; + if (ctx) + ntfs_attr_put_search_ctx(ctx); + if (m) + unmap_mft_record(dir_ni); + return ERR_MREF(err); +dir_err_out: + ntfs_error(sb, "Corrupt directory. Aborting lookup."); + goto err_out; +} + +#endif + +/** + * ntfs_filldir - ntfs specific filldir method + * @vol: current ntfs volume + * @ndir: ntfs inode of current directory + * @ia_page: page in which the index allocation buffer @ie is in resides + * @ie: current index entry + * @name: buffer to use for the converted name + * @actor: what to feed the entries to + * + * Convert the Unicode @name to the loaded NLS and pass it to the @filldir + * callback. + * + * If @ia_page is not NULL it is the locked page containing the index + * allocation block containing the index entry @ie. + * + * Note, we drop (and then reacquire) the page lock on @ia_page across the + * @filldir() call otherwise we would deadlock with NFSd when it calls ->lookup + * since ntfs_lookup() will lock the same page. As an optimization, we do not + * retake the lock if we are returning a non-zero value as ntfs_readdir() + * would need to drop the lock immediately anyway. + */ +static inline int ntfs_filldir(ntfs_volume *vol, + ntfs_inode *ndir, struct page *ia_page, INDEX_ENTRY *ie, + u8 *name, struct dir_context *actor) +{ + unsigned long mref; + int name_len; + unsigned dt_type; + FILE_NAME_TYPE_FLAGS name_type; + + name_type = ie->key.file_name.file_name_type; + if (name_type == FILE_NAME_DOS) { + ntfs_debug("Skipping DOS name space entry."); + return 0; + } + if (MREF_LE(ie->data.dir.indexed_file) == FILE_root) { + ntfs_debug("Skipping root directory self reference entry."); + return 0; + } + if (MREF_LE(ie->data.dir.indexed_file) < FILE_first_user && + !NVolShowSystemFiles(vol)) { + ntfs_debug("Skipping system file."); + return 0; + } + name_len = ntfs_ucstonls(vol, (ntfschar*)&ie->key.file_name.file_name, + ie->key.file_name.file_name_length, &name, + NTFS_MAX_NAME_LEN * NLS_MAX_CHARSET_SIZE + 1); + if (name_len <= 0) { + ntfs_warning(vol->sb, "Skipping unrepresentable inode 0x%llx.", + (long long)MREF_LE(ie->data.dir.indexed_file)); + return 0; + } + if (ie->key.file_name.file_attributes & + FILE_ATTR_DUP_FILE_NAME_INDEX_PRESENT) + dt_type = DT_DIR; + else + dt_type = DT_REG; + mref = MREF_LE(ie->data.dir.indexed_file); + /* + * Drop the page lock otherwise we deadlock with NFS when it calls + * ->lookup since ntfs_lookup() will lock the same page. + */ + if (ia_page) + unlock_page(ia_page); + ntfs_debug("Calling filldir for %s with len %i, fpos 0x%llx, inode " + "0x%lx, DT_%s.", name, name_len, actor->pos, mref, + dt_type == DT_DIR ? "DIR" : "REG"); + if (!dir_emit(actor, name, name_len, mref, dt_type)) + return 1; + /* Relock the page but not if we are aborting ->readdir. */ + if (ia_page) + lock_page(ia_page); + return 0; +} + +/* + * We use the same basic approach as the old NTFS driver, i.e. we parse the + * index root entries and then the index allocation entries that are marked + * as in use in the index bitmap. + * + * While this will return the names in random order this doesn't matter for + * ->readdir but OTOH results in a faster ->readdir. + * + * VFS calls ->readdir without BKL but with i_mutex held. This protects the VFS + * parts (e.g. ->f_pos and ->i_size, and it also protects against directory + * modifications). + * + * Locking: - Caller must hold i_mutex on the directory. + * - Each page cache page in the index allocation mapping must be + * locked whilst being accessed otherwise we may find a corrupt + * page due to it being under ->writepage at the moment which + * applies the mst protection fixups before writing out and then + * removes them again after the write is complete after which it + * unlocks the page. + */ +static int ntfs_readdir(struct file *file, struct dir_context *actor) +{ + s64 ia_pos, ia_start, prev_ia_pos, bmp_pos; + loff_t i_size; + struct inode *bmp_vi, *vdir = file_inode(file); + struct super_block *sb = vdir->i_sb; + ntfs_inode *ndir = NTFS_I(vdir); + ntfs_volume *vol = NTFS_SB(sb); + MFT_RECORD *m; + INDEX_ROOT *ir = NULL; + INDEX_ENTRY *ie; + INDEX_ALLOCATION *ia; + u8 *name = NULL; + int rc, err, ir_pos, cur_bmp_pos; + struct address_space *ia_mapping, *bmp_mapping; + struct page *bmp_page = NULL, *ia_page = NULL; + u8 *kaddr, *bmp, *index_end; + ntfs_attr_search_ctx *ctx; + + ntfs_debug("Entering for inode 0x%lx, fpos 0x%llx.", + vdir->i_ino, actor->pos); + rc = err = 0; + /* Are we at end of dir yet? */ + i_size = i_size_read(vdir); + if (actor->pos >= i_size + vol->mft_record_size) + return 0; + /* Emulate . and .. for all directories. */ + if (!dir_emit_dots(file, actor)) + return 0; + m = NULL; + ctx = NULL; + /* + * Allocate a buffer to store the current name being processed + * converted to format determined by current NLS. + */ + name = kmalloc(NTFS_MAX_NAME_LEN * NLS_MAX_CHARSET_SIZE + 1, GFP_NOFS); + if (unlikely(!name)) { + err = -ENOMEM; + goto err_out; + } + /* Are we jumping straight into the index allocation attribute? */ + if (actor->pos >= vol->mft_record_size) + goto skip_index_root; + /* Get hold of the mft record for the directory. */ + m = map_mft_record(ndir); + if (IS_ERR(m)) { + err = PTR_ERR(m); + m = NULL; + goto err_out; + } + ctx = ntfs_attr_get_search_ctx(ndir, m); + if (unlikely(!ctx)) { + err = -ENOMEM; + goto err_out; + } + /* Get the offset into the index root attribute. */ + ir_pos = (s64)actor->pos; + /* Find the index root attribute in the mft record. */ + err = ntfs_attr_lookup(AT_INDEX_ROOT, I30, 4, CASE_SENSITIVE, 0, NULL, + 0, ctx); + if (unlikely(err)) { + ntfs_error(sb, "Index root attribute missing in directory " + "inode 0x%lx.", vdir->i_ino); + goto err_out; + } + /* + * Copy the index root attribute value to a buffer so that we can put + * the search context and unmap the mft record before calling the + * filldir() callback. We need to do this because of NFSd which calls + * ->lookup() from its filldir callback() and this causes NTFS to + * deadlock as ntfs_lookup() maps the mft record of the directory and + * we have got it mapped here already. The only solution is for us to + * unmap the mft record here so that a call to ntfs_lookup() is able to + * map the mft record without deadlocking. + */ + rc = le32_to_cpu(ctx->attr->data.resident.value_length); + ir = kmalloc(rc, GFP_NOFS); + if (unlikely(!ir)) { + err = -ENOMEM; + goto err_out; + } + /* Copy the index root value (it has been verified in read_inode). */ + memcpy(ir, (u8*)ctx->attr + + le16_to_cpu(ctx->attr->data.resident.value_offset), rc); + ntfs_attr_put_search_ctx(ctx); + unmap_mft_record(ndir); + ctx = NULL; + m = NULL; + index_end = (u8*)&ir->index + le32_to_cpu(ir->index.index_length); + /* The first index entry. */ + ie = (INDEX_ENTRY*)((u8*)&ir->index + + le32_to_cpu(ir->index.entries_offset)); + /* + * Loop until we exceed valid memory (corruption case) or until we + * reach the last entry or until filldir tells us it has had enough + * or signals an error (both covered by the rc test). + */ + for (;; ie = (INDEX_ENTRY*)((u8*)ie + le16_to_cpu(ie->length))) { + ntfs_debug("In index root, offset 0x%zx.", (u8*)ie - (u8*)ir); + /* Bounds checks. */ + if (unlikely((u8*)ie < (u8*)ir || (u8*)ie + + sizeof(INDEX_ENTRY_HEADER) > index_end || + (u8*)ie + le16_to_cpu(ie->key_length) > + index_end)) + goto err_out; + /* The last entry cannot contain a name. */ + if (ie->flags & INDEX_ENTRY_END) + break; + /* Skip index root entry if continuing previous readdir. */ + if (ir_pos > (u8*)ie - (u8*)ir) + continue; + /* Advance the position even if going to skip the entry. */ + actor->pos = (u8*)ie - (u8*)ir; + /* Submit the name to the filldir callback. */ + rc = ntfs_filldir(vol, ndir, NULL, ie, name, actor); + if (rc) { + kfree(ir); + goto abort; + } + } + /* We are done with the index root and can free the buffer. */ + kfree(ir); + ir = NULL; + /* If there is no index allocation attribute we are finished. */ + if (!NInoIndexAllocPresent(ndir)) + goto EOD; + /* Advance fpos to the beginning of the index allocation. */ + actor->pos = vol->mft_record_size; +skip_index_root: + kaddr = NULL; + prev_ia_pos = -1LL; + /* Get the offset into the index allocation attribute. */ + ia_pos = (s64)actor->pos - vol->mft_record_size; + ia_mapping = vdir->i_mapping; + ntfs_debug("Inode 0x%lx, getting index bitmap.", vdir->i_ino); + bmp_vi = ntfs_attr_iget(vdir, AT_BITMAP, I30, 4); + if (IS_ERR(bmp_vi)) { + ntfs_error(sb, "Failed to get bitmap attribute."); + err = PTR_ERR(bmp_vi); + goto err_out; + } + bmp_mapping = bmp_vi->i_mapping; + /* Get the starting bitmap bit position and sanity check it. */ + bmp_pos = ia_pos >> ndir->itype.index.block_size_bits; + if (unlikely(bmp_pos >> 3 >= i_size_read(bmp_vi))) { + ntfs_error(sb, "Current index allocation position exceeds " + "index bitmap size."); + goto iput_err_out; + } + /* Get the starting bit position in the current bitmap page. */ + cur_bmp_pos = bmp_pos & ((PAGE_SIZE * 8) - 1); + bmp_pos &= ~(u64)((PAGE_SIZE * 8) - 1); +get_next_bmp_page: + ntfs_debug("Reading bitmap with page index 0x%llx, bit ofs 0x%llx", + (unsigned long long)bmp_pos >> (3 + PAGE_SHIFT), + (unsigned long long)bmp_pos & + (unsigned long long)((PAGE_SIZE * 8) - 1)); + bmp_page = ntfs_map_page(bmp_mapping, + bmp_pos >> (3 + PAGE_SHIFT)); + if (IS_ERR(bmp_page)) { + ntfs_error(sb, "Reading index bitmap failed."); + err = PTR_ERR(bmp_page); + bmp_page = NULL; + goto iput_err_out; + } + bmp = (u8*)page_address(bmp_page); + /* Find next index block in use. */ + while (!(bmp[cur_bmp_pos >> 3] & (1 << (cur_bmp_pos & 7)))) { +find_next_index_buffer: + cur_bmp_pos++; + /* + * If we have reached the end of the bitmap page, get the next + * page, and put away the old one. + */ + if (unlikely((cur_bmp_pos >> 3) >= PAGE_SIZE)) { + ntfs_unmap_page(bmp_page); + bmp_pos += PAGE_SIZE * 8; + cur_bmp_pos = 0; + goto get_next_bmp_page; + } + /* If we have reached the end of the bitmap, we are done. */ + if (unlikely(((bmp_pos + cur_bmp_pos) >> 3) >= i_size)) + goto unm_EOD; + ia_pos = (bmp_pos + cur_bmp_pos) << + ndir->itype.index.block_size_bits; + } + ntfs_debug("Handling index buffer 0x%llx.", + (unsigned long long)bmp_pos + cur_bmp_pos); + /* If the current index buffer is in the same page we reuse the page. */ + if ((prev_ia_pos & (s64)PAGE_MASK) != + (ia_pos & (s64)PAGE_MASK)) { + prev_ia_pos = ia_pos; + if (likely(ia_page != NULL)) { + unlock_page(ia_page); + ntfs_unmap_page(ia_page); + } + /* + * Map the page cache page containing the current ia_pos, + * reading it from disk if necessary. + */ + ia_page = ntfs_map_page(ia_mapping, ia_pos >> PAGE_SHIFT); + if (IS_ERR(ia_page)) { + ntfs_error(sb, "Reading index allocation data failed."); + err = PTR_ERR(ia_page); + ia_page = NULL; + goto err_out; + } + lock_page(ia_page); + kaddr = (u8*)page_address(ia_page); + } + /* Get the current index buffer. */ + ia = (INDEX_ALLOCATION*)(kaddr + (ia_pos & ~PAGE_MASK & + ~(s64)(ndir->itype.index.block_size - 1))); + /* Bounds checks. */ + if (unlikely((u8*)ia < kaddr || (u8*)ia > kaddr + PAGE_SIZE)) { + ntfs_error(sb, "Out of bounds check failed. Corrupt directory " + "inode 0x%lx or driver bug.", vdir->i_ino); + goto err_out; + } + /* Catch multi sector transfer fixup errors. */ + if (unlikely(!ntfs_is_indx_record(ia->magic))) { + ntfs_error(sb, "Directory index record with vcn 0x%llx is " + "corrupt. Corrupt inode 0x%lx. Run chkdsk.", + (unsigned long long)ia_pos >> + ndir->itype.index.vcn_size_bits, vdir->i_ino); + goto err_out; + } + if (unlikely(sle64_to_cpu(ia->index_block_vcn) != (ia_pos & + ~(s64)(ndir->itype.index.block_size - 1)) >> + ndir->itype.index.vcn_size_bits)) { + ntfs_error(sb, "Actual VCN (0x%llx) of index buffer is " + "different from expected VCN (0x%llx). " + "Directory inode 0x%lx is corrupt or driver " + "bug. ", (unsigned long long) + sle64_to_cpu(ia->index_block_vcn), + (unsigned long long)ia_pos >> + ndir->itype.index.vcn_size_bits, vdir->i_ino); + goto err_out; + } + if (unlikely(le32_to_cpu(ia->index.allocated_size) + 0x18 != + ndir->itype.index.block_size)) { + ntfs_error(sb, "Index buffer (VCN 0x%llx) of directory inode " + "0x%lx has a size (%u) differing from the " + "directory specified size (%u). Directory " + "inode is corrupt or driver bug.", + (unsigned long long)ia_pos >> + ndir->itype.index.vcn_size_bits, vdir->i_ino, + le32_to_cpu(ia->index.allocated_size) + 0x18, + ndir->itype.index.block_size); + goto err_out; + } + index_end = (u8*)ia + ndir->itype.index.block_size; + if (unlikely(index_end > kaddr + PAGE_SIZE)) { + ntfs_error(sb, "Index buffer (VCN 0x%llx) of directory inode " + "0x%lx crosses page boundary. Impossible! " + "Cannot access! This is probably a bug in the " + "driver.", (unsigned long long)ia_pos >> + ndir->itype.index.vcn_size_bits, vdir->i_ino); + goto err_out; + } + ia_start = ia_pos & ~(s64)(ndir->itype.index.block_size - 1); + index_end = (u8*)&ia->index + le32_to_cpu(ia->index.index_length); + if (unlikely(index_end > (u8*)ia + ndir->itype.index.block_size)) { + ntfs_error(sb, "Size of index buffer (VCN 0x%llx) of directory " + "inode 0x%lx exceeds maximum size.", + (unsigned long long)ia_pos >> + ndir->itype.index.vcn_size_bits, vdir->i_ino); + goto err_out; + } + /* The first index entry in this index buffer. */ + ie = (INDEX_ENTRY*)((u8*)&ia->index + + le32_to_cpu(ia->index.entries_offset)); + /* + * Loop until we exceed valid memory (corruption case) or until we + * reach the last entry or until filldir tells us it has had enough + * or signals an error (both covered by the rc test). + */ + for (;; ie = (INDEX_ENTRY*)((u8*)ie + le16_to_cpu(ie->length))) { + ntfs_debug("In index allocation, offset 0x%llx.", + (unsigned long long)ia_start + + (unsigned long long)((u8*)ie - (u8*)ia)); + /* Bounds checks. */ + if (unlikely((u8*)ie < (u8*)ia || (u8*)ie + + sizeof(INDEX_ENTRY_HEADER) > index_end || + (u8*)ie + le16_to_cpu(ie->key_length) > + index_end)) + goto err_out; + /* The last entry cannot contain a name. */ + if (ie->flags & INDEX_ENTRY_END) + break; + /* Skip index block entry if continuing previous readdir. */ + if (ia_pos - ia_start > (u8*)ie - (u8*)ia) + continue; + /* Advance the position even if going to skip the entry. */ + actor->pos = (u8*)ie - (u8*)ia + + (sle64_to_cpu(ia->index_block_vcn) << + ndir->itype.index.vcn_size_bits) + + vol->mft_record_size; + /* + * Submit the name to the @filldir callback. Note, + * ntfs_filldir() drops the lock on @ia_page but it retakes it + * before returning, unless a non-zero value is returned in + * which case the page is left unlocked. + */ + rc = ntfs_filldir(vol, ndir, ia_page, ie, name, actor); + if (rc) { + /* @ia_page is already unlocked in this case. */ + ntfs_unmap_page(ia_page); + ntfs_unmap_page(bmp_page); + iput(bmp_vi); + goto abort; + } + } + goto find_next_index_buffer; +unm_EOD: + if (ia_page) { + unlock_page(ia_page); + ntfs_unmap_page(ia_page); + } + ntfs_unmap_page(bmp_page); + iput(bmp_vi); +EOD: + /* We are finished, set fpos to EOD. */ + actor->pos = i_size + vol->mft_record_size; +abort: + kfree(name); + return 0; +err_out: + if (bmp_page) { + ntfs_unmap_page(bmp_page); +iput_err_out: + iput(bmp_vi); + } + if (ia_page) { + unlock_page(ia_page); + ntfs_unmap_page(ia_page); + } + kfree(ir); + kfree(name); + if (ctx) + ntfs_attr_put_search_ctx(ctx); + if (m) + unmap_mft_record(ndir); + if (!err) + err = -EIO; + ntfs_debug("Failed. Returning error code %i.", -err); + return err; +} + +/** + * ntfs_dir_open - called when an inode is about to be opened + * @vi: inode to be opened + * @filp: file structure describing the inode + * + * Limit directory size to the page cache limit on architectures where unsigned + * long is 32-bits. This is the most we can do for now without overflowing the + * page cache page index. Doing it this way means we don't run into problems + * because of existing too large directories. It would be better to allow the + * user to read the accessible part of the directory but I doubt very much + * anyone is going to hit this check on a 32-bit architecture, so there is no + * point in adding the extra complexity required to support this. + * + * On 64-bit architectures, the check is hopefully optimized away by the + * compiler. + */ +static int ntfs_dir_open(struct inode *vi, struct file *filp) +{ + if (sizeof(unsigned long) < 8) { + if (i_size_read(vi) > MAX_LFS_FILESIZE) + return -EFBIG; + } + return 0; +} + +#ifdef NTFS_RW + +/** + * ntfs_dir_fsync - sync a directory to disk + * @filp: directory to be synced + * @dentry: dentry describing the directory to sync + * @datasync: if non-zero only flush user data and not metadata + * + * Data integrity sync of a directory to disk. Used for fsync, fdatasync, and + * msync system calls. This function is based on file.c::ntfs_file_fsync(). + * + * Write the mft record and all associated extent mft records as well as the + * $INDEX_ALLOCATION and $BITMAP attributes and then sync the block device. + * + * If @datasync is true, we do not wait on the inode(s) to be written out + * but we always wait on the page cache pages to be written out. + * + * Note: In the past @filp could be NULL so we ignore it as we don't need it + * anyway. + * + * Locking: Caller must hold i_mutex on the inode. + * + * TODO: We should probably also write all attribute/index inodes associated + * with this inode but since we have no simple way of getting to them we ignore + * this problem for now. We do write the $BITMAP attribute if it is present + * which is the important one for a directory so things are not too bad. + */ +static int ntfs_dir_fsync(struct file *filp, loff_t start, loff_t end, + int datasync) +{ + struct inode *bmp_vi, *vi = filp->f_mapping->host; + int err, ret; + ntfs_attr na; + + ntfs_debug("Entering for inode 0x%lx.", vi->i_ino); + + err = file_write_and_wait_range(filp, start, end); + if (err) + return err; + inode_lock(vi); + + BUG_ON(!S_ISDIR(vi->i_mode)); + /* If the bitmap attribute inode is in memory sync it, too. */ + na.mft_no = vi->i_ino; + na.type = AT_BITMAP; + na.name = I30; + na.name_len = 4; + bmp_vi = ilookup5(vi->i_sb, vi->i_ino, ntfs_test_inode, &na); + if (bmp_vi) { + write_inode_now(bmp_vi, !datasync); + iput(bmp_vi); + } + ret = __ntfs_write_inode(vi, 1); + write_inode_now(vi, !datasync); + err = sync_blockdev(vi->i_sb->s_bdev); + if (unlikely(err && !ret)) + ret = err; + if (likely(!ret)) + ntfs_debug("Done."); + else + ntfs_warning(vi->i_sb, "Failed to f%ssync inode 0x%lx. Error " + "%u.", datasync ? "data" : "", vi->i_ino, -ret); + inode_unlock(vi); + return ret; +} + +#endif /* NTFS_RW */ + +const struct file_operations ntfs_dir_ops = { + .llseek = generic_file_llseek, /* Seek inside directory. */ + .read = generic_read_dir, /* Return -EISDIR. */ + .iterate = ntfs_readdir, /* Read directory contents. */ +#ifdef NTFS_RW + .fsync = ntfs_dir_fsync, /* Sync a directory to disk. */ +#endif /* NTFS_RW */ + /*.ioctl = ,*/ /* Perform function on the + mounted filesystem. */ + .open = ntfs_dir_open, /* Open directory. */ +}; diff --git a/fs/ntfs/dir.h b/fs/ntfs/dir.h new file mode 100644 index 000000000..0e326753d --- /dev/null +++ b/fs/ntfs/dir.h @@ -0,0 +1,34 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * dir.h - Defines for directory handling in NTFS Linux kernel driver. Part of + * the Linux-NTFS project. + * + * Copyright (c) 2002-2004 Anton Altaparmakov + */ + +#ifndef _LINUX_NTFS_DIR_H +#define _LINUX_NTFS_DIR_H + +#include "layout.h" +#include "inode.h" +#include "types.h" + +/* + * ntfs_name is used to return the file name to the caller of + * ntfs_lookup_inode_by_name() in order for the caller (namei.c::ntfs_lookup()) + * to be able to deal with dcache aliasing issues. + */ +typedef struct { + MFT_REF mref; + FILE_NAME_TYPE_FLAGS type; + u8 len; + ntfschar name[0]; +} __attribute__ ((__packed__)) ntfs_name; + +/* The little endian Unicode string $I30 as a global constant. */ +extern ntfschar I30[5]; + +extern MFT_REF ntfs_lookup_inode_by_name(ntfs_inode *dir_ni, + const ntfschar *uname, const int uname_len, ntfs_name **res); + +#endif /* _LINUX_NTFS_FS_DIR_H */ diff --git a/fs/ntfs/endian.h b/fs/ntfs/endian.h new file mode 100644 index 000000000..f30c139bf --- /dev/null +++ b/fs/ntfs/endian.h @@ -0,0 +1,79 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * endian.h - Defines for endianness handling in NTFS Linux kernel driver. + * Part of the Linux-NTFS project. + * + * Copyright (c) 2001-2004 Anton Altaparmakov + */ + +#ifndef _LINUX_NTFS_ENDIAN_H +#define _LINUX_NTFS_ENDIAN_H + +#include <asm/byteorder.h> +#include "types.h" + +/* + * Signed endianness conversion functions. + */ + +static inline s16 sle16_to_cpu(sle16 x) +{ + return le16_to_cpu((__force le16)x); +} + +static inline s32 sle32_to_cpu(sle32 x) +{ + return le32_to_cpu((__force le32)x); +} + +static inline s64 sle64_to_cpu(sle64 x) +{ + return le64_to_cpu((__force le64)x); +} + +static inline s16 sle16_to_cpup(sle16 *x) +{ + return le16_to_cpu(*(__force le16*)x); +} + +static inline s32 sle32_to_cpup(sle32 *x) +{ + return le32_to_cpu(*(__force le32*)x); +} + +static inline s64 sle64_to_cpup(sle64 *x) +{ + return le64_to_cpu(*(__force le64*)x); +} + +static inline sle16 cpu_to_sle16(s16 x) +{ + return (__force sle16)cpu_to_le16(x); +} + +static inline sle32 cpu_to_sle32(s32 x) +{ + return (__force sle32)cpu_to_le32(x); +} + +static inline sle64 cpu_to_sle64(s64 x) +{ + return (__force sle64)cpu_to_le64(x); +} + +static inline sle16 cpu_to_sle16p(s16 *x) +{ + return (__force sle16)cpu_to_le16(*x); +} + +static inline sle32 cpu_to_sle32p(s32 *x) +{ + return (__force sle32)cpu_to_le32(*x); +} + +static inline sle64 cpu_to_sle64p(s64 *x) +{ + return (__force sle64)cpu_to_le64(*x); +} + +#endif /* _LINUX_NTFS_ENDIAN_H */ diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c new file mode 100644 index 000000000..c481b14e4 --- /dev/null +++ b/fs/ntfs/file.c @@ -0,0 +1,2006 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * file.c - NTFS kernel file operations. Part of the Linux-NTFS project. + * + * Copyright (c) 2001-2015 Anton Altaparmakov and Tuxera Inc. + */ + +#include <linux/blkdev.h> +#include <linux/backing-dev.h> +#include <linux/buffer_head.h> +#include <linux/gfp.h> +#include <linux/pagemap.h> +#include <linux/pagevec.h> +#include <linux/sched/signal.h> +#include <linux/swap.h> +#include <linux/uio.h> +#include <linux/writeback.h> + +#include <asm/page.h> +#include <linux/uaccess.h> + +#include "attrib.h" +#include "bitmap.h" +#include "inode.h" +#include "debug.h" +#include "lcnalloc.h" +#include "malloc.h" +#include "mft.h" +#include "ntfs.h" + +/** + * ntfs_file_open - called when an inode is about to be opened + * @vi: inode to be opened + * @filp: file structure describing the inode + * + * Limit file size to the page cache limit on architectures where unsigned long + * is 32-bits. This is the most we can do for now without overflowing the page + * cache page index. Doing it this way means we don't run into problems because + * of existing too large files. It would be better to allow the user to read + * the beginning of the file but I doubt very much anyone is going to hit this + * check on a 32-bit architecture, so there is no point in adding the extra + * complexity required to support this. + * + * On 64-bit architectures, the check is hopefully optimized away by the + * compiler. + * + * After the check passes, just call generic_file_open() to do its work. + */ +static int ntfs_file_open(struct inode *vi, struct file *filp) +{ + if (sizeof(unsigned long) < 8) { + if (i_size_read(vi) > MAX_LFS_FILESIZE) + return -EOVERFLOW; + } + return generic_file_open(vi, filp); +} + +#ifdef NTFS_RW + +/** + * ntfs_attr_extend_initialized - extend the initialized size of an attribute + * @ni: ntfs inode of the attribute to extend + * @new_init_size: requested new initialized size in bytes + * + * Extend the initialized size of an attribute described by the ntfs inode @ni + * to @new_init_size bytes. This involves zeroing any non-sparse space between + * the old initialized size and @new_init_size both in the page cache and on + * disk (if relevant complete pages are already uptodate in the page cache then + * these are simply marked dirty). + * + * As a side-effect, the file size (vfs inode->i_size) may be incremented as, + * in the resident attribute case, it is tied to the initialized size and, in + * the non-resident attribute case, it may not fall below the initialized size. + * + * Note that if the attribute is resident, we do not need to touch the page + * cache at all. This is because if the page cache page is not uptodate we + * bring it uptodate later, when doing the write to the mft record since we + * then already have the page mapped. And if the page is uptodate, the + * non-initialized region will already have been zeroed when the page was + * brought uptodate and the region may in fact already have been overwritten + * with new data via mmap() based writes, so we cannot just zero it. And since + * POSIX specifies that the behaviour of resizing a file whilst it is mmap()ped + * is unspecified, we choose not to do zeroing and thus we do not need to touch + * the page at all. For a more detailed explanation see ntfs_truncate() in + * fs/ntfs/inode.c. + * + * Return 0 on success and -errno on error. In the case that an error is + * encountered it is possible that the initialized size will already have been + * incremented some way towards @new_init_size but it is guaranteed that if + * this is the case, the necessary zeroing will also have happened and that all + * metadata is self-consistent. + * + * Locking: i_mutex on the vfs inode corrseponsind to the ntfs inode @ni must be + * held by the caller. + */ +static int ntfs_attr_extend_initialized(ntfs_inode *ni, const s64 new_init_size) +{ + s64 old_init_size; + loff_t old_i_size; + pgoff_t index, end_index; + unsigned long flags; + struct inode *vi = VFS_I(ni); + ntfs_inode *base_ni; + MFT_RECORD *m = NULL; + ATTR_RECORD *a; + ntfs_attr_search_ctx *ctx = NULL; + struct address_space *mapping; + struct page *page = NULL; + u8 *kattr; + int err; + u32 attr_len; + + read_lock_irqsave(&ni->size_lock, flags); + old_init_size = ni->initialized_size; + old_i_size = i_size_read(vi); + BUG_ON(new_init_size > ni->allocated_size); + read_unlock_irqrestore(&ni->size_lock, flags); + ntfs_debug("Entering for i_ino 0x%lx, attribute type 0x%x, " + "old_initialized_size 0x%llx, " + "new_initialized_size 0x%llx, i_size 0x%llx.", + vi->i_ino, (unsigned)le32_to_cpu(ni->type), + (unsigned long long)old_init_size, + (unsigned long long)new_init_size, old_i_size); + if (!NInoAttr(ni)) + base_ni = ni; + else + base_ni = ni->ext.base_ntfs_ino; + /* Use goto to reduce indentation and we need the label below anyway. */ + if (NInoNonResident(ni)) + goto do_non_resident_extend; + BUG_ON(old_init_size != old_i_size); + m = map_mft_record(base_ni); + if (IS_ERR(m)) { + err = PTR_ERR(m); + m = NULL; + goto err_out; + } + ctx = ntfs_attr_get_search_ctx(base_ni, m); + if (unlikely(!ctx)) { + err = -ENOMEM; + goto err_out; + } + err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len, + CASE_SENSITIVE, 0, NULL, 0, ctx); + if (unlikely(err)) { + if (err == -ENOENT) + err = -EIO; + goto err_out; + } + m = ctx->mrec; + a = ctx->attr; + BUG_ON(a->non_resident); + /* The total length of the attribute value. */ + attr_len = le32_to_cpu(a->data.resident.value_length); + BUG_ON(old_i_size != (loff_t)attr_len); + /* + * Do the zeroing in the mft record and update the attribute size in + * the mft record. + */ + kattr = (u8*)a + le16_to_cpu(a->data.resident.value_offset); + memset(kattr + attr_len, 0, new_init_size - attr_len); + a->data.resident.value_length = cpu_to_le32((u32)new_init_size); + /* Finally, update the sizes in the vfs and ntfs inodes. */ + write_lock_irqsave(&ni->size_lock, flags); + i_size_write(vi, new_init_size); + ni->initialized_size = new_init_size; + write_unlock_irqrestore(&ni->size_lock, flags); + goto done; +do_non_resident_extend: + /* + * If the new initialized size @new_init_size exceeds the current file + * size (vfs inode->i_size), we need to extend the file size to the + * new initialized size. + */ + if (new_init_size > old_i_size) { + m = map_mft_record(base_ni); + if (IS_ERR(m)) { + err = PTR_ERR(m); + m = NULL; + goto err_out; + } + ctx = ntfs_attr_get_search_ctx(base_ni, m); + if (unlikely(!ctx)) { + err = -ENOMEM; + goto err_out; + } + err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len, + CASE_SENSITIVE, 0, NULL, 0, ctx); + if (unlikely(err)) { + if (err == -ENOENT) + err = -EIO; + goto err_out; + } + m = ctx->mrec; + a = ctx->attr; + BUG_ON(!a->non_resident); + BUG_ON(old_i_size != (loff_t) + sle64_to_cpu(a->data.non_resident.data_size)); + a->data.non_resident.data_size = cpu_to_sle64(new_init_size); + flush_dcache_mft_record_page(ctx->ntfs_ino); + mark_mft_record_dirty(ctx->ntfs_ino); + /* Update the file size in the vfs inode. */ + i_size_write(vi, new_init_size); + ntfs_attr_put_search_ctx(ctx); + ctx = NULL; + unmap_mft_record(base_ni); + m = NULL; + } + mapping = vi->i_mapping; + index = old_init_size >> PAGE_SHIFT; + end_index = (new_init_size + PAGE_SIZE - 1) >> PAGE_SHIFT; + do { + /* + * Read the page. If the page is not present, this will zero + * the uninitialized regions for us. + */ + page = read_mapping_page(mapping, index, NULL); + if (IS_ERR(page)) { + err = PTR_ERR(page); + goto init_err_out; + } + /* + * Update the initialized size in the ntfs inode. This is + * enough to make ntfs_writepage() work. + */ + write_lock_irqsave(&ni->size_lock, flags); + ni->initialized_size = (s64)(index + 1) << PAGE_SHIFT; + if (ni->initialized_size > new_init_size) + ni->initialized_size = new_init_size; + write_unlock_irqrestore(&ni->size_lock, flags); + /* Set the page dirty so it gets written out. */ + set_page_dirty(page); + put_page(page); + /* + * Play nice with the vm and the rest of the system. This is + * very much needed as we can potentially be modifying the + * initialised size from a very small value to a really huge + * value, e.g. + * f = open(somefile, O_TRUNC); + * truncate(f, 10GiB); + * seek(f, 10GiB); + * write(f, 1); + * And this would mean we would be marking dirty hundreds of + * thousands of pages or as in the above example more than + * two and a half million pages! + * + * TODO: For sparse pages could optimize this workload by using + * the FsMisc / MiscFs page bit as a "PageIsSparse" bit. This + * would be set in read_folio for sparse pages and here we would + * not need to mark dirty any pages which have this bit set. + * The only caveat is that we have to clear the bit everywhere + * where we allocate any clusters that lie in the page or that + * contain the page. + * + * TODO: An even greater optimization would be for us to only + * call read_folio() on pages which are not in sparse regions as + * determined from the runlist. This would greatly reduce the + * number of pages we read and make dirty in the case of sparse + * files. + */ + balance_dirty_pages_ratelimited(mapping); + cond_resched(); + } while (++index < end_index); + read_lock_irqsave(&ni->size_lock, flags); + BUG_ON(ni->initialized_size != new_init_size); + read_unlock_irqrestore(&ni->size_lock, flags); + /* Now bring in sync the initialized_size in the mft record. */ + m = map_mft_record(base_ni); + if (IS_ERR(m)) { + err = PTR_ERR(m); + m = NULL; + goto init_err_out; + } + ctx = ntfs_attr_get_search_ctx(base_ni, m); + if (unlikely(!ctx)) { + err = -ENOMEM; + goto init_err_out; + } + err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len, + CASE_SENSITIVE, 0, NULL, 0, ctx); + if (unlikely(err)) { + if (err == -ENOENT) + err = -EIO; + goto init_err_out; + } + m = ctx->mrec; + a = ctx->attr; + BUG_ON(!a->non_resident); + a->data.non_resident.initialized_size = cpu_to_sle64(new_init_size); +done: + flush_dcache_mft_record_page(ctx->ntfs_ino); + mark_mft_record_dirty(ctx->ntfs_ino); + if (ctx) + ntfs_attr_put_search_ctx(ctx); + if (m) + unmap_mft_record(base_ni); + ntfs_debug("Done, initialized_size 0x%llx, i_size 0x%llx.", + (unsigned long long)new_init_size, i_size_read(vi)); + return 0; +init_err_out: + write_lock_irqsave(&ni->size_lock, flags); + ni->initialized_size = old_init_size; + write_unlock_irqrestore(&ni->size_lock, flags); +err_out: + if (ctx) + ntfs_attr_put_search_ctx(ctx); + if (m) + unmap_mft_record(base_ni); + ntfs_debug("Failed. Returning error code %i.", err); + return err; +} + +static ssize_t ntfs_prepare_file_for_write(struct kiocb *iocb, + struct iov_iter *from) +{ + loff_t pos; + s64 end, ll; + ssize_t err; + unsigned long flags; + struct file *file = iocb->ki_filp; + struct inode *vi = file_inode(file); + ntfs_inode *ni = NTFS_I(vi); + ntfs_volume *vol = ni->vol; + + ntfs_debug("Entering for i_ino 0x%lx, attribute type 0x%x, pos " + "0x%llx, count 0x%zx.", vi->i_ino, + (unsigned)le32_to_cpu(ni->type), + (unsigned long long)iocb->ki_pos, + iov_iter_count(from)); + err = generic_write_checks(iocb, from); + if (unlikely(err <= 0)) + goto out; + /* + * All checks have passed. Before we start doing any writing we want + * to abort any totally illegal writes. + */ + BUG_ON(NInoMstProtected(ni)); + BUG_ON(ni->type != AT_DATA); + /* If file is encrypted, deny access, just like NT4. */ + if (NInoEncrypted(ni)) { + /* Only $DATA attributes can be encrypted. */ + /* + * Reminder for later: Encrypted files are _always_ + * non-resident so that the content can always be encrypted. + */ + ntfs_debug("Denying write access to encrypted file."); + err = -EACCES; + goto out; + } + if (NInoCompressed(ni)) { + /* Only unnamed $DATA attribute can be compressed. */ + BUG_ON(ni->name_len); + /* + * Reminder for later: If resident, the data is not actually + * compressed. Only on the switch to non-resident does + * compression kick in. This is in contrast to encrypted files + * (see above). + */ + ntfs_error(vi->i_sb, "Writing to compressed files is not " + "implemented yet. Sorry."); + err = -EOPNOTSUPP; + goto out; + } + err = file_remove_privs(file); + if (unlikely(err)) + goto out; + /* + * Our ->update_time method always succeeds thus file_update_time() + * cannot fail either so there is no need to check the return code. + */ + file_update_time(file); + pos = iocb->ki_pos; + /* The first byte after the last cluster being written to. */ + end = (pos + iov_iter_count(from) + vol->cluster_size_mask) & + ~(u64)vol->cluster_size_mask; + /* + * If the write goes beyond the allocated size, extend the allocation + * to cover the whole of the write, rounded up to the nearest cluster. + */ + read_lock_irqsave(&ni->size_lock, flags); + ll = ni->allocated_size; + read_unlock_irqrestore(&ni->size_lock, flags); + if (end > ll) { + /* + * Extend the allocation without changing the data size. + * + * Note we ensure the allocation is big enough to at least + * write some data but we do not require the allocation to be + * complete, i.e. it may be partial. + */ + ll = ntfs_attr_extend_allocation(ni, end, -1, pos); + if (likely(ll >= 0)) { + BUG_ON(pos >= ll); + /* If the extension was partial truncate the write. */ + if (end > ll) { + ntfs_debug("Truncating write to inode 0x%lx, " + "attribute type 0x%x, because " + "the allocation was only " + "partially extended.", + vi->i_ino, (unsigned) + le32_to_cpu(ni->type)); + iov_iter_truncate(from, ll - pos); + } + } else { + err = ll; + read_lock_irqsave(&ni->size_lock, flags); + ll = ni->allocated_size; + read_unlock_irqrestore(&ni->size_lock, flags); + /* Perform a partial write if possible or fail. */ + if (pos < ll) { + ntfs_debug("Truncating write to inode 0x%lx " + "attribute type 0x%x, because " + "extending the allocation " + "failed (error %d).", + vi->i_ino, (unsigned) + le32_to_cpu(ni->type), + (int)-err); + iov_iter_truncate(from, ll - pos); + } else { + if (err != -ENOSPC) + ntfs_error(vi->i_sb, "Cannot perform " + "write to inode " + "0x%lx, attribute " + "type 0x%x, because " + "extending the " + "allocation failed " + "(error %ld).", + vi->i_ino, (unsigned) + le32_to_cpu(ni->type), + (long)-err); + else + ntfs_debug("Cannot perform write to " + "inode 0x%lx, " + "attribute type 0x%x, " + "because there is not " + "space left.", + vi->i_ino, (unsigned) + le32_to_cpu(ni->type)); + goto out; + } + } + } + /* + * If the write starts beyond the initialized size, extend it up to the + * beginning of the write and initialize all non-sparse space between + * the old initialized size and the new one. This automatically also + * increments the vfs inode->i_size to keep it above or equal to the + * initialized_size. + */ + read_lock_irqsave(&ni->size_lock, flags); + ll = ni->initialized_size; + read_unlock_irqrestore(&ni->size_lock, flags); + if (pos > ll) { + /* + * Wait for ongoing direct i/o to complete before proceeding. + * New direct i/o cannot start as we hold i_mutex. + */ + inode_dio_wait(vi); + err = ntfs_attr_extend_initialized(ni, pos); + if (unlikely(err < 0)) + ntfs_error(vi->i_sb, "Cannot perform write to inode " + "0x%lx, attribute type 0x%x, because " + "extending the initialized size " + "failed (error %d).", vi->i_ino, + (unsigned)le32_to_cpu(ni->type), + (int)-err); + } +out: + return err; +} + +/** + * __ntfs_grab_cache_pages - obtain a number of locked pages + * @mapping: address space mapping from which to obtain page cache pages + * @index: starting index in @mapping at which to begin obtaining pages + * @nr_pages: number of page cache pages to obtain + * @pages: array of pages in which to return the obtained page cache pages + * @cached_page: allocated but as yet unused page + * + * Obtain @nr_pages locked page cache pages from the mapping @mapping and + * starting at index @index. + * + * If a page is newly created, add it to lru list + * + * Note, the page locks are obtained in ascending page index order. + */ +static inline int __ntfs_grab_cache_pages(struct address_space *mapping, + pgoff_t index, const unsigned nr_pages, struct page **pages, + struct page **cached_page) +{ + int err, nr; + + BUG_ON(!nr_pages); + err = nr = 0; + do { + pages[nr] = find_get_page_flags(mapping, index, FGP_LOCK | + FGP_ACCESSED); + if (!pages[nr]) { + if (!*cached_page) { + *cached_page = page_cache_alloc(mapping); + if (unlikely(!*cached_page)) { + err = -ENOMEM; + goto err_out; + } + } + err = add_to_page_cache_lru(*cached_page, mapping, + index, + mapping_gfp_constraint(mapping, GFP_KERNEL)); + if (unlikely(err)) { + if (err == -EEXIST) + continue; + goto err_out; + } + pages[nr] = *cached_page; + *cached_page = NULL; + } + index++; + nr++; + } while (nr < nr_pages); +out: + return err; +err_out: + while (nr > 0) { + unlock_page(pages[--nr]); + put_page(pages[nr]); + } + goto out; +} + +static inline void ntfs_submit_bh_for_read(struct buffer_head *bh) +{ + lock_buffer(bh); + get_bh(bh); + bh->b_end_io = end_buffer_read_sync; + submit_bh(REQ_OP_READ, bh); +} + +/** + * ntfs_prepare_pages_for_non_resident_write - prepare pages for receiving data + * @pages: array of destination pages + * @nr_pages: number of pages in @pages + * @pos: byte position in file at which the write begins + * @bytes: number of bytes to be written + * + * This is called for non-resident attributes from ntfs_file_buffered_write() + * with i_mutex held on the inode (@pages[0]->mapping->host). There are + * @nr_pages pages in @pages which are locked but not kmap()ped. The source + * data has not yet been copied into the @pages. + * + * Need to fill any holes with actual clusters, allocate buffers if necessary, + * ensure all the buffers are mapped, and bring uptodate any buffers that are + * only partially being written to. + * + * If @nr_pages is greater than one, we are guaranteed that the cluster size is + * greater than PAGE_SIZE, that all pages in @pages are entirely inside + * the same cluster and that they are the entirety of that cluster, and that + * the cluster is sparse, i.e. we need to allocate a cluster to fill the hole. + * + * i_size is not to be modified yet. + * + * Return 0 on success or -errno on error. + */ +static int ntfs_prepare_pages_for_non_resident_write(struct page **pages, + unsigned nr_pages, s64 pos, size_t bytes) +{ + VCN vcn, highest_vcn = 0, cpos, cend, bh_cpos, bh_cend; + LCN lcn; + s64 bh_pos, vcn_len, end, initialized_size; + sector_t lcn_block; + struct page *page; + struct inode *vi; + ntfs_inode *ni, *base_ni = NULL; + ntfs_volume *vol; + runlist_element *rl, *rl2; + struct buffer_head *bh, *head, *wait[2], **wait_bh = wait; + ntfs_attr_search_ctx *ctx = NULL; + MFT_RECORD *m = NULL; + ATTR_RECORD *a = NULL; + unsigned long flags; + u32 attr_rec_len = 0; + unsigned blocksize, u; + int err, mp_size; + bool rl_write_locked, was_hole, is_retry; + unsigned char blocksize_bits; + struct { + u8 runlist_merged:1; + u8 mft_attr_mapped:1; + u8 mp_rebuilt:1; + u8 attr_switched:1; + } status = { 0, 0, 0, 0 }; + + BUG_ON(!nr_pages); + BUG_ON(!pages); + BUG_ON(!*pages); + vi = pages[0]->mapping->host; + ni = NTFS_I(vi); + vol = ni->vol; + ntfs_debug("Entering for inode 0x%lx, attribute type 0x%x, start page " + "index 0x%lx, nr_pages 0x%x, pos 0x%llx, bytes 0x%zx.", + vi->i_ino, ni->type, pages[0]->index, nr_pages, + (long long)pos, bytes); + blocksize = vol->sb->s_blocksize; + blocksize_bits = vol->sb->s_blocksize_bits; + u = 0; + do { + page = pages[u]; + BUG_ON(!page); + /* + * create_empty_buffers() will create uptodate/dirty buffers if + * the page is uptodate/dirty. + */ + if (!page_has_buffers(page)) { + create_empty_buffers(page, blocksize, 0); + if (unlikely(!page_has_buffers(page))) + return -ENOMEM; + } + } while (++u < nr_pages); + rl_write_locked = false; + rl = NULL; + err = 0; + vcn = lcn = -1; + vcn_len = 0; + lcn_block = -1; + was_hole = false; + cpos = pos >> vol->cluster_size_bits; + end = pos + bytes; + cend = (end + vol->cluster_size - 1) >> vol->cluster_size_bits; + /* + * Loop over each page and for each page over each buffer. Use goto to + * reduce indentation. + */ + u = 0; +do_next_page: + page = pages[u]; + bh_pos = (s64)page->index << PAGE_SHIFT; + bh = head = page_buffers(page); + do { + VCN cdelta; + s64 bh_end; + unsigned bh_cofs; + + /* Clear buffer_new on all buffers to reinitialise state. */ + if (buffer_new(bh)) + clear_buffer_new(bh); + bh_end = bh_pos + blocksize; + bh_cpos = bh_pos >> vol->cluster_size_bits; + bh_cofs = bh_pos & vol->cluster_size_mask; + if (buffer_mapped(bh)) { + /* + * The buffer is already mapped. If it is uptodate, + * ignore it. + */ + if (buffer_uptodate(bh)) + continue; + /* + * The buffer is not uptodate. If the page is uptodate + * set the buffer uptodate and otherwise ignore it. + */ + if (PageUptodate(page)) { + set_buffer_uptodate(bh); + continue; + } + /* + * Neither the page nor the buffer are uptodate. If + * the buffer is only partially being written to, we + * need to read it in before the write, i.e. now. + */ + if ((bh_pos < pos && bh_end > pos) || + (bh_pos < end && bh_end > end)) { + /* + * If the buffer is fully or partially within + * the initialized size, do an actual read. + * Otherwise, simply zero the buffer. + */ + read_lock_irqsave(&ni->size_lock, flags); + initialized_size = ni->initialized_size; + read_unlock_irqrestore(&ni->size_lock, flags); + if (bh_pos < initialized_size) { + ntfs_submit_bh_for_read(bh); + *wait_bh++ = bh; + } else { + zero_user(page, bh_offset(bh), + blocksize); + set_buffer_uptodate(bh); + } + } + continue; + } + /* Unmapped buffer. Need to map it. */ + bh->b_bdev = vol->sb->s_bdev; + /* + * If the current buffer is in the same clusters as the map + * cache, there is no need to check the runlist again. The + * map cache is made up of @vcn, which is the first cached file + * cluster, @vcn_len which is the number of cached file + * clusters, @lcn is the device cluster corresponding to @vcn, + * and @lcn_block is the block number corresponding to @lcn. + */ + cdelta = bh_cpos - vcn; + if (likely(!cdelta || (cdelta > 0 && cdelta < vcn_len))) { +map_buffer_cached: + BUG_ON(lcn < 0); + bh->b_blocknr = lcn_block + + (cdelta << (vol->cluster_size_bits - + blocksize_bits)) + + (bh_cofs >> blocksize_bits); + set_buffer_mapped(bh); + /* + * If the page is uptodate so is the buffer. If the + * buffer is fully outside the write, we ignore it if + * it was already allocated and we mark it dirty so it + * gets written out if we allocated it. On the other + * hand, if we allocated the buffer but we are not + * marking it dirty we set buffer_new so we can do + * error recovery. + */ + if (PageUptodate(page)) { + if (!buffer_uptodate(bh)) + set_buffer_uptodate(bh); + if (unlikely(was_hole)) { + /* We allocated the buffer. */ + clean_bdev_bh_alias(bh); + if (bh_end <= pos || bh_pos >= end) + mark_buffer_dirty(bh); + else + set_buffer_new(bh); + } + continue; + } + /* Page is _not_ uptodate. */ + if (likely(!was_hole)) { + /* + * Buffer was already allocated. If it is not + * uptodate and is only partially being written + * to, we need to read it in before the write, + * i.e. now. + */ + if (!buffer_uptodate(bh) && bh_pos < end && + bh_end > pos && + (bh_pos < pos || + bh_end > end)) { + /* + * If the buffer is fully or partially + * within the initialized size, do an + * actual read. Otherwise, simply zero + * the buffer. + */ + read_lock_irqsave(&ni->size_lock, + flags); + initialized_size = ni->initialized_size; + read_unlock_irqrestore(&ni->size_lock, + flags); + if (bh_pos < initialized_size) { + ntfs_submit_bh_for_read(bh); + *wait_bh++ = bh; + } else { + zero_user(page, bh_offset(bh), + blocksize); + set_buffer_uptodate(bh); + } + } + continue; + } + /* We allocated the buffer. */ + clean_bdev_bh_alias(bh); + /* + * If the buffer is fully outside the write, zero it, + * set it uptodate, and mark it dirty so it gets + * written out. If it is partially being written to, + * zero region surrounding the write but leave it to + * commit write to do anything else. Finally, if the + * buffer is fully being overwritten, do nothing. + */ + if (bh_end <= pos || bh_pos >= end) { + if (!buffer_uptodate(bh)) { + zero_user(page, bh_offset(bh), + blocksize); + set_buffer_uptodate(bh); + } + mark_buffer_dirty(bh); + continue; + } + set_buffer_new(bh); + if (!buffer_uptodate(bh) && + (bh_pos < pos || bh_end > end)) { + u8 *kaddr; + unsigned pofs; + + kaddr = kmap_atomic(page); + if (bh_pos < pos) { + pofs = bh_pos & ~PAGE_MASK; + memset(kaddr + pofs, 0, pos - bh_pos); + } + if (bh_end > end) { + pofs = end & ~PAGE_MASK; + memset(kaddr + pofs, 0, bh_end - end); + } + kunmap_atomic(kaddr); + flush_dcache_page(page); + } + continue; + } + /* + * Slow path: this is the first buffer in the cluster. If it + * is outside allocated size and is not uptodate, zero it and + * set it uptodate. + */ + read_lock_irqsave(&ni->size_lock, flags); + initialized_size = ni->allocated_size; + read_unlock_irqrestore(&ni->size_lock, flags); + if (bh_pos > initialized_size) { + if (PageUptodate(page)) { + if (!buffer_uptodate(bh)) + set_buffer_uptodate(bh); + } else if (!buffer_uptodate(bh)) { + zero_user(page, bh_offset(bh), blocksize); + set_buffer_uptodate(bh); + } + continue; + } + is_retry = false; + if (!rl) { + down_read(&ni->runlist.lock); +retry_remap: + rl = ni->runlist.rl; + } + if (likely(rl != NULL)) { + /* Seek to element containing target cluster. */ + while (rl->length && rl[1].vcn <= bh_cpos) + rl++; + lcn = ntfs_rl_vcn_to_lcn(rl, bh_cpos); + if (likely(lcn >= 0)) { + /* + * Successful remap, setup the map cache and + * use that to deal with the buffer. + */ + was_hole = false; + vcn = bh_cpos; + vcn_len = rl[1].vcn - vcn; + lcn_block = lcn << (vol->cluster_size_bits - + blocksize_bits); + cdelta = 0; + /* + * If the number of remaining clusters touched + * by the write is smaller or equal to the + * number of cached clusters, unlock the + * runlist as the map cache will be used from + * now on. + */ + if (likely(vcn + vcn_len >= cend)) { + if (rl_write_locked) { + up_write(&ni->runlist.lock); + rl_write_locked = false; + } else + up_read(&ni->runlist.lock); + rl = NULL; + } + goto map_buffer_cached; + } + } else + lcn = LCN_RL_NOT_MAPPED; + /* + * If it is not a hole and not out of bounds, the runlist is + * probably unmapped so try to map it now. + */ + if (unlikely(lcn != LCN_HOLE && lcn != LCN_ENOENT)) { + if (likely(!is_retry && lcn == LCN_RL_NOT_MAPPED)) { + /* Attempt to map runlist. */ + if (!rl_write_locked) { + /* + * We need the runlist locked for + * writing, so if it is locked for + * reading relock it now and retry in + * case it changed whilst we dropped + * the lock. + */ + up_read(&ni->runlist.lock); + down_write(&ni->runlist.lock); + rl_write_locked = true; + goto retry_remap; + } + err = ntfs_map_runlist_nolock(ni, bh_cpos, + NULL); + if (likely(!err)) { + is_retry = true; + goto retry_remap; + } + /* + * If @vcn is out of bounds, pretend @lcn is + * LCN_ENOENT. As long as the buffer is out + * of bounds this will work fine. + */ + if (err == -ENOENT) { + lcn = LCN_ENOENT; + err = 0; + goto rl_not_mapped_enoent; + } + } else + err = -EIO; + /* Failed to map the buffer, even after retrying. */ + bh->b_blocknr = -1; + ntfs_error(vol->sb, "Failed to write to inode 0x%lx, " + "attribute type 0x%x, vcn 0x%llx, " + "vcn offset 0x%x, because its " + "location on disk could not be " + "determined%s (error code %i).", + ni->mft_no, ni->type, + (unsigned long long)bh_cpos, + (unsigned)bh_pos & + vol->cluster_size_mask, + is_retry ? " even after retrying" : "", + err); + break; + } +rl_not_mapped_enoent: + /* + * The buffer is in a hole or out of bounds. We need to fill + * the hole, unless the buffer is in a cluster which is not + * touched by the write, in which case we just leave the buffer + * unmapped. This can only happen when the cluster size is + * less than the page cache size. + */ + if (unlikely(vol->cluster_size < PAGE_SIZE)) { + bh_cend = (bh_end + vol->cluster_size - 1) >> + vol->cluster_size_bits; + if ((bh_cend <= cpos || bh_cpos >= cend)) { + bh->b_blocknr = -1; + /* + * If the buffer is uptodate we skip it. If it + * is not but the page is uptodate, we can set + * the buffer uptodate. If the page is not + * uptodate, we can clear the buffer and set it + * uptodate. Whether this is worthwhile is + * debatable and this could be removed. + */ + if (PageUptodate(page)) { + if (!buffer_uptodate(bh)) + set_buffer_uptodate(bh); + } else if (!buffer_uptodate(bh)) { + zero_user(page, bh_offset(bh), + blocksize); + set_buffer_uptodate(bh); + } + continue; + } + } + /* + * Out of bounds buffer is invalid if it was not really out of + * bounds. + */ + BUG_ON(lcn != LCN_HOLE); + /* + * We need the runlist locked for writing, so if it is locked + * for reading relock it now and retry in case it changed + * whilst we dropped the lock. + */ + BUG_ON(!rl); + if (!rl_write_locked) { + up_read(&ni->runlist.lock); + down_write(&ni->runlist.lock); + rl_write_locked = true; + goto retry_remap; + } + /* Find the previous last allocated cluster. */ + BUG_ON(rl->lcn != LCN_HOLE); + lcn = -1; + rl2 = rl; + while (--rl2 >= ni->runlist.rl) { + if (rl2->lcn >= 0) { + lcn = rl2->lcn + rl2->length; + break; + } + } + rl2 = ntfs_cluster_alloc(vol, bh_cpos, 1, lcn, DATA_ZONE, + false); + if (IS_ERR(rl2)) { + err = PTR_ERR(rl2); + ntfs_debug("Failed to allocate cluster, error code %i.", + err); + break; + } + lcn = rl2->lcn; + rl = ntfs_runlists_merge(ni->runlist.rl, rl2); + if (IS_ERR(rl)) { + err = PTR_ERR(rl); + if (err != -ENOMEM) + err = -EIO; + if (ntfs_cluster_free_from_rl(vol, rl2)) { + ntfs_error(vol->sb, "Failed to release " + "allocated cluster in error " + "code path. Run chkdsk to " + "recover the lost cluster."); + NVolSetErrors(vol); + } + ntfs_free(rl2); + break; + } + ni->runlist.rl = rl; + status.runlist_merged = 1; + ntfs_debug("Allocated cluster, lcn 0x%llx.", + (unsigned long long)lcn); + /* Map and lock the mft record and get the attribute record. */ + if (!NInoAttr(ni)) + base_ni = ni; + else + base_ni = ni->ext.base_ntfs_ino; + m = map_mft_record(base_ni); + if (IS_ERR(m)) { + err = PTR_ERR(m); + break; + } + ctx = ntfs_attr_get_search_ctx(base_ni, m); + if (unlikely(!ctx)) { + err = -ENOMEM; + unmap_mft_record(base_ni); + break; + } + status.mft_attr_mapped = 1; + err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len, + CASE_SENSITIVE, bh_cpos, NULL, 0, ctx); + if (unlikely(err)) { + if (err == -ENOENT) + err = -EIO; + break; + } + m = ctx->mrec; + a = ctx->attr; + /* + * Find the runlist element with which the attribute extent + * starts. Note, we cannot use the _attr_ version because we + * have mapped the mft record. That is ok because we know the + * runlist fragment must be mapped already to have ever gotten + * here, so we can just use the _rl_ version. + */ + vcn = sle64_to_cpu(a->data.non_resident.lowest_vcn); + rl2 = ntfs_rl_find_vcn_nolock(rl, vcn); + BUG_ON(!rl2); + BUG_ON(!rl2->length); + BUG_ON(rl2->lcn < LCN_HOLE); + highest_vcn = sle64_to_cpu(a->data.non_resident.highest_vcn); + /* + * If @highest_vcn is zero, calculate the real highest_vcn + * (which can really be zero). + */ + if (!highest_vcn) + highest_vcn = (sle64_to_cpu( + a->data.non_resident.allocated_size) >> + vol->cluster_size_bits) - 1; + /* + * Determine the size of the mapping pairs array for the new + * extent, i.e. the old extent with the hole filled. + */ + mp_size = ntfs_get_size_for_mapping_pairs(vol, rl2, vcn, + highest_vcn); + if (unlikely(mp_size <= 0)) { + if (!(err = mp_size)) + err = -EIO; + ntfs_debug("Failed to get size for mapping pairs " + "array, error code %i.", err); + break; + } + /* + * Resize the attribute record to fit the new mapping pairs + * array. + */ + attr_rec_len = le32_to_cpu(a->length); + err = ntfs_attr_record_resize(m, a, mp_size + le16_to_cpu( + a->data.non_resident.mapping_pairs_offset)); + if (unlikely(err)) { + BUG_ON(err != -ENOSPC); + // TODO: Deal with this by using the current attribute + // and fill it with as much of the mapping pairs + // array as possible. Then loop over each attribute + // extent rewriting the mapping pairs arrays as we go + // along and if when we reach the end we have not + // enough space, try to resize the last attribute + // extent and if even that fails, add a new attribute + // extent. + // We could also try to resize at each step in the hope + // that we will not need to rewrite every single extent. + // Note, we may need to decompress some extents to fill + // the runlist as we are walking the extents... + ntfs_error(vol->sb, "Not enough space in the mft " + "record for the extended attribute " + "record. This case is not " + "implemented yet."); + err = -EOPNOTSUPP; + break ; + } + status.mp_rebuilt = 1; + /* + * Generate the mapping pairs array directly into the attribute + * record. + */ + err = ntfs_mapping_pairs_build(vol, (u8*)a + le16_to_cpu( + a->data.non_resident.mapping_pairs_offset), + mp_size, rl2, vcn, highest_vcn, NULL); + if (unlikely(err)) { + ntfs_error(vol->sb, "Cannot fill hole in inode 0x%lx, " + "attribute type 0x%x, because building " + "the mapping pairs failed with error " + "code %i.", vi->i_ino, + (unsigned)le32_to_cpu(ni->type), err); + err = -EIO; + break; + } + /* Update the highest_vcn but only if it was not set. */ + if (unlikely(!a->data.non_resident.highest_vcn)) + a->data.non_resident.highest_vcn = + cpu_to_sle64(highest_vcn); + /* + * If the attribute is sparse/compressed, update the compressed + * size in the ntfs_inode structure and the attribute record. + */ + if (likely(NInoSparse(ni) || NInoCompressed(ni))) { + /* + * If we are not in the first attribute extent, switch + * to it, but first ensure the changes will make it to + * disk later. + */ + if (a->data.non_resident.lowest_vcn) { + flush_dcache_mft_record_page(ctx->ntfs_ino); + mark_mft_record_dirty(ctx->ntfs_ino); + ntfs_attr_reinit_search_ctx(ctx); + err = ntfs_attr_lookup(ni->type, ni->name, + ni->name_len, CASE_SENSITIVE, + 0, NULL, 0, ctx); + if (unlikely(err)) { + status.attr_switched = 1; + break; + } + /* @m is not used any more so do not set it. */ + a = ctx->attr; + } + write_lock_irqsave(&ni->size_lock, flags); + ni->itype.compressed.size += vol->cluster_size; + a->data.non_resident.compressed_size = + cpu_to_sle64(ni->itype.compressed.size); + write_unlock_irqrestore(&ni->size_lock, flags); + } + /* Ensure the changes make it to disk. */ + flush_dcache_mft_record_page(ctx->ntfs_ino); + mark_mft_record_dirty(ctx->ntfs_ino); + ntfs_attr_put_search_ctx(ctx); + unmap_mft_record(base_ni); + /* Successfully filled the hole. */ + status.runlist_merged = 0; + status.mft_attr_mapped = 0; + status.mp_rebuilt = 0; + /* Setup the map cache and use that to deal with the buffer. */ + was_hole = true; + vcn = bh_cpos; + vcn_len = 1; + lcn_block = lcn << (vol->cluster_size_bits - blocksize_bits); + cdelta = 0; + /* + * If the number of remaining clusters in the @pages is smaller + * or equal to the number of cached clusters, unlock the + * runlist as the map cache will be used from now on. + */ + if (likely(vcn + vcn_len >= cend)) { + up_write(&ni->runlist.lock); + rl_write_locked = false; + rl = NULL; + } + goto map_buffer_cached; + } while (bh_pos += blocksize, (bh = bh->b_this_page) != head); + /* If there are no errors, do the next page. */ + if (likely(!err && ++u < nr_pages)) + goto do_next_page; + /* If there are no errors, release the runlist lock if we took it. */ + if (likely(!err)) { + if (unlikely(rl_write_locked)) { + up_write(&ni->runlist.lock); + rl_write_locked = false; + } else if (unlikely(rl)) + up_read(&ni->runlist.lock); + rl = NULL; + } + /* If we issued read requests, let them complete. */ + read_lock_irqsave(&ni->size_lock, flags); + initialized_size = ni->initialized_size; + read_unlock_irqrestore(&ni->size_lock, flags); + while (wait_bh > wait) { + bh = *--wait_bh; + wait_on_buffer(bh); + if (likely(buffer_uptodate(bh))) { + page = bh->b_page; + bh_pos = ((s64)page->index << PAGE_SHIFT) + + bh_offset(bh); + /* + * If the buffer overflows the initialized size, need + * to zero the overflowing region. + */ + if (unlikely(bh_pos + blocksize > initialized_size)) { + int ofs = 0; + + if (likely(bh_pos < initialized_size)) + ofs = initialized_size - bh_pos; + zero_user_segment(page, bh_offset(bh) + ofs, + blocksize); + } + } else /* if (unlikely(!buffer_uptodate(bh))) */ + err = -EIO; + } + if (likely(!err)) { + /* Clear buffer_new on all buffers. */ + u = 0; + do { + bh = head = page_buffers(pages[u]); + do { + if (buffer_new(bh)) + clear_buffer_new(bh); + } while ((bh = bh->b_this_page) != head); + } while (++u < nr_pages); + ntfs_debug("Done."); + return err; + } + if (status.attr_switched) { + /* Get back to the attribute extent we modified. */ + ntfs_attr_reinit_search_ctx(ctx); + if (ntfs_attr_lookup(ni->type, ni->name, ni->name_len, + CASE_SENSITIVE, bh_cpos, NULL, 0, ctx)) { + ntfs_error(vol->sb, "Failed to find required " + "attribute extent of attribute in " + "error code path. Run chkdsk to " + "recover."); + write_lock_irqsave(&ni->size_lock, flags); + ni->itype.compressed.size += vol->cluster_size; + write_unlock_irqrestore(&ni->size_lock, flags); + flush_dcache_mft_record_page(ctx->ntfs_ino); + mark_mft_record_dirty(ctx->ntfs_ino); + /* + * The only thing that is now wrong is the compressed + * size of the base attribute extent which chkdsk + * should be able to fix. + */ + NVolSetErrors(vol); + } else { + m = ctx->mrec; + a = ctx->attr; + status.attr_switched = 0; + } + } + /* + * If the runlist has been modified, need to restore it by punching a + * hole into it and we then need to deallocate the on-disk cluster as + * well. Note, we only modify the runlist if we are able to generate a + * new mapping pairs array, i.e. only when the mapped attribute extent + * is not switched. + */ + if (status.runlist_merged && !status.attr_switched) { + BUG_ON(!rl_write_locked); + /* Make the file cluster we allocated sparse in the runlist. */ + if (ntfs_rl_punch_nolock(vol, &ni->runlist, bh_cpos, 1)) { + ntfs_error(vol->sb, "Failed to punch hole into " + "attribute runlist in error code " + "path. Run chkdsk to recover the " + "lost cluster."); + NVolSetErrors(vol); + } else /* if (success) */ { + status.runlist_merged = 0; + /* + * Deallocate the on-disk cluster we allocated but only + * if we succeeded in punching its vcn out of the + * runlist. + */ + down_write(&vol->lcnbmp_lock); + if (ntfs_bitmap_clear_bit(vol->lcnbmp_ino, lcn)) { + ntfs_error(vol->sb, "Failed to release " + "allocated cluster in error " + "code path. Run chkdsk to " + "recover the lost cluster."); + NVolSetErrors(vol); + } + up_write(&vol->lcnbmp_lock); + } + } + /* + * Resize the attribute record to its old size and rebuild the mapping + * pairs array. Note, we only can do this if the runlist has been + * restored to its old state which also implies that the mapped + * attribute extent is not switched. + */ + if (status.mp_rebuilt && !status.runlist_merged) { + if (ntfs_attr_record_resize(m, a, attr_rec_len)) { + ntfs_error(vol->sb, "Failed to restore attribute " + "record in error code path. Run " + "chkdsk to recover."); + NVolSetErrors(vol); + } else /* if (success) */ { + if (ntfs_mapping_pairs_build(vol, (u8*)a + + le16_to_cpu(a->data.non_resident. + mapping_pairs_offset), attr_rec_len - + le16_to_cpu(a->data.non_resident. + mapping_pairs_offset), ni->runlist.rl, + vcn, highest_vcn, NULL)) { + ntfs_error(vol->sb, "Failed to restore " + "mapping pairs array in error " + "code path. Run chkdsk to " + "recover."); + NVolSetErrors(vol); + } + flush_dcache_mft_record_page(ctx->ntfs_ino); + mark_mft_record_dirty(ctx->ntfs_ino); + } + } + /* Release the mft record and the attribute. */ + if (status.mft_attr_mapped) { + ntfs_attr_put_search_ctx(ctx); + unmap_mft_record(base_ni); + } + /* Release the runlist lock. */ + if (rl_write_locked) + up_write(&ni->runlist.lock); + else if (rl) + up_read(&ni->runlist.lock); + /* + * Zero out any newly allocated blocks to avoid exposing stale data. + * If BH_New is set, we know that the block was newly allocated above + * and that it has not been fully zeroed and marked dirty yet. + */ + nr_pages = u; + u = 0; + end = bh_cpos << vol->cluster_size_bits; + do { + page = pages[u]; + bh = head = page_buffers(page); + do { + if (u == nr_pages && + ((s64)page->index << PAGE_SHIFT) + + bh_offset(bh) >= end) + break; + if (!buffer_new(bh)) + continue; + clear_buffer_new(bh); + if (!buffer_uptodate(bh)) { + if (PageUptodate(page)) + set_buffer_uptodate(bh); + else { + zero_user(page, bh_offset(bh), + blocksize); + set_buffer_uptodate(bh); + } + } + mark_buffer_dirty(bh); + } while ((bh = bh->b_this_page) != head); + } while (++u <= nr_pages); + ntfs_error(vol->sb, "Failed. Returning error code %i.", err); + return err; +} + +static inline void ntfs_flush_dcache_pages(struct page **pages, + unsigned nr_pages) +{ + BUG_ON(!nr_pages); + /* + * Warning: Do not do the decrement at the same time as the call to + * flush_dcache_page() because it is a NULL macro on i386 and hence the + * decrement never happens so the loop never terminates. + */ + do { + --nr_pages; + flush_dcache_page(pages[nr_pages]); + } while (nr_pages > 0); +} + +/** + * ntfs_commit_pages_after_non_resident_write - commit the received data + * @pages: array of destination pages + * @nr_pages: number of pages in @pages + * @pos: byte position in file at which the write begins + * @bytes: number of bytes to be written + * + * See description of ntfs_commit_pages_after_write(), below. + */ +static inline int ntfs_commit_pages_after_non_resident_write( + struct page **pages, const unsigned nr_pages, + s64 pos, size_t bytes) +{ + s64 end, initialized_size; + struct inode *vi; + ntfs_inode *ni, *base_ni; + struct buffer_head *bh, *head; + ntfs_attr_search_ctx *ctx; + MFT_RECORD *m; + ATTR_RECORD *a; + unsigned long flags; + unsigned blocksize, u; + int err; + + vi = pages[0]->mapping->host; + ni = NTFS_I(vi); + blocksize = vi->i_sb->s_blocksize; + end = pos + bytes; + u = 0; + do { + s64 bh_pos; + struct page *page; + bool partial; + + page = pages[u]; + bh_pos = (s64)page->index << PAGE_SHIFT; + bh = head = page_buffers(page); + partial = false; + do { + s64 bh_end; + + bh_end = bh_pos + blocksize; + if (bh_end <= pos || bh_pos >= end) { + if (!buffer_uptodate(bh)) + partial = true; + } else { + set_buffer_uptodate(bh); + mark_buffer_dirty(bh); + } + } while (bh_pos += blocksize, (bh = bh->b_this_page) != head); + /* + * If all buffers are now uptodate but the page is not, set the + * page uptodate. + */ + if (!partial && !PageUptodate(page)) + SetPageUptodate(page); + } while (++u < nr_pages); + /* + * Finally, if we do not need to update initialized_size or i_size we + * are finished. + */ + read_lock_irqsave(&ni->size_lock, flags); + initialized_size = ni->initialized_size; + read_unlock_irqrestore(&ni->size_lock, flags); + if (end <= initialized_size) { + ntfs_debug("Done."); + return 0; + } + /* + * Update initialized_size/i_size as appropriate, both in the inode and + * the mft record. + */ + if (!NInoAttr(ni)) + base_ni = ni; + else + base_ni = ni->ext.base_ntfs_ino; + /* Map, pin, and lock the mft record. */ + m = map_mft_record(base_ni); + if (IS_ERR(m)) { + err = PTR_ERR(m); + m = NULL; + ctx = NULL; + goto err_out; + } + BUG_ON(!NInoNonResident(ni)); + ctx = ntfs_attr_get_search_ctx(base_ni, m); + if (unlikely(!ctx)) { + err = -ENOMEM; + goto err_out; + } + err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len, + CASE_SENSITIVE, 0, NULL, 0, ctx); + if (unlikely(err)) { + if (err == -ENOENT) + err = -EIO; + goto err_out; + } + a = ctx->attr; + BUG_ON(!a->non_resident); + write_lock_irqsave(&ni->size_lock, flags); + BUG_ON(end > ni->allocated_size); + ni->initialized_size = end; + a->data.non_resident.initialized_size = cpu_to_sle64(end); + if (end > i_size_read(vi)) { + i_size_write(vi, end); + a->data.non_resident.data_size = + a->data.non_resident.initialized_size; + } + write_unlock_irqrestore(&ni->size_lock, flags); + /* Mark the mft record dirty, so it gets written back. */ + flush_dcache_mft_record_page(ctx->ntfs_ino); + mark_mft_record_dirty(ctx->ntfs_ino); + ntfs_attr_put_search_ctx(ctx); + unmap_mft_record(base_ni); + ntfs_debug("Done."); + return 0; +err_out: + if (ctx) + ntfs_attr_put_search_ctx(ctx); + if (m) + unmap_mft_record(base_ni); + ntfs_error(vi->i_sb, "Failed to update initialized_size/i_size (error " + "code %i).", err); + if (err != -ENOMEM) + NVolSetErrors(ni->vol); + return err; +} + +/** + * ntfs_commit_pages_after_write - commit the received data + * @pages: array of destination pages + * @nr_pages: number of pages in @pages + * @pos: byte position in file at which the write begins + * @bytes: number of bytes to be written + * + * This is called from ntfs_file_buffered_write() with i_mutex held on the inode + * (@pages[0]->mapping->host). There are @nr_pages pages in @pages which are + * locked but not kmap()ped. The source data has already been copied into the + * @page. ntfs_prepare_pages_for_non_resident_write() has been called before + * the data was copied (for non-resident attributes only) and it returned + * success. + * + * Need to set uptodate and mark dirty all buffers within the boundary of the + * write. If all buffers in a page are uptodate we set the page uptodate, too. + * + * Setting the buffers dirty ensures that they get written out later when + * ntfs_writepage() is invoked by the VM. + * + * Finally, we need to update i_size and initialized_size as appropriate both + * in the inode and the mft record. + * + * This is modelled after fs/buffer.c::generic_commit_write(), which marks + * buffers uptodate and dirty, sets the page uptodate if all buffers in the + * page are uptodate, and updates i_size if the end of io is beyond i_size. In + * that case, it also marks the inode dirty. + * + * If things have gone as outlined in + * ntfs_prepare_pages_for_non_resident_write(), we do not need to do any page + * content modifications here for non-resident attributes. For resident + * attributes we need to do the uptodate bringing here which we combine with + * the copying into the mft record which means we save one atomic kmap. + * + * Return 0 on success or -errno on error. + */ +static int ntfs_commit_pages_after_write(struct page **pages, + const unsigned nr_pages, s64 pos, size_t bytes) +{ + s64 end, initialized_size; + loff_t i_size; + struct inode *vi; + ntfs_inode *ni, *base_ni; + struct page *page; + ntfs_attr_search_ctx *ctx; + MFT_RECORD *m; + ATTR_RECORD *a; + char *kattr, *kaddr; + unsigned long flags; + u32 attr_len; + int err; + + BUG_ON(!nr_pages); + BUG_ON(!pages); + page = pages[0]; + BUG_ON(!page); + vi = page->mapping->host; + ni = NTFS_I(vi); + ntfs_debug("Entering for inode 0x%lx, attribute type 0x%x, start page " + "index 0x%lx, nr_pages 0x%x, pos 0x%llx, bytes 0x%zx.", + vi->i_ino, ni->type, page->index, nr_pages, + (long long)pos, bytes); + if (NInoNonResident(ni)) + return ntfs_commit_pages_after_non_resident_write(pages, + nr_pages, pos, bytes); + BUG_ON(nr_pages > 1); + /* + * Attribute is resident, implying it is not compressed, encrypted, or + * sparse. + */ + if (!NInoAttr(ni)) + base_ni = ni; + else + base_ni = ni->ext.base_ntfs_ino; + BUG_ON(NInoNonResident(ni)); + /* Map, pin, and lock the mft record. */ + m = map_mft_record(base_ni); + if (IS_ERR(m)) { + err = PTR_ERR(m); + m = NULL; + ctx = NULL; + goto err_out; + } + ctx = ntfs_attr_get_search_ctx(base_ni, m); + if (unlikely(!ctx)) { + err = -ENOMEM; + goto err_out; + } + err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len, + CASE_SENSITIVE, 0, NULL, 0, ctx); + if (unlikely(err)) { + if (err == -ENOENT) + err = -EIO; + goto err_out; + } + a = ctx->attr; + BUG_ON(a->non_resident); + /* The total length of the attribute value. */ + attr_len = le32_to_cpu(a->data.resident.value_length); + i_size = i_size_read(vi); + BUG_ON(attr_len != i_size); + BUG_ON(pos > attr_len); + end = pos + bytes; + BUG_ON(end > le32_to_cpu(a->length) - + le16_to_cpu(a->data.resident.value_offset)); + kattr = (u8*)a + le16_to_cpu(a->data.resident.value_offset); + kaddr = kmap_atomic(page); + /* Copy the received data from the page to the mft record. */ + memcpy(kattr + pos, kaddr + pos, bytes); + /* Update the attribute length if necessary. */ + if (end > attr_len) { + attr_len = end; + a->data.resident.value_length = cpu_to_le32(attr_len); + } + /* + * If the page is not uptodate, bring the out of bounds area(s) + * uptodate by copying data from the mft record to the page. + */ + if (!PageUptodate(page)) { + if (pos > 0) + memcpy(kaddr, kattr, pos); + if (end < attr_len) + memcpy(kaddr + end, kattr + end, attr_len - end); + /* Zero the region outside the end of the attribute value. */ + memset(kaddr + attr_len, 0, PAGE_SIZE - attr_len); + flush_dcache_page(page); + SetPageUptodate(page); + } + kunmap_atomic(kaddr); + /* Update initialized_size/i_size if necessary. */ + read_lock_irqsave(&ni->size_lock, flags); + initialized_size = ni->initialized_size; + BUG_ON(end > ni->allocated_size); + read_unlock_irqrestore(&ni->size_lock, flags); + BUG_ON(initialized_size != i_size); + if (end > initialized_size) { + write_lock_irqsave(&ni->size_lock, flags); + ni->initialized_size = end; + i_size_write(vi, end); + write_unlock_irqrestore(&ni->size_lock, flags); + } + /* Mark the mft record dirty, so it gets written back. */ + flush_dcache_mft_record_page(ctx->ntfs_ino); + mark_mft_record_dirty(ctx->ntfs_ino); + ntfs_attr_put_search_ctx(ctx); + unmap_mft_record(base_ni); + ntfs_debug("Done."); + return 0; +err_out: + if (err == -ENOMEM) { + ntfs_warning(vi->i_sb, "Error allocating memory required to " + "commit the write."); + if (PageUptodate(page)) { + ntfs_warning(vi->i_sb, "Page is uptodate, setting " + "dirty so the write will be retried " + "later on by the VM."); + /* + * Put the page on mapping->dirty_pages, but leave its + * buffers' dirty state as-is. + */ + __set_page_dirty_nobuffers(page); + err = 0; + } else + ntfs_error(vi->i_sb, "Page is not uptodate. Written " + "data has been lost."); + } else { + ntfs_error(vi->i_sb, "Resident attribute commit write failed " + "with error %i.", err); + NVolSetErrors(ni->vol); + } + if (ctx) + ntfs_attr_put_search_ctx(ctx); + if (m) + unmap_mft_record(base_ni); + return err; +} + +/* + * Copy as much as we can into the pages and return the number of bytes which + * were successfully copied. If a fault is encountered then clear the pages + * out to (ofs + bytes) and return the number of bytes which were copied. + */ +static size_t ntfs_copy_from_user_iter(struct page **pages, unsigned nr_pages, + unsigned ofs, struct iov_iter *i, size_t bytes) +{ + struct page **last_page = pages + nr_pages; + size_t total = 0; + unsigned len, copied; + + do { + len = PAGE_SIZE - ofs; + if (len > bytes) + len = bytes; + copied = copy_page_from_iter_atomic(*pages, ofs, len, i); + total += copied; + bytes -= copied; + if (!bytes) + break; + if (copied < len) + goto err; + ofs = 0; + } while (++pages < last_page); +out: + return total; +err: + /* Zero the rest of the target like __copy_from_user(). */ + len = PAGE_SIZE - copied; + do { + if (len > bytes) + len = bytes; + zero_user(*pages, copied, len); + bytes -= len; + copied = 0; + len = PAGE_SIZE; + } while (++pages < last_page); + goto out; +} + +/** + * ntfs_perform_write - perform buffered write to a file + * @file: file to write to + * @i: iov_iter with data to write + * @pos: byte offset in file at which to begin writing to + */ +static ssize_t ntfs_perform_write(struct file *file, struct iov_iter *i, + loff_t pos) +{ + struct address_space *mapping = file->f_mapping; + struct inode *vi = mapping->host; + ntfs_inode *ni = NTFS_I(vi); + ntfs_volume *vol = ni->vol; + struct page *pages[NTFS_MAX_PAGES_PER_CLUSTER]; + struct page *cached_page = NULL; + VCN last_vcn; + LCN lcn; + size_t bytes; + ssize_t status, written = 0; + unsigned nr_pages; + + ntfs_debug("Entering for i_ino 0x%lx, attribute type 0x%x, pos " + "0x%llx, count 0x%lx.", vi->i_ino, + (unsigned)le32_to_cpu(ni->type), + (unsigned long long)pos, + (unsigned long)iov_iter_count(i)); + /* + * If a previous ntfs_truncate() failed, repeat it and abort if it + * fails again. + */ + if (unlikely(NInoTruncateFailed(ni))) { + int err; + + inode_dio_wait(vi); + err = ntfs_truncate(vi); + if (err || NInoTruncateFailed(ni)) { + if (!err) + err = -EIO; + ntfs_error(vol->sb, "Cannot perform write to inode " + "0x%lx, attribute type 0x%x, because " + "ntfs_truncate() failed (error code " + "%i).", vi->i_ino, + (unsigned)le32_to_cpu(ni->type), err); + return err; + } + } + /* + * Determine the number of pages per cluster for non-resident + * attributes. + */ + nr_pages = 1; + if (vol->cluster_size > PAGE_SIZE && NInoNonResident(ni)) + nr_pages = vol->cluster_size >> PAGE_SHIFT; + last_vcn = -1; + do { + VCN vcn; + pgoff_t start_idx; + unsigned ofs, do_pages, u; + size_t copied; + + start_idx = pos >> PAGE_SHIFT; + ofs = pos & ~PAGE_MASK; + bytes = PAGE_SIZE - ofs; + do_pages = 1; + if (nr_pages > 1) { + vcn = pos >> vol->cluster_size_bits; + if (vcn != last_vcn) { + last_vcn = vcn; + /* + * Get the lcn of the vcn the write is in. If + * it is a hole, need to lock down all pages in + * the cluster. + */ + down_read(&ni->runlist.lock); + lcn = ntfs_attr_vcn_to_lcn_nolock(ni, pos >> + vol->cluster_size_bits, false); + up_read(&ni->runlist.lock); + if (unlikely(lcn < LCN_HOLE)) { + if (lcn == LCN_ENOMEM) + status = -ENOMEM; + else { + status = -EIO; + ntfs_error(vol->sb, "Cannot " + "perform write to " + "inode 0x%lx, " + "attribute type 0x%x, " + "because the attribute " + "is corrupt.", + vi->i_ino, (unsigned) + le32_to_cpu(ni->type)); + } + break; + } + if (lcn == LCN_HOLE) { + start_idx = (pos & ~(s64) + vol->cluster_size_mask) + >> PAGE_SHIFT; + bytes = vol->cluster_size - (pos & + vol->cluster_size_mask); + do_pages = nr_pages; + } + } + } + if (bytes > iov_iter_count(i)) + bytes = iov_iter_count(i); +again: + /* + * Bring in the user page(s) that we will copy from _first_. + * Otherwise there is a nasty deadlock on copying from the same + * page(s) as we are writing to, without it/them being marked + * up-to-date. Note, at present there is nothing to stop the + * pages being swapped out between us bringing them into memory + * and doing the actual copying. + */ + if (unlikely(fault_in_iov_iter_readable(i, bytes))) { + status = -EFAULT; + break; + } + /* Get and lock @do_pages starting at index @start_idx. */ + status = __ntfs_grab_cache_pages(mapping, start_idx, do_pages, + pages, &cached_page); + if (unlikely(status)) + break; + /* + * For non-resident attributes, we need to fill any holes with + * actual clusters and ensure all bufferes are mapped. We also + * need to bring uptodate any buffers that are only partially + * being written to. + */ + if (NInoNonResident(ni)) { + status = ntfs_prepare_pages_for_non_resident_write( + pages, do_pages, pos, bytes); + if (unlikely(status)) { + do { + unlock_page(pages[--do_pages]); + put_page(pages[do_pages]); + } while (do_pages); + break; + } + } + u = (pos >> PAGE_SHIFT) - pages[0]->index; + copied = ntfs_copy_from_user_iter(pages + u, do_pages - u, ofs, + i, bytes); + ntfs_flush_dcache_pages(pages + u, do_pages - u); + status = 0; + if (likely(copied == bytes)) { + status = ntfs_commit_pages_after_write(pages, do_pages, + pos, bytes); + } + do { + unlock_page(pages[--do_pages]); + put_page(pages[do_pages]); + } while (do_pages); + if (unlikely(status < 0)) { + iov_iter_revert(i, copied); + break; + } + cond_resched(); + if (unlikely(copied < bytes)) { + iov_iter_revert(i, copied); + if (copied) + bytes = copied; + else if (bytes > PAGE_SIZE - ofs) + bytes = PAGE_SIZE - ofs; + goto again; + } + pos += copied; + written += copied; + balance_dirty_pages_ratelimited(mapping); + if (fatal_signal_pending(current)) { + status = -EINTR; + break; + } + } while (iov_iter_count(i)); + if (cached_page) + put_page(cached_page); + ntfs_debug("Done. Returning %s (written 0x%lx, status %li).", + written ? "written" : "status", (unsigned long)written, + (long)status); + return written ? written : status; +} + +/** + * ntfs_file_write_iter - simple wrapper for ntfs_file_write_iter_nolock() + * @iocb: IO state structure + * @from: iov_iter with data to write + * + * Basically the same as generic_file_write_iter() except that it ends up + * up calling ntfs_perform_write() instead of generic_perform_write() and that + * O_DIRECT is not implemented. + */ +static ssize_t ntfs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) +{ + struct file *file = iocb->ki_filp; + struct inode *vi = file_inode(file); + ssize_t written = 0; + ssize_t err; + + inode_lock(vi); + /* We can write back this queue in page reclaim. */ + current->backing_dev_info = inode_to_bdi(vi); + err = ntfs_prepare_file_for_write(iocb, from); + if (iov_iter_count(from) && !err) + written = ntfs_perform_write(file, from, iocb->ki_pos); + current->backing_dev_info = NULL; + inode_unlock(vi); + iocb->ki_pos += written; + if (likely(written > 0)) + written = generic_write_sync(iocb, written); + return written ? written : err; +} + +/** + * ntfs_file_fsync - sync a file to disk + * @filp: file to be synced + * @datasync: if non-zero only flush user data and not metadata + * + * Data integrity sync of a file to disk. Used for fsync, fdatasync, and msync + * system calls. This function is inspired by fs/buffer.c::file_fsync(). + * + * If @datasync is false, write the mft record and all associated extent mft + * records as well as the $DATA attribute and then sync the block device. + * + * If @datasync is true and the attribute is non-resident, we skip the writing + * of the mft record and all associated extent mft records (this might still + * happen due to the write_inode_now() call). + * + * Also, if @datasync is true, we do not wait on the inode to be written out + * but we always wait on the page cache pages to be written out. + * + * Locking: Caller must hold i_mutex on the inode. + * + * TODO: We should probably also write all attribute/index inodes associated + * with this inode but since we have no simple way of getting to them we ignore + * this problem for now. + */ +static int ntfs_file_fsync(struct file *filp, loff_t start, loff_t end, + int datasync) +{ + struct inode *vi = filp->f_mapping->host; + int err, ret = 0; + + ntfs_debug("Entering for inode 0x%lx.", vi->i_ino); + + err = file_write_and_wait_range(filp, start, end); + if (err) + return err; + inode_lock(vi); + + BUG_ON(S_ISDIR(vi->i_mode)); + if (!datasync || !NInoNonResident(NTFS_I(vi))) + ret = __ntfs_write_inode(vi, 1); + write_inode_now(vi, !datasync); + /* + * NOTE: If we were to use mapping->private_list (see ext2 and + * fs/buffer.c) for dirty blocks then we could optimize the below to be + * sync_mapping_buffers(vi->i_mapping). + */ + err = sync_blockdev(vi->i_sb->s_bdev); + if (unlikely(err && !ret)) + ret = err; + if (likely(!ret)) + ntfs_debug("Done."); + else + ntfs_warning(vi->i_sb, "Failed to f%ssync inode 0x%lx. Error " + "%u.", datasync ? "data" : "", vi->i_ino, -ret); + inode_unlock(vi); + return ret; +} + +#endif /* NTFS_RW */ + +const struct file_operations ntfs_file_ops = { + .llseek = generic_file_llseek, + .read_iter = generic_file_read_iter, +#ifdef NTFS_RW + .write_iter = ntfs_file_write_iter, + .fsync = ntfs_file_fsync, +#endif /* NTFS_RW */ + .mmap = generic_file_mmap, + .open = ntfs_file_open, + .splice_read = generic_file_splice_read, +}; + +const struct inode_operations ntfs_file_inode_ops = { +#ifdef NTFS_RW + .setattr = ntfs_setattr, +#endif /* NTFS_RW */ +}; + +const struct file_operations ntfs_empty_file_ops = {}; + +const struct inode_operations ntfs_empty_inode_ops = {}; diff --git a/fs/ntfs/index.c b/fs/ntfs/index.c new file mode 100644 index 000000000..d46c2c03a --- /dev/null +++ b/fs/ntfs/index.c @@ -0,0 +1,440 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * index.c - NTFS kernel index handling. Part of the Linux-NTFS project. + * + * Copyright (c) 2004-2005 Anton Altaparmakov + */ + +#include <linux/slab.h> + +#include "aops.h" +#include "collate.h" +#include "debug.h" +#include "index.h" +#include "ntfs.h" + +/** + * ntfs_index_ctx_get - allocate and initialize a new index context + * @idx_ni: ntfs index inode with which to initialize the context + * + * Allocate a new index context, initialize it with @idx_ni and return it. + * Return NULL if allocation failed. + * + * Locking: Caller must hold i_mutex on the index inode. + */ +ntfs_index_context *ntfs_index_ctx_get(ntfs_inode *idx_ni) +{ + ntfs_index_context *ictx; + + ictx = kmem_cache_alloc(ntfs_index_ctx_cache, GFP_NOFS); + if (ictx) + *ictx = (ntfs_index_context){ .idx_ni = idx_ni }; + return ictx; +} + +/** + * ntfs_index_ctx_put - release an index context + * @ictx: index context to free + * + * Release the index context @ictx, releasing all associated resources. + * + * Locking: Caller must hold i_mutex on the index inode. + */ +void ntfs_index_ctx_put(ntfs_index_context *ictx) +{ + if (ictx->entry) { + if (ictx->is_in_root) { + if (ictx->actx) + ntfs_attr_put_search_ctx(ictx->actx); + if (ictx->base_ni) + unmap_mft_record(ictx->base_ni); + } else { + struct page *page = ictx->page; + if (page) { + BUG_ON(!PageLocked(page)); + unlock_page(page); + ntfs_unmap_page(page); + } + } + } + kmem_cache_free(ntfs_index_ctx_cache, ictx); + return; +} + +/** + * ntfs_index_lookup - find a key in an index and return its index entry + * @key: [IN] key for which to search in the index + * @key_len: [IN] length of @key in bytes + * @ictx: [IN/OUT] context describing the index and the returned entry + * + * Before calling ntfs_index_lookup(), @ictx must have been obtained from a + * call to ntfs_index_ctx_get(). + * + * Look for the @key in the index specified by the index lookup context @ictx. + * ntfs_index_lookup() walks the contents of the index looking for the @key. + * + * If the @key is found in the index, 0 is returned and @ictx is setup to + * describe the index entry containing the matching @key. @ictx->entry is the + * index entry and @ictx->data and @ictx->data_len are the index entry data and + * its length in bytes, respectively. + * + * If the @key is not found in the index, -ENOENT is returned and @ictx is + * setup to describe the index entry whose key collates immediately after the + * search @key, i.e. this is the position in the index at which an index entry + * with a key of @key would need to be inserted. + * + * If an error occurs return the negative error code and @ictx is left + * untouched. + * + * When finished with the entry and its data, call ntfs_index_ctx_put() to free + * the context and other associated resources. + * + * If the index entry was modified, call flush_dcache_index_entry_page() + * immediately after the modification and either ntfs_index_entry_mark_dirty() + * or ntfs_index_entry_write() before the call to ntfs_index_ctx_put() to + * ensure that the changes are written to disk. + * + * Locking: - Caller must hold i_mutex on the index inode. + * - Each page cache page in the index allocation mapping must be + * locked whilst being accessed otherwise we may find a corrupt + * page due to it being under ->writepage at the moment which + * applies the mst protection fixups before writing out and then + * removes them again after the write is complete after which it + * unlocks the page. + */ +int ntfs_index_lookup(const void *key, const int key_len, + ntfs_index_context *ictx) +{ + VCN vcn, old_vcn; + ntfs_inode *idx_ni = ictx->idx_ni; + ntfs_volume *vol = idx_ni->vol; + struct super_block *sb = vol->sb; + ntfs_inode *base_ni = idx_ni->ext.base_ntfs_ino; + MFT_RECORD *m; + INDEX_ROOT *ir; + INDEX_ENTRY *ie; + INDEX_ALLOCATION *ia; + u8 *index_end, *kaddr; + ntfs_attr_search_ctx *actx; + struct address_space *ia_mapping; + struct page *page; + int rc, err = 0; + + ntfs_debug("Entering."); + BUG_ON(!NInoAttr(idx_ni)); + BUG_ON(idx_ni->type != AT_INDEX_ALLOCATION); + BUG_ON(idx_ni->nr_extents != -1); + BUG_ON(!base_ni); + BUG_ON(!key); + BUG_ON(key_len <= 0); + if (!ntfs_is_collation_rule_supported( + idx_ni->itype.index.collation_rule)) { + ntfs_error(sb, "Index uses unsupported collation rule 0x%x. " + "Aborting lookup.", le32_to_cpu( + idx_ni->itype.index.collation_rule)); + return -EOPNOTSUPP; + } + /* Get hold of the mft record for the index inode. */ + m = map_mft_record(base_ni); + if (IS_ERR(m)) { + ntfs_error(sb, "map_mft_record() failed with error code %ld.", + -PTR_ERR(m)); + return PTR_ERR(m); + } + actx = ntfs_attr_get_search_ctx(base_ni, m); + if (unlikely(!actx)) { + err = -ENOMEM; + goto err_out; + } + /* Find the index root attribute in the mft record. */ + err = ntfs_attr_lookup(AT_INDEX_ROOT, idx_ni->name, idx_ni->name_len, + CASE_SENSITIVE, 0, NULL, 0, actx); + if (unlikely(err)) { + if (err == -ENOENT) { + ntfs_error(sb, "Index root attribute missing in inode " + "0x%lx.", idx_ni->mft_no); + err = -EIO; + } + goto err_out; + } + /* Get to the index root value (it has been verified in read_inode). */ + ir = (INDEX_ROOT*)((u8*)actx->attr + + le16_to_cpu(actx->attr->data.resident.value_offset)); + index_end = (u8*)&ir->index + le32_to_cpu(ir->index.index_length); + /* The first index entry. */ + ie = (INDEX_ENTRY*)((u8*)&ir->index + + le32_to_cpu(ir->index.entries_offset)); + /* + * Loop until we exceed valid memory (corruption case) or until we + * reach the last entry. + */ + for (;; ie = (INDEX_ENTRY*)((u8*)ie + le16_to_cpu(ie->length))) { + /* Bounds checks. */ + if ((u8*)ie < (u8*)actx->mrec || (u8*)ie + + sizeof(INDEX_ENTRY_HEADER) > index_end || + (u8*)ie + le16_to_cpu(ie->length) > index_end) + goto idx_err_out; + /* + * The last entry cannot contain a key. It can however contain + * a pointer to a child node in the B+tree so we just break out. + */ + if (ie->flags & INDEX_ENTRY_END) + break; + /* Further bounds checks. */ + if ((u32)sizeof(INDEX_ENTRY_HEADER) + + le16_to_cpu(ie->key_length) > + le16_to_cpu(ie->data.vi.data_offset) || + (u32)le16_to_cpu(ie->data.vi.data_offset) + + le16_to_cpu(ie->data.vi.data_length) > + le16_to_cpu(ie->length)) + goto idx_err_out; + /* If the keys match perfectly, we setup @ictx and return 0. */ + if ((key_len == le16_to_cpu(ie->key_length)) && !memcmp(key, + &ie->key, key_len)) { +ir_done: + ictx->is_in_root = true; + ictx->ir = ir; + ictx->actx = actx; + ictx->base_ni = base_ni; + ictx->ia = NULL; + ictx->page = NULL; +done: + ictx->entry = ie; + ictx->data = (u8*)ie + + le16_to_cpu(ie->data.vi.data_offset); + ictx->data_len = le16_to_cpu(ie->data.vi.data_length); + ntfs_debug("Done."); + return err; + } + /* + * Not a perfect match, need to do full blown collation so we + * know which way in the B+tree we have to go. + */ + rc = ntfs_collate(vol, idx_ni->itype.index.collation_rule, key, + key_len, &ie->key, le16_to_cpu(ie->key_length)); + /* + * If @key collates before the key of the current entry, there + * is definitely no such key in this index but we might need to + * descend into the B+tree so we just break out of the loop. + */ + if (rc == -1) + break; + /* + * A match should never happen as the memcmp() call should have + * cought it, but we still treat it correctly. + */ + if (!rc) + goto ir_done; + /* The keys are not equal, continue the search. */ + } + /* + * We have finished with this index without success. Check for the + * presence of a child node and if not present setup @ictx and return + * -ENOENT. + */ + if (!(ie->flags & INDEX_ENTRY_NODE)) { + ntfs_debug("Entry not found."); + err = -ENOENT; + goto ir_done; + } /* Child node present, descend into it. */ + /* Consistency check: Verify that an index allocation exists. */ + if (!NInoIndexAllocPresent(idx_ni)) { + ntfs_error(sb, "No index allocation attribute but index entry " + "requires one. Inode 0x%lx is corrupt or " + "driver bug.", idx_ni->mft_no); + goto err_out; + } + /* Get the starting vcn of the index_block holding the child node. */ + vcn = sle64_to_cpup((sle64*)((u8*)ie + le16_to_cpu(ie->length) - 8)); + ia_mapping = VFS_I(idx_ni)->i_mapping; + /* + * We are done with the index root and the mft record. Release them, + * otherwise we deadlock with ntfs_map_page(). + */ + ntfs_attr_put_search_ctx(actx); + unmap_mft_record(base_ni); + m = NULL; + actx = NULL; +descend_into_child_node: + /* + * Convert vcn to index into the index allocation attribute in units + * of PAGE_SIZE and map the page cache page, reading it from + * disk if necessary. + */ + page = ntfs_map_page(ia_mapping, vcn << + idx_ni->itype.index.vcn_size_bits >> PAGE_SHIFT); + if (IS_ERR(page)) { + ntfs_error(sb, "Failed to map index page, error %ld.", + -PTR_ERR(page)); + err = PTR_ERR(page); + goto err_out; + } + lock_page(page); + kaddr = (u8*)page_address(page); +fast_descend_into_child_node: + /* Get to the index allocation block. */ + ia = (INDEX_ALLOCATION*)(kaddr + ((vcn << + idx_ni->itype.index.vcn_size_bits) & ~PAGE_MASK)); + /* Bounds checks. */ + if ((u8*)ia < kaddr || (u8*)ia > kaddr + PAGE_SIZE) { + ntfs_error(sb, "Out of bounds check failed. Corrupt inode " + "0x%lx or driver bug.", idx_ni->mft_no); + goto unm_err_out; + } + /* Catch multi sector transfer fixup errors. */ + if (unlikely(!ntfs_is_indx_record(ia->magic))) { + ntfs_error(sb, "Index record with vcn 0x%llx is corrupt. " + "Corrupt inode 0x%lx. Run chkdsk.", + (long long)vcn, idx_ni->mft_no); + goto unm_err_out; + } + if (sle64_to_cpu(ia->index_block_vcn) != vcn) { + ntfs_error(sb, "Actual VCN (0x%llx) of index buffer is " + "different from expected VCN (0x%llx). Inode " + "0x%lx is corrupt or driver bug.", + (unsigned long long) + sle64_to_cpu(ia->index_block_vcn), + (unsigned long long)vcn, idx_ni->mft_no); + goto unm_err_out; + } + if (le32_to_cpu(ia->index.allocated_size) + 0x18 != + idx_ni->itype.index.block_size) { + ntfs_error(sb, "Index buffer (VCN 0x%llx) of inode 0x%lx has " + "a size (%u) differing from the index " + "specified size (%u). Inode is corrupt or " + "driver bug.", (unsigned long long)vcn, + idx_ni->mft_no, + le32_to_cpu(ia->index.allocated_size) + 0x18, + idx_ni->itype.index.block_size); + goto unm_err_out; + } + index_end = (u8*)ia + idx_ni->itype.index.block_size; + if (index_end > kaddr + PAGE_SIZE) { + ntfs_error(sb, "Index buffer (VCN 0x%llx) of inode 0x%lx " + "crosses page boundary. Impossible! Cannot " + "access! This is probably a bug in the " + "driver.", (unsigned long long)vcn, + idx_ni->mft_no); + goto unm_err_out; + } + index_end = (u8*)&ia->index + le32_to_cpu(ia->index.index_length); + if (index_end > (u8*)ia + idx_ni->itype.index.block_size) { + ntfs_error(sb, "Size of index buffer (VCN 0x%llx) of inode " + "0x%lx exceeds maximum size.", + (unsigned long long)vcn, idx_ni->mft_no); + goto unm_err_out; + } + /* The first index entry. */ + ie = (INDEX_ENTRY*)((u8*)&ia->index + + le32_to_cpu(ia->index.entries_offset)); + /* + * Iterate similar to above big loop but applied to index buffer, thus + * loop until we exceed valid memory (corruption case) or until we + * reach the last entry. + */ + for (;; ie = (INDEX_ENTRY*)((u8*)ie + le16_to_cpu(ie->length))) { + /* Bounds checks. */ + if ((u8*)ie < (u8*)ia || (u8*)ie + + sizeof(INDEX_ENTRY_HEADER) > index_end || + (u8*)ie + le16_to_cpu(ie->length) > index_end) { + ntfs_error(sb, "Index entry out of bounds in inode " + "0x%lx.", idx_ni->mft_no); + goto unm_err_out; + } + /* + * The last entry cannot contain a key. It can however contain + * a pointer to a child node in the B+tree so we just break out. + */ + if (ie->flags & INDEX_ENTRY_END) + break; + /* Further bounds checks. */ + if ((u32)sizeof(INDEX_ENTRY_HEADER) + + le16_to_cpu(ie->key_length) > + le16_to_cpu(ie->data.vi.data_offset) || + (u32)le16_to_cpu(ie->data.vi.data_offset) + + le16_to_cpu(ie->data.vi.data_length) > + le16_to_cpu(ie->length)) { + ntfs_error(sb, "Index entry out of bounds in inode " + "0x%lx.", idx_ni->mft_no); + goto unm_err_out; + } + /* If the keys match perfectly, we setup @ictx and return 0. */ + if ((key_len == le16_to_cpu(ie->key_length)) && !memcmp(key, + &ie->key, key_len)) { +ia_done: + ictx->is_in_root = false; + ictx->actx = NULL; + ictx->base_ni = NULL; + ictx->ia = ia; + ictx->page = page; + goto done; + } + /* + * Not a perfect match, need to do full blown collation so we + * know which way in the B+tree we have to go. + */ + rc = ntfs_collate(vol, idx_ni->itype.index.collation_rule, key, + key_len, &ie->key, le16_to_cpu(ie->key_length)); + /* + * If @key collates before the key of the current entry, there + * is definitely no such key in this index but we might need to + * descend into the B+tree so we just break out of the loop. + */ + if (rc == -1) + break; + /* + * A match should never happen as the memcmp() call should have + * cought it, but we still treat it correctly. + */ + if (!rc) + goto ia_done; + /* The keys are not equal, continue the search. */ + } + /* + * We have finished with this index buffer without success. Check for + * the presence of a child node and if not present return -ENOENT. + */ + if (!(ie->flags & INDEX_ENTRY_NODE)) { + ntfs_debug("Entry not found."); + err = -ENOENT; + goto ia_done; + } + if ((ia->index.flags & NODE_MASK) == LEAF_NODE) { + ntfs_error(sb, "Index entry with child node found in a leaf " + "node in inode 0x%lx.", idx_ni->mft_no); + goto unm_err_out; + } + /* Child node present, descend into it. */ + old_vcn = vcn; + vcn = sle64_to_cpup((sle64*)((u8*)ie + le16_to_cpu(ie->length) - 8)); + if (vcn >= 0) { + /* + * If vcn is in the same page cache page as old_vcn we recycle + * the mapped page. + */ + if (old_vcn << vol->cluster_size_bits >> + PAGE_SHIFT == vcn << + vol->cluster_size_bits >> + PAGE_SHIFT) + goto fast_descend_into_child_node; + unlock_page(page); + ntfs_unmap_page(page); + goto descend_into_child_node; + } + ntfs_error(sb, "Negative child node vcn in inode 0x%lx.", + idx_ni->mft_no); +unm_err_out: + unlock_page(page); + ntfs_unmap_page(page); +err_out: + if (!err) + err = -EIO; + if (actx) + ntfs_attr_put_search_ctx(actx); + if (m) + unmap_mft_record(base_ni); + return err; +idx_err_out: + ntfs_error(sb, "Corrupt index. Aborting lookup."); + goto err_out; +} diff --git a/fs/ntfs/index.h b/fs/ntfs/index.h new file mode 100644 index 000000000..bb3c3ae55 --- /dev/null +++ b/fs/ntfs/index.h @@ -0,0 +1,134 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * index.h - Defines for NTFS kernel index handling. Part of the Linux-NTFS + * project. + * + * Copyright (c) 2004 Anton Altaparmakov + */ + +#ifndef _LINUX_NTFS_INDEX_H +#define _LINUX_NTFS_INDEX_H + +#include <linux/fs.h> + +#include "types.h" +#include "layout.h" +#include "inode.h" +#include "attrib.h" +#include "mft.h" +#include "aops.h" + +/** + * @idx_ni: index inode containing the @entry described by this context + * @entry: index entry (points into @ir or @ia) + * @data: index entry data (points into @entry) + * @data_len: length in bytes of @data + * @is_in_root: 'true' if @entry is in @ir and 'false' if it is in @ia + * @ir: index root if @is_in_root and NULL otherwise + * @actx: attribute search context if @is_in_root and NULL otherwise + * @base_ni: base inode if @is_in_root and NULL otherwise + * @ia: index block if @is_in_root is 'false' and NULL otherwise + * @page: page if @is_in_root is 'false' and NULL otherwise + * + * @idx_ni is the index inode this context belongs to. + * + * @entry is the index entry described by this context. @data and @data_len + * are the index entry data and its length in bytes, respectively. @data + * simply points into @entry. This is probably what the user is interested in. + * + * If @is_in_root is 'true', @entry is in the index root attribute @ir described + * by the attribute search context @actx and the base inode @base_ni. @ia and + * @page are NULL in this case. + * + * If @is_in_root is 'false', @entry is in the index allocation attribute and @ia + * and @page point to the index allocation block and the mapped, locked page it + * is in, respectively. @ir, @actx and @base_ni are NULL in this case. + * + * To obtain a context call ntfs_index_ctx_get(). + * + * We use this context to allow ntfs_index_lookup() to return the found index + * @entry and its @data without having to allocate a buffer and copy the @entry + * and/or its @data into it. + * + * When finished with the @entry and its @data, call ntfs_index_ctx_put() to + * free the context and other associated resources. + * + * If the index entry was modified, call flush_dcache_index_entry_page() + * immediately after the modification and either ntfs_index_entry_mark_dirty() + * or ntfs_index_entry_write() before the call to ntfs_index_ctx_put() to + * ensure that the changes are written to disk. + */ +typedef struct { + ntfs_inode *idx_ni; + INDEX_ENTRY *entry; + void *data; + u16 data_len; + bool is_in_root; + INDEX_ROOT *ir; + ntfs_attr_search_ctx *actx; + ntfs_inode *base_ni; + INDEX_ALLOCATION *ia; + struct page *page; +} ntfs_index_context; + +extern ntfs_index_context *ntfs_index_ctx_get(ntfs_inode *idx_ni); +extern void ntfs_index_ctx_put(ntfs_index_context *ictx); + +extern int ntfs_index_lookup(const void *key, const int key_len, + ntfs_index_context *ictx); + +#ifdef NTFS_RW + +/** + * ntfs_index_entry_flush_dcache_page - flush_dcache_page() for index entries + * @ictx: ntfs index context describing the index entry + * + * Call flush_dcache_page() for the page in which an index entry resides. + * + * This must be called every time an index entry is modified, just after the + * modification. + * + * If the index entry is in the index root attribute, simply flush the page + * containing the mft record containing the index root attribute. + * + * If the index entry is in an index block belonging to the index allocation + * attribute, simply flush the page cache page containing the index block. + */ +static inline void ntfs_index_entry_flush_dcache_page(ntfs_index_context *ictx) +{ + if (ictx->is_in_root) + flush_dcache_mft_record_page(ictx->actx->ntfs_ino); + else + flush_dcache_page(ictx->page); +} + +/** + * ntfs_index_entry_mark_dirty - mark an index entry dirty + * @ictx: ntfs index context describing the index entry + * + * Mark the index entry described by the index entry context @ictx dirty. + * + * If the index entry is in the index root attribute, simply mark the mft + * record containing the index root attribute dirty. This ensures the mft + * record, and hence the index root attribute, will be written out to disk + * later. + * + * If the index entry is in an index block belonging to the index allocation + * attribute, mark the buffers belonging to the index record as well as the + * page cache page the index block is in dirty. This automatically marks the + * VFS inode of the ntfs index inode to which the index entry belongs dirty, + * too (I_DIRTY_PAGES) and this in turn ensures the page buffers, and hence the + * dirty index block, will be written out to disk later. + */ +static inline void ntfs_index_entry_mark_dirty(ntfs_index_context *ictx) +{ + if (ictx->is_in_root) + mark_mft_record_dirty(ictx->actx->ntfs_ino); + else + mark_ntfs_record_dirty(ictx->page, + (u8*)ictx->ia - (u8*)page_address(ictx->page)); +} + +#endif /* NTFS_RW */ + +#endif /* _LINUX_NTFS_INDEX_H */ diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c new file mode 100644 index 000000000..08c659332 --- /dev/null +++ b/fs/ntfs/inode.c @@ -0,0 +1,3100 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/** + * inode.c - NTFS kernel inode handling. + * + * Copyright (c) 2001-2014 Anton Altaparmakov and Tuxera Inc. + */ + +#include <linux/buffer_head.h> +#include <linux/fs.h> +#include <linux/mm.h> +#include <linux/mount.h> +#include <linux/mutex.h> +#include <linux/pagemap.h> +#include <linux/quotaops.h> +#include <linux/slab.h> +#include <linux/log2.h> + +#include "aops.h" +#include "attrib.h" +#include "bitmap.h" +#include "dir.h" +#include "debug.h" +#include "inode.h" +#include "lcnalloc.h" +#include "malloc.h" +#include "mft.h" +#include "time.h" +#include "ntfs.h" + +/** + * ntfs_test_inode - compare two (possibly fake) inodes for equality + * @vi: vfs inode which to test + * @data: data which is being tested with + * + * Compare the ntfs attribute embedded in the ntfs specific part of the vfs + * inode @vi for equality with the ntfs attribute @data. + * + * If searching for the normal file/directory inode, set @na->type to AT_UNUSED. + * @na->name and @na->name_len are then ignored. + * + * Return 1 if the attributes match and 0 if not. + * + * NOTE: This function runs with the inode_hash_lock spin lock held so it is not + * allowed to sleep. + */ +int ntfs_test_inode(struct inode *vi, void *data) +{ + ntfs_attr *na = (ntfs_attr *)data; + ntfs_inode *ni; + + if (vi->i_ino != na->mft_no) + return 0; + ni = NTFS_I(vi); + /* If !NInoAttr(ni), @vi is a normal file or directory inode. */ + if (likely(!NInoAttr(ni))) { + /* If not looking for a normal inode this is a mismatch. */ + if (unlikely(na->type != AT_UNUSED)) + return 0; + } else { + /* A fake inode describing an attribute. */ + if (ni->type != na->type) + return 0; + if (ni->name_len != na->name_len) + return 0; + if (na->name_len && memcmp(ni->name, na->name, + na->name_len * sizeof(ntfschar))) + return 0; + } + /* Match! */ + return 1; +} + +/** + * ntfs_init_locked_inode - initialize an inode + * @vi: vfs inode to initialize + * @data: data which to initialize @vi to + * + * Initialize the vfs inode @vi with the values from the ntfs attribute @data in + * order to enable ntfs_test_inode() to do its work. + * + * If initializing the normal file/directory inode, set @na->type to AT_UNUSED. + * In that case, @na->name and @na->name_len should be set to NULL and 0, + * respectively. Although that is not strictly necessary as + * ntfs_read_locked_inode() will fill them in later. + * + * Return 0 on success and -errno on error. + * + * NOTE: This function runs with the inode->i_lock spin lock held so it is not + * allowed to sleep. (Hence the GFP_ATOMIC allocation.) + */ +static int ntfs_init_locked_inode(struct inode *vi, void *data) +{ + ntfs_attr *na = (ntfs_attr *)data; + ntfs_inode *ni = NTFS_I(vi); + + vi->i_ino = na->mft_no; + + ni->type = na->type; + if (na->type == AT_INDEX_ALLOCATION) + NInoSetMstProtected(ni); + + ni->name = na->name; + ni->name_len = na->name_len; + + /* If initializing a normal inode, we are done. */ + if (likely(na->type == AT_UNUSED)) { + BUG_ON(na->name); + BUG_ON(na->name_len); + return 0; + } + + /* It is a fake inode. */ + NInoSetAttr(ni); + + /* + * We have I30 global constant as an optimization as it is the name + * in >99.9% of named attributes! The other <0.1% incur a GFP_ATOMIC + * allocation but that is ok. And most attributes are unnamed anyway, + * thus the fraction of named attributes with name != I30 is actually + * absolutely tiny. + */ + if (na->name_len && na->name != I30) { + unsigned int i; + + BUG_ON(!na->name); + i = na->name_len * sizeof(ntfschar); + ni->name = kmalloc(i + sizeof(ntfschar), GFP_ATOMIC); + if (!ni->name) + return -ENOMEM; + memcpy(ni->name, na->name, i); + ni->name[na->name_len] = 0; + } + return 0; +} + +static int ntfs_read_locked_inode(struct inode *vi); +static int ntfs_read_locked_attr_inode(struct inode *base_vi, struct inode *vi); +static int ntfs_read_locked_index_inode(struct inode *base_vi, + struct inode *vi); + +/** + * ntfs_iget - obtain a struct inode corresponding to a specific normal inode + * @sb: super block of mounted volume + * @mft_no: mft record number / inode number to obtain + * + * Obtain the struct inode corresponding to a specific normal inode (i.e. a + * file or directory). + * + * If the inode is in the cache, it is just returned with an increased + * reference count. Otherwise, a new struct inode is allocated and initialized, + * and finally ntfs_read_locked_inode() is called to read in the inode and + * fill in the remainder of the inode structure. + * + * Return the struct inode on success. Check the return value with IS_ERR() and + * if true, the function failed and the error code is obtained from PTR_ERR(). + */ +struct inode *ntfs_iget(struct super_block *sb, unsigned long mft_no) +{ + struct inode *vi; + int err; + ntfs_attr na; + + na.mft_no = mft_no; + na.type = AT_UNUSED; + na.name = NULL; + na.name_len = 0; + + vi = iget5_locked(sb, mft_no, ntfs_test_inode, + ntfs_init_locked_inode, &na); + if (unlikely(!vi)) + return ERR_PTR(-ENOMEM); + + err = 0; + + /* If this is a freshly allocated inode, need to read it now. */ + if (vi->i_state & I_NEW) { + err = ntfs_read_locked_inode(vi); + unlock_new_inode(vi); + } + /* + * There is no point in keeping bad inodes around if the failure was + * due to ENOMEM. We want to be able to retry again later. + */ + if (unlikely(err == -ENOMEM)) { + iput(vi); + vi = ERR_PTR(err); + } + return vi; +} + +/** + * ntfs_attr_iget - obtain a struct inode corresponding to an attribute + * @base_vi: vfs base inode containing the attribute + * @type: attribute type + * @name: Unicode name of the attribute (NULL if unnamed) + * @name_len: length of @name in Unicode characters (0 if unnamed) + * + * Obtain the (fake) struct inode corresponding to the attribute specified by + * @type, @name, and @name_len, which is present in the base mft record + * specified by the vfs inode @base_vi. + * + * If the attribute inode is in the cache, it is just returned with an + * increased reference count. Otherwise, a new struct inode is allocated and + * initialized, and finally ntfs_read_locked_attr_inode() is called to read the + * attribute and fill in the inode structure. + * + * Note, for index allocation attributes, you need to use ntfs_index_iget() + * instead of ntfs_attr_iget() as working with indices is a lot more complex. + * + * Return the struct inode of the attribute inode on success. Check the return + * value with IS_ERR() and if true, the function failed and the error code is + * obtained from PTR_ERR(). + */ +struct inode *ntfs_attr_iget(struct inode *base_vi, ATTR_TYPE type, + ntfschar *name, u32 name_len) +{ + struct inode *vi; + int err; + ntfs_attr na; + + /* Make sure no one calls ntfs_attr_iget() for indices. */ + BUG_ON(type == AT_INDEX_ALLOCATION); + + na.mft_no = base_vi->i_ino; + na.type = type; + na.name = name; + na.name_len = name_len; + + vi = iget5_locked(base_vi->i_sb, na.mft_no, ntfs_test_inode, + ntfs_init_locked_inode, &na); + if (unlikely(!vi)) + return ERR_PTR(-ENOMEM); + + err = 0; + + /* If this is a freshly allocated inode, need to read it now. */ + if (vi->i_state & I_NEW) { + err = ntfs_read_locked_attr_inode(base_vi, vi); + unlock_new_inode(vi); + } + /* + * There is no point in keeping bad attribute inodes around. This also + * simplifies things in that we never need to check for bad attribute + * inodes elsewhere. + */ + if (unlikely(err)) { + iput(vi); + vi = ERR_PTR(err); + } + return vi; +} + +/** + * ntfs_index_iget - obtain a struct inode corresponding to an index + * @base_vi: vfs base inode containing the index related attributes + * @name: Unicode name of the index + * @name_len: length of @name in Unicode characters + * + * Obtain the (fake) struct inode corresponding to the index specified by @name + * and @name_len, which is present in the base mft record specified by the vfs + * inode @base_vi. + * + * If the index inode is in the cache, it is just returned with an increased + * reference count. Otherwise, a new struct inode is allocated and + * initialized, and finally ntfs_read_locked_index_inode() is called to read + * the index related attributes and fill in the inode structure. + * + * Return the struct inode of the index inode on success. Check the return + * value with IS_ERR() and if true, the function failed and the error code is + * obtained from PTR_ERR(). + */ +struct inode *ntfs_index_iget(struct inode *base_vi, ntfschar *name, + u32 name_len) +{ + struct inode *vi; + int err; + ntfs_attr na; + + na.mft_no = base_vi->i_ino; + na.type = AT_INDEX_ALLOCATION; + na.name = name; + na.name_len = name_len; + + vi = iget5_locked(base_vi->i_sb, na.mft_no, ntfs_test_inode, + ntfs_init_locked_inode, &na); + if (unlikely(!vi)) + return ERR_PTR(-ENOMEM); + + err = 0; + + /* If this is a freshly allocated inode, need to read it now. */ + if (vi->i_state & I_NEW) { + err = ntfs_read_locked_index_inode(base_vi, vi); + unlock_new_inode(vi); + } + /* + * There is no point in keeping bad index inodes around. This also + * simplifies things in that we never need to check for bad index + * inodes elsewhere. + */ + if (unlikely(err)) { + iput(vi); + vi = ERR_PTR(err); + } + return vi; +} + +struct inode *ntfs_alloc_big_inode(struct super_block *sb) +{ + ntfs_inode *ni; + + ntfs_debug("Entering."); + ni = alloc_inode_sb(sb, ntfs_big_inode_cache, GFP_NOFS); + if (likely(ni != NULL)) { + ni->state = 0; + return VFS_I(ni); + } + ntfs_error(sb, "Allocation of NTFS big inode structure failed."); + return NULL; +} + +void ntfs_free_big_inode(struct inode *inode) +{ + kmem_cache_free(ntfs_big_inode_cache, NTFS_I(inode)); +} + +static inline ntfs_inode *ntfs_alloc_extent_inode(void) +{ + ntfs_inode *ni; + + ntfs_debug("Entering."); + ni = kmem_cache_alloc(ntfs_inode_cache, GFP_NOFS); + if (likely(ni != NULL)) { + ni->state = 0; + return ni; + } + ntfs_error(NULL, "Allocation of NTFS inode structure failed."); + return NULL; +} + +static void ntfs_destroy_extent_inode(ntfs_inode *ni) +{ + ntfs_debug("Entering."); + BUG_ON(ni->page); + if (!atomic_dec_and_test(&ni->count)) + BUG(); + kmem_cache_free(ntfs_inode_cache, ni); +} + +/* + * The attribute runlist lock has separate locking rules from the + * normal runlist lock, so split the two lock-classes: + */ +static struct lock_class_key attr_list_rl_lock_class; + +/** + * __ntfs_init_inode - initialize ntfs specific part of an inode + * @sb: super block of mounted volume + * @ni: freshly allocated ntfs inode which to initialize + * + * Initialize an ntfs inode to defaults. + * + * NOTE: ni->mft_no, ni->state, ni->type, ni->name, and ni->name_len are left + * untouched. Make sure to initialize them elsewhere. + * + * Return zero on success and -ENOMEM on error. + */ +void __ntfs_init_inode(struct super_block *sb, ntfs_inode *ni) +{ + ntfs_debug("Entering."); + rwlock_init(&ni->size_lock); + ni->initialized_size = ni->allocated_size = 0; + ni->seq_no = 0; + atomic_set(&ni->count, 1); + ni->vol = NTFS_SB(sb); + ntfs_init_runlist(&ni->runlist); + mutex_init(&ni->mrec_lock); + ni->page = NULL; + ni->page_ofs = 0; + ni->attr_list_size = 0; + ni->attr_list = NULL; + ntfs_init_runlist(&ni->attr_list_rl); + lockdep_set_class(&ni->attr_list_rl.lock, + &attr_list_rl_lock_class); + ni->itype.index.block_size = 0; + ni->itype.index.vcn_size = 0; + ni->itype.index.collation_rule = 0; + ni->itype.index.block_size_bits = 0; + ni->itype.index.vcn_size_bits = 0; + mutex_init(&ni->extent_lock); + ni->nr_extents = 0; + ni->ext.base_ntfs_ino = NULL; +} + +/* + * Extent inodes get MFT-mapped in a nested way, while the base inode + * is still mapped. Teach this nesting to the lock validator by creating + * a separate class for nested inode's mrec_lock's: + */ +static struct lock_class_key extent_inode_mrec_lock_key; + +inline ntfs_inode *ntfs_new_extent_inode(struct super_block *sb, + unsigned long mft_no) +{ + ntfs_inode *ni = ntfs_alloc_extent_inode(); + + ntfs_debug("Entering."); + if (likely(ni != NULL)) { + __ntfs_init_inode(sb, ni); + lockdep_set_class(&ni->mrec_lock, &extent_inode_mrec_lock_key); + ni->mft_no = mft_no; + ni->type = AT_UNUSED; + ni->name = NULL; + ni->name_len = 0; + } + return ni; +} + +/** + * ntfs_is_extended_system_file - check if a file is in the $Extend directory + * @ctx: initialized attribute search context + * + * Search all file name attributes in the inode described by the attribute + * search context @ctx and check if any of the names are in the $Extend system + * directory. + * + * Return values: + * 1: file is in $Extend directory + * 0: file is not in $Extend directory + * -errno: failed to determine if the file is in the $Extend directory + */ +static int ntfs_is_extended_system_file(ntfs_attr_search_ctx *ctx) +{ + int nr_links, err; + + /* Restart search. */ + ntfs_attr_reinit_search_ctx(ctx); + + /* Get number of hard links. */ + nr_links = le16_to_cpu(ctx->mrec->link_count); + + /* Loop through all hard links. */ + while (!(err = ntfs_attr_lookup(AT_FILE_NAME, NULL, 0, 0, 0, NULL, 0, + ctx))) { + FILE_NAME_ATTR *file_name_attr; + ATTR_RECORD *attr = ctx->attr; + u8 *p, *p2; + + nr_links--; + /* + * Maximum sanity checking as we are called on an inode that + * we suspect might be corrupt. + */ + p = (u8*)attr + le32_to_cpu(attr->length); + if (p < (u8*)ctx->mrec || (u8*)p > (u8*)ctx->mrec + + le32_to_cpu(ctx->mrec->bytes_in_use)) { +err_corrupt_attr: + ntfs_error(ctx->ntfs_ino->vol->sb, "Corrupt file name " + "attribute. You should run chkdsk."); + return -EIO; + } + if (attr->non_resident) { + ntfs_error(ctx->ntfs_ino->vol->sb, "Non-resident file " + "name. You should run chkdsk."); + return -EIO; + } + if (attr->flags) { + ntfs_error(ctx->ntfs_ino->vol->sb, "File name with " + "invalid flags. You should run " + "chkdsk."); + return -EIO; + } + if (!(attr->data.resident.flags & RESIDENT_ATTR_IS_INDEXED)) { + ntfs_error(ctx->ntfs_ino->vol->sb, "Unindexed file " + "name. You should run chkdsk."); + return -EIO; + } + file_name_attr = (FILE_NAME_ATTR*)((u8*)attr + + le16_to_cpu(attr->data.resident.value_offset)); + p2 = (u8 *)file_name_attr + le32_to_cpu(attr->data.resident.value_length); + if (p2 < (u8*)attr || p2 > p) + goto err_corrupt_attr; + /* This attribute is ok, but is it in the $Extend directory? */ + if (MREF_LE(file_name_attr->parent_directory) == FILE_Extend) + return 1; /* YES, it's an extended system file. */ + } + if (unlikely(err != -ENOENT)) + return err; + if (unlikely(nr_links)) { + ntfs_error(ctx->ntfs_ino->vol->sb, "Inode hard link count " + "doesn't match number of name attributes. You " + "should run chkdsk."); + return -EIO; + } + return 0; /* NO, it is not an extended system file. */ +} + +/** + * ntfs_read_locked_inode - read an inode from its device + * @vi: inode to read + * + * ntfs_read_locked_inode() is called from ntfs_iget() to read the inode + * described by @vi into memory from the device. + * + * The only fields in @vi that we need to/can look at when the function is + * called are i_sb, pointing to the mounted device's super block, and i_ino, + * the number of the inode to load. + * + * ntfs_read_locked_inode() maps, pins and locks the mft record number i_ino + * for reading and sets up the necessary @vi fields as well as initializing + * the ntfs inode. + * + * Q: What locks are held when the function is called? + * A: i_state has I_NEW set, hence the inode is locked, also + * i_count is set to 1, so it is not going to go away + * i_flags is set to 0 and we have no business touching it. Only an ioctl() + * is allowed to write to them. We should of course be honouring them but + * we need to do that using the IS_* macros defined in include/linux/fs.h. + * In any case ntfs_read_locked_inode() has nothing to do with i_flags. + * + * Return 0 on success and -errno on error. In the error case, the inode will + * have had make_bad_inode() executed on it. + */ +static int ntfs_read_locked_inode(struct inode *vi) +{ + ntfs_volume *vol = NTFS_SB(vi->i_sb); + ntfs_inode *ni; + struct inode *bvi; + MFT_RECORD *m; + ATTR_RECORD *a; + STANDARD_INFORMATION *si; + ntfs_attr_search_ctx *ctx; + int err = 0; + + ntfs_debug("Entering for i_ino 0x%lx.", vi->i_ino); + + /* Setup the generic vfs inode parts now. */ + vi->i_uid = vol->uid; + vi->i_gid = vol->gid; + vi->i_mode = 0; + + /* + * Initialize the ntfs specific part of @vi special casing + * FILE_MFT which we need to do at mount time. + */ + if (vi->i_ino != FILE_MFT) + ntfs_init_big_inode(vi); + ni = NTFS_I(vi); + + m = map_mft_record(ni); + if (IS_ERR(m)) { + err = PTR_ERR(m); + goto err_out; + } + ctx = ntfs_attr_get_search_ctx(ni, m); + if (!ctx) { + err = -ENOMEM; + goto unm_err_out; + } + + if (!(m->flags & MFT_RECORD_IN_USE)) { + ntfs_error(vi->i_sb, "Inode is not in use!"); + goto unm_err_out; + } + if (m->base_mft_record) { + ntfs_error(vi->i_sb, "Inode is an extent inode!"); + goto unm_err_out; + } + + /* Transfer information from mft record into vfs and ntfs inodes. */ + vi->i_generation = ni->seq_no = le16_to_cpu(m->sequence_number); + + /* + * FIXME: Keep in mind that link_count is two for files which have both + * a long file name and a short file name as separate entries, so if + * we are hiding short file names this will be too high. Either we need + * to account for the short file names by subtracting them or we need + * to make sure we delete files even though i_nlink is not zero which + * might be tricky due to vfs interactions. Need to think about this + * some more when implementing the unlink command. + */ + set_nlink(vi, le16_to_cpu(m->link_count)); + /* + * FIXME: Reparse points can have the directory bit set even though + * they would be S_IFLNK. Need to deal with this further below when we + * implement reparse points / symbolic links but it will do for now. + * Also if not a directory, it could be something else, rather than + * a regular file. But again, will do for now. + */ + /* Everyone gets all permissions. */ + vi->i_mode |= S_IRWXUGO; + /* If read-only, no one gets write permissions. */ + if (IS_RDONLY(vi)) + vi->i_mode &= ~S_IWUGO; + if (m->flags & MFT_RECORD_IS_DIRECTORY) { + vi->i_mode |= S_IFDIR; + /* + * Apply the directory permissions mask set in the mount + * options. + */ + vi->i_mode &= ~vol->dmask; + /* Things break without this kludge! */ + if (vi->i_nlink > 1) + set_nlink(vi, 1); + } else { + vi->i_mode |= S_IFREG; + /* Apply the file permissions mask set in the mount options. */ + vi->i_mode &= ~vol->fmask; + } + /* + * Find the standard information attribute in the mft record. At this + * stage we haven't setup the attribute list stuff yet, so this could + * in fact fail if the standard information is in an extent record, but + * I don't think this actually ever happens. + */ + err = ntfs_attr_lookup(AT_STANDARD_INFORMATION, NULL, 0, 0, 0, NULL, 0, + ctx); + if (unlikely(err)) { + if (err == -ENOENT) { + /* + * TODO: We should be performing a hot fix here (if the + * recover mount option is set) by creating a new + * attribute. + */ + ntfs_error(vi->i_sb, "$STANDARD_INFORMATION attribute " + "is missing."); + } + goto unm_err_out; + } + a = ctx->attr; + /* Get the standard information attribute value. */ + if ((u8 *)a + le16_to_cpu(a->data.resident.value_offset) + + le32_to_cpu(a->data.resident.value_length) > + (u8 *)ctx->mrec + vol->mft_record_size) { + ntfs_error(vi->i_sb, "Corrupt standard information attribute in inode."); + goto unm_err_out; + } + si = (STANDARD_INFORMATION*)((u8*)a + + le16_to_cpu(a->data.resident.value_offset)); + + /* Transfer information from the standard information into vi. */ + /* + * Note: The i_?times do not quite map perfectly onto the NTFS times, + * but they are close enough, and in the end it doesn't really matter + * that much... + */ + /* + * mtime is the last change of the data within the file. Not changed + * when only metadata is changed, e.g. a rename doesn't affect mtime. + */ + vi->i_mtime = ntfs2utc(si->last_data_change_time); + /* + * ctime is the last change of the metadata of the file. This obviously + * always changes, when mtime is changed. ctime can be changed on its + * own, mtime is then not changed, e.g. when a file is renamed. + */ + vi->i_ctime = ntfs2utc(si->last_mft_change_time); + /* + * Last access to the data within the file. Not changed during a rename + * for example but changed whenever the file is written to. + */ + vi->i_atime = ntfs2utc(si->last_access_time); + + /* Find the attribute list attribute if present. */ + ntfs_attr_reinit_search_ctx(ctx); + err = ntfs_attr_lookup(AT_ATTRIBUTE_LIST, NULL, 0, 0, 0, NULL, 0, ctx); + if (err) { + if (unlikely(err != -ENOENT)) { + ntfs_error(vi->i_sb, "Failed to lookup attribute list " + "attribute."); + goto unm_err_out; + } + } else /* if (!err) */ { + if (vi->i_ino == FILE_MFT) + goto skip_attr_list_load; + ntfs_debug("Attribute list found in inode 0x%lx.", vi->i_ino); + NInoSetAttrList(ni); + a = ctx->attr; + if (a->flags & ATTR_COMPRESSION_MASK) { + ntfs_error(vi->i_sb, "Attribute list attribute is " + "compressed."); + goto unm_err_out; + } + if (a->flags & ATTR_IS_ENCRYPTED || + a->flags & ATTR_IS_SPARSE) { + if (a->non_resident) { + ntfs_error(vi->i_sb, "Non-resident attribute " + "list attribute is encrypted/" + "sparse."); + goto unm_err_out; + } + ntfs_warning(vi->i_sb, "Resident attribute list " + "attribute in inode 0x%lx is marked " + "encrypted/sparse which is not true. " + "However, Windows allows this and " + "chkdsk does not detect or correct it " + "so we will just ignore the invalid " + "flags and pretend they are not set.", + vi->i_ino); + } + /* Now allocate memory for the attribute list. */ + ni->attr_list_size = (u32)ntfs_attr_size(a); + ni->attr_list = ntfs_malloc_nofs(ni->attr_list_size); + if (!ni->attr_list) { + ntfs_error(vi->i_sb, "Not enough memory to allocate " + "buffer for attribute list."); + err = -ENOMEM; + goto unm_err_out; + } + if (a->non_resident) { + NInoSetAttrListNonResident(ni); + if (a->data.non_resident.lowest_vcn) { + ntfs_error(vi->i_sb, "Attribute list has non " + "zero lowest_vcn."); + goto unm_err_out; + } + /* + * Setup the runlist. No need for locking as we have + * exclusive access to the inode at this time. + */ + ni->attr_list_rl.rl = ntfs_mapping_pairs_decompress(vol, + a, NULL); + if (IS_ERR(ni->attr_list_rl.rl)) { + err = PTR_ERR(ni->attr_list_rl.rl); + ni->attr_list_rl.rl = NULL; + ntfs_error(vi->i_sb, "Mapping pairs " + "decompression failed."); + goto unm_err_out; + } + /* Now load the attribute list. */ + if ((err = load_attribute_list(vol, &ni->attr_list_rl, + ni->attr_list, ni->attr_list_size, + sle64_to_cpu(a->data.non_resident. + initialized_size)))) { + ntfs_error(vi->i_sb, "Failed to load " + "attribute list attribute."); + goto unm_err_out; + } + } else /* if (!a->non_resident) */ { + if ((u8*)a + le16_to_cpu(a->data.resident.value_offset) + + le32_to_cpu( + a->data.resident.value_length) > + (u8*)ctx->mrec + vol->mft_record_size) { + ntfs_error(vi->i_sb, "Corrupt attribute list " + "in inode."); + goto unm_err_out; + } + /* Now copy the attribute list. */ + memcpy(ni->attr_list, (u8*)a + le16_to_cpu( + a->data.resident.value_offset), + le32_to_cpu( + a->data.resident.value_length)); + } + } +skip_attr_list_load: + /* + * If an attribute list is present we now have the attribute list value + * in ntfs_ino->attr_list and it is ntfs_ino->attr_list_size bytes. + */ + if (S_ISDIR(vi->i_mode)) { + loff_t bvi_size; + ntfs_inode *bni; + INDEX_ROOT *ir; + u8 *ir_end, *index_end; + + /* It is a directory, find index root attribute. */ + ntfs_attr_reinit_search_ctx(ctx); + err = ntfs_attr_lookup(AT_INDEX_ROOT, I30, 4, CASE_SENSITIVE, + 0, NULL, 0, ctx); + if (unlikely(err)) { + if (err == -ENOENT) { + // FIXME: File is corrupt! Hot-fix with empty + // index root attribute if recovery option is + // set. + ntfs_error(vi->i_sb, "$INDEX_ROOT attribute " + "is missing."); + } + goto unm_err_out; + } + a = ctx->attr; + /* Set up the state. */ + if (unlikely(a->non_resident)) { + ntfs_error(vol->sb, "$INDEX_ROOT attribute is not " + "resident."); + goto unm_err_out; + } + /* Ensure the attribute name is placed before the value. */ + if (unlikely(a->name_length && (le16_to_cpu(a->name_offset) >= + le16_to_cpu(a->data.resident.value_offset)))) { + ntfs_error(vol->sb, "$INDEX_ROOT attribute name is " + "placed after the attribute value."); + goto unm_err_out; + } + /* + * Compressed/encrypted index root just means that the newly + * created files in that directory should be created compressed/ + * encrypted. However index root cannot be both compressed and + * encrypted. + */ + if (a->flags & ATTR_COMPRESSION_MASK) + NInoSetCompressed(ni); + if (a->flags & ATTR_IS_ENCRYPTED) { + if (a->flags & ATTR_COMPRESSION_MASK) { + ntfs_error(vi->i_sb, "Found encrypted and " + "compressed attribute."); + goto unm_err_out; + } + NInoSetEncrypted(ni); + } + if (a->flags & ATTR_IS_SPARSE) + NInoSetSparse(ni); + ir = (INDEX_ROOT*)((u8*)a + + le16_to_cpu(a->data.resident.value_offset)); + ir_end = (u8*)ir + le32_to_cpu(a->data.resident.value_length); + if (ir_end > (u8*)ctx->mrec + vol->mft_record_size) { + ntfs_error(vi->i_sb, "$INDEX_ROOT attribute is " + "corrupt."); + goto unm_err_out; + } + index_end = (u8*)&ir->index + + le32_to_cpu(ir->index.index_length); + if (index_end > ir_end) { + ntfs_error(vi->i_sb, "Directory index is corrupt."); + goto unm_err_out; + } + if (ir->type != AT_FILE_NAME) { + ntfs_error(vi->i_sb, "Indexed attribute is not " + "$FILE_NAME."); + goto unm_err_out; + } + if (ir->collation_rule != COLLATION_FILE_NAME) { + ntfs_error(vi->i_sb, "Index collation rule is not " + "COLLATION_FILE_NAME."); + goto unm_err_out; + } + ni->itype.index.collation_rule = ir->collation_rule; + ni->itype.index.block_size = le32_to_cpu(ir->index_block_size); + if (ni->itype.index.block_size & + (ni->itype.index.block_size - 1)) { + ntfs_error(vi->i_sb, "Index block size (%u) is not a " + "power of two.", + ni->itype.index.block_size); + goto unm_err_out; + } + if (ni->itype.index.block_size > PAGE_SIZE) { + ntfs_error(vi->i_sb, "Index block size (%u) > " + "PAGE_SIZE (%ld) is not " + "supported. Sorry.", + ni->itype.index.block_size, + PAGE_SIZE); + err = -EOPNOTSUPP; + goto unm_err_out; + } + if (ni->itype.index.block_size < NTFS_BLOCK_SIZE) { + ntfs_error(vi->i_sb, "Index block size (%u) < " + "NTFS_BLOCK_SIZE (%i) is not " + "supported. Sorry.", + ni->itype.index.block_size, + NTFS_BLOCK_SIZE); + err = -EOPNOTSUPP; + goto unm_err_out; + } + ni->itype.index.block_size_bits = + ffs(ni->itype.index.block_size) - 1; + /* Determine the size of a vcn in the directory index. */ + if (vol->cluster_size <= ni->itype.index.block_size) { + ni->itype.index.vcn_size = vol->cluster_size; + ni->itype.index.vcn_size_bits = vol->cluster_size_bits; + } else { + ni->itype.index.vcn_size = vol->sector_size; + ni->itype.index.vcn_size_bits = vol->sector_size_bits; + } + + /* Setup the index allocation attribute, even if not present. */ + NInoSetMstProtected(ni); + ni->type = AT_INDEX_ALLOCATION; + ni->name = I30; + ni->name_len = 4; + + if (!(ir->index.flags & LARGE_INDEX)) { + /* No index allocation. */ + vi->i_size = ni->initialized_size = + ni->allocated_size = 0; + /* We are done with the mft record, so we release it. */ + ntfs_attr_put_search_ctx(ctx); + unmap_mft_record(ni); + m = NULL; + ctx = NULL; + goto skip_large_dir_stuff; + } /* LARGE_INDEX: Index allocation present. Setup state. */ + NInoSetIndexAllocPresent(ni); + /* Find index allocation attribute. */ + ntfs_attr_reinit_search_ctx(ctx); + err = ntfs_attr_lookup(AT_INDEX_ALLOCATION, I30, 4, + CASE_SENSITIVE, 0, NULL, 0, ctx); + if (unlikely(err)) { + if (err == -ENOENT) + ntfs_error(vi->i_sb, "$INDEX_ALLOCATION " + "attribute is not present but " + "$INDEX_ROOT indicated it is."); + else + ntfs_error(vi->i_sb, "Failed to lookup " + "$INDEX_ALLOCATION " + "attribute."); + goto unm_err_out; + } + a = ctx->attr; + if (!a->non_resident) { + ntfs_error(vi->i_sb, "$INDEX_ALLOCATION attribute " + "is resident."); + goto unm_err_out; + } + /* + * Ensure the attribute name is placed before the mapping pairs + * array. + */ + if (unlikely(a->name_length && (le16_to_cpu(a->name_offset) >= + le16_to_cpu( + a->data.non_resident.mapping_pairs_offset)))) { + ntfs_error(vol->sb, "$INDEX_ALLOCATION attribute name " + "is placed after the mapping pairs " + "array."); + goto unm_err_out; + } + if (a->flags & ATTR_IS_ENCRYPTED) { + ntfs_error(vi->i_sb, "$INDEX_ALLOCATION attribute " + "is encrypted."); + goto unm_err_out; + } + if (a->flags & ATTR_IS_SPARSE) { + ntfs_error(vi->i_sb, "$INDEX_ALLOCATION attribute " + "is sparse."); + goto unm_err_out; + } + if (a->flags & ATTR_COMPRESSION_MASK) { + ntfs_error(vi->i_sb, "$INDEX_ALLOCATION attribute " + "is compressed."); + goto unm_err_out; + } + if (a->data.non_resident.lowest_vcn) { + ntfs_error(vi->i_sb, "First extent of " + "$INDEX_ALLOCATION attribute has non " + "zero lowest_vcn."); + goto unm_err_out; + } + vi->i_size = sle64_to_cpu(a->data.non_resident.data_size); + ni->initialized_size = sle64_to_cpu( + a->data.non_resident.initialized_size); + ni->allocated_size = sle64_to_cpu( + a->data.non_resident.allocated_size); + /* + * We are done with the mft record, so we release it. Otherwise + * we would deadlock in ntfs_attr_iget(). + */ + ntfs_attr_put_search_ctx(ctx); + unmap_mft_record(ni); + m = NULL; + ctx = NULL; + /* Get the index bitmap attribute inode. */ + bvi = ntfs_attr_iget(vi, AT_BITMAP, I30, 4); + if (IS_ERR(bvi)) { + ntfs_error(vi->i_sb, "Failed to get bitmap attribute."); + err = PTR_ERR(bvi); + goto unm_err_out; + } + bni = NTFS_I(bvi); + if (NInoCompressed(bni) || NInoEncrypted(bni) || + NInoSparse(bni)) { + ntfs_error(vi->i_sb, "$BITMAP attribute is compressed " + "and/or encrypted and/or sparse."); + goto iput_unm_err_out; + } + /* Consistency check bitmap size vs. index allocation size. */ + bvi_size = i_size_read(bvi); + if ((bvi_size << 3) < (vi->i_size >> + ni->itype.index.block_size_bits)) { + ntfs_error(vi->i_sb, "Index bitmap too small (0x%llx) " + "for index allocation (0x%llx).", + bvi_size << 3, vi->i_size); + goto iput_unm_err_out; + } + /* No longer need the bitmap attribute inode. */ + iput(bvi); +skip_large_dir_stuff: + /* Setup the operations for this inode. */ + vi->i_op = &ntfs_dir_inode_ops; + vi->i_fop = &ntfs_dir_ops; + vi->i_mapping->a_ops = &ntfs_mst_aops; + } else { + /* It is a file. */ + ntfs_attr_reinit_search_ctx(ctx); + + /* Setup the data attribute, even if not present. */ + ni->type = AT_DATA; + ni->name = NULL; + ni->name_len = 0; + + /* Find first extent of the unnamed data attribute. */ + err = ntfs_attr_lookup(AT_DATA, NULL, 0, 0, 0, NULL, 0, ctx); + if (unlikely(err)) { + vi->i_size = ni->initialized_size = + ni->allocated_size = 0; + if (err != -ENOENT) { + ntfs_error(vi->i_sb, "Failed to lookup $DATA " + "attribute."); + goto unm_err_out; + } + /* + * FILE_Secure does not have an unnamed $DATA + * attribute, so we special case it here. + */ + if (vi->i_ino == FILE_Secure) + goto no_data_attr_special_case; + /* + * Most if not all the system files in the $Extend + * system directory do not have unnamed data + * attributes so we need to check if the parent + * directory of the file is FILE_Extend and if it is + * ignore this error. To do this we need to get the + * name of this inode from the mft record as the name + * contains the back reference to the parent directory. + */ + if (ntfs_is_extended_system_file(ctx) > 0) + goto no_data_attr_special_case; + // FIXME: File is corrupt! Hot-fix with empty data + // attribute if recovery option is set. + ntfs_error(vi->i_sb, "$DATA attribute is missing."); + goto unm_err_out; + } + a = ctx->attr; + /* Setup the state. */ + if (a->flags & (ATTR_COMPRESSION_MASK | ATTR_IS_SPARSE)) { + if (a->flags & ATTR_COMPRESSION_MASK) { + NInoSetCompressed(ni); + if (vol->cluster_size > 4096) { + ntfs_error(vi->i_sb, "Found " + "compressed data but " + "compression is " + "disabled due to " + "cluster size (%i) > " + "4kiB.", + vol->cluster_size); + goto unm_err_out; + } + if ((a->flags & ATTR_COMPRESSION_MASK) + != ATTR_IS_COMPRESSED) { + ntfs_error(vi->i_sb, "Found unknown " + "compression method " + "or corrupt file."); + goto unm_err_out; + } + } + if (a->flags & ATTR_IS_SPARSE) + NInoSetSparse(ni); + } + if (a->flags & ATTR_IS_ENCRYPTED) { + if (NInoCompressed(ni)) { + ntfs_error(vi->i_sb, "Found encrypted and " + "compressed data."); + goto unm_err_out; + } + NInoSetEncrypted(ni); + } + if (a->non_resident) { + NInoSetNonResident(ni); + if (NInoCompressed(ni) || NInoSparse(ni)) { + if (NInoCompressed(ni) && a->data.non_resident. + compression_unit != 4) { + ntfs_error(vi->i_sb, "Found " + "non-standard " + "compression unit (%u " + "instead of 4). " + "Cannot handle this.", + a->data.non_resident. + compression_unit); + err = -EOPNOTSUPP; + goto unm_err_out; + } + if (a->data.non_resident.compression_unit) { + ni->itype.compressed.block_size = 1U << + (a->data.non_resident. + compression_unit + + vol->cluster_size_bits); + ni->itype.compressed.block_size_bits = + ffs(ni->itype. + compressed. + block_size) - 1; + ni->itype.compressed.block_clusters = + 1U << a->data. + non_resident. + compression_unit; + } else { + ni->itype.compressed.block_size = 0; + ni->itype.compressed.block_size_bits = + 0; + ni->itype.compressed.block_clusters = + 0; + } + ni->itype.compressed.size = sle64_to_cpu( + a->data.non_resident. + compressed_size); + } + if (a->data.non_resident.lowest_vcn) { + ntfs_error(vi->i_sb, "First extent of $DATA " + "attribute has non zero " + "lowest_vcn."); + goto unm_err_out; + } + vi->i_size = sle64_to_cpu( + a->data.non_resident.data_size); + ni->initialized_size = sle64_to_cpu( + a->data.non_resident.initialized_size); + ni->allocated_size = sle64_to_cpu( + a->data.non_resident.allocated_size); + } else { /* Resident attribute. */ + vi->i_size = ni->initialized_size = le32_to_cpu( + a->data.resident.value_length); + ni->allocated_size = le32_to_cpu(a->length) - + le16_to_cpu( + a->data.resident.value_offset); + if (vi->i_size > ni->allocated_size) { + ntfs_error(vi->i_sb, "Resident data attribute " + "is corrupt (size exceeds " + "allocation)."); + goto unm_err_out; + } + } +no_data_attr_special_case: + /* We are done with the mft record, so we release it. */ + ntfs_attr_put_search_ctx(ctx); + unmap_mft_record(ni); + m = NULL; + ctx = NULL; + /* Setup the operations for this inode. */ + vi->i_op = &ntfs_file_inode_ops; + vi->i_fop = &ntfs_file_ops; + vi->i_mapping->a_ops = &ntfs_normal_aops; + if (NInoMstProtected(ni)) + vi->i_mapping->a_ops = &ntfs_mst_aops; + else if (NInoCompressed(ni)) + vi->i_mapping->a_ops = &ntfs_compressed_aops; + } + /* + * The number of 512-byte blocks used on disk (for stat). This is in so + * far inaccurate as it doesn't account for any named streams or other + * special non-resident attributes, but that is how Windows works, too, + * so we are at least consistent with Windows, if not entirely + * consistent with the Linux Way. Doing it the Linux Way would cause a + * significant slowdown as it would involve iterating over all + * attributes in the mft record and adding the allocated/compressed + * sizes of all non-resident attributes present to give us the Linux + * correct size that should go into i_blocks (after division by 512). + */ + if (S_ISREG(vi->i_mode) && (NInoCompressed(ni) || NInoSparse(ni))) + vi->i_blocks = ni->itype.compressed.size >> 9; + else + vi->i_blocks = ni->allocated_size >> 9; + ntfs_debug("Done."); + return 0; +iput_unm_err_out: + iput(bvi); +unm_err_out: + if (!err) + err = -EIO; + if (ctx) + ntfs_attr_put_search_ctx(ctx); + if (m) + unmap_mft_record(ni); +err_out: + ntfs_error(vol->sb, "Failed with error code %i. Marking corrupt " + "inode 0x%lx as bad. Run chkdsk.", err, vi->i_ino); + make_bad_inode(vi); + if (err != -EOPNOTSUPP && err != -ENOMEM) + NVolSetErrors(vol); + return err; +} + +/** + * ntfs_read_locked_attr_inode - read an attribute inode from its base inode + * @base_vi: base inode + * @vi: attribute inode to read + * + * ntfs_read_locked_attr_inode() is called from ntfs_attr_iget() to read the + * attribute inode described by @vi into memory from the base mft record + * described by @base_ni. + * + * ntfs_read_locked_attr_inode() maps, pins and locks the base inode for + * reading and looks up the attribute described by @vi before setting up the + * necessary fields in @vi as well as initializing the ntfs inode. + * + * Q: What locks are held when the function is called? + * A: i_state has I_NEW set, hence the inode is locked, also + * i_count is set to 1, so it is not going to go away + * + * Return 0 on success and -errno on error. In the error case, the inode will + * have had make_bad_inode() executed on it. + * + * Note this cannot be called for AT_INDEX_ALLOCATION. + */ +static int ntfs_read_locked_attr_inode(struct inode *base_vi, struct inode *vi) +{ + ntfs_volume *vol = NTFS_SB(vi->i_sb); + ntfs_inode *ni, *base_ni; + MFT_RECORD *m; + ATTR_RECORD *a; + ntfs_attr_search_ctx *ctx; + int err = 0; + + ntfs_debug("Entering for i_ino 0x%lx.", vi->i_ino); + + ntfs_init_big_inode(vi); + + ni = NTFS_I(vi); + base_ni = NTFS_I(base_vi); + + /* Just mirror the values from the base inode. */ + vi->i_uid = base_vi->i_uid; + vi->i_gid = base_vi->i_gid; + set_nlink(vi, base_vi->i_nlink); + vi->i_mtime = base_vi->i_mtime; + vi->i_ctime = base_vi->i_ctime; + vi->i_atime = base_vi->i_atime; + vi->i_generation = ni->seq_no = base_ni->seq_no; + + /* Set inode type to zero but preserve permissions. */ + vi->i_mode = base_vi->i_mode & ~S_IFMT; + + m = map_mft_record(base_ni); + if (IS_ERR(m)) { + err = PTR_ERR(m); + goto err_out; + } + ctx = ntfs_attr_get_search_ctx(base_ni, m); + if (!ctx) { + err = -ENOMEM; + goto unm_err_out; + } + /* Find the attribute. */ + err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len, + CASE_SENSITIVE, 0, NULL, 0, ctx); + if (unlikely(err)) + goto unm_err_out; + a = ctx->attr; + if (a->flags & (ATTR_COMPRESSION_MASK | ATTR_IS_SPARSE)) { + if (a->flags & ATTR_COMPRESSION_MASK) { + NInoSetCompressed(ni); + if ((ni->type != AT_DATA) || (ni->type == AT_DATA && + ni->name_len)) { + ntfs_error(vi->i_sb, "Found compressed " + "non-data or named data " + "attribute. Please report " + "you saw this message to " + "linux-ntfs-dev@lists." + "sourceforge.net"); + goto unm_err_out; + } + if (vol->cluster_size > 4096) { + ntfs_error(vi->i_sb, "Found compressed " + "attribute but compression is " + "disabled due to cluster size " + "(%i) > 4kiB.", + vol->cluster_size); + goto unm_err_out; + } + if ((a->flags & ATTR_COMPRESSION_MASK) != + ATTR_IS_COMPRESSED) { + ntfs_error(vi->i_sb, "Found unknown " + "compression method."); + goto unm_err_out; + } + } + /* + * The compressed/sparse flag set in an index root just means + * to compress all files. + */ + if (NInoMstProtected(ni) && ni->type != AT_INDEX_ROOT) { + ntfs_error(vi->i_sb, "Found mst protected attribute " + "but the attribute is %s. Please " + "report you saw this message to " + "linux-ntfs-dev@lists.sourceforge.net", + NInoCompressed(ni) ? "compressed" : + "sparse"); + goto unm_err_out; + } + if (a->flags & ATTR_IS_SPARSE) + NInoSetSparse(ni); + } + if (a->flags & ATTR_IS_ENCRYPTED) { + if (NInoCompressed(ni)) { + ntfs_error(vi->i_sb, "Found encrypted and compressed " + "data."); + goto unm_err_out; + } + /* + * The encryption flag set in an index root just means to + * encrypt all files. + */ + if (NInoMstProtected(ni) && ni->type != AT_INDEX_ROOT) { + ntfs_error(vi->i_sb, "Found mst protected attribute " + "but the attribute is encrypted. " + "Please report you saw this message " + "to linux-ntfs-dev@lists.sourceforge." + "net"); + goto unm_err_out; + } + if (ni->type != AT_DATA) { + ntfs_error(vi->i_sb, "Found encrypted non-data " + "attribute."); + goto unm_err_out; + } + NInoSetEncrypted(ni); + } + if (!a->non_resident) { + /* Ensure the attribute name is placed before the value. */ + if (unlikely(a->name_length && (le16_to_cpu(a->name_offset) >= + le16_to_cpu(a->data.resident.value_offset)))) { + ntfs_error(vol->sb, "Attribute name is placed after " + "the attribute value."); + goto unm_err_out; + } + if (NInoMstProtected(ni)) { + ntfs_error(vi->i_sb, "Found mst protected attribute " + "but the attribute is resident. " + "Please report you saw this message to " + "linux-ntfs-dev@lists.sourceforge.net"); + goto unm_err_out; + } + vi->i_size = ni->initialized_size = le32_to_cpu( + a->data.resident.value_length); + ni->allocated_size = le32_to_cpu(a->length) - + le16_to_cpu(a->data.resident.value_offset); + if (vi->i_size > ni->allocated_size) { + ntfs_error(vi->i_sb, "Resident attribute is corrupt " + "(size exceeds allocation)."); + goto unm_err_out; + } + } else { + NInoSetNonResident(ni); + /* + * Ensure the attribute name is placed before the mapping pairs + * array. + */ + if (unlikely(a->name_length && (le16_to_cpu(a->name_offset) >= + le16_to_cpu( + a->data.non_resident.mapping_pairs_offset)))) { + ntfs_error(vol->sb, "Attribute name is placed after " + "the mapping pairs array."); + goto unm_err_out; + } + if (NInoCompressed(ni) || NInoSparse(ni)) { + if (NInoCompressed(ni) && a->data.non_resident. + compression_unit != 4) { + ntfs_error(vi->i_sb, "Found non-standard " + "compression unit (%u instead " + "of 4). Cannot handle this.", + a->data.non_resident. + compression_unit); + err = -EOPNOTSUPP; + goto unm_err_out; + } + if (a->data.non_resident.compression_unit) { + ni->itype.compressed.block_size = 1U << + (a->data.non_resident. + compression_unit + + vol->cluster_size_bits); + ni->itype.compressed.block_size_bits = + ffs(ni->itype.compressed. + block_size) - 1; + ni->itype.compressed.block_clusters = 1U << + a->data.non_resident. + compression_unit; + } else { + ni->itype.compressed.block_size = 0; + ni->itype.compressed.block_size_bits = 0; + ni->itype.compressed.block_clusters = 0; + } + ni->itype.compressed.size = sle64_to_cpu( + a->data.non_resident.compressed_size); + } + if (a->data.non_resident.lowest_vcn) { + ntfs_error(vi->i_sb, "First extent of attribute has " + "non-zero lowest_vcn."); + goto unm_err_out; + } + vi->i_size = sle64_to_cpu(a->data.non_resident.data_size); + ni->initialized_size = sle64_to_cpu( + a->data.non_resident.initialized_size); + ni->allocated_size = sle64_to_cpu( + a->data.non_resident.allocated_size); + } + vi->i_mapping->a_ops = &ntfs_normal_aops; + if (NInoMstProtected(ni)) + vi->i_mapping->a_ops = &ntfs_mst_aops; + else if (NInoCompressed(ni)) + vi->i_mapping->a_ops = &ntfs_compressed_aops; + if ((NInoCompressed(ni) || NInoSparse(ni)) && ni->type != AT_INDEX_ROOT) + vi->i_blocks = ni->itype.compressed.size >> 9; + else + vi->i_blocks = ni->allocated_size >> 9; + /* + * Make sure the base inode does not go away and attach it to the + * attribute inode. + */ + igrab(base_vi); + ni->ext.base_ntfs_ino = base_ni; + ni->nr_extents = -1; + + ntfs_attr_put_search_ctx(ctx); + unmap_mft_record(base_ni); + + ntfs_debug("Done."); + return 0; + +unm_err_out: + if (!err) + err = -EIO; + if (ctx) + ntfs_attr_put_search_ctx(ctx); + unmap_mft_record(base_ni); +err_out: + ntfs_error(vol->sb, "Failed with error code %i while reading attribute " + "inode (mft_no 0x%lx, type 0x%x, name_len %i). " + "Marking corrupt inode and base inode 0x%lx as bad. " + "Run chkdsk.", err, vi->i_ino, ni->type, ni->name_len, + base_vi->i_ino); + make_bad_inode(vi); + if (err != -ENOMEM) + NVolSetErrors(vol); + return err; +} + +/** + * ntfs_read_locked_index_inode - read an index inode from its base inode + * @base_vi: base inode + * @vi: index inode to read + * + * ntfs_read_locked_index_inode() is called from ntfs_index_iget() to read the + * index inode described by @vi into memory from the base mft record described + * by @base_ni. + * + * ntfs_read_locked_index_inode() maps, pins and locks the base inode for + * reading and looks up the attributes relating to the index described by @vi + * before setting up the necessary fields in @vi as well as initializing the + * ntfs inode. + * + * Note, index inodes are essentially attribute inodes (NInoAttr() is true) + * with the attribute type set to AT_INDEX_ALLOCATION. Apart from that, they + * are setup like directory inodes since directories are a special case of + * indices ao they need to be treated in much the same way. Most importantly, + * for small indices the index allocation attribute might not actually exist. + * However, the index root attribute always exists but this does not need to + * have an inode associated with it and this is why we define a new inode type + * index. Also, like for directories, we need to have an attribute inode for + * the bitmap attribute corresponding to the index allocation attribute and we + * can store this in the appropriate field of the inode, just like we do for + * normal directory inodes. + * + * Q: What locks are held when the function is called? + * A: i_state has I_NEW set, hence the inode is locked, also + * i_count is set to 1, so it is not going to go away + * + * Return 0 on success and -errno on error. In the error case, the inode will + * have had make_bad_inode() executed on it. + */ +static int ntfs_read_locked_index_inode(struct inode *base_vi, struct inode *vi) +{ + loff_t bvi_size; + ntfs_volume *vol = NTFS_SB(vi->i_sb); + ntfs_inode *ni, *base_ni, *bni; + struct inode *bvi; + MFT_RECORD *m; + ATTR_RECORD *a; + ntfs_attr_search_ctx *ctx; + INDEX_ROOT *ir; + u8 *ir_end, *index_end; + int err = 0; + + ntfs_debug("Entering for i_ino 0x%lx.", vi->i_ino); + ntfs_init_big_inode(vi); + ni = NTFS_I(vi); + base_ni = NTFS_I(base_vi); + /* Just mirror the values from the base inode. */ + vi->i_uid = base_vi->i_uid; + vi->i_gid = base_vi->i_gid; + set_nlink(vi, base_vi->i_nlink); + vi->i_mtime = base_vi->i_mtime; + vi->i_ctime = base_vi->i_ctime; + vi->i_atime = base_vi->i_atime; + vi->i_generation = ni->seq_no = base_ni->seq_no; + /* Set inode type to zero but preserve permissions. */ + vi->i_mode = base_vi->i_mode & ~S_IFMT; + /* Map the mft record for the base inode. */ + m = map_mft_record(base_ni); + if (IS_ERR(m)) { + err = PTR_ERR(m); + goto err_out; + } + ctx = ntfs_attr_get_search_ctx(base_ni, m); + if (!ctx) { + err = -ENOMEM; + goto unm_err_out; + } + /* Find the index root attribute. */ + err = ntfs_attr_lookup(AT_INDEX_ROOT, ni->name, ni->name_len, + CASE_SENSITIVE, 0, NULL, 0, ctx); + if (unlikely(err)) { + if (err == -ENOENT) + ntfs_error(vi->i_sb, "$INDEX_ROOT attribute is " + "missing."); + goto unm_err_out; + } + a = ctx->attr; + /* Set up the state. */ + if (unlikely(a->non_resident)) { + ntfs_error(vol->sb, "$INDEX_ROOT attribute is not resident."); + goto unm_err_out; + } + /* Ensure the attribute name is placed before the value. */ + if (unlikely(a->name_length && (le16_to_cpu(a->name_offset) >= + le16_to_cpu(a->data.resident.value_offset)))) { + ntfs_error(vol->sb, "$INDEX_ROOT attribute name is placed " + "after the attribute value."); + goto unm_err_out; + } + /* + * Compressed/encrypted/sparse index root is not allowed, except for + * directories of course but those are not dealt with here. + */ + if (a->flags & (ATTR_COMPRESSION_MASK | ATTR_IS_ENCRYPTED | + ATTR_IS_SPARSE)) { + ntfs_error(vi->i_sb, "Found compressed/encrypted/sparse index " + "root attribute."); + goto unm_err_out; + } + ir = (INDEX_ROOT*)((u8*)a + le16_to_cpu(a->data.resident.value_offset)); + ir_end = (u8*)ir + le32_to_cpu(a->data.resident.value_length); + if (ir_end > (u8*)ctx->mrec + vol->mft_record_size) { + ntfs_error(vi->i_sb, "$INDEX_ROOT attribute is corrupt."); + goto unm_err_out; + } + index_end = (u8*)&ir->index + le32_to_cpu(ir->index.index_length); + if (index_end > ir_end) { + ntfs_error(vi->i_sb, "Index is corrupt."); + goto unm_err_out; + } + if (ir->type) { + ntfs_error(vi->i_sb, "Index type is not 0 (type is 0x%x).", + le32_to_cpu(ir->type)); + goto unm_err_out; + } + ni->itype.index.collation_rule = ir->collation_rule; + ntfs_debug("Index collation rule is 0x%x.", + le32_to_cpu(ir->collation_rule)); + ni->itype.index.block_size = le32_to_cpu(ir->index_block_size); + if (!is_power_of_2(ni->itype.index.block_size)) { + ntfs_error(vi->i_sb, "Index block size (%u) is not a power of " + "two.", ni->itype.index.block_size); + goto unm_err_out; + } + if (ni->itype.index.block_size > PAGE_SIZE) { + ntfs_error(vi->i_sb, "Index block size (%u) > PAGE_SIZE " + "(%ld) is not supported. Sorry.", + ni->itype.index.block_size, PAGE_SIZE); + err = -EOPNOTSUPP; + goto unm_err_out; + } + if (ni->itype.index.block_size < NTFS_BLOCK_SIZE) { + ntfs_error(vi->i_sb, "Index block size (%u) < NTFS_BLOCK_SIZE " + "(%i) is not supported. Sorry.", + ni->itype.index.block_size, NTFS_BLOCK_SIZE); + err = -EOPNOTSUPP; + goto unm_err_out; + } + ni->itype.index.block_size_bits = ffs(ni->itype.index.block_size) - 1; + /* Determine the size of a vcn in the index. */ + if (vol->cluster_size <= ni->itype.index.block_size) { + ni->itype.index.vcn_size = vol->cluster_size; + ni->itype.index.vcn_size_bits = vol->cluster_size_bits; + } else { + ni->itype.index.vcn_size = vol->sector_size; + ni->itype.index.vcn_size_bits = vol->sector_size_bits; + } + /* Check for presence of index allocation attribute. */ + if (!(ir->index.flags & LARGE_INDEX)) { + /* No index allocation. */ + vi->i_size = ni->initialized_size = ni->allocated_size = 0; + /* We are done with the mft record, so we release it. */ + ntfs_attr_put_search_ctx(ctx); + unmap_mft_record(base_ni); + m = NULL; + ctx = NULL; + goto skip_large_index_stuff; + } /* LARGE_INDEX: Index allocation present. Setup state. */ + NInoSetIndexAllocPresent(ni); + /* Find index allocation attribute. */ + ntfs_attr_reinit_search_ctx(ctx); + err = ntfs_attr_lookup(AT_INDEX_ALLOCATION, ni->name, ni->name_len, + CASE_SENSITIVE, 0, NULL, 0, ctx); + if (unlikely(err)) { + if (err == -ENOENT) + ntfs_error(vi->i_sb, "$INDEX_ALLOCATION attribute is " + "not present but $INDEX_ROOT " + "indicated it is."); + else + ntfs_error(vi->i_sb, "Failed to lookup " + "$INDEX_ALLOCATION attribute."); + goto unm_err_out; + } + a = ctx->attr; + if (!a->non_resident) { + ntfs_error(vi->i_sb, "$INDEX_ALLOCATION attribute is " + "resident."); + goto unm_err_out; + } + /* + * Ensure the attribute name is placed before the mapping pairs array. + */ + if (unlikely(a->name_length && (le16_to_cpu(a->name_offset) >= + le16_to_cpu( + a->data.non_resident.mapping_pairs_offset)))) { + ntfs_error(vol->sb, "$INDEX_ALLOCATION attribute name is " + "placed after the mapping pairs array."); + goto unm_err_out; + } + if (a->flags & ATTR_IS_ENCRYPTED) { + ntfs_error(vi->i_sb, "$INDEX_ALLOCATION attribute is " + "encrypted."); + goto unm_err_out; + } + if (a->flags & ATTR_IS_SPARSE) { + ntfs_error(vi->i_sb, "$INDEX_ALLOCATION attribute is sparse."); + goto unm_err_out; + } + if (a->flags & ATTR_COMPRESSION_MASK) { + ntfs_error(vi->i_sb, "$INDEX_ALLOCATION attribute is " + "compressed."); + goto unm_err_out; + } + if (a->data.non_resident.lowest_vcn) { + ntfs_error(vi->i_sb, "First extent of $INDEX_ALLOCATION " + "attribute has non zero lowest_vcn."); + goto unm_err_out; + } + vi->i_size = sle64_to_cpu(a->data.non_resident.data_size); + ni->initialized_size = sle64_to_cpu( + a->data.non_resident.initialized_size); + ni->allocated_size = sle64_to_cpu(a->data.non_resident.allocated_size); + /* + * We are done with the mft record, so we release it. Otherwise + * we would deadlock in ntfs_attr_iget(). + */ + ntfs_attr_put_search_ctx(ctx); + unmap_mft_record(base_ni); + m = NULL; + ctx = NULL; + /* Get the index bitmap attribute inode. */ + bvi = ntfs_attr_iget(base_vi, AT_BITMAP, ni->name, ni->name_len); + if (IS_ERR(bvi)) { + ntfs_error(vi->i_sb, "Failed to get bitmap attribute."); + err = PTR_ERR(bvi); + goto unm_err_out; + } + bni = NTFS_I(bvi); + if (NInoCompressed(bni) || NInoEncrypted(bni) || + NInoSparse(bni)) { + ntfs_error(vi->i_sb, "$BITMAP attribute is compressed and/or " + "encrypted and/or sparse."); + goto iput_unm_err_out; + } + /* Consistency check bitmap size vs. index allocation size. */ + bvi_size = i_size_read(bvi); + if ((bvi_size << 3) < (vi->i_size >> ni->itype.index.block_size_bits)) { + ntfs_error(vi->i_sb, "Index bitmap too small (0x%llx) for " + "index allocation (0x%llx).", bvi_size << 3, + vi->i_size); + goto iput_unm_err_out; + } + iput(bvi); +skip_large_index_stuff: + /* Setup the operations for this index inode. */ + vi->i_mapping->a_ops = &ntfs_mst_aops; + vi->i_blocks = ni->allocated_size >> 9; + /* + * Make sure the base inode doesn't go away and attach it to the + * index inode. + */ + igrab(base_vi); + ni->ext.base_ntfs_ino = base_ni; + ni->nr_extents = -1; + + ntfs_debug("Done."); + return 0; +iput_unm_err_out: + iput(bvi); +unm_err_out: + if (!err) + err = -EIO; + if (ctx) + ntfs_attr_put_search_ctx(ctx); + if (m) + unmap_mft_record(base_ni); +err_out: + ntfs_error(vi->i_sb, "Failed with error code %i while reading index " + "inode (mft_no 0x%lx, name_len %i.", err, vi->i_ino, + ni->name_len); + make_bad_inode(vi); + if (err != -EOPNOTSUPP && err != -ENOMEM) + NVolSetErrors(vol); + return err; +} + +/* + * The MFT inode has special locking, so teach the lock validator + * about this by splitting off the locking rules of the MFT from + * the locking rules of other inodes. The MFT inode can never be + * accessed from the VFS side (or even internally), only by the + * map_mft functions. + */ +static struct lock_class_key mft_ni_runlist_lock_key, mft_ni_mrec_lock_key; + +/** + * ntfs_read_inode_mount - special read_inode for mount time use only + * @vi: inode to read + * + * Read inode FILE_MFT at mount time, only called with super_block lock + * held from within the read_super() code path. + * + * This function exists because when it is called the page cache for $MFT/$DATA + * is not initialized and hence we cannot get at the contents of mft records + * by calling map_mft_record*(). + * + * Further it needs to cope with the circular references problem, i.e. cannot + * load any attributes other than $ATTRIBUTE_LIST until $DATA is loaded, because + * we do not know where the other extent mft records are yet and again, because + * we cannot call map_mft_record*() yet. Obviously this applies only when an + * attribute list is actually present in $MFT inode. + * + * We solve these problems by starting with the $DATA attribute before anything + * else and iterating using ntfs_attr_lookup($DATA) over all extents. As each + * extent is found, we ntfs_mapping_pairs_decompress() including the implied + * ntfs_runlists_merge(). Each step of the iteration necessarily provides + * sufficient information for the next step to complete. + * + * This should work but there are two possible pit falls (see inline comments + * below), but only time will tell if they are real pits or just smoke... + */ +int ntfs_read_inode_mount(struct inode *vi) +{ + VCN next_vcn, last_vcn, highest_vcn; + s64 block; + struct super_block *sb = vi->i_sb; + ntfs_volume *vol = NTFS_SB(sb); + struct buffer_head *bh; + ntfs_inode *ni; + MFT_RECORD *m = NULL; + ATTR_RECORD *a; + ntfs_attr_search_ctx *ctx; + unsigned int i, nr_blocks; + int err; + + ntfs_debug("Entering."); + + /* Initialize the ntfs specific part of @vi. */ + ntfs_init_big_inode(vi); + + ni = NTFS_I(vi); + + /* Setup the data attribute. It is special as it is mst protected. */ + NInoSetNonResident(ni); + NInoSetMstProtected(ni); + NInoSetSparseDisabled(ni); + ni->type = AT_DATA; + ni->name = NULL; + ni->name_len = 0; + /* + * This sets up our little cheat allowing us to reuse the async read io + * completion handler for directories. + */ + ni->itype.index.block_size = vol->mft_record_size; + ni->itype.index.block_size_bits = vol->mft_record_size_bits; + + /* Very important! Needed to be able to call map_mft_record*(). */ + vol->mft_ino = vi; + + /* Allocate enough memory to read the first mft record. */ + if (vol->mft_record_size > 64 * 1024) { + ntfs_error(sb, "Unsupported mft record size %i (max 64kiB).", + vol->mft_record_size); + goto err_out; + } + i = vol->mft_record_size; + if (i < sb->s_blocksize) + i = sb->s_blocksize; + m = (MFT_RECORD*)ntfs_malloc_nofs(i); + if (!m) { + ntfs_error(sb, "Failed to allocate buffer for $MFT record 0."); + goto err_out; + } + + /* Determine the first block of the $MFT/$DATA attribute. */ + block = vol->mft_lcn << vol->cluster_size_bits >> + sb->s_blocksize_bits; + nr_blocks = vol->mft_record_size >> sb->s_blocksize_bits; + if (!nr_blocks) + nr_blocks = 1; + + /* Load $MFT/$DATA's first mft record. */ + for (i = 0; i < nr_blocks; i++) { + bh = sb_bread(sb, block++); + if (!bh) { + ntfs_error(sb, "Device read failed."); + goto err_out; + } + memcpy((char*)m + (i << sb->s_blocksize_bits), bh->b_data, + sb->s_blocksize); + brelse(bh); + } + + if (le32_to_cpu(m->bytes_allocated) != vol->mft_record_size) { + ntfs_error(sb, "Incorrect mft record size %u in superblock, should be %u.", + le32_to_cpu(m->bytes_allocated), vol->mft_record_size); + goto err_out; + } + + /* Apply the mst fixups. */ + if (post_read_mst_fixup((NTFS_RECORD*)m, vol->mft_record_size)) { + /* FIXME: Try to use the $MFTMirr now. */ + ntfs_error(sb, "MST fixup failed. $MFT is corrupt."); + goto err_out; + } + + /* Sanity check offset to the first attribute */ + if (le16_to_cpu(m->attrs_offset) >= le32_to_cpu(m->bytes_allocated)) { + ntfs_error(sb, "Incorrect mft offset to the first attribute %u in superblock.", + le16_to_cpu(m->attrs_offset)); + goto err_out; + } + + /* Need this to sanity check attribute list references to $MFT. */ + vi->i_generation = ni->seq_no = le16_to_cpu(m->sequence_number); + + /* Provides read_folio() for map_mft_record(). */ + vi->i_mapping->a_ops = &ntfs_mst_aops; + + ctx = ntfs_attr_get_search_ctx(ni, m); + if (!ctx) { + err = -ENOMEM; + goto err_out; + } + + /* Find the attribute list attribute if present. */ + err = ntfs_attr_lookup(AT_ATTRIBUTE_LIST, NULL, 0, 0, 0, NULL, 0, ctx); + if (err) { + if (unlikely(err != -ENOENT)) { + ntfs_error(sb, "Failed to lookup attribute list " + "attribute. You should run chkdsk."); + goto put_err_out; + } + } else /* if (!err) */ { + ATTR_LIST_ENTRY *al_entry, *next_al_entry; + u8 *al_end; + static const char *es = " Not allowed. $MFT is corrupt. " + "You should run chkdsk."; + + ntfs_debug("Attribute list attribute found in $MFT."); + NInoSetAttrList(ni); + a = ctx->attr; + if (a->flags & ATTR_COMPRESSION_MASK) { + ntfs_error(sb, "Attribute list attribute is " + "compressed.%s", es); + goto put_err_out; + } + if (a->flags & ATTR_IS_ENCRYPTED || + a->flags & ATTR_IS_SPARSE) { + if (a->non_resident) { + ntfs_error(sb, "Non-resident attribute list " + "attribute is encrypted/" + "sparse.%s", es); + goto put_err_out; + } + ntfs_warning(sb, "Resident attribute list attribute " + "in $MFT system file is marked " + "encrypted/sparse which is not true. " + "However, Windows allows this and " + "chkdsk does not detect or correct it " + "so we will just ignore the invalid " + "flags and pretend they are not set."); + } + /* Now allocate memory for the attribute list. */ + ni->attr_list_size = (u32)ntfs_attr_size(a); + if (!ni->attr_list_size) { + ntfs_error(sb, "Attr_list_size is zero"); + goto put_err_out; + } + ni->attr_list = ntfs_malloc_nofs(ni->attr_list_size); + if (!ni->attr_list) { + ntfs_error(sb, "Not enough memory to allocate buffer " + "for attribute list."); + goto put_err_out; + } + if (a->non_resident) { + NInoSetAttrListNonResident(ni); + if (a->data.non_resident.lowest_vcn) { + ntfs_error(sb, "Attribute list has non zero " + "lowest_vcn. $MFT is corrupt. " + "You should run chkdsk."); + goto put_err_out; + } + /* Setup the runlist. */ + ni->attr_list_rl.rl = ntfs_mapping_pairs_decompress(vol, + a, NULL); + if (IS_ERR(ni->attr_list_rl.rl)) { + err = PTR_ERR(ni->attr_list_rl.rl); + ni->attr_list_rl.rl = NULL; + ntfs_error(sb, "Mapping pairs decompression " + "failed with error code %i.", + -err); + goto put_err_out; + } + /* Now load the attribute list. */ + if ((err = load_attribute_list(vol, &ni->attr_list_rl, + ni->attr_list, ni->attr_list_size, + sle64_to_cpu(a->data. + non_resident.initialized_size)))) { + ntfs_error(sb, "Failed to load attribute list " + "attribute with error code %i.", + -err); + goto put_err_out; + } + } else /* if (!ctx.attr->non_resident) */ { + if ((u8*)a + le16_to_cpu( + a->data.resident.value_offset) + + le32_to_cpu( + a->data.resident.value_length) > + (u8*)ctx->mrec + vol->mft_record_size) { + ntfs_error(sb, "Corrupt attribute list " + "attribute."); + goto put_err_out; + } + /* Now copy the attribute list. */ + memcpy(ni->attr_list, (u8*)a + le16_to_cpu( + a->data.resident.value_offset), + le32_to_cpu( + a->data.resident.value_length)); + } + /* The attribute list is now setup in memory. */ + /* + * FIXME: I don't know if this case is actually possible. + * According to logic it is not possible but I have seen too + * many weird things in MS software to rely on logic... Thus we + * perform a manual search and make sure the first $MFT/$DATA + * extent is in the base inode. If it is not we abort with an + * error and if we ever see a report of this error we will need + * to do some magic in order to have the necessary mft record + * loaded and in the right place in the page cache. But + * hopefully logic will prevail and this never happens... + */ + al_entry = (ATTR_LIST_ENTRY*)ni->attr_list; + al_end = (u8*)al_entry + ni->attr_list_size; + for (;; al_entry = next_al_entry) { + /* Out of bounds check. */ + if ((u8*)al_entry < ni->attr_list || + (u8*)al_entry > al_end) + goto em_put_err_out; + /* Catch the end of the attribute list. */ + if ((u8*)al_entry == al_end) + goto em_put_err_out; + if (!al_entry->length) + goto em_put_err_out; + if ((u8*)al_entry + 6 > al_end || (u8*)al_entry + + le16_to_cpu(al_entry->length) > al_end) + goto em_put_err_out; + next_al_entry = (ATTR_LIST_ENTRY*)((u8*)al_entry + + le16_to_cpu(al_entry->length)); + if (le32_to_cpu(al_entry->type) > le32_to_cpu(AT_DATA)) + goto em_put_err_out; + if (AT_DATA != al_entry->type) + continue; + /* We want an unnamed attribute. */ + if (al_entry->name_length) + goto em_put_err_out; + /* Want the first entry, i.e. lowest_vcn == 0. */ + if (al_entry->lowest_vcn) + goto em_put_err_out; + /* First entry has to be in the base mft record. */ + if (MREF_LE(al_entry->mft_reference) != vi->i_ino) { + /* MFT references do not match, logic fails. */ + ntfs_error(sb, "BUG: The first $DATA extent " + "of $MFT is not in the base " + "mft record. Please report " + "you saw this message to " + "linux-ntfs-dev@lists." + "sourceforge.net"); + goto put_err_out; + } else { + /* Sequence numbers must match. */ + if (MSEQNO_LE(al_entry->mft_reference) != + ni->seq_no) + goto em_put_err_out; + /* Got it. All is ok. We can stop now. */ + break; + } + } + } + + ntfs_attr_reinit_search_ctx(ctx); + + /* Now load all attribute extents. */ + a = NULL; + next_vcn = last_vcn = highest_vcn = 0; + while (!(err = ntfs_attr_lookup(AT_DATA, NULL, 0, 0, next_vcn, NULL, 0, + ctx))) { + runlist_element *nrl; + + /* Cache the current attribute. */ + a = ctx->attr; + /* $MFT must be non-resident. */ + if (!a->non_resident) { + ntfs_error(sb, "$MFT must be non-resident but a " + "resident extent was found. $MFT is " + "corrupt. Run chkdsk."); + goto put_err_out; + } + /* $MFT must be uncompressed and unencrypted. */ + if (a->flags & ATTR_COMPRESSION_MASK || + a->flags & ATTR_IS_ENCRYPTED || + a->flags & ATTR_IS_SPARSE) { + ntfs_error(sb, "$MFT must be uncompressed, " + "non-sparse, and unencrypted but a " + "compressed/sparse/encrypted extent " + "was found. $MFT is corrupt. Run " + "chkdsk."); + goto put_err_out; + } + /* + * Decompress the mapping pairs array of this extent and merge + * the result into the existing runlist. No need for locking + * as we have exclusive access to the inode at this time and we + * are a mount in progress task, too. + */ + nrl = ntfs_mapping_pairs_decompress(vol, a, ni->runlist.rl); + if (IS_ERR(nrl)) { + ntfs_error(sb, "ntfs_mapping_pairs_decompress() " + "failed with error code %ld. $MFT is " + "corrupt.", PTR_ERR(nrl)); + goto put_err_out; + } + ni->runlist.rl = nrl; + + /* Are we in the first extent? */ + if (!next_vcn) { + if (a->data.non_resident.lowest_vcn) { + ntfs_error(sb, "First extent of $DATA " + "attribute has non zero " + "lowest_vcn. $MFT is corrupt. " + "You should run chkdsk."); + goto put_err_out; + } + /* Get the last vcn in the $DATA attribute. */ + last_vcn = sle64_to_cpu( + a->data.non_resident.allocated_size) + >> vol->cluster_size_bits; + /* Fill in the inode size. */ + vi->i_size = sle64_to_cpu( + a->data.non_resident.data_size); + ni->initialized_size = sle64_to_cpu( + a->data.non_resident.initialized_size); + ni->allocated_size = sle64_to_cpu( + a->data.non_resident.allocated_size); + /* + * Verify the number of mft records does not exceed + * 2^32 - 1. + */ + if ((vi->i_size >> vol->mft_record_size_bits) >= + (1ULL << 32)) { + ntfs_error(sb, "$MFT is too big! Aborting."); + goto put_err_out; + } + /* + * We have got the first extent of the runlist for + * $MFT which means it is now relatively safe to call + * the normal ntfs_read_inode() function. + * Complete reading the inode, this will actually + * re-read the mft record for $MFT, this time entering + * it into the page cache with which we complete the + * kick start of the volume. It should be safe to do + * this now as the first extent of $MFT/$DATA is + * already known and we would hope that we don't need + * further extents in order to find the other + * attributes belonging to $MFT. Only time will tell if + * this is really the case. If not we will have to play + * magic at this point, possibly duplicating a lot of + * ntfs_read_inode() at this point. We will need to + * ensure we do enough of its work to be able to call + * ntfs_read_inode() on extents of $MFT/$DATA. But lets + * hope this never happens... + */ + ntfs_read_locked_inode(vi); + if (is_bad_inode(vi)) { + ntfs_error(sb, "ntfs_read_inode() of $MFT " + "failed. BUG or corrupt $MFT. " + "Run chkdsk and if no errors " + "are found, please report you " + "saw this message to " + "linux-ntfs-dev@lists." + "sourceforge.net"); + ntfs_attr_put_search_ctx(ctx); + /* Revert to the safe super operations. */ + ntfs_free(m); + return -1; + } + /* + * Re-initialize some specifics about $MFT's inode as + * ntfs_read_inode() will have set up the default ones. + */ + /* Set uid and gid to root. */ + vi->i_uid = GLOBAL_ROOT_UID; + vi->i_gid = GLOBAL_ROOT_GID; + /* Regular file. No access for anyone. */ + vi->i_mode = S_IFREG; + /* No VFS initiated operations allowed for $MFT. */ + vi->i_op = &ntfs_empty_inode_ops; + vi->i_fop = &ntfs_empty_file_ops; + } + + /* Get the lowest vcn for the next extent. */ + highest_vcn = sle64_to_cpu(a->data.non_resident.highest_vcn); + next_vcn = highest_vcn + 1; + + /* Only one extent or error, which we catch below. */ + if (next_vcn <= 0) + break; + + /* Avoid endless loops due to corruption. */ + if (next_vcn < sle64_to_cpu( + a->data.non_resident.lowest_vcn)) { + ntfs_error(sb, "$MFT has corrupt attribute list " + "attribute. Run chkdsk."); + goto put_err_out; + } + } + if (err != -ENOENT) { + ntfs_error(sb, "Failed to lookup $MFT/$DATA attribute extent. " + "$MFT is corrupt. Run chkdsk."); + goto put_err_out; + } + if (!a) { + ntfs_error(sb, "$MFT/$DATA attribute not found. $MFT is " + "corrupt. Run chkdsk."); + goto put_err_out; + } + if (highest_vcn && highest_vcn != last_vcn - 1) { + ntfs_error(sb, "Failed to load the complete runlist for " + "$MFT/$DATA. Driver bug or corrupt $MFT. " + "Run chkdsk."); + ntfs_debug("highest_vcn = 0x%llx, last_vcn - 1 = 0x%llx", + (unsigned long long)highest_vcn, + (unsigned long long)last_vcn - 1); + goto put_err_out; + } + ntfs_attr_put_search_ctx(ctx); + ntfs_debug("Done."); + ntfs_free(m); + + /* + * Split the locking rules of the MFT inode from the + * locking rules of other inodes: + */ + lockdep_set_class(&ni->runlist.lock, &mft_ni_runlist_lock_key); + lockdep_set_class(&ni->mrec_lock, &mft_ni_mrec_lock_key); + + return 0; + +em_put_err_out: + ntfs_error(sb, "Couldn't find first extent of $DATA attribute in " + "attribute list. $MFT is corrupt. Run chkdsk."); +put_err_out: + ntfs_attr_put_search_ctx(ctx); +err_out: + ntfs_error(sb, "Failed. Marking inode as bad."); + make_bad_inode(vi); + ntfs_free(m); + return -1; +} + +static void __ntfs_clear_inode(ntfs_inode *ni) +{ + /* Free all alocated memory. */ + down_write(&ni->runlist.lock); + if (ni->runlist.rl) { + ntfs_free(ni->runlist.rl); + ni->runlist.rl = NULL; + } + up_write(&ni->runlist.lock); + + if (ni->attr_list) { + ntfs_free(ni->attr_list); + ni->attr_list = NULL; + } + + down_write(&ni->attr_list_rl.lock); + if (ni->attr_list_rl.rl) { + ntfs_free(ni->attr_list_rl.rl); + ni->attr_list_rl.rl = NULL; + } + up_write(&ni->attr_list_rl.lock); + + if (ni->name_len && ni->name != I30) { + /* Catch bugs... */ + BUG_ON(!ni->name); + kfree(ni->name); + } +} + +void ntfs_clear_extent_inode(ntfs_inode *ni) +{ + ntfs_debug("Entering for inode 0x%lx.", ni->mft_no); + + BUG_ON(NInoAttr(ni)); + BUG_ON(ni->nr_extents != -1); + +#ifdef NTFS_RW + if (NInoDirty(ni)) { + if (!is_bad_inode(VFS_I(ni->ext.base_ntfs_ino))) + ntfs_error(ni->vol->sb, "Clearing dirty extent inode! " + "Losing data! This is a BUG!!!"); + // FIXME: Do something!!! + } +#endif /* NTFS_RW */ + + __ntfs_clear_inode(ni); + + /* Bye, bye... */ + ntfs_destroy_extent_inode(ni); +} + +/** + * ntfs_evict_big_inode - clean up the ntfs specific part of an inode + * @vi: vfs inode pending annihilation + * + * When the VFS is going to remove an inode from memory, ntfs_clear_big_inode() + * is called, which deallocates all memory belonging to the NTFS specific part + * of the inode and returns. + * + * If the MFT record is dirty, we commit it before doing anything else. + */ +void ntfs_evict_big_inode(struct inode *vi) +{ + ntfs_inode *ni = NTFS_I(vi); + + truncate_inode_pages_final(&vi->i_data); + clear_inode(vi); + +#ifdef NTFS_RW + if (NInoDirty(ni)) { + bool was_bad = (is_bad_inode(vi)); + + /* Committing the inode also commits all extent inodes. */ + ntfs_commit_inode(vi); + + if (!was_bad && (is_bad_inode(vi) || NInoDirty(ni))) { + ntfs_error(vi->i_sb, "Failed to commit dirty inode " + "0x%lx. Losing data!", vi->i_ino); + // FIXME: Do something!!! + } + } +#endif /* NTFS_RW */ + + /* No need to lock at this stage as no one else has a reference. */ + if (ni->nr_extents > 0) { + int i; + + for (i = 0; i < ni->nr_extents; i++) + ntfs_clear_extent_inode(ni->ext.extent_ntfs_inos[i]); + kfree(ni->ext.extent_ntfs_inos); + } + + __ntfs_clear_inode(ni); + + if (NInoAttr(ni)) { + /* Release the base inode if we are holding it. */ + if (ni->nr_extents == -1) { + iput(VFS_I(ni->ext.base_ntfs_ino)); + ni->nr_extents = 0; + ni->ext.base_ntfs_ino = NULL; + } + } + BUG_ON(ni->page); + if (!atomic_dec_and_test(&ni->count)) + BUG(); + return; +} + +/** + * ntfs_show_options - show mount options in /proc/mounts + * @sf: seq_file in which to write our mount options + * @root: root of the mounted tree whose mount options to display + * + * Called by the VFS once for each mounted ntfs volume when someone reads + * /proc/mounts in order to display the NTFS specific mount options of each + * mount. The mount options of fs specified by @root are written to the seq file + * @sf and success is returned. + */ +int ntfs_show_options(struct seq_file *sf, struct dentry *root) +{ + ntfs_volume *vol = NTFS_SB(root->d_sb); + int i; + + seq_printf(sf, ",uid=%i", from_kuid_munged(&init_user_ns, vol->uid)); + seq_printf(sf, ",gid=%i", from_kgid_munged(&init_user_ns, vol->gid)); + if (vol->fmask == vol->dmask) + seq_printf(sf, ",umask=0%o", vol->fmask); + else { + seq_printf(sf, ",fmask=0%o", vol->fmask); + seq_printf(sf, ",dmask=0%o", vol->dmask); + } + seq_printf(sf, ",nls=%s", vol->nls_map->charset); + if (NVolCaseSensitive(vol)) + seq_printf(sf, ",case_sensitive"); + if (NVolShowSystemFiles(vol)) + seq_printf(sf, ",show_sys_files"); + if (!NVolSparseEnabled(vol)) + seq_printf(sf, ",disable_sparse"); + for (i = 0; on_errors_arr[i].val; i++) { + if (on_errors_arr[i].val & vol->on_errors) + seq_printf(sf, ",errors=%s", on_errors_arr[i].str); + } + seq_printf(sf, ",mft_zone_multiplier=%i", vol->mft_zone_multiplier); + return 0; +} + +#ifdef NTFS_RW + +static const char *es = " Leaving inconsistent metadata. Unmount and run " + "chkdsk."; + +/** + * ntfs_truncate - called when the i_size of an ntfs inode is changed + * @vi: inode for which the i_size was changed + * + * We only support i_size changes for normal files at present, i.e. not + * compressed and not encrypted. This is enforced in ntfs_setattr(), see + * below. + * + * The kernel guarantees that @vi is a regular file (S_ISREG() is true) and + * that the change is allowed. + * + * This implies for us that @vi is a file inode rather than a directory, index, + * or attribute inode as well as that @vi is a base inode. + * + * Returns 0 on success or -errno on error. + * + * Called with ->i_mutex held. + */ +int ntfs_truncate(struct inode *vi) +{ + s64 new_size, old_size, nr_freed, new_alloc_size, old_alloc_size; + VCN highest_vcn; + unsigned long flags; + ntfs_inode *base_ni, *ni = NTFS_I(vi); + ntfs_volume *vol = ni->vol; + ntfs_attr_search_ctx *ctx; + MFT_RECORD *m; + ATTR_RECORD *a; + const char *te = " Leaving file length out of sync with i_size."; + int err, mp_size, size_change, alloc_change; + + ntfs_debug("Entering for inode 0x%lx.", vi->i_ino); + BUG_ON(NInoAttr(ni)); + BUG_ON(S_ISDIR(vi->i_mode)); + BUG_ON(NInoMstProtected(ni)); + BUG_ON(ni->nr_extents < 0); +retry_truncate: + /* + * Lock the runlist for writing and map the mft record to ensure it is + * safe to mess with the attribute runlist and sizes. + */ + down_write(&ni->runlist.lock); + if (!NInoAttr(ni)) + base_ni = ni; + else + base_ni = ni->ext.base_ntfs_ino; + m = map_mft_record(base_ni); + if (IS_ERR(m)) { + err = PTR_ERR(m); + ntfs_error(vi->i_sb, "Failed to map mft record for inode 0x%lx " + "(error code %d).%s", vi->i_ino, err, te); + ctx = NULL; + m = NULL; + goto old_bad_out; + } + ctx = ntfs_attr_get_search_ctx(base_ni, m); + if (unlikely(!ctx)) { + ntfs_error(vi->i_sb, "Failed to allocate a search context for " + "inode 0x%lx (not enough memory).%s", + vi->i_ino, te); + err = -ENOMEM; + goto old_bad_out; + } + err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len, + CASE_SENSITIVE, 0, NULL, 0, ctx); + if (unlikely(err)) { + if (err == -ENOENT) { + ntfs_error(vi->i_sb, "Open attribute is missing from " + "mft record. Inode 0x%lx is corrupt. " + "Run chkdsk.%s", vi->i_ino, te); + err = -EIO; + } else + ntfs_error(vi->i_sb, "Failed to lookup attribute in " + "inode 0x%lx (error code %d).%s", + vi->i_ino, err, te); + goto old_bad_out; + } + m = ctx->mrec; + a = ctx->attr; + /* + * The i_size of the vfs inode is the new size for the attribute value. + */ + new_size = i_size_read(vi); + /* The current size of the attribute value is the old size. */ + old_size = ntfs_attr_size(a); + /* Calculate the new allocated size. */ + if (NInoNonResident(ni)) + new_alloc_size = (new_size + vol->cluster_size - 1) & + ~(s64)vol->cluster_size_mask; + else + new_alloc_size = (new_size + 7) & ~7; + /* The current allocated size is the old allocated size. */ + read_lock_irqsave(&ni->size_lock, flags); + old_alloc_size = ni->allocated_size; + read_unlock_irqrestore(&ni->size_lock, flags); + /* + * The change in the file size. This will be 0 if no change, >0 if the + * size is growing, and <0 if the size is shrinking. + */ + size_change = -1; + if (new_size - old_size >= 0) { + size_change = 1; + if (new_size == old_size) + size_change = 0; + } + /* As above for the allocated size. */ + alloc_change = -1; + if (new_alloc_size - old_alloc_size >= 0) { + alloc_change = 1; + if (new_alloc_size == old_alloc_size) + alloc_change = 0; + } + /* + * If neither the size nor the allocation are being changed there is + * nothing to do. + */ + if (!size_change && !alloc_change) + goto unm_done; + /* If the size is changing, check if new size is allowed in $AttrDef. */ + if (size_change) { + err = ntfs_attr_size_bounds_check(vol, ni->type, new_size); + if (unlikely(err)) { + if (err == -ERANGE) { + ntfs_error(vol->sb, "Truncate would cause the " + "inode 0x%lx to %simum size " + "for its attribute type " + "(0x%x). Aborting truncate.", + vi->i_ino, + new_size > old_size ? "exceed " + "the max" : "go under the min", + le32_to_cpu(ni->type)); + err = -EFBIG; + } else { + ntfs_error(vol->sb, "Inode 0x%lx has unknown " + "attribute type 0x%x. " + "Aborting truncate.", + vi->i_ino, + le32_to_cpu(ni->type)); + err = -EIO; + } + /* Reset the vfs inode size to the old size. */ + i_size_write(vi, old_size); + goto err_out; + } + } + if (NInoCompressed(ni) || NInoEncrypted(ni)) { + ntfs_warning(vi->i_sb, "Changes in inode size are not " + "supported yet for %s files, ignoring.", + NInoCompressed(ni) ? "compressed" : + "encrypted"); + err = -EOPNOTSUPP; + goto bad_out; + } + if (a->non_resident) + goto do_non_resident_truncate; + BUG_ON(NInoNonResident(ni)); + /* Resize the attribute record to best fit the new attribute size. */ + if (new_size < vol->mft_record_size && + !ntfs_resident_attr_value_resize(m, a, new_size)) { + /* The resize succeeded! */ + flush_dcache_mft_record_page(ctx->ntfs_ino); + mark_mft_record_dirty(ctx->ntfs_ino); + write_lock_irqsave(&ni->size_lock, flags); + /* Update the sizes in the ntfs inode and all is done. */ + ni->allocated_size = le32_to_cpu(a->length) - + le16_to_cpu(a->data.resident.value_offset); + /* + * Note ntfs_resident_attr_value_resize() has already done any + * necessary data clearing in the attribute record. When the + * file is being shrunk vmtruncate() will already have cleared + * the top part of the last partial page, i.e. since this is + * the resident case this is the page with index 0. However, + * when the file is being expanded, the page cache page data + * between the old data_size, i.e. old_size, and the new_size + * has not been zeroed. Fortunately, we do not need to zero it + * either since on one hand it will either already be zero due + * to both read_folio and writepage clearing partial page data + * beyond i_size in which case there is nothing to do or in the + * case of the file being mmap()ped at the same time, POSIX + * specifies that the behaviour is unspecified thus we do not + * have to do anything. This means that in our implementation + * in the rare case that the file is mmap()ped and a write + * occurred into the mmap()ped region just beyond the file size + * and writepage has not yet been called to write out the page + * (which would clear the area beyond the file size) and we now + * extend the file size to incorporate this dirty region + * outside the file size, a write of the page would result in + * this data being written to disk instead of being cleared. + * Given both POSIX and the Linux mmap(2) man page specify that + * this corner case is undefined, we choose to leave it like + * that as this is much simpler for us as we cannot lock the + * relevant page now since we are holding too many ntfs locks + * which would result in a lock reversal deadlock. + */ + ni->initialized_size = new_size; + write_unlock_irqrestore(&ni->size_lock, flags); + goto unm_done; + } + /* If the above resize failed, this must be an attribute extension. */ + BUG_ON(size_change < 0); + /* + * We have to drop all the locks so we can call + * ntfs_attr_make_non_resident(). This could be optimised by try- + * locking the first page cache page and only if that fails dropping + * the locks, locking the page, and redoing all the locking and + * lookups. While this would be a huge optimisation, it is not worth + * it as this is definitely a slow code path as it only ever can happen + * once for any given file. + */ + ntfs_attr_put_search_ctx(ctx); + unmap_mft_record(base_ni); + up_write(&ni->runlist.lock); + /* + * Not enough space in the mft record, try to make the attribute + * non-resident and if successful restart the truncation process. + */ + err = ntfs_attr_make_non_resident(ni, old_size); + if (likely(!err)) + goto retry_truncate; + /* + * Could not make non-resident. If this is due to this not being + * permitted for this attribute type or there not being enough space, + * try to make other attributes non-resident. Otherwise fail. + */ + if (unlikely(err != -EPERM && err != -ENOSPC)) { + ntfs_error(vol->sb, "Cannot truncate inode 0x%lx, attribute " + "type 0x%x, because the conversion from " + "resident to non-resident attribute failed " + "with error code %i.", vi->i_ino, + (unsigned)le32_to_cpu(ni->type), err); + if (err != -ENOMEM) + err = -EIO; + goto conv_err_out; + } + /* TODO: Not implemented from here, abort. */ + if (err == -ENOSPC) + ntfs_error(vol->sb, "Not enough space in the mft record/on " + "disk for the non-resident attribute value. " + "This case is not implemented yet."); + else /* if (err == -EPERM) */ + ntfs_error(vol->sb, "This attribute type may not be " + "non-resident. This case is not implemented " + "yet."); + err = -EOPNOTSUPP; + goto conv_err_out; +#if 0 + // TODO: Attempt to make other attributes non-resident. + if (!err) + goto do_resident_extend; + /* + * Both the attribute list attribute and the standard information + * attribute must remain in the base inode. Thus, if this is one of + * these attributes, we have to try to move other attributes out into + * extent mft records instead. + */ + if (ni->type == AT_ATTRIBUTE_LIST || + ni->type == AT_STANDARD_INFORMATION) { + // TODO: Attempt to move other attributes into extent mft + // records. + err = -EOPNOTSUPP; + if (!err) + goto do_resident_extend; + goto err_out; + } + // TODO: Attempt to move this attribute to an extent mft record, but + // only if it is not already the only attribute in an mft record in + // which case there would be nothing to gain. + err = -EOPNOTSUPP; + if (!err) + goto do_resident_extend; + /* There is nothing we can do to make enough space. )-: */ + goto err_out; +#endif +do_non_resident_truncate: + BUG_ON(!NInoNonResident(ni)); + if (alloc_change < 0) { + highest_vcn = sle64_to_cpu(a->data.non_resident.highest_vcn); + if (highest_vcn > 0 && + old_alloc_size >> vol->cluster_size_bits > + highest_vcn + 1) { + /* + * This attribute has multiple extents. Not yet + * supported. + */ + ntfs_error(vol->sb, "Cannot truncate inode 0x%lx, " + "attribute type 0x%x, because the " + "attribute is highly fragmented (it " + "consists of multiple extents) and " + "this case is not implemented yet.", + vi->i_ino, + (unsigned)le32_to_cpu(ni->type)); + err = -EOPNOTSUPP; + goto bad_out; + } + } + /* + * If the size is shrinking, need to reduce the initialized_size and + * the data_size before reducing the allocation. + */ + if (size_change < 0) { + /* + * Make the valid size smaller (i_size is already up-to-date). + */ + write_lock_irqsave(&ni->size_lock, flags); + if (new_size < ni->initialized_size) { + ni->initialized_size = new_size; + a->data.non_resident.initialized_size = + cpu_to_sle64(new_size); + } + a->data.non_resident.data_size = cpu_to_sle64(new_size); + write_unlock_irqrestore(&ni->size_lock, flags); + flush_dcache_mft_record_page(ctx->ntfs_ino); + mark_mft_record_dirty(ctx->ntfs_ino); + /* If the allocated size is not changing, we are done. */ + if (!alloc_change) + goto unm_done; + /* + * If the size is shrinking it makes no sense for the + * allocation to be growing. + */ + BUG_ON(alloc_change > 0); + } else /* if (size_change >= 0) */ { + /* + * The file size is growing or staying the same but the + * allocation can be shrinking, growing or staying the same. + */ + if (alloc_change > 0) { + /* + * We need to extend the allocation and possibly update + * the data size. If we are updating the data size, + * since we are not touching the initialized_size we do + * not need to worry about the actual data on disk. + * And as far as the page cache is concerned, there + * will be no pages beyond the old data size and any + * partial region in the last page between the old and + * new data size (or the end of the page if the new + * data size is outside the page) does not need to be + * modified as explained above for the resident + * attribute truncate case. To do this, we simply drop + * the locks we hold and leave all the work to our + * friendly helper ntfs_attr_extend_allocation(). + */ + ntfs_attr_put_search_ctx(ctx); + unmap_mft_record(base_ni); + up_write(&ni->runlist.lock); + err = ntfs_attr_extend_allocation(ni, new_size, + size_change > 0 ? new_size : -1, -1); + /* + * ntfs_attr_extend_allocation() will have done error + * output already. + */ + goto done; + } + if (!alloc_change) + goto alloc_done; + } + /* alloc_change < 0 */ + /* Free the clusters. */ + nr_freed = ntfs_cluster_free(ni, new_alloc_size >> + vol->cluster_size_bits, -1, ctx); + m = ctx->mrec; + a = ctx->attr; + if (unlikely(nr_freed < 0)) { + ntfs_error(vol->sb, "Failed to release cluster(s) (error code " + "%lli). Unmount and run chkdsk to recover " + "the lost cluster(s).", (long long)nr_freed); + NVolSetErrors(vol); + nr_freed = 0; + } + /* Truncate the runlist. */ + err = ntfs_rl_truncate_nolock(vol, &ni->runlist, + new_alloc_size >> vol->cluster_size_bits); + /* + * If the runlist truncation failed and/or the search context is no + * longer valid, we cannot resize the attribute record or build the + * mapping pairs array thus we mark the inode bad so that no access to + * the freed clusters can happen. + */ + if (unlikely(err || IS_ERR(m))) { + ntfs_error(vol->sb, "Failed to %s (error code %li).%s", + IS_ERR(m) ? + "restore attribute search context" : + "truncate attribute runlist", + IS_ERR(m) ? PTR_ERR(m) : err, es); + err = -EIO; + goto bad_out; + } + /* Get the size for the shrunk mapping pairs array for the runlist. */ + mp_size = ntfs_get_size_for_mapping_pairs(vol, ni->runlist.rl, 0, -1); + if (unlikely(mp_size <= 0)) { + ntfs_error(vol->sb, "Cannot shrink allocation of inode 0x%lx, " + "attribute type 0x%x, because determining the " + "size for the mapping pairs failed with error " + "code %i.%s", vi->i_ino, + (unsigned)le32_to_cpu(ni->type), mp_size, es); + err = -EIO; + goto bad_out; + } + /* + * Shrink the attribute record for the new mapping pairs array. Note, + * this cannot fail since we are making the attribute smaller thus by + * definition there is enough space to do so. + */ + err = ntfs_attr_record_resize(m, a, mp_size + + le16_to_cpu(a->data.non_resident.mapping_pairs_offset)); + BUG_ON(err); + /* + * Generate the mapping pairs array directly into the attribute record. + */ + err = ntfs_mapping_pairs_build(vol, (u8*)a + + le16_to_cpu(a->data.non_resident.mapping_pairs_offset), + mp_size, ni->runlist.rl, 0, -1, NULL); + if (unlikely(err)) { + ntfs_error(vol->sb, "Cannot shrink allocation of inode 0x%lx, " + "attribute type 0x%x, because building the " + "mapping pairs failed with error code %i.%s", + vi->i_ino, (unsigned)le32_to_cpu(ni->type), + err, es); + err = -EIO; + goto bad_out; + } + /* Update the allocated/compressed size as well as the highest vcn. */ + a->data.non_resident.highest_vcn = cpu_to_sle64((new_alloc_size >> + vol->cluster_size_bits) - 1); + write_lock_irqsave(&ni->size_lock, flags); + ni->allocated_size = new_alloc_size; + a->data.non_resident.allocated_size = cpu_to_sle64(new_alloc_size); + if (NInoSparse(ni) || NInoCompressed(ni)) { + if (nr_freed) { + ni->itype.compressed.size -= nr_freed << + vol->cluster_size_bits; + BUG_ON(ni->itype.compressed.size < 0); + a->data.non_resident.compressed_size = cpu_to_sle64( + ni->itype.compressed.size); + vi->i_blocks = ni->itype.compressed.size >> 9; + } + } else + vi->i_blocks = new_alloc_size >> 9; + write_unlock_irqrestore(&ni->size_lock, flags); + /* + * We have shrunk the allocation. If this is a shrinking truncate we + * have already dealt with the initialized_size and the data_size above + * and we are done. If the truncate is only changing the allocation + * and not the data_size, we are also done. If this is an extending + * truncate, need to extend the data_size now which is ensured by the + * fact that @size_change is positive. + */ +alloc_done: + /* + * If the size is growing, need to update it now. If it is shrinking, + * we have already updated it above (before the allocation change). + */ + if (size_change > 0) + a->data.non_resident.data_size = cpu_to_sle64(new_size); + /* Ensure the modified mft record is written out. */ + flush_dcache_mft_record_page(ctx->ntfs_ino); + mark_mft_record_dirty(ctx->ntfs_ino); +unm_done: + ntfs_attr_put_search_ctx(ctx); + unmap_mft_record(base_ni); + up_write(&ni->runlist.lock); +done: + /* Update the mtime and ctime on the base inode. */ + /* normally ->truncate shouldn't update ctime or mtime, + * but ntfs did before so it got a copy & paste version + * of file_update_time. one day someone should fix this + * for real. + */ + if (!IS_NOCMTIME(VFS_I(base_ni)) && !IS_RDONLY(VFS_I(base_ni))) { + struct timespec64 now = current_time(VFS_I(base_ni)); + int sync_it = 0; + + if (!timespec64_equal(&VFS_I(base_ni)->i_mtime, &now) || + !timespec64_equal(&VFS_I(base_ni)->i_ctime, &now)) + sync_it = 1; + VFS_I(base_ni)->i_mtime = now; + VFS_I(base_ni)->i_ctime = now; + + if (sync_it) + mark_inode_dirty_sync(VFS_I(base_ni)); + } + + if (likely(!err)) { + NInoClearTruncateFailed(ni); + ntfs_debug("Done."); + } + return err; +old_bad_out: + old_size = -1; +bad_out: + if (err != -ENOMEM && err != -EOPNOTSUPP) + NVolSetErrors(vol); + if (err != -EOPNOTSUPP) + NInoSetTruncateFailed(ni); + else if (old_size >= 0) + i_size_write(vi, old_size); +err_out: + if (ctx) + ntfs_attr_put_search_ctx(ctx); + if (m) + unmap_mft_record(base_ni); + up_write(&ni->runlist.lock); +out: + ntfs_debug("Failed. Returning error code %i.", err); + return err; +conv_err_out: + if (err != -ENOMEM && err != -EOPNOTSUPP) + NVolSetErrors(vol); + if (err != -EOPNOTSUPP) + NInoSetTruncateFailed(ni); + else + i_size_write(vi, old_size); + goto out; +} + +/** + * ntfs_truncate_vfs - wrapper for ntfs_truncate() that has no return value + * @vi: inode for which the i_size was changed + * + * Wrapper for ntfs_truncate() that has no return value. + * + * See ntfs_truncate() description above for details. + */ +#ifdef NTFS_RW +void ntfs_truncate_vfs(struct inode *vi) { + ntfs_truncate(vi); +} +#endif + +/** + * ntfs_setattr - called from notify_change() when an attribute is being changed + * @mnt_userns: user namespace of the mount the inode was found from + * @dentry: dentry whose attributes to change + * @attr: structure describing the attributes and the changes + * + * We have to trap VFS attempts to truncate the file described by @dentry as + * soon as possible, because we do not implement changes in i_size yet. So we + * abort all i_size changes here. + * + * We also abort all changes of user, group, and mode as we do not implement + * the NTFS ACLs yet. + * + * Called with ->i_mutex held. + */ +int ntfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, + struct iattr *attr) +{ + struct inode *vi = d_inode(dentry); + int err; + unsigned int ia_valid = attr->ia_valid; + + err = setattr_prepare(&init_user_ns, dentry, attr); + if (err) + goto out; + /* We do not support NTFS ACLs yet. */ + if (ia_valid & (ATTR_UID | ATTR_GID | ATTR_MODE)) { + ntfs_warning(vi->i_sb, "Changes in user/group/mode are not " + "supported yet, ignoring."); + err = -EOPNOTSUPP; + goto out; + } + if (ia_valid & ATTR_SIZE) { + if (attr->ia_size != i_size_read(vi)) { + ntfs_inode *ni = NTFS_I(vi); + /* + * FIXME: For now we do not support resizing of + * compressed or encrypted files yet. + */ + if (NInoCompressed(ni) || NInoEncrypted(ni)) { + ntfs_warning(vi->i_sb, "Changes in inode size " + "are not supported yet for " + "%s files, ignoring.", + NInoCompressed(ni) ? + "compressed" : "encrypted"); + err = -EOPNOTSUPP; + } else { + truncate_setsize(vi, attr->ia_size); + ntfs_truncate_vfs(vi); + } + if (err || ia_valid == ATTR_SIZE) + goto out; + } else { + /* + * We skipped the truncate but must still update + * timestamps. + */ + ia_valid |= ATTR_MTIME | ATTR_CTIME; + } + } + if (ia_valid & ATTR_ATIME) + vi->i_atime = attr->ia_atime; + if (ia_valid & ATTR_MTIME) + vi->i_mtime = attr->ia_mtime; + if (ia_valid & ATTR_CTIME) + vi->i_ctime = attr->ia_ctime; + mark_inode_dirty(vi); +out: + return err; +} + +/** + * ntfs_write_inode - write out a dirty inode + * @vi: inode to write out + * @sync: if true, write out synchronously + * + * Write out a dirty inode to disk including any extent inodes if present. + * + * If @sync is true, commit the inode to disk and wait for io completion. This + * is done using write_mft_record(). + * + * If @sync is false, just schedule the write to happen but do not wait for i/o + * completion. In 2.6 kernels, scheduling usually happens just by virtue of + * marking the page (and in this case mft record) dirty but we do not implement + * this yet as write_mft_record() largely ignores the @sync parameter and + * always performs synchronous writes. + * + * Return 0 on success and -errno on error. + */ +int __ntfs_write_inode(struct inode *vi, int sync) +{ + sle64 nt; + ntfs_inode *ni = NTFS_I(vi); + ntfs_attr_search_ctx *ctx; + MFT_RECORD *m; + STANDARD_INFORMATION *si; + int err = 0; + bool modified = false; + + ntfs_debug("Entering for %sinode 0x%lx.", NInoAttr(ni) ? "attr " : "", + vi->i_ino); + /* + * Dirty attribute inodes are written via their real inodes so just + * clean them here. Access time updates are taken care off when the + * real inode is written. + */ + if (NInoAttr(ni)) { + NInoClearDirty(ni); + ntfs_debug("Done."); + return 0; + } + /* Map, pin, and lock the mft record belonging to the inode. */ + m = map_mft_record(ni); + if (IS_ERR(m)) { + err = PTR_ERR(m); + goto err_out; + } + /* Update the access times in the standard information attribute. */ + ctx = ntfs_attr_get_search_ctx(ni, m); + if (unlikely(!ctx)) { + err = -ENOMEM; + goto unm_err_out; + } + err = ntfs_attr_lookup(AT_STANDARD_INFORMATION, NULL, 0, + CASE_SENSITIVE, 0, NULL, 0, ctx); + if (unlikely(err)) { + ntfs_attr_put_search_ctx(ctx); + goto unm_err_out; + } + si = (STANDARD_INFORMATION*)((u8*)ctx->attr + + le16_to_cpu(ctx->attr->data.resident.value_offset)); + /* Update the access times if they have changed. */ + nt = utc2ntfs(vi->i_mtime); + if (si->last_data_change_time != nt) { + ntfs_debug("Updating mtime for inode 0x%lx: old = 0x%llx, " + "new = 0x%llx", vi->i_ino, (long long) + sle64_to_cpu(si->last_data_change_time), + (long long)sle64_to_cpu(nt)); + si->last_data_change_time = nt; + modified = true; + } + nt = utc2ntfs(vi->i_ctime); + if (si->last_mft_change_time != nt) { + ntfs_debug("Updating ctime for inode 0x%lx: old = 0x%llx, " + "new = 0x%llx", vi->i_ino, (long long) + sle64_to_cpu(si->last_mft_change_time), + (long long)sle64_to_cpu(nt)); + si->last_mft_change_time = nt; + modified = true; + } + nt = utc2ntfs(vi->i_atime); + if (si->last_access_time != nt) { + ntfs_debug("Updating atime for inode 0x%lx: old = 0x%llx, " + "new = 0x%llx", vi->i_ino, + (long long)sle64_to_cpu(si->last_access_time), + (long long)sle64_to_cpu(nt)); + si->last_access_time = nt; + modified = true; + } + /* + * If we just modified the standard information attribute we need to + * mark the mft record it is in dirty. We do this manually so that + * mark_inode_dirty() is not called which would redirty the inode and + * hence result in an infinite loop of trying to write the inode. + * There is no need to mark the base inode nor the base mft record + * dirty, since we are going to write this mft record below in any case + * and the base mft record may actually not have been modified so it + * might not need to be written out. + * NOTE: It is not a problem when the inode for $MFT itself is being + * written out as mark_ntfs_record_dirty() will only set I_DIRTY_PAGES + * on the $MFT inode and hence ntfs_write_inode() will not be + * re-invoked because of it which in turn is ok since the dirtied mft + * record will be cleaned and written out to disk below, i.e. before + * this function returns. + */ + if (modified) { + flush_dcache_mft_record_page(ctx->ntfs_ino); + if (!NInoTestSetDirty(ctx->ntfs_ino)) + mark_ntfs_record_dirty(ctx->ntfs_ino->page, + ctx->ntfs_ino->page_ofs); + } + ntfs_attr_put_search_ctx(ctx); + /* Now the access times are updated, write the base mft record. */ + if (NInoDirty(ni)) + err = write_mft_record(ni, m, sync); + /* Write all attached extent mft records. */ + mutex_lock(&ni->extent_lock); + if (ni->nr_extents > 0) { + ntfs_inode **extent_nis = ni->ext.extent_ntfs_inos; + int i; + + ntfs_debug("Writing %i extent inodes.", ni->nr_extents); + for (i = 0; i < ni->nr_extents; i++) { + ntfs_inode *tni = extent_nis[i]; + + if (NInoDirty(tni)) { + MFT_RECORD *tm = map_mft_record(tni); + int ret; + + if (IS_ERR(tm)) { + if (!err || err == -ENOMEM) + err = PTR_ERR(tm); + continue; + } + ret = write_mft_record(tni, tm, sync); + unmap_mft_record(tni); + if (unlikely(ret)) { + if (!err || err == -ENOMEM) + err = ret; + } + } + } + } + mutex_unlock(&ni->extent_lock); + unmap_mft_record(ni); + if (unlikely(err)) + goto err_out; + ntfs_debug("Done."); + return 0; +unm_err_out: + unmap_mft_record(ni); +err_out: + if (err == -ENOMEM) { + ntfs_warning(vi->i_sb, "Not enough memory to write inode. " + "Marking the inode dirty again, so the VFS " + "retries later."); + mark_inode_dirty(vi); + } else { + ntfs_error(vi->i_sb, "Failed (error %i): Run chkdsk.", -err); + NVolSetErrors(ni->vol); + } + return err; +} + +#endif /* NTFS_RW */ diff --git a/fs/ntfs/inode.h b/fs/ntfs/inode.h new file mode 100644 index 000000000..6f78ee00f --- /dev/null +++ b/fs/ntfs/inode.h @@ -0,0 +1,310 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * inode.h - Defines for inode structures NTFS Linux kernel driver. Part of + * the Linux-NTFS project. + * + * Copyright (c) 2001-2007 Anton Altaparmakov + * Copyright (c) 2002 Richard Russon + */ + +#ifndef _LINUX_NTFS_INODE_H +#define _LINUX_NTFS_INODE_H + +#include <linux/atomic.h> + +#include <linux/fs.h> +#include <linux/list.h> +#include <linux/mm.h> +#include <linux/mutex.h> +#include <linux/seq_file.h> + +#include "layout.h" +#include "volume.h" +#include "types.h" +#include "runlist.h" +#include "debug.h" + +typedef struct _ntfs_inode ntfs_inode; + +/* + * The NTFS in-memory inode structure. It is just used as an extension to the + * fields already provided in the VFS inode. + */ +struct _ntfs_inode { + rwlock_t size_lock; /* Lock serializing access to inode sizes. */ + s64 initialized_size; /* Copy from the attribute record. */ + s64 allocated_size; /* Copy from the attribute record. */ + unsigned long state; /* NTFS specific flags describing this inode. + See ntfs_inode_state_bits below. */ + unsigned long mft_no; /* Number of the mft record / inode. */ + u16 seq_no; /* Sequence number of the mft record. */ + atomic_t count; /* Inode reference count for book keeping. */ + ntfs_volume *vol; /* Pointer to the ntfs volume of this inode. */ + /* + * If NInoAttr() is true, the below fields describe the attribute which + * this fake inode belongs to. The actual inode of this attribute is + * pointed to by base_ntfs_ino and nr_extents is always set to -1 (see + * below). For real inodes, we also set the type (AT_DATA for files and + * AT_INDEX_ALLOCATION for directories), with the name = NULL and + * name_len = 0 for files and name = I30 (global constant) and + * name_len = 4 for directories. + */ + ATTR_TYPE type; /* Attribute type of this fake inode. */ + ntfschar *name; /* Attribute name of this fake inode. */ + u32 name_len; /* Attribute name length of this fake inode. */ + runlist runlist; /* If state has the NI_NonResident bit set, + the runlist of the unnamed data attribute + (if a file) or of the index allocation + attribute (directory) or of the attribute + described by the fake inode (if NInoAttr()). + If runlist.rl is NULL, the runlist has not + been read in yet or has been unmapped. If + NI_NonResident is clear, the attribute is + resident (file and fake inode) or there is + no $I30 index allocation attribute + (small directory). In the latter case + runlist.rl is always NULL.*/ + /* + * The following fields are only valid for real inodes and extent + * inodes. + */ + struct mutex mrec_lock; /* Lock for serializing access to the + mft record belonging to this inode. */ + struct page *page; /* The page containing the mft record of the + inode. This should only be touched by the + (un)map_mft_record*() functions. */ + int page_ofs; /* Offset into the page at which the mft record + begins. This should only be touched by the + (un)map_mft_record*() functions. */ + /* + * Attribute list support (only for use by the attribute lookup + * functions). Setup during read_inode for all inodes with attribute + * lists. Only valid if NI_AttrList is set in state, and attr_list_rl is + * further only valid if NI_AttrListNonResident is set. + */ + u32 attr_list_size; /* Length of attribute list value in bytes. */ + u8 *attr_list; /* Attribute list value itself. */ + runlist attr_list_rl; /* Run list for the attribute list value. */ + union { + struct { /* It is a directory, $MFT, or an index inode. */ + u32 block_size; /* Size of an index block. */ + u32 vcn_size; /* Size of a vcn in this + index. */ + COLLATION_RULE collation_rule; /* The collation rule + for the index. */ + u8 block_size_bits; /* Log2 of the above. */ + u8 vcn_size_bits; /* Log2 of the above. */ + } index; + struct { /* It is a compressed/sparse file/attribute inode. */ + s64 size; /* Copy of compressed_size from + $DATA. */ + u32 block_size; /* Size of a compression block + (cb). */ + u8 block_size_bits; /* Log2 of the size of a cb. */ + u8 block_clusters; /* Number of clusters per cb. */ + } compressed; + } itype; + struct mutex extent_lock; /* Lock for accessing/modifying the + below . */ + s32 nr_extents; /* For a base mft record, the number of attached extent + inodes (0 if none), for extent records and for fake + inodes describing an attribute this is -1. */ + union { /* This union is only used if nr_extents != 0. */ + ntfs_inode **extent_ntfs_inos; /* For nr_extents > 0, array of + the ntfs inodes of the extent + mft records belonging to + this base inode which have + been loaded. */ + ntfs_inode *base_ntfs_ino; /* For nr_extents == -1, the + ntfs inode of the base mft + record. For fake inodes, the + real (base) inode to which + the attribute belongs. */ + } ext; +}; + +/* + * Defined bits for the state field in the ntfs_inode structure. + * (f) = files only, (d) = directories only, (a) = attributes/fake inodes only + */ +typedef enum { + NI_Dirty, /* 1: Mft record needs to be written to disk. */ + NI_AttrList, /* 1: Mft record contains an attribute list. */ + NI_AttrListNonResident, /* 1: Attribute list is non-resident. Implies + NI_AttrList is set. */ + + NI_Attr, /* 1: Fake inode for attribute i/o. + 0: Real inode or extent inode. */ + + NI_MstProtected, /* 1: Attribute is protected by MST fixups. + 0: Attribute is not protected by fixups. */ + NI_NonResident, /* 1: Unnamed data attr is non-resident (f). + 1: Attribute is non-resident (a). */ + NI_IndexAllocPresent = NI_NonResident, /* 1: $I30 index alloc attr is + present (d). */ + NI_Compressed, /* 1: Unnamed data attr is compressed (f). + 1: Create compressed files by default (d). + 1: Attribute is compressed (a). */ + NI_Encrypted, /* 1: Unnamed data attr is encrypted (f). + 1: Create encrypted files by default (d). + 1: Attribute is encrypted (a). */ + NI_Sparse, /* 1: Unnamed data attr is sparse (f). + 1: Create sparse files by default (d). + 1: Attribute is sparse (a). */ + NI_SparseDisabled, /* 1: May not create sparse regions. */ + NI_TruncateFailed, /* 1: Last ntfs_truncate() call failed. */ +} ntfs_inode_state_bits; + +/* + * NOTE: We should be adding dirty mft records to a list somewhere and they + * should be independent of the (ntfs/vfs) inode structure so that an inode can + * be removed but the record can be left dirty for syncing later. + */ + +/* + * Macro tricks to expand the NInoFoo(), NInoSetFoo(), and NInoClearFoo() + * functions. + */ +#define NINO_FNS(flag) \ +static inline int NIno##flag(ntfs_inode *ni) \ +{ \ + return test_bit(NI_##flag, &(ni)->state); \ +} \ +static inline void NInoSet##flag(ntfs_inode *ni) \ +{ \ + set_bit(NI_##flag, &(ni)->state); \ +} \ +static inline void NInoClear##flag(ntfs_inode *ni) \ +{ \ + clear_bit(NI_##flag, &(ni)->state); \ +} + +/* + * As above for NInoTestSetFoo() and NInoTestClearFoo(). + */ +#define TAS_NINO_FNS(flag) \ +static inline int NInoTestSet##flag(ntfs_inode *ni) \ +{ \ + return test_and_set_bit(NI_##flag, &(ni)->state); \ +} \ +static inline int NInoTestClear##flag(ntfs_inode *ni) \ +{ \ + return test_and_clear_bit(NI_##flag, &(ni)->state); \ +} + +/* Emit the ntfs inode bitops functions. */ +NINO_FNS(Dirty) +TAS_NINO_FNS(Dirty) +NINO_FNS(AttrList) +NINO_FNS(AttrListNonResident) +NINO_FNS(Attr) +NINO_FNS(MstProtected) +NINO_FNS(NonResident) +NINO_FNS(IndexAllocPresent) +NINO_FNS(Compressed) +NINO_FNS(Encrypted) +NINO_FNS(Sparse) +NINO_FNS(SparseDisabled) +NINO_FNS(TruncateFailed) + +/* + * The full structure containing a ntfs_inode and a vfs struct inode. Used for + * all real and fake inodes but not for extent inodes which lack the vfs struct + * inode. + */ +typedef struct { + ntfs_inode ntfs_inode; + struct inode vfs_inode; /* The vfs inode structure. */ +} big_ntfs_inode; + +/** + * NTFS_I - return the ntfs inode given a vfs inode + * @inode: VFS inode + * + * NTFS_I() returns the ntfs inode associated with the VFS @inode. + */ +static inline ntfs_inode *NTFS_I(struct inode *inode) +{ + return (ntfs_inode *)container_of(inode, big_ntfs_inode, vfs_inode); +} + +static inline struct inode *VFS_I(ntfs_inode *ni) +{ + return &((big_ntfs_inode *)ni)->vfs_inode; +} + +/** + * ntfs_attr - ntfs in memory attribute structure + * @mft_no: mft record number of the base mft record of this attribute + * @name: Unicode name of the attribute (NULL if unnamed) + * @name_len: length of @name in Unicode characters (0 if unnamed) + * @type: attribute type (see layout.h) + * + * This structure exists only to provide a small structure for the + * ntfs_{attr_}iget()/ntfs_test_inode()/ntfs_init_locked_inode() mechanism. + * + * NOTE: Elements are ordered by size to make the structure as compact as + * possible on all architectures. + */ +typedef struct { + unsigned long mft_no; + ntfschar *name; + u32 name_len; + ATTR_TYPE type; +} ntfs_attr; + +extern int ntfs_test_inode(struct inode *vi, void *data); + +extern struct inode *ntfs_iget(struct super_block *sb, unsigned long mft_no); +extern struct inode *ntfs_attr_iget(struct inode *base_vi, ATTR_TYPE type, + ntfschar *name, u32 name_len); +extern struct inode *ntfs_index_iget(struct inode *base_vi, ntfschar *name, + u32 name_len); + +extern struct inode *ntfs_alloc_big_inode(struct super_block *sb); +extern void ntfs_free_big_inode(struct inode *inode); +extern void ntfs_evict_big_inode(struct inode *vi); + +extern void __ntfs_init_inode(struct super_block *sb, ntfs_inode *ni); + +static inline void ntfs_init_big_inode(struct inode *vi) +{ + ntfs_inode *ni = NTFS_I(vi); + + ntfs_debug("Entering."); + __ntfs_init_inode(vi->i_sb, ni); + ni->mft_no = vi->i_ino; +} + +extern ntfs_inode *ntfs_new_extent_inode(struct super_block *sb, + unsigned long mft_no); +extern void ntfs_clear_extent_inode(ntfs_inode *ni); + +extern int ntfs_read_inode_mount(struct inode *vi); + +extern int ntfs_show_options(struct seq_file *sf, struct dentry *root); + +#ifdef NTFS_RW + +extern int ntfs_truncate(struct inode *vi); +extern void ntfs_truncate_vfs(struct inode *vi); + +extern int ntfs_setattr(struct user_namespace *mnt_userns, + struct dentry *dentry, struct iattr *attr); + +extern int __ntfs_write_inode(struct inode *vi, int sync); + +static inline void ntfs_commit_inode(struct inode *vi) +{ + if (!is_bad_inode(vi)) + __ntfs_write_inode(vi, 1); + return; +} + +#else + +static inline void ntfs_truncate_vfs(struct inode *vi) {} + +#endif /* NTFS_RW */ + +#endif /* _LINUX_NTFS_INODE_H */ diff --git a/fs/ntfs/layout.h b/fs/ntfs/layout.h new file mode 100644 index 000000000..5d4bf7a32 --- /dev/null +++ b/fs/ntfs/layout.h @@ -0,0 +1,2421 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * layout.h - All NTFS associated on-disk structures. Part of the Linux-NTFS + * project. + * + * Copyright (c) 2001-2005 Anton Altaparmakov + * Copyright (c) 2002 Richard Russon + */ + +#ifndef _LINUX_NTFS_LAYOUT_H +#define _LINUX_NTFS_LAYOUT_H + +#include <linux/types.h> +#include <linux/bitops.h> +#include <linux/list.h> +#include <asm/byteorder.h> + +#include "types.h" + +/* The NTFS oem_id "NTFS " */ +#define magicNTFS cpu_to_le64(0x202020205346544eULL) + +/* + * Location of bootsector on partition: + * The standard NTFS_BOOT_SECTOR is on sector 0 of the partition. + * On NT4 and above there is one backup copy of the boot sector to + * be found on the last sector of the partition (not normally accessible + * from within Windows as the bootsector contained number of sectors + * value is one less than the actual value!). + * On versions of NT 3.51 and earlier, the backup copy was located at + * number of sectors/2 (integer divide), i.e. in the middle of the volume. + */ + +/* + * BIOS parameter block (bpb) structure. + */ +typedef struct { + le16 bytes_per_sector; /* Size of a sector in bytes. */ + u8 sectors_per_cluster; /* Size of a cluster in sectors. */ + le16 reserved_sectors; /* zero */ + u8 fats; /* zero */ + le16 root_entries; /* zero */ + le16 sectors; /* zero */ + u8 media_type; /* 0xf8 = hard disk */ + le16 sectors_per_fat; /* zero */ + le16 sectors_per_track; /* irrelevant */ + le16 heads; /* irrelevant */ + le32 hidden_sectors; /* zero */ + le32 large_sectors; /* zero */ +} __attribute__ ((__packed__)) BIOS_PARAMETER_BLOCK; + +/* + * NTFS boot sector structure. + */ +typedef struct { + u8 jump[3]; /* Irrelevant (jump to boot up code).*/ + le64 oem_id; /* Magic "NTFS ". */ + BIOS_PARAMETER_BLOCK bpb; /* See BIOS_PARAMETER_BLOCK. */ + u8 unused[4]; /* zero, NTFS diskedit.exe states that + this is actually: + __u8 physical_drive; // 0x80 + __u8 current_head; // zero + __u8 extended_boot_signature; + // 0x80 + __u8 unused; // zero + */ +/*0x28*/sle64 number_of_sectors; /* Number of sectors in volume. Gives + maximum volume size of 2^63 sectors. + Assuming standard sector size of 512 + bytes, the maximum byte size is + approx. 4.7x10^21 bytes. (-; */ + sle64 mft_lcn; /* Cluster location of mft data. */ + sle64 mftmirr_lcn; /* Cluster location of copy of mft. */ + s8 clusters_per_mft_record; /* Mft record size in clusters. */ + u8 reserved0[3]; /* zero */ + s8 clusters_per_index_record; /* Index block size in clusters. */ + u8 reserved1[3]; /* zero */ + le64 volume_serial_number; /* Irrelevant (serial number). */ + le32 checksum; /* Boot sector checksum. */ +/*0x54*/u8 bootstrap[426]; /* Irrelevant (boot up code). */ + le16 end_of_sector_marker; /* End of bootsector magic. Always is + 0xaa55 in little endian. */ +/* sizeof() = 512 (0x200) bytes */ +} __attribute__ ((__packed__)) NTFS_BOOT_SECTOR; + +/* + * Magic identifiers present at the beginning of all ntfs record containing + * records (like mft records for example). + */ +enum { + /* Found in $MFT/$DATA. */ + magic_FILE = cpu_to_le32(0x454c4946), /* Mft entry. */ + magic_INDX = cpu_to_le32(0x58444e49), /* Index buffer. */ + magic_HOLE = cpu_to_le32(0x454c4f48), /* ? (NTFS 3.0+?) */ + + /* Found in $LogFile/$DATA. */ + magic_RSTR = cpu_to_le32(0x52545352), /* Restart page. */ + magic_RCRD = cpu_to_le32(0x44524352), /* Log record page. */ + + /* Found in $LogFile/$DATA. (May be found in $MFT/$DATA, also?) */ + magic_CHKD = cpu_to_le32(0x444b4843), /* Modified by chkdsk. */ + + /* Found in all ntfs record containing records. */ + magic_BAAD = cpu_to_le32(0x44414142), /* Failed multi sector + transfer was detected. */ + /* + * Found in $LogFile/$DATA when a page is full of 0xff bytes and is + * thus not initialized. Page must be initialized before using it. + */ + magic_empty = cpu_to_le32(0xffffffff) /* Record is empty. */ +}; + +typedef le32 NTFS_RECORD_TYPE; + +/* + * Generic magic comparison macros. Finally found a use for the ## preprocessor + * operator! (-8 + */ + +static inline bool __ntfs_is_magic(le32 x, NTFS_RECORD_TYPE r) +{ + return (x == r); +} +#define ntfs_is_magic(x, m) __ntfs_is_magic(x, magic_##m) + +static inline bool __ntfs_is_magicp(le32 *p, NTFS_RECORD_TYPE r) +{ + return (*p == r); +} +#define ntfs_is_magicp(p, m) __ntfs_is_magicp(p, magic_##m) + +/* + * Specialised magic comparison macros for the NTFS_RECORD_TYPEs defined above. + */ +#define ntfs_is_file_record(x) ( ntfs_is_magic (x, FILE) ) +#define ntfs_is_file_recordp(p) ( ntfs_is_magicp(p, FILE) ) +#define ntfs_is_mft_record(x) ( ntfs_is_file_record (x) ) +#define ntfs_is_mft_recordp(p) ( ntfs_is_file_recordp(p) ) +#define ntfs_is_indx_record(x) ( ntfs_is_magic (x, INDX) ) +#define ntfs_is_indx_recordp(p) ( ntfs_is_magicp(p, INDX) ) +#define ntfs_is_hole_record(x) ( ntfs_is_magic (x, HOLE) ) +#define ntfs_is_hole_recordp(p) ( ntfs_is_magicp(p, HOLE) ) + +#define ntfs_is_rstr_record(x) ( ntfs_is_magic (x, RSTR) ) +#define ntfs_is_rstr_recordp(p) ( ntfs_is_magicp(p, RSTR) ) +#define ntfs_is_rcrd_record(x) ( ntfs_is_magic (x, RCRD) ) +#define ntfs_is_rcrd_recordp(p) ( ntfs_is_magicp(p, RCRD) ) + +#define ntfs_is_chkd_record(x) ( ntfs_is_magic (x, CHKD) ) +#define ntfs_is_chkd_recordp(p) ( ntfs_is_magicp(p, CHKD) ) + +#define ntfs_is_baad_record(x) ( ntfs_is_magic (x, BAAD) ) +#define ntfs_is_baad_recordp(p) ( ntfs_is_magicp(p, BAAD) ) + +#define ntfs_is_empty_record(x) ( ntfs_is_magic (x, empty) ) +#define ntfs_is_empty_recordp(p) ( ntfs_is_magicp(p, empty) ) + +/* + * The Update Sequence Array (usa) is an array of the le16 values which belong + * to the end of each sector protected by the update sequence record in which + * this array is contained. Note that the first entry is the Update Sequence + * Number (usn), a cyclic counter of how many times the protected record has + * been written to disk. The values 0 and -1 (ie. 0xffff) are not used. All + * last le16's of each sector have to be equal to the usn (during reading) or + * are set to it (during writing). If they are not, an incomplete multi sector + * transfer has occurred when the data was written. + * The maximum size for the update sequence array is fixed to: + * maximum size = usa_ofs + (usa_count * 2) = 510 bytes + * The 510 bytes comes from the fact that the last le16 in the array has to + * (obviously) finish before the last le16 of the first 512-byte sector. + * This formula can be used as a consistency check in that usa_ofs + + * (usa_count * 2) has to be less than or equal to 510. + */ +typedef struct { + NTFS_RECORD_TYPE magic; /* A four-byte magic identifying the record + type and/or status. */ + le16 usa_ofs; /* Offset to the Update Sequence Array (usa) + from the start of the ntfs record. */ + le16 usa_count; /* Number of le16 sized entries in the usa + including the Update Sequence Number (usn), + thus the number of fixups is the usa_count + minus 1. */ +} __attribute__ ((__packed__)) NTFS_RECORD; + +/* + * System files mft record numbers. All these files are always marked as used + * in the bitmap attribute of the mft; presumably in order to avoid accidental + * allocation for random other mft records. Also, the sequence number for each + * of the system files is always equal to their mft record number and it is + * never modified. + */ +typedef enum { + FILE_MFT = 0, /* Master file table (mft). Data attribute + contains the entries and bitmap attribute + records which ones are in use (bit==1). */ + FILE_MFTMirr = 1, /* Mft mirror: copy of first four mft records + in data attribute. If cluster size > 4kiB, + copy of first N mft records, with + N = cluster_size / mft_record_size. */ + FILE_LogFile = 2, /* Journalling log in data attribute. */ + FILE_Volume = 3, /* Volume name attribute and volume information + attribute (flags and ntfs version). Windows + refers to this file as volume DASD (Direct + Access Storage Device). */ + FILE_AttrDef = 4, /* Array of attribute definitions in data + attribute. */ + FILE_root = 5, /* Root directory. */ + FILE_Bitmap = 6, /* Allocation bitmap of all clusters (lcns) in + data attribute. */ + FILE_Boot = 7, /* Boot sector (always at cluster 0) in data + attribute. */ + FILE_BadClus = 8, /* Contains all bad clusters in the non-resident + data attribute. */ + FILE_Secure = 9, /* Shared security descriptors in data attribute + and two indexes into the descriptors. + Appeared in Windows 2000. Before that, this + file was named $Quota but was unused. */ + FILE_UpCase = 10, /* Uppercase equivalents of all 65536 Unicode + characters in data attribute. */ + FILE_Extend = 11, /* Directory containing other system files (eg. + $ObjId, $Quota, $Reparse and $UsnJrnl). This + is new to NTFS3.0. */ + FILE_reserved12 = 12, /* Reserved for future use (records 12-15). */ + FILE_reserved13 = 13, + FILE_reserved14 = 14, + FILE_reserved15 = 15, + FILE_first_user = 16, /* First user file, used as test limit for + whether to allow opening a file or not. */ +} NTFS_SYSTEM_FILES; + +/* + * These are the so far known MFT_RECORD_* flags (16-bit) which contain + * information about the mft record in which they are present. + */ +enum { + MFT_RECORD_IN_USE = cpu_to_le16(0x0001), + MFT_RECORD_IS_DIRECTORY = cpu_to_le16(0x0002), +} __attribute__ ((__packed__)); + +typedef le16 MFT_RECORD_FLAGS; + +/* + * mft references (aka file references or file record segment references) are + * used whenever a structure needs to refer to a record in the mft. + * + * A reference consists of a 48-bit index into the mft and a 16-bit sequence + * number used to detect stale references. + * + * For error reporting purposes we treat the 48-bit index as a signed quantity. + * + * The sequence number is a circular counter (skipping 0) describing how many + * times the referenced mft record has been (re)used. This has to match the + * sequence number of the mft record being referenced, otherwise the reference + * is considered stale and removed (FIXME: only ntfsck or the driver itself?). + * + * If the sequence number is zero it is assumed that no sequence number + * consistency checking should be performed. + * + * FIXME: Since inodes are 32-bit as of now, the driver needs to always check + * for high_part being 0 and if not either BUG(), cause a panic() or handle + * the situation in some other way. This shouldn't be a problem as a volume has + * to become HUGE in order to need more than 32-bits worth of mft records. + * Assuming the standard mft record size of 1kb only the records (never mind + * the non-resident attributes, etc.) would require 4Tb of space on their own + * for the first 32 bits worth of records. This is only if some strange person + * doesn't decide to foul play and make the mft sparse which would be a really + * horrible thing to do as it would trash our current driver implementation. )-: + * Do I hear screams "we want 64-bit inodes!" ?!? (-; + * + * FIXME: The mft zone is defined as the first 12% of the volume. This space is + * reserved so that the mft can grow contiguously and hence doesn't become + * fragmented. Volume free space includes the empty part of the mft zone and + * when the volume's free 88% are used up, the mft zone is shrunk by a factor + * of 2, thus making more space available for more files/data. This process is + * repeated every time there is no more free space except for the mft zone until + * there really is no more free space. + */ + +/* + * Typedef the MFT_REF as a 64-bit value for easier handling. + * Also define two unpacking macros to get to the reference (MREF) and + * sequence number (MSEQNO) respectively. + * The _LE versions are to be applied on little endian MFT_REFs. + * Note: The _LE versions will return a CPU endian formatted value! + */ +#define MFT_REF_MASK_CPU 0x0000ffffffffffffULL +#define MFT_REF_MASK_LE cpu_to_le64(MFT_REF_MASK_CPU) + +typedef u64 MFT_REF; +typedef le64 leMFT_REF; + +#define MK_MREF(m, s) ((MFT_REF)(((MFT_REF)(s) << 48) | \ + ((MFT_REF)(m) & MFT_REF_MASK_CPU))) +#define MK_LE_MREF(m, s) cpu_to_le64(MK_MREF(m, s)) + +#define MREF(x) ((unsigned long)((x) & MFT_REF_MASK_CPU)) +#define MSEQNO(x) ((u16)(((x) >> 48) & 0xffff)) +#define MREF_LE(x) ((unsigned long)(le64_to_cpu(x) & MFT_REF_MASK_CPU)) +#define MSEQNO_LE(x) ((u16)((le64_to_cpu(x) >> 48) & 0xffff)) + +#define IS_ERR_MREF(x) (((x) & 0x0000800000000000ULL) ? true : false) +#define ERR_MREF(x) ((u64)((s64)(x))) +#define MREF_ERR(x) ((int)((s64)(x))) + +/* + * The mft record header present at the beginning of every record in the mft. + * This is followed by a sequence of variable length attribute records which + * is terminated by an attribute of type AT_END which is a truncated attribute + * in that it only consists of the attribute type code AT_END and none of the + * other members of the attribute structure are present. + */ +typedef struct { +/*Ofs*/ +/* 0 NTFS_RECORD; -- Unfolded here as gcc doesn't like unnamed structs. */ + NTFS_RECORD_TYPE magic; /* Usually the magic is "FILE". */ + le16 usa_ofs; /* See NTFS_RECORD definition above. */ + le16 usa_count; /* See NTFS_RECORD definition above. */ + +/* 8*/ le64 lsn; /* $LogFile sequence number for this record. + Changed every time the record is modified. */ +/* 16*/ le16 sequence_number; /* Number of times this mft record has been + reused. (See description for MFT_REF + above.) NOTE: The increment (skipping zero) + is done when the file is deleted. NOTE: If + this is zero it is left zero. */ +/* 18*/ le16 link_count; /* Number of hard links, i.e. the number of + directory entries referencing this record. + NOTE: Only used in mft base records. + NOTE: When deleting a directory entry we + check the link_count and if it is 1 we + delete the file. Otherwise we delete the + FILE_NAME_ATTR being referenced by the + directory entry from the mft record and + decrement the link_count. + FIXME: Careful with Win32 + DOS names! */ +/* 20*/ le16 attrs_offset; /* Byte offset to the first attribute in this + mft record from the start of the mft record. + NOTE: Must be aligned to 8-byte boundary. */ +/* 22*/ MFT_RECORD_FLAGS flags; /* Bit array of MFT_RECORD_FLAGS. When a file + is deleted, the MFT_RECORD_IN_USE flag is + set to zero. */ +/* 24*/ le32 bytes_in_use; /* Number of bytes used in this mft record. + NOTE: Must be aligned to 8-byte boundary. */ +/* 28*/ le32 bytes_allocated; /* Number of bytes allocated for this mft + record. This should be equal to the mft + record size. */ +/* 32*/ leMFT_REF base_mft_record;/* This is zero for base mft records. + When it is not zero it is a mft reference + pointing to the base mft record to which + this record belongs (this is then used to + locate the attribute list attribute present + in the base record which describes this + extension record and hence might need + modification when the extension record + itself is modified, also locating the + attribute list also means finding the other + potential extents, belonging to the non-base + mft record). */ +/* 40*/ le16 next_attr_instance;/* The instance number that will be assigned to + the next attribute added to this mft record. + NOTE: Incremented each time after it is used. + NOTE: Every time the mft record is reused + this number is set to zero. NOTE: The first + instance number is always 0. */ +/* The below fields are specific to NTFS 3.1+ (Windows XP and above): */ +/* 42*/ le16 reserved; /* Reserved/alignment. */ +/* 44*/ le32 mft_record_number; /* Number of this mft record. */ +/* sizeof() = 48 bytes */ +/* + * When (re)using the mft record, we place the update sequence array at this + * offset, i.e. before we start with the attributes. This also makes sense, + * otherwise we could run into problems with the update sequence array + * containing in itself the last two bytes of a sector which would mean that + * multi sector transfer protection wouldn't work. As you can't protect data + * by overwriting it since you then can't get it back... + * When reading we obviously use the data from the ntfs record header. + */ +} __attribute__ ((__packed__)) MFT_RECORD; + +/* This is the version without the NTFS 3.1+ specific fields. */ +typedef struct { +/*Ofs*/ +/* 0 NTFS_RECORD; -- Unfolded here as gcc doesn't like unnamed structs. */ + NTFS_RECORD_TYPE magic; /* Usually the magic is "FILE". */ + le16 usa_ofs; /* See NTFS_RECORD definition above. */ + le16 usa_count; /* See NTFS_RECORD definition above. */ + +/* 8*/ le64 lsn; /* $LogFile sequence number for this record. + Changed every time the record is modified. */ +/* 16*/ le16 sequence_number; /* Number of times this mft record has been + reused. (See description for MFT_REF + above.) NOTE: The increment (skipping zero) + is done when the file is deleted. NOTE: If + this is zero it is left zero. */ +/* 18*/ le16 link_count; /* Number of hard links, i.e. the number of + directory entries referencing this record. + NOTE: Only used in mft base records. + NOTE: When deleting a directory entry we + check the link_count and if it is 1 we + delete the file. Otherwise we delete the + FILE_NAME_ATTR being referenced by the + directory entry from the mft record and + decrement the link_count. + FIXME: Careful with Win32 + DOS names! */ +/* 20*/ le16 attrs_offset; /* Byte offset to the first attribute in this + mft record from the start of the mft record. + NOTE: Must be aligned to 8-byte boundary. */ +/* 22*/ MFT_RECORD_FLAGS flags; /* Bit array of MFT_RECORD_FLAGS. When a file + is deleted, the MFT_RECORD_IN_USE flag is + set to zero. */ +/* 24*/ le32 bytes_in_use; /* Number of bytes used in this mft record. + NOTE: Must be aligned to 8-byte boundary. */ +/* 28*/ le32 bytes_allocated; /* Number of bytes allocated for this mft + record. This should be equal to the mft + record size. */ +/* 32*/ leMFT_REF base_mft_record;/* This is zero for base mft records. + When it is not zero it is a mft reference + pointing to the base mft record to which + this record belongs (this is then used to + locate the attribute list attribute present + in the base record which describes this + extension record and hence might need + modification when the extension record + itself is modified, also locating the + attribute list also means finding the other + potential extents, belonging to the non-base + mft record). */ +/* 40*/ le16 next_attr_instance;/* The instance number that will be assigned to + the next attribute added to this mft record. + NOTE: Incremented each time after it is used. + NOTE: Every time the mft record is reused + this number is set to zero. NOTE: The first + instance number is always 0. */ +/* sizeof() = 42 bytes */ +/* + * When (re)using the mft record, we place the update sequence array at this + * offset, i.e. before we start with the attributes. This also makes sense, + * otherwise we could run into problems with the update sequence array + * containing in itself the last two bytes of a sector which would mean that + * multi sector transfer protection wouldn't work. As you can't protect data + * by overwriting it since you then can't get it back... + * When reading we obviously use the data from the ntfs record header. + */ +} __attribute__ ((__packed__)) MFT_RECORD_OLD; + +/* + * System defined attributes (32-bit). Each attribute type has a corresponding + * attribute name (Unicode string of maximum 64 character length) as described + * by the attribute definitions present in the data attribute of the $AttrDef + * system file. On NTFS 3.0 volumes the names are just as the types are named + * in the below defines exchanging AT_ for the dollar sign ($). If that is not + * a revealing choice of symbol I do not know what is... (-; + */ +enum { + AT_UNUSED = cpu_to_le32( 0), + AT_STANDARD_INFORMATION = cpu_to_le32( 0x10), + AT_ATTRIBUTE_LIST = cpu_to_le32( 0x20), + AT_FILE_NAME = cpu_to_le32( 0x30), + AT_OBJECT_ID = cpu_to_le32( 0x40), + AT_SECURITY_DESCRIPTOR = cpu_to_le32( 0x50), + AT_VOLUME_NAME = cpu_to_le32( 0x60), + AT_VOLUME_INFORMATION = cpu_to_le32( 0x70), + AT_DATA = cpu_to_le32( 0x80), + AT_INDEX_ROOT = cpu_to_le32( 0x90), + AT_INDEX_ALLOCATION = cpu_to_le32( 0xa0), + AT_BITMAP = cpu_to_le32( 0xb0), + AT_REPARSE_POINT = cpu_to_le32( 0xc0), + AT_EA_INFORMATION = cpu_to_le32( 0xd0), + AT_EA = cpu_to_le32( 0xe0), + AT_PROPERTY_SET = cpu_to_le32( 0xf0), + AT_LOGGED_UTILITY_STREAM = cpu_to_le32( 0x100), + AT_FIRST_USER_DEFINED_ATTRIBUTE = cpu_to_le32( 0x1000), + AT_END = cpu_to_le32(0xffffffff) +}; + +typedef le32 ATTR_TYPE; + +/* + * The collation rules for sorting views/indexes/etc (32-bit). + * + * COLLATION_BINARY - Collate by binary compare where the first byte is most + * significant. + * COLLATION_UNICODE_STRING - Collate Unicode strings by comparing their binary + * Unicode values, except that when a character can be uppercased, the + * upper case value collates before the lower case one. + * COLLATION_FILE_NAME - Collate file names as Unicode strings. The collation + * is done very much like COLLATION_UNICODE_STRING. In fact I have no idea + * what the difference is. Perhaps the difference is that file names + * would treat some special characters in an odd way (see + * unistr.c::ntfs_collate_names() and unistr.c::legal_ansi_char_array[] + * for what I mean but COLLATION_UNICODE_STRING would not give any special + * treatment to any characters at all, but this is speculation. + * COLLATION_NTOFS_ULONG - Sorting is done according to ascending le32 key + * values. E.g. used for $SII index in FILE_Secure, which sorts by + * security_id (le32). + * COLLATION_NTOFS_SID - Sorting is done according to ascending SID values. + * E.g. used for $O index in FILE_Extend/$Quota. + * COLLATION_NTOFS_SECURITY_HASH - Sorting is done first by ascending hash + * values and second by ascending security_id values. E.g. used for $SDH + * index in FILE_Secure. + * COLLATION_NTOFS_ULONGS - Sorting is done according to a sequence of ascending + * le32 key values. E.g. used for $O index in FILE_Extend/$ObjId, which + * sorts by object_id (16-byte), by splitting up the object_id in four + * le32 values and using them as individual keys. E.g. take the following + * two security_ids, stored as follows on disk: + * 1st: a1 61 65 b7 65 7b d4 11 9e 3d 00 e0 81 10 42 59 + * 2nd: 38 14 37 d2 d2 f3 d4 11 a5 21 c8 6b 79 b1 97 45 + * To compare them, they are split into four le32 values each, like so: + * 1st: 0xb76561a1 0x11d47b65 0xe0003d9e 0x59421081 + * 2nd: 0xd2371438 0x11d4f3d2 0x6bc821a5 0x4597b179 + * Now, it is apparent why the 2nd object_id collates after the 1st: the + * first le32 value of the 1st object_id is less than the first le32 of + * the 2nd object_id. If the first le32 values of both object_ids were + * equal then the second le32 values would be compared, etc. + */ +enum { + COLLATION_BINARY = cpu_to_le32(0x00), + COLLATION_FILE_NAME = cpu_to_le32(0x01), + COLLATION_UNICODE_STRING = cpu_to_le32(0x02), + COLLATION_NTOFS_ULONG = cpu_to_le32(0x10), + COLLATION_NTOFS_SID = cpu_to_le32(0x11), + COLLATION_NTOFS_SECURITY_HASH = cpu_to_le32(0x12), + COLLATION_NTOFS_ULONGS = cpu_to_le32(0x13), +}; + +typedef le32 COLLATION_RULE; + +/* + * The flags (32-bit) describing attribute properties in the attribute + * definition structure. FIXME: This information is based on Regis's + * information and, according to him, it is not certain and probably + * incomplete. The INDEXABLE flag is fairly certainly correct as only the file + * name attribute has this flag set and this is the only attribute indexed in + * NT4. + */ +enum { + ATTR_DEF_INDEXABLE = cpu_to_le32(0x02), /* Attribute can be + indexed. */ + ATTR_DEF_MULTIPLE = cpu_to_le32(0x04), /* Attribute type + can be present multiple times in the + mft records of an inode. */ + ATTR_DEF_NOT_ZERO = cpu_to_le32(0x08), /* Attribute value + must contain at least one non-zero + byte. */ + ATTR_DEF_INDEXED_UNIQUE = cpu_to_le32(0x10), /* Attribute must be + indexed and the attribute value must be + unique for the attribute type in all of + the mft records of an inode. */ + ATTR_DEF_NAMED_UNIQUE = cpu_to_le32(0x20), /* Attribute must be + named and the name must be unique for + the attribute type in all of the mft + records of an inode. */ + ATTR_DEF_RESIDENT = cpu_to_le32(0x40), /* Attribute must be + resident. */ + ATTR_DEF_ALWAYS_LOG = cpu_to_le32(0x80), /* Always log + modifications to this attribute, + regardless of whether it is resident or + non-resident. Without this, only log + modifications if the attribute is + resident. */ +}; + +typedef le32 ATTR_DEF_FLAGS; + +/* + * The data attribute of FILE_AttrDef contains a sequence of attribute + * definitions for the NTFS volume. With this, it is supposed to be safe for an + * older NTFS driver to mount a volume containing a newer NTFS version without + * damaging it (that's the theory. In practice it's: not damaging it too much). + * Entries are sorted by attribute type. The flags describe whether the + * attribute can be resident/non-resident and possibly other things, but the + * actual bits are unknown. + */ +typedef struct { +/*hex ofs*/ +/* 0*/ ntfschar name[0x40]; /* Unicode name of the attribute. Zero + terminated. */ +/* 80*/ ATTR_TYPE type; /* Type of the attribute. */ +/* 84*/ le32 display_rule; /* Default display rule. + FIXME: What does it mean? (AIA) */ +/* 88*/ COLLATION_RULE collation_rule; /* Default collation rule. */ +/* 8c*/ ATTR_DEF_FLAGS flags; /* Flags describing the attribute. */ +/* 90*/ sle64 min_size; /* Optional minimum attribute size. */ +/* 98*/ sle64 max_size; /* Maximum size of attribute. */ +/* sizeof() = 0xa0 or 160 bytes */ +} __attribute__ ((__packed__)) ATTR_DEF; + +/* + * Attribute flags (16-bit). + */ +enum { + ATTR_IS_COMPRESSED = cpu_to_le16(0x0001), + ATTR_COMPRESSION_MASK = cpu_to_le16(0x00ff), /* Compression method + mask. Also, first + illegal value. */ + ATTR_IS_ENCRYPTED = cpu_to_le16(0x4000), + ATTR_IS_SPARSE = cpu_to_le16(0x8000), +} __attribute__ ((__packed__)); + +typedef le16 ATTR_FLAGS; + +/* + * Attribute compression. + * + * Only the data attribute is ever compressed in the current ntfs driver in + * Windows. Further, compression is only applied when the data attribute is + * non-resident. Finally, to use compression, the maximum allowed cluster size + * on a volume is 4kib. + * + * The compression method is based on independently compressing blocks of X + * clusters, where X is determined from the compression_unit value found in the + * non-resident attribute record header (more precisely: X = 2^compression_unit + * clusters). On Windows NT/2k, X always is 16 clusters (compression_unit = 4). + * + * There are three different cases of how a compression block of X clusters + * can be stored: + * + * 1) The data in the block is all zero (a sparse block): + * This is stored as a sparse block in the runlist, i.e. the runlist + * entry has length = X and lcn = -1. The mapping pairs array actually + * uses a delta_lcn value length of 0, i.e. delta_lcn is not present at + * all, which is then interpreted by the driver as lcn = -1. + * NOTE: Even uncompressed files can be sparse on NTFS 3.0 volumes, then + * the same principles apply as above, except that the length is not + * restricted to being any particular value. + * + * 2) The data in the block is not compressed: + * This happens when compression doesn't reduce the size of the block + * in clusters. I.e. if compression has a small effect so that the + * compressed data still occupies X clusters, then the uncompressed data + * is stored in the block. + * This case is recognised by the fact that the runlist entry has + * length = X and lcn >= 0. The mapping pairs array stores this as + * normal with a run length of X and some specific delta_lcn, i.e. + * delta_lcn has to be present. + * + * 3) The data in the block is compressed: + * The common case. This case is recognised by the fact that the run + * list entry has length L < X and lcn >= 0. The mapping pairs array + * stores this as normal with a run length of X and some specific + * delta_lcn, i.e. delta_lcn has to be present. This runlist entry is + * immediately followed by a sparse entry with length = X - L and + * lcn = -1. The latter entry is to make up the vcn counting to the + * full compression block size X. + * + * In fact, life is more complicated because adjacent entries of the same type + * can be coalesced. This means that one has to keep track of the number of + * clusters handled and work on a basis of X clusters at a time being one + * block. An example: if length L > X this means that this particular runlist + * entry contains a block of length X and part of one or more blocks of length + * L - X. Another example: if length L < X, this does not necessarily mean that + * the block is compressed as it might be that the lcn changes inside the block + * and hence the following runlist entry describes the continuation of the + * potentially compressed block. The block would be compressed if the + * following runlist entry describes at least X - L sparse clusters, thus + * making up the compression block length as described in point 3 above. (Of + * course, there can be several runlist entries with small lengths so that the + * sparse entry does not follow the first data containing entry with + * length < X.) + * + * NOTE: At the end of the compressed attribute value, there most likely is not + * just the right amount of data to make up a compression block, thus this data + * is not even attempted to be compressed. It is just stored as is, unless + * the number of clusters it occupies is reduced when compressed in which case + * it is stored as a compressed compression block, complete with sparse + * clusters at the end. + */ + +/* + * Flags of resident attributes (8-bit). + */ +enum { + RESIDENT_ATTR_IS_INDEXED = 0x01, /* Attribute is referenced in an index + (has implications for deleting and + modifying the attribute). */ +} __attribute__ ((__packed__)); + +typedef u8 RESIDENT_ATTR_FLAGS; + +/* + * Attribute record header. Always aligned to 8-byte boundary. + */ +typedef struct { +/*Ofs*/ +/* 0*/ ATTR_TYPE type; /* The (32-bit) type of the attribute. */ +/* 4*/ le32 length; /* Byte size of the resident part of the + attribute (aligned to 8-byte boundary). + Used to get to the next attribute. */ +/* 8*/ u8 non_resident; /* If 0, attribute is resident. + If 1, attribute is non-resident. */ +/* 9*/ u8 name_length; /* Unicode character size of name of attribute. + 0 if unnamed. */ +/* 10*/ le16 name_offset; /* If name_length != 0, the byte offset to the + beginning of the name from the attribute + record. Note that the name is stored as a + Unicode string. When creating, place offset + just at the end of the record header. Then, + follow with attribute value or mapping pairs + array, resident and non-resident attributes + respectively, aligning to an 8-byte + boundary. */ +/* 12*/ ATTR_FLAGS flags; /* Flags describing the attribute. */ +/* 14*/ le16 instance; /* The instance of this attribute record. This + number is unique within this mft record (see + MFT_RECORD/next_attribute_instance notes in + mft.h for more details). */ +/* 16*/ union { + /* Resident attributes. */ + struct { +/* 16 */ le32 value_length;/* Byte size of attribute value. */ +/* 20 */ le16 value_offset;/* Byte offset of the attribute + value from the start of the + attribute record. When creating, + align to 8-byte boundary if we + have a name present as this might + not have a length of a multiple + of 8-bytes. */ +/* 22 */ RESIDENT_ATTR_FLAGS flags; /* See above. */ +/* 23 */ s8 reserved; /* Reserved/alignment to 8-byte + boundary. */ + } __attribute__ ((__packed__)) resident; + /* Non-resident attributes. */ + struct { +/* 16*/ leVCN lowest_vcn;/* Lowest valid virtual cluster number + for this portion of the attribute value or + 0 if this is the only extent (usually the + case). - Only when an attribute list is used + does lowest_vcn != 0 ever occur. */ +/* 24*/ leVCN highest_vcn;/* Highest valid vcn of this extent of + the attribute value. - Usually there is only one + portion, so this usually equals the attribute + value size in clusters minus 1. Can be -1 for + zero length files. Can be 0 for "single extent" + attributes. */ +/* 32*/ le16 mapping_pairs_offset; /* Byte offset from the + beginning of the structure to the mapping pairs + array which contains the mappings between the + vcns and the logical cluster numbers (lcns). + When creating, place this at the end of this + record header aligned to 8-byte boundary. */ +/* 34*/ u8 compression_unit; /* The compression unit expressed + as the log to the base 2 of the number of + clusters in a compression unit. 0 means not + compressed. (This effectively limits the + compression unit size to be a power of two + clusters.) WinNT4 only uses a value of 4. + Sparse files have this set to 0 on XPSP2. */ +/* 35*/ u8 reserved[5]; /* Align to 8-byte boundary. */ +/* The sizes below are only used when lowest_vcn is zero, as otherwise it would + be difficult to keep them up-to-date.*/ +/* 40*/ sle64 allocated_size; /* Byte size of disk space + allocated to hold the attribute value. Always + is a multiple of the cluster size. When a file + is compressed, this field is a multiple of the + compression block size (2^compression_unit) and + it represents the logically allocated space + rather than the actual on disk usage. For this + use the compressed_size (see below). */ +/* 48*/ sle64 data_size; /* Byte size of the attribute + value. Can be larger than allocated_size if + attribute value is compressed or sparse. */ +/* 56*/ sle64 initialized_size; /* Byte size of initialized + portion of the attribute value. Usually equals + data_size. */ +/* sizeof(uncompressed attr) = 64*/ +/* 64*/ sle64 compressed_size; /* Byte size of the attribute + value after compression. Only present when + compressed or sparse. Always is a multiple of + the cluster size. Represents the actual amount + of disk space being used on the disk. */ +/* sizeof(compressed attr) = 72*/ + } __attribute__ ((__packed__)) non_resident; + } __attribute__ ((__packed__)) data; +} __attribute__ ((__packed__)) ATTR_RECORD; + +typedef ATTR_RECORD ATTR_REC; + +/* + * File attribute flags (32-bit) appearing in the file_attributes fields of the + * STANDARD_INFORMATION attribute of MFT_RECORDs and the FILENAME_ATTR + * attributes of MFT_RECORDs and directory index entries. + * + * All of the below flags appear in the directory index entries but only some + * appear in the STANDARD_INFORMATION attribute whilst only some others appear + * in the FILENAME_ATTR attribute of MFT_RECORDs. Unless otherwise stated the + * flags appear in all of the above. + */ +enum { + FILE_ATTR_READONLY = cpu_to_le32(0x00000001), + FILE_ATTR_HIDDEN = cpu_to_le32(0x00000002), + FILE_ATTR_SYSTEM = cpu_to_le32(0x00000004), + /* Old DOS volid. Unused in NT. = cpu_to_le32(0x00000008), */ + + FILE_ATTR_DIRECTORY = cpu_to_le32(0x00000010), + /* Note, FILE_ATTR_DIRECTORY is not considered valid in NT. It is + reserved for the DOS SUBDIRECTORY flag. */ + FILE_ATTR_ARCHIVE = cpu_to_le32(0x00000020), + FILE_ATTR_DEVICE = cpu_to_le32(0x00000040), + FILE_ATTR_NORMAL = cpu_to_le32(0x00000080), + + FILE_ATTR_TEMPORARY = cpu_to_le32(0x00000100), + FILE_ATTR_SPARSE_FILE = cpu_to_le32(0x00000200), + FILE_ATTR_REPARSE_POINT = cpu_to_le32(0x00000400), + FILE_ATTR_COMPRESSED = cpu_to_le32(0x00000800), + + FILE_ATTR_OFFLINE = cpu_to_le32(0x00001000), + FILE_ATTR_NOT_CONTENT_INDEXED = cpu_to_le32(0x00002000), + FILE_ATTR_ENCRYPTED = cpu_to_le32(0x00004000), + + FILE_ATTR_VALID_FLAGS = cpu_to_le32(0x00007fb7), + /* Note, FILE_ATTR_VALID_FLAGS masks out the old DOS VolId and the + FILE_ATTR_DEVICE and preserves everything else. This mask is used + to obtain all flags that are valid for reading. */ + FILE_ATTR_VALID_SET_FLAGS = cpu_to_le32(0x000031a7), + /* Note, FILE_ATTR_VALID_SET_FLAGS masks out the old DOS VolId, the + F_A_DEVICE, F_A_DIRECTORY, F_A_SPARSE_FILE, F_A_REPARSE_POINT, + F_A_COMPRESSED, and F_A_ENCRYPTED and preserves the rest. This mask + is used to obtain all flags that are valid for setting. */ + /* + * The flag FILE_ATTR_DUP_FILENAME_INDEX_PRESENT is present in all + * FILENAME_ATTR attributes but not in the STANDARD_INFORMATION + * attribute of an mft record. + */ + FILE_ATTR_DUP_FILE_NAME_INDEX_PRESENT = cpu_to_le32(0x10000000), + /* Note, this is a copy of the corresponding bit from the mft record, + telling us whether this is a directory or not, i.e. whether it has + an index root attribute or not. */ + FILE_ATTR_DUP_VIEW_INDEX_PRESENT = cpu_to_le32(0x20000000), + /* Note, this is a copy of the corresponding bit from the mft record, + telling us whether this file has a view index present (eg. object id + index, quota index, one of the security indexes or the encrypting + filesystem related indexes). */ +}; + +typedef le32 FILE_ATTR_FLAGS; + +/* + * NOTE on times in NTFS: All times are in MS standard time format, i.e. they + * are the number of 100-nanosecond intervals since 1st January 1601, 00:00:00 + * universal coordinated time (UTC). (In Linux time starts 1st January 1970, + * 00:00:00 UTC and is stored as the number of 1-second intervals since then.) + */ + +/* + * Attribute: Standard information (0x10). + * + * NOTE: Always resident. + * NOTE: Present in all base file records on a volume. + * NOTE: There is conflicting information about the meaning of each of the time + * fields but the meaning as defined below has been verified to be + * correct by practical experimentation on Windows NT4 SP6a and is hence + * assumed to be the one and only correct interpretation. + */ +typedef struct { +/*Ofs*/ +/* 0*/ sle64 creation_time; /* Time file was created. Updated when + a filename is changed(?). */ +/* 8*/ sle64 last_data_change_time; /* Time the data attribute was last + modified. */ +/* 16*/ sle64 last_mft_change_time; /* Time this mft record was last + modified. */ +/* 24*/ sle64 last_access_time; /* Approximate time when the file was + last accessed (obviously this is not + updated on read-only volumes). In + Windows this is only updated when + accessed if some time delta has + passed since the last update. Also, + last access time updates can be + disabled altogether for speed. */ +/* 32*/ FILE_ATTR_FLAGS file_attributes; /* Flags describing the file. */ +/* 36*/ union { + /* NTFS 1.2 */ + struct { + /* 36*/ u8 reserved12[12]; /* Reserved/alignment to 8-byte + boundary. */ + } __attribute__ ((__packed__)) v1; + /* sizeof() = 48 bytes */ + /* NTFS 3.x */ + struct { +/* + * If a volume has been upgraded from a previous NTFS version, then these + * fields are present only if the file has been accessed since the upgrade. + * Recognize the difference by comparing the length of the resident attribute + * value. If it is 48, then the following fields are missing. If it is 72 then + * the fields are present. Maybe just check like this: + * if (resident.ValueLength < sizeof(STANDARD_INFORMATION)) { + * Assume NTFS 1.2- format. + * If (volume version is 3.x) + * Upgrade attribute to NTFS 3.x format. + * else + * Use NTFS 1.2- format for access. + * } else + * Use NTFS 3.x format for access. + * Only problem is that it might be legal to set the length of the value to + * arbitrarily large values thus spoiling this check. - But chkdsk probably + * views that as a corruption, assuming that it behaves like this for all + * attributes. + */ + /* 36*/ le32 maximum_versions; /* Maximum allowed versions for + file. Zero if version numbering is disabled. */ + /* 40*/ le32 version_number; /* This file's version (if any). + Set to zero if maximum_versions is zero. */ + /* 44*/ le32 class_id; /* Class id from bidirectional + class id index (?). */ + /* 48*/ le32 owner_id; /* Owner_id of the user owning + the file. Translate via $Q index in FILE_Extend + /$Quota to the quota control entry for the user + owning the file. Zero if quotas are disabled. */ + /* 52*/ le32 security_id; /* Security_id for the file. + Translate via $SII index and $SDS data stream + in FILE_Secure to the security descriptor. */ + /* 56*/ le64 quota_charged; /* Byte size of the charge to + the quota for all streams of the file. Note: Is + zero if quotas are disabled. */ + /* 64*/ leUSN usn; /* Last update sequence number + of the file. This is a direct index into the + transaction log file ($UsnJrnl). It is zero if + the usn journal is disabled or this file has + not been subject to logging yet. See usnjrnl.h + for details. */ + } __attribute__ ((__packed__)) v3; + /* sizeof() = 72 bytes (NTFS 3.x) */ + } __attribute__ ((__packed__)) ver; +} __attribute__ ((__packed__)) STANDARD_INFORMATION; + +/* + * Attribute: Attribute list (0x20). + * + * - Can be either resident or non-resident. + * - Value consists of a sequence of variable length, 8-byte aligned, + * ATTR_LIST_ENTRY records. + * - The list is not terminated by anything at all! The only way to know when + * the end is reached is to keep track of the current offset and compare it to + * the attribute value size. + * - The attribute list attribute contains one entry for each attribute of + * the file in which the list is located, except for the list attribute + * itself. The list is sorted: first by attribute type, second by attribute + * name (if present), third by instance number. The extents of one + * non-resident attribute (if present) immediately follow after the initial + * extent. They are ordered by lowest_vcn and have their instace set to zero. + * It is not allowed to have two attributes with all sorting keys equal. + * - Further restrictions: + * - If not resident, the vcn to lcn mapping array has to fit inside the + * base mft record. + * - The attribute list attribute value has a maximum size of 256kb. This + * is imposed by the Windows cache manager. + * - Attribute lists are only used when the attributes of mft record do not + * fit inside the mft record despite all attributes (that can be made + * non-resident) having been made non-resident. This can happen e.g. when: + * - File has a large number of hard links (lots of file name + * attributes present). + * - The mapping pairs array of some non-resident attribute becomes so + * large due to fragmentation that it overflows the mft record. + * - The security descriptor is very complex (not applicable to + * NTFS 3.0 volumes). + * - There are many named streams. + */ +typedef struct { +/*Ofs*/ +/* 0*/ ATTR_TYPE type; /* Type of referenced attribute. */ +/* 4*/ le16 length; /* Byte size of this entry (8-byte aligned). */ +/* 6*/ u8 name_length; /* Size in Unicode chars of the name of the + attribute or 0 if unnamed. */ +/* 7*/ u8 name_offset; /* Byte offset to beginning of attribute name + (always set this to where the name would + start even if unnamed). */ +/* 8*/ leVCN lowest_vcn; /* Lowest virtual cluster number of this portion + of the attribute value. This is usually 0. It + is non-zero for the case where one attribute + does not fit into one mft record and thus + several mft records are allocated to hold + this attribute. In the latter case, each mft + record holds one extent of the attribute and + there is one attribute list entry for each + extent. NOTE: This is DEFINITELY a signed + value! The windows driver uses cmp, followed + by jg when comparing this, thus it treats it + as signed. */ +/* 16*/ leMFT_REF mft_reference;/* The reference of the mft record holding + the ATTR_RECORD for this portion of the + attribute value. */ +/* 24*/ le16 instance; /* If lowest_vcn = 0, the instance of the + attribute being referenced; otherwise 0. */ +/* 26*/ ntfschar name[0]; /* Use when creating only. When reading use + name_offset to determine the location of the + name. */ +/* sizeof() = 26 + (attribute_name_length * 2) bytes */ +} __attribute__ ((__packed__)) ATTR_LIST_ENTRY; + +/* + * The maximum allowed length for a file name. + */ +#define MAXIMUM_FILE_NAME_LENGTH 255 + +/* + * Possible namespaces for filenames in ntfs (8-bit). + */ +enum { + FILE_NAME_POSIX = 0x00, + /* This is the largest namespace. It is case sensitive and allows all + Unicode characters except for: '\0' and '/'. Beware that in + WinNT/2k/2003 by default files which eg have the same name except + for their case will not be distinguished by the standard utilities + and thus a "del filename" will delete both "filename" and "fileName" + without warning. However if for example Services For Unix (SFU) are + installed and the case sensitive option was enabled at installation + time, then you can create/access/delete such files. + Note that even SFU places restrictions on the filenames beyond the + '\0' and '/' and in particular the following set of characters is + not allowed: '"', '/', '<', '>', '\'. All other characters, + including the ones no allowed in WIN32 namespace are allowed. + Tested with SFU 3.5 (this is now free) running on Windows XP. */ + FILE_NAME_WIN32 = 0x01, + /* The standard WinNT/2k NTFS long filenames. Case insensitive. All + Unicode chars except: '\0', '"', '*', '/', ':', '<', '>', '?', '\', + and '|'. Further, names cannot end with a '.' or a space. */ + FILE_NAME_DOS = 0x02, + /* The standard DOS filenames (8.3 format). Uppercase only. All 8-bit + characters greater space, except: '"', '*', '+', ',', '/', ':', ';', + '<', '=', '>', '?', and '\'. */ + FILE_NAME_WIN32_AND_DOS = 0x03, + /* 3 means that both the Win32 and the DOS filenames are identical and + hence have been saved in this single filename record. */ +} __attribute__ ((__packed__)); + +typedef u8 FILE_NAME_TYPE_FLAGS; + +/* + * Attribute: Filename (0x30). + * + * NOTE: Always resident. + * NOTE: All fields, except the parent_directory, are only updated when the + * filename is changed. Until then, they just become out of sync with + * reality and the more up to date values are present in the standard + * information attribute. + * NOTE: There is conflicting information about the meaning of each of the time + * fields but the meaning as defined below has been verified to be + * correct by practical experimentation on Windows NT4 SP6a and is hence + * assumed to be the one and only correct interpretation. + */ +typedef struct { +/*hex ofs*/ +/* 0*/ leMFT_REF parent_directory; /* Directory this filename is + referenced from. */ +/* 8*/ sle64 creation_time; /* Time file was created. */ +/* 10*/ sle64 last_data_change_time; /* Time the data attribute was last + modified. */ +/* 18*/ sle64 last_mft_change_time; /* Time this mft record was last + modified. */ +/* 20*/ sle64 last_access_time; /* Time this mft record was last + accessed. */ +/* 28*/ sle64 allocated_size; /* Byte size of on-disk allocated space + for the unnamed data attribute. So + for normal $DATA, this is the + allocated_size from the unnamed + $DATA attribute and for compressed + and/or sparse $DATA, this is the + compressed_size from the unnamed + $DATA attribute. For a directory or + other inode without an unnamed $DATA + attribute, this is always 0. NOTE: + This is a multiple of the cluster + size. */ +/* 30*/ sle64 data_size; /* Byte size of actual data in unnamed + data attribute. For a directory or + other inode without an unnamed $DATA + attribute, this is always 0. */ +/* 38*/ FILE_ATTR_FLAGS file_attributes; /* Flags describing the file. */ +/* 3c*/ union { + /* 3c*/ struct { + /* 3c*/ le16 packed_ea_size; /* Size of the buffer needed to + pack the extended attributes + (EAs), if such are present.*/ + /* 3e*/ le16 reserved; /* Reserved for alignment. */ + } __attribute__ ((__packed__)) ea; + /* 3c*/ struct { + /* 3c*/ le32 reparse_point_tag; /* Type of reparse point, + present only in reparse + points and only if there are + no EAs. */ + } __attribute__ ((__packed__)) rp; + } __attribute__ ((__packed__)) type; +/* 40*/ u8 file_name_length; /* Length of file name in + (Unicode) characters. */ +/* 41*/ FILE_NAME_TYPE_FLAGS file_name_type; /* Namespace of the file name.*/ +/* 42*/ ntfschar file_name[0]; /* File name in Unicode. */ +} __attribute__ ((__packed__)) FILE_NAME_ATTR; + +/* + * GUID structures store globally unique identifiers (GUID). A GUID is a + * 128-bit value consisting of one group of eight hexadecimal digits, followed + * by three groups of four hexadecimal digits each, followed by one group of + * twelve hexadecimal digits. GUIDs are Microsoft's implementation of the + * distributed computing environment (DCE) universally unique identifier (UUID). + * Example of a GUID: + * 1F010768-5A73-BC91-0010A52216A7 + */ +typedef struct { + le32 data1; /* The first eight hexadecimal digits of the GUID. */ + le16 data2; /* The first group of four hexadecimal digits. */ + le16 data3; /* The second group of four hexadecimal digits. */ + u8 data4[8]; /* The first two bytes are the third group of four + hexadecimal digits. The remaining six bytes are the + final 12 hexadecimal digits. */ +} __attribute__ ((__packed__)) GUID; + +/* + * FILE_Extend/$ObjId contains an index named $O. This index contains all + * object_ids present on the volume as the index keys and the corresponding + * mft_record numbers as the index entry data parts. The data part (defined + * below) also contains three other object_ids: + * birth_volume_id - object_id of FILE_Volume on which the file was first + * created. Optional (i.e. can be zero). + * birth_object_id - object_id of file when it was first created. Usually + * equals the object_id. Optional (i.e. can be zero). + * domain_id - Reserved (always zero). + */ +typedef struct { + leMFT_REF mft_reference;/* Mft record containing the object_id in + the index entry key. */ + union { + struct { + GUID birth_volume_id; + GUID birth_object_id; + GUID domain_id; + } __attribute__ ((__packed__)) origin; + u8 extended_info[48]; + } __attribute__ ((__packed__)) opt; +} __attribute__ ((__packed__)) OBJ_ID_INDEX_DATA; + +/* + * Attribute: Object id (NTFS 3.0+) (0x40). + * + * NOTE: Always resident. + */ +typedef struct { + GUID object_id; /* Unique id assigned to the + file.*/ + /* The following fields are optional. The attribute value size is 16 + bytes, i.e. sizeof(GUID), if these are not present at all. Note, + the entries can be present but one or more (or all) can be zero + meaning that that particular value(s) is(are) not defined. */ + union { + struct { + GUID birth_volume_id; /* Unique id of volume on which + the file was first created.*/ + GUID birth_object_id; /* Unique id of file when it was + first created. */ + GUID domain_id; /* Reserved, zero. */ + } __attribute__ ((__packed__)) origin; + u8 extended_info[48]; + } __attribute__ ((__packed__)) opt; +} __attribute__ ((__packed__)) OBJECT_ID_ATTR; + +/* + * The pre-defined IDENTIFIER_AUTHORITIES used as SID_IDENTIFIER_AUTHORITY in + * the SID structure (see below). + */ +//typedef enum { /* SID string prefix. */ +// SECURITY_NULL_SID_AUTHORITY = {0, 0, 0, 0, 0, 0}, /* S-1-0 */ +// SECURITY_WORLD_SID_AUTHORITY = {0, 0, 0, 0, 0, 1}, /* S-1-1 */ +// SECURITY_LOCAL_SID_AUTHORITY = {0, 0, 0, 0, 0, 2}, /* S-1-2 */ +// SECURITY_CREATOR_SID_AUTHORITY = {0, 0, 0, 0, 0, 3}, /* S-1-3 */ +// SECURITY_NON_UNIQUE_AUTHORITY = {0, 0, 0, 0, 0, 4}, /* S-1-4 */ +// SECURITY_NT_SID_AUTHORITY = {0, 0, 0, 0, 0, 5}, /* S-1-5 */ +//} IDENTIFIER_AUTHORITIES; + +/* + * These relative identifiers (RIDs) are used with the above identifier + * authorities to make up universal well-known SIDs. + * + * Note: The relative identifier (RID) refers to the portion of a SID, which + * identifies a user or group in relation to the authority that issued the SID. + * For example, the universal well-known SID Creator Owner ID (S-1-3-0) is + * made up of the identifier authority SECURITY_CREATOR_SID_AUTHORITY (3) and + * the relative identifier SECURITY_CREATOR_OWNER_RID (0). + */ +typedef enum { /* Identifier authority. */ + SECURITY_NULL_RID = 0, /* S-1-0 */ + SECURITY_WORLD_RID = 0, /* S-1-1 */ + SECURITY_LOCAL_RID = 0, /* S-1-2 */ + + SECURITY_CREATOR_OWNER_RID = 0, /* S-1-3 */ + SECURITY_CREATOR_GROUP_RID = 1, /* S-1-3 */ + + SECURITY_CREATOR_OWNER_SERVER_RID = 2, /* S-1-3 */ + SECURITY_CREATOR_GROUP_SERVER_RID = 3, /* S-1-3 */ + + SECURITY_DIALUP_RID = 1, + SECURITY_NETWORK_RID = 2, + SECURITY_BATCH_RID = 3, + SECURITY_INTERACTIVE_RID = 4, + SECURITY_SERVICE_RID = 6, + SECURITY_ANONYMOUS_LOGON_RID = 7, + SECURITY_PROXY_RID = 8, + SECURITY_ENTERPRISE_CONTROLLERS_RID=9, + SECURITY_SERVER_LOGON_RID = 9, + SECURITY_PRINCIPAL_SELF_RID = 0xa, + SECURITY_AUTHENTICATED_USER_RID = 0xb, + SECURITY_RESTRICTED_CODE_RID = 0xc, + SECURITY_TERMINAL_SERVER_RID = 0xd, + + SECURITY_LOGON_IDS_RID = 5, + SECURITY_LOGON_IDS_RID_COUNT = 3, + + SECURITY_LOCAL_SYSTEM_RID = 0x12, + + SECURITY_NT_NON_UNIQUE = 0x15, + + SECURITY_BUILTIN_DOMAIN_RID = 0x20, + + /* + * Well-known domain relative sub-authority values (RIDs). + */ + + /* Users. */ + DOMAIN_USER_RID_ADMIN = 0x1f4, + DOMAIN_USER_RID_GUEST = 0x1f5, + DOMAIN_USER_RID_KRBTGT = 0x1f6, + + /* Groups. */ + DOMAIN_GROUP_RID_ADMINS = 0x200, + DOMAIN_GROUP_RID_USERS = 0x201, + DOMAIN_GROUP_RID_GUESTS = 0x202, + DOMAIN_GROUP_RID_COMPUTERS = 0x203, + DOMAIN_GROUP_RID_CONTROLLERS = 0x204, + DOMAIN_GROUP_RID_CERT_ADMINS = 0x205, + DOMAIN_GROUP_RID_SCHEMA_ADMINS = 0x206, + DOMAIN_GROUP_RID_ENTERPRISE_ADMINS= 0x207, + DOMAIN_GROUP_RID_POLICY_ADMINS = 0x208, + + /* Aliases. */ + DOMAIN_ALIAS_RID_ADMINS = 0x220, + DOMAIN_ALIAS_RID_USERS = 0x221, + DOMAIN_ALIAS_RID_GUESTS = 0x222, + DOMAIN_ALIAS_RID_POWER_USERS = 0x223, + + DOMAIN_ALIAS_RID_ACCOUNT_OPS = 0x224, + DOMAIN_ALIAS_RID_SYSTEM_OPS = 0x225, + DOMAIN_ALIAS_RID_PRINT_OPS = 0x226, + DOMAIN_ALIAS_RID_BACKUP_OPS = 0x227, + + DOMAIN_ALIAS_RID_REPLICATOR = 0x228, + DOMAIN_ALIAS_RID_RAS_SERVERS = 0x229, + DOMAIN_ALIAS_RID_PREW2KCOMPACCESS = 0x22a, +} RELATIVE_IDENTIFIERS; + +/* + * The universal well-known SIDs: + * + * NULL_SID S-1-0-0 + * WORLD_SID S-1-1-0 + * LOCAL_SID S-1-2-0 + * CREATOR_OWNER_SID S-1-3-0 + * CREATOR_GROUP_SID S-1-3-1 + * CREATOR_OWNER_SERVER_SID S-1-3-2 + * CREATOR_GROUP_SERVER_SID S-1-3-3 + * + * (Non-unique IDs) S-1-4 + * + * NT well-known SIDs: + * + * NT_AUTHORITY_SID S-1-5 + * DIALUP_SID S-1-5-1 + * + * NETWORD_SID S-1-5-2 + * BATCH_SID S-1-5-3 + * INTERACTIVE_SID S-1-5-4 + * SERVICE_SID S-1-5-6 + * ANONYMOUS_LOGON_SID S-1-5-7 (aka null logon session) + * PROXY_SID S-1-5-8 + * SERVER_LOGON_SID S-1-5-9 (aka domain controller account) + * SELF_SID S-1-5-10 (self RID) + * AUTHENTICATED_USER_SID S-1-5-11 + * RESTRICTED_CODE_SID S-1-5-12 (running restricted code) + * TERMINAL_SERVER_SID S-1-5-13 (running on terminal server) + * + * (Logon IDs) S-1-5-5-X-Y + * + * (NT non-unique IDs) S-1-5-0x15-... + * + * (Built-in domain) S-1-5-0x20 + */ + +/* + * The SID_IDENTIFIER_AUTHORITY is a 48-bit value used in the SID structure. + * + * NOTE: This is stored as a big endian number, hence the high_part comes + * before the low_part. + */ +typedef union { + struct { + u16 high_part; /* High 16-bits. */ + u32 low_part; /* Low 32-bits. */ + } __attribute__ ((__packed__)) parts; + u8 value[6]; /* Value as individual bytes. */ +} __attribute__ ((__packed__)) SID_IDENTIFIER_AUTHORITY; + +/* + * The SID structure is a variable-length structure used to uniquely identify + * users or groups. SID stands for security identifier. + * + * The standard textual representation of the SID is of the form: + * S-R-I-S-S... + * Where: + * - The first "S" is the literal character 'S' identifying the following + * digits as a SID. + * - R is the revision level of the SID expressed as a sequence of digits + * either in decimal or hexadecimal (if the later, prefixed by "0x"). + * - I is the 48-bit identifier_authority, expressed as digits as R above. + * - S... is one or more sub_authority values, expressed as digits as above. + * + * Example SID; the domain-relative SID of the local Administrators group on + * Windows NT/2k: + * S-1-5-32-544 + * This translates to a SID with: + * revision = 1, + * sub_authority_count = 2, + * identifier_authority = {0,0,0,0,0,5}, // SECURITY_NT_AUTHORITY + * sub_authority[0] = 32, // SECURITY_BUILTIN_DOMAIN_RID + * sub_authority[1] = 544 // DOMAIN_ALIAS_RID_ADMINS + */ +typedef struct { + u8 revision; + u8 sub_authority_count; + SID_IDENTIFIER_AUTHORITY identifier_authority; + le32 sub_authority[1]; /* At least one sub_authority. */ +} __attribute__ ((__packed__)) SID; + +/* + * Current constants for SIDs. + */ +typedef enum { + SID_REVISION = 1, /* Current revision level. */ + SID_MAX_SUB_AUTHORITIES = 15, /* Maximum number of those. */ + SID_RECOMMENDED_SUB_AUTHORITIES = 1, /* Will change to around 6 in + a future revision. */ +} SID_CONSTANTS; + +/* + * The predefined ACE types (8-bit, see below). + */ +enum { + ACCESS_MIN_MS_ACE_TYPE = 0, + ACCESS_ALLOWED_ACE_TYPE = 0, + ACCESS_DENIED_ACE_TYPE = 1, + SYSTEM_AUDIT_ACE_TYPE = 2, + SYSTEM_ALARM_ACE_TYPE = 3, /* Not implemented as of Win2k. */ + ACCESS_MAX_MS_V2_ACE_TYPE = 3, + + ACCESS_ALLOWED_COMPOUND_ACE_TYPE= 4, + ACCESS_MAX_MS_V3_ACE_TYPE = 4, + + /* The following are Win2k only. */ + ACCESS_MIN_MS_OBJECT_ACE_TYPE = 5, + ACCESS_ALLOWED_OBJECT_ACE_TYPE = 5, + ACCESS_DENIED_OBJECT_ACE_TYPE = 6, + SYSTEM_AUDIT_OBJECT_ACE_TYPE = 7, + SYSTEM_ALARM_OBJECT_ACE_TYPE = 8, + ACCESS_MAX_MS_OBJECT_ACE_TYPE = 8, + + ACCESS_MAX_MS_V4_ACE_TYPE = 8, + + /* This one is for WinNT/2k. */ + ACCESS_MAX_MS_ACE_TYPE = 8, +} __attribute__ ((__packed__)); + +typedef u8 ACE_TYPES; + +/* + * The ACE flags (8-bit) for audit and inheritance (see below). + * + * SUCCESSFUL_ACCESS_ACE_FLAG is only used with system audit and alarm ACE + * types to indicate that a message is generated (in Windows!) for successful + * accesses. + * + * FAILED_ACCESS_ACE_FLAG is only used with system audit and alarm ACE types + * to indicate that a message is generated (in Windows!) for failed accesses. + */ +enum { + /* The inheritance flags. */ + OBJECT_INHERIT_ACE = 0x01, + CONTAINER_INHERIT_ACE = 0x02, + NO_PROPAGATE_INHERIT_ACE = 0x04, + INHERIT_ONLY_ACE = 0x08, + INHERITED_ACE = 0x10, /* Win2k only. */ + VALID_INHERIT_FLAGS = 0x1f, + + /* The audit flags. */ + SUCCESSFUL_ACCESS_ACE_FLAG = 0x40, + FAILED_ACCESS_ACE_FLAG = 0x80, +} __attribute__ ((__packed__)); + +typedef u8 ACE_FLAGS; + +/* + * An ACE is an access-control entry in an access-control list (ACL). + * An ACE defines access to an object for a specific user or group or defines + * the types of access that generate system-administration messages or alarms + * for a specific user or group. The user or group is identified by a security + * identifier (SID). + * + * Each ACE starts with an ACE_HEADER structure (aligned on 4-byte boundary), + * which specifies the type and size of the ACE. The format of the subsequent + * data depends on the ACE type. + */ +typedef struct { +/*Ofs*/ +/* 0*/ ACE_TYPES type; /* Type of the ACE. */ +/* 1*/ ACE_FLAGS flags; /* Flags describing the ACE. */ +/* 2*/ le16 size; /* Size in bytes of the ACE. */ +} __attribute__ ((__packed__)) ACE_HEADER; + +/* + * The access mask (32-bit). Defines the access rights. + * + * The specific rights (bits 0 to 15). These depend on the type of the object + * being secured by the ACE. + */ +enum { + /* Specific rights for files and directories are as follows: */ + + /* Right to read data from the file. (FILE) */ + FILE_READ_DATA = cpu_to_le32(0x00000001), + /* Right to list contents of a directory. (DIRECTORY) */ + FILE_LIST_DIRECTORY = cpu_to_le32(0x00000001), + + /* Right to write data to the file. (FILE) */ + FILE_WRITE_DATA = cpu_to_le32(0x00000002), + /* Right to create a file in the directory. (DIRECTORY) */ + FILE_ADD_FILE = cpu_to_le32(0x00000002), + + /* Right to append data to the file. (FILE) */ + FILE_APPEND_DATA = cpu_to_le32(0x00000004), + /* Right to create a subdirectory. (DIRECTORY) */ + FILE_ADD_SUBDIRECTORY = cpu_to_le32(0x00000004), + + /* Right to read extended attributes. (FILE/DIRECTORY) */ + FILE_READ_EA = cpu_to_le32(0x00000008), + + /* Right to write extended attributes. (FILE/DIRECTORY) */ + FILE_WRITE_EA = cpu_to_le32(0x00000010), + + /* Right to execute a file. (FILE) */ + FILE_EXECUTE = cpu_to_le32(0x00000020), + /* Right to traverse the directory. (DIRECTORY) */ + FILE_TRAVERSE = cpu_to_le32(0x00000020), + + /* + * Right to delete a directory and all the files it contains (its + * children), even if the files are read-only. (DIRECTORY) + */ + FILE_DELETE_CHILD = cpu_to_le32(0x00000040), + + /* Right to read file attributes. (FILE/DIRECTORY) */ + FILE_READ_ATTRIBUTES = cpu_to_le32(0x00000080), + + /* Right to change file attributes. (FILE/DIRECTORY) */ + FILE_WRITE_ATTRIBUTES = cpu_to_le32(0x00000100), + + /* + * The standard rights (bits 16 to 23). These are independent of the + * type of object being secured. + */ + + /* Right to delete the object. */ + DELETE = cpu_to_le32(0x00010000), + + /* + * Right to read the information in the object's security descriptor, + * not including the information in the SACL, i.e. right to read the + * security descriptor and owner. + */ + READ_CONTROL = cpu_to_le32(0x00020000), + + /* Right to modify the DACL in the object's security descriptor. */ + WRITE_DAC = cpu_to_le32(0x00040000), + + /* Right to change the owner in the object's security descriptor. */ + WRITE_OWNER = cpu_to_le32(0x00080000), + + /* + * Right to use the object for synchronization. Enables a process to + * wait until the object is in the signalled state. Some object types + * do not support this access right. + */ + SYNCHRONIZE = cpu_to_le32(0x00100000), + + /* + * The following STANDARD_RIGHTS_* are combinations of the above for + * convenience and are defined by the Win32 API. + */ + + /* These are currently defined to READ_CONTROL. */ + STANDARD_RIGHTS_READ = cpu_to_le32(0x00020000), + STANDARD_RIGHTS_WRITE = cpu_to_le32(0x00020000), + STANDARD_RIGHTS_EXECUTE = cpu_to_le32(0x00020000), + + /* Combines DELETE, READ_CONTROL, WRITE_DAC, and WRITE_OWNER access. */ + STANDARD_RIGHTS_REQUIRED = cpu_to_le32(0x000f0000), + + /* + * Combines DELETE, READ_CONTROL, WRITE_DAC, WRITE_OWNER, and + * SYNCHRONIZE access. + */ + STANDARD_RIGHTS_ALL = cpu_to_le32(0x001f0000), + + /* + * The access system ACL and maximum allowed access types (bits 24 to + * 25, bits 26 to 27 are reserved). + */ + ACCESS_SYSTEM_SECURITY = cpu_to_le32(0x01000000), + MAXIMUM_ALLOWED = cpu_to_le32(0x02000000), + + /* + * The generic rights (bits 28 to 31). These map onto the standard and + * specific rights. + */ + + /* Read, write, and execute access. */ + GENERIC_ALL = cpu_to_le32(0x10000000), + + /* Execute access. */ + GENERIC_EXECUTE = cpu_to_le32(0x20000000), + + /* + * Write access. For files, this maps onto: + * FILE_APPEND_DATA | FILE_WRITE_ATTRIBUTES | FILE_WRITE_DATA | + * FILE_WRITE_EA | STANDARD_RIGHTS_WRITE | SYNCHRONIZE + * For directories, the mapping has the same numerical value. See + * above for the descriptions of the rights granted. + */ + GENERIC_WRITE = cpu_to_le32(0x40000000), + + /* + * Read access. For files, this maps onto: + * FILE_READ_ATTRIBUTES | FILE_READ_DATA | FILE_READ_EA | + * STANDARD_RIGHTS_READ | SYNCHRONIZE + * For directories, the mapping has the same numberical value. See + * above for the descriptions of the rights granted. + */ + GENERIC_READ = cpu_to_le32(0x80000000), +}; + +typedef le32 ACCESS_MASK; + +/* + * The generic mapping array. Used to denote the mapping of each generic + * access right to a specific access mask. + * + * FIXME: What exactly is this and what is it for? (AIA) + */ +typedef struct { + ACCESS_MASK generic_read; + ACCESS_MASK generic_write; + ACCESS_MASK generic_execute; + ACCESS_MASK generic_all; +} __attribute__ ((__packed__)) GENERIC_MAPPING; + +/* + * The predefined ACE type structures are as defined below. + */ + +/* + * ACCESS_ALLOWED_ACE, ACCESS_DENIED_ACE, SYSTEM_AUDIT_ACE, SYSTEM_ALARM_ACE + */ +typedef struct { +/* 0 ACE_HEADER; -- Unfolded here as gcc doesn't like unnamed structs. */ + ACE_TYPES type; /* Type of the ACE. */ + ACE_FLAGS flags; /* Flags describing the ACE. */ + le16 size; /* Size in bytes of the ACE. */ +/* 4*/ ACCESS_MASK mask; /* Access mask associated with the ACE. */ + +/* 8*/ SID sid; /* The SID associated with the ACE. */ +} __attribute__ ((__packed__)) ACCESS_ALLOWED_ACE, ACCESS_DENIED_ACE, + SYSTEM_AUDIT_ACE, SYSTEM_ALARM_ACE; + +/* + * The object ACE flags (32-bit). + */ +enum { + ACE_OBJECT_TYPE_PRESENT = cpu_to_le32(1), + ACE_INHERITED_OBJECT_TYPE_PRESENT = cpu_to_le32(2), +}; + +typedef le32 OBJECT_ACE_FLAGS; + +typedef struct { +/* 0 ACE_HEADER; -- Unfolded here as gcc doesn't like unnamed structs. */ + ACE_TYPES type; /* Type of the ACE. */ + ACE_FLAGS flags; /* Flags describing the ACE. */ + le16 size; /* Size in bytes of the ACE. */ +/* 4*/ ACCESS_MASK mask; /* Access mask associated with the ACE. */ + +/* 8*/ OBJECT_ACE_FLAGS object_flags; /* Flags describing the object ACE. */ +/* 12*/ GUID object_type; +/* 28*/ GUID inherited_object_type; + +/* 44*/ SID sid; /* The SID associated with the ACE. */ +} __attribute__ ((__packed__)) ACCESS_ALLOWED_OBJECT_ACE, + ACCESS_DENIED_OBJECT_ACE, + SYSTEM_AUDIT_OBJECT_ACE, + SYSTEM_ALARM_OBJECT_ACE; + +/* + * An ACL is an access-control list (ACL). + * An ACL starts with an ACL header structure, which specifies the size of + * the ACL and the number of ACEs it contains. The ACL header is followed by + * zero or more access control entries (ACEs). The ACL as well as each ACE + * are aligned on 4-byte boundaries. + */ +typedef struct { + u8 revision; /* Revision of this ACL. */ + u8 alignment1; + le16 size; /* Allocated space in bytes for ACL. Includes this + header, the ACEs and the remaining free space. */ + le16 ace_count; /* Number of ACEs in the ACL. */ + le16 alignment2; +/* sizeof() = 8 bytes */ +} __attribute__ ((__packed__)) ACL; + +/* + * Current constants for ACLs. + */ +typedef enum { + /* Current revision. */ + ACL_REVISION = 2, + ACL_REVISION_DS = 4, + + /* History of revisions. */ + ACL_REVISION1 = 1, + MIN_ACL_REVISION = 2, + ACL_REVISION2 = 2, + ACL_REVISION3 = 3, + ACL_REVISION4 = 4, + MAX_ACL_REVISION = 4, +} ACL_CONSTANTS; + +/* + * The security descriptor control flags (16-bit). + * + * SE_OWNER_DEFAULTED - This boolean flag, when set, indicates that the SID + * pointed to by the Owner field was provided by a defaulting mechanism + * rather than explicitly provided by the original provider of the + * security descriptor. This may affect the treatment of the SID with + * respect to inheritance of an owner. + * + * SE_GROUP_DEFAULTED - This boolean flag, when set, indicates that the SID in + * the Group field was provided by a defaulting mechanism rather than + * explicitly provided by the original provider of the security + * descriptor. This may affect the treatment of the SID with respect to + * inheritance of a primary group. + * + * SE_DACL_PRESENT - This boolean flag, when set, indicates that the security + * descriptor contains a discretionary ACL. If this flag is set and the + * Dacl field of the SECURITY_DESCRIPTOR is null, then a null ACL is + * explicitly being specified. + * + * SE_DACL_DEFAULTED - This boolean flag, when set, indicates that the ACL + * pointed to by the Dacl field was provided by a defaulting mechanism + * rather than explicitly provided by the original provider of the + * security descriptor. This may affect the treatment of the ACL with + * respect to inheritance of an ACL. This flag is ignored if the + * DaclPresent flag is not set. + * + * SE_SACL_PRESENT - This boolean flag, when set, indicates that the security + * descriptor contains a system ACL pointed to by the Sacl field. If this + * flag is set and the Sacl field of the SECURITY_DESCRIPTOR is null, then + * an empty (but present) ACL is being specified. + * + * SE_SACL_DEFAULTED - This boolean flag, when set, indicates that the ACL + * pointed to by the Sacl field was provided by a defaulting mechanism + * rather than explicitly provided by the original provider of the + * security descriptor. This may affect the treatment of the ACL with + * respect to inheritance of an ACL. This flag is ignored if the + * SaclPresent flag is not set. + * + * SE_SELF_RELATIVE - This boolean flag, when set, indicates that the security + * descriptor is in self-relative form. In this form, all fields of the + * security descriptor are contiguous in memory and all pointer fields are + * expressed as offsets from the beginning of the security descriptor. + */ +enum { + SE_OWNER_DEFAULTED = cpu_to_le16(0x0001), + SE_GROUP_DEFAULTED = cpu_to_le16(0x0002), + SE_DACL_PRESENT = cpu_to_le16(0x0004), + SE_DACL_DEFAULTED = cpu_to_le16(0x0008), + + SE_SACL_PRESENT = cpu_to_le16(0x0010), + SE_SACL_DEFAULTED = cpu_to_le16(0x0020), + + SE_DACL_AUTO_INHERIT_REQ = cpu_to_le16(0x0100), + SE_SACL_AUTO_INHERIT_REQ = cpu_to_le16(0x0200), + SE_DACL_AUTO_INHERITED = cpu_to_le16(0x0400), + SE_SACL_AUTO_INHERITED = cpu_to_le16(0x0800), + + SE_DACL_PROTECTED = cpu_to_le16(0x1000), + SE_SACL_PROTECTED = cpu_to_le16(0x2000), + SE_RM_CONTROL_VALID = cpu_to_le16(0x4000), + SE_SELF_RELATIVE = cpu_to_le16(0x8000) +} __attribute__ ((__packed__)); + +typedef le16 SECURITY_DESCRIPTOR_CONTROL; + +/* + * Self-relative security descriptor. Contains the owner and group SIDs as well + * as the sacl and dacl ACLs inside the security descriptor itself. + */ +typedef struct { + u8 revision; /* Revision level of the security descriptor. */ + u8 alignment; + SECURITY_DESCRIPTOR_CONTROL control; /* Flags qualifying the type of + the descriptor as well as the following fields. */ + le32 owner; /* Byte offset to a SID representing an object's + owner. If this is NULL, no owner SID is present in + the descriptor. */ + le32 group; /* Byte offset to a SID representing an object's + primary group. If this is NULL, no primary group + SID is present in the descriptor. */ + le32 sacl; /* Byte offset to a system ACL. Only valid, if + SE_SACL_PRESENT is set in the control field. If + SE_SACL_PRESENT is set but sacl is NULL, a NULL ACL + is specified. */ + le32 dacl; /* Byte offset to a discretionary ACL. Only valid, if + SE_DACL_PRESENT is set in the control field. If + SE_DACL_PRESENT is set but dacl is NULL, a NULL ACL + (unconditionally granting access) is specified. */ +/* sizeof() = 0x14 bytes */ +} __attribute__ ((__packed__)) SECURITY_DESCRIPTOR_RELATIVE; + +/* + * Absolute security descriptor. Does not contain the owner and group SIDs, nor + * the sacl and dacl ACLs inside the security descriptor. Instead, it contains + * pointers to these structures in memory. Obviously, absolute security + * descriptors are only useful for in memory representations of security + * descriptors. On disk, a self-relative security descriptor is used. + */ +typedef struct { + u8 revision; /* Revision level of the security descriptor. */ + u8 alignment; + SECURITY_DESCRIPTOR_CONTROL control; /* Flags qualifying the type of + the descriptor as well as the following fields. */ + SID *owner; /* Points to a SID representing an object's owner. If + this is NULL, no owner SID is present in the + descriptor. */ + SID *group; /* Points to a SID representing an object's primary + group. If this is NULL, no primary group SID is + present in the descriptor. */ + ACL *sacl; /* Points to a system ACL. Only valid, if + SE_SACL_PRESENT is set in the control field. If + SE_SACL_PRESENT is set but sacl is NULL, a NULL ACL + is specified. */ + ACL *dacl; /* Points to a discretionary ACL. Only valid, if + SE_DACL_PRESENT is set in the control field. If + SE_DACL_PRESENT is set but dacl is NULL, a NULL ACL + (unconditionally granting access) is specified. */ +} __attribute__ ((__packed__)) SECURITY_DESCRIPTOR; + +/* + * Current constants for security descriptors. + */ +typedef enum { + /* Current revision. */ + SECURITY_DESCRIPTOR_REVISION = 1, + SECURITY_DESCRIPTOR_REVISION1 = 1, + + /* The sizes of both the absolute and relative security descriptors is + the same as pointers, at least on ia32 architecture are 32-bit. */ + SECURITY_DESCRIPTOR_MIN_LENGTH = sizeof(SECURITY_DESCRIPTOR), +} SECURITY_DESCRIPTOR_CONSTANTS; + +/* + * Attribute: Security descriptor (0x50). A standard self-relative security + * descriptor. + * + * NOTE: Can be resident or non-resident. + * NOTE: Not used in NTFS 3.0+, as security descriptors are stored centrally + * in FILE_Secure and the correct descriptor is found using the security_id + * from the standard information attribute. + */ +typedef SECURITY_DESCRIPTOR_RELATIVE SECURITY_DESCRIPTOR_ATTR; + +/* + * On NTFS 3.0+, all security descriptors are stored in FILE_Secure. Only one + * referenced instance of each unique security descriptor is stored. + * + * FILE_Secure contains no unnamed data attribute, i.e. it has zero length. It + * does, however, contain two indexes ($SDH and $SII) as well as a named data + * stream ($SDS). + * + * Every unique security descriptor is assigned a unique security identifier + * (security_id, not to be confused with a SID). The security_id is unique for + * the NTFS volume and is used as an index into the $SII index, which maps + * security_ids to the security descriptor's storage location within the $SDS + * data attribute. The $SII index is sorted by ascending security_id. + * + * A simple hash is computed from each security descriptor. This hash is used + * as an index into the $SDH index, which maps security descriptor hashes to + * the security descriptor's storage location within the $SDS data attribute. + * The $SDH index is sorted by security descriptor hash and is stored in a B+ + * tree. When searching $SDH (with the intent of determining whether or not a + * new security descriptor is already present in the $SDS data stream), if a + * matching hash is found, but the security descriptors do not match, the + * search in the $SDH index is continued, searching for a next matching hash. + * + * When a precise match is found, the security_id coresponding to the security + * descriptor in the $SDS attribute is read from the found $SDH index entry and + * is stored in the $STANDARD_INFORMATION attribute of the file/directory to + * which the security descriptor is being applied. The $STANDARD_INFORMATION + * attribute is present in all base mft records (i.e. in all files and + * directories). + * + * If a match is not found, the security descriptor is assigned a new unique + * security_id and is added to the $SDS data attribute. Then, entries + * referencing the this security descriptor in the $SDS data attribute are + * added to the $SDH and $SII indexes. + * + * Note: Entries are never deleted from FILE_Secure, even if nothing + * references an entry any more. + */ + +/* + * This header precedes each security descriptor in the $SDS data stream. + * This is also the index entry data part of both the $SII and $SDH indexes. + */ +typedef struct { + le32 hash; /* Hash of the security descriptor. */ + le32 security_id; /* The security_id assigned to the descriptor. */ + le64 offset; /* Byte offset of this entry in the $SDS stream. */ + le32 length; /* Size in bytes of this entry in $SDS stream. */ +} __attribute__ ((__packed__)) SECURITY_DESCRIPTOR_HEADER; + +/* + * The $SDS data stream contains the security descriptors, aligned on 16-byte + * boundaries, sorted by security_id in a B+ tree. Security descriptors cannot + * cross 256kib boundaries (this restriction is imposed by the Windows cache + * manager). Each security descriptor is contained in a SDS_ENTRY structure. + * Also, each security descriptor is stored twice in the $SDS stream with a + * fixed offset of 0x40000 bytes (256kib, the Windows cache manager's max size) + * between them; i.e. if a SDS_ENTRY specifies an offset of 0x51d0, then the + * first copy of the security descriptor will be at offset 0x51d0 in the + * $SDS data stream and the second copy will be at offset 0x451d0. + */ +typedef struct { +/*Ofs*/ +/* 0 SECURITY_DESCRIPTOR_HEADER; -- Unfolded here as gcc doesn't like + unnamed structs. */ + le32 hash; /* Hash of the security descriptor. */ + le32 security_id; /* The security_id assigned to the descriptor. */ + le64 offset; /* Byte offset of this entry in the $SDS stream. */ + le32 length; /* Size in bytes of this entry in $SDS stream. */ +/* 20*/ SECURITY_DESCRIPTOR_RELATIVE sid; /* The self-relative security + descriptor. */ +} __attribute__ ((__packed__)) SDS_ENTRY; + +/* + * The index entry key used in the $SII index. The collation type is + * COLLATION_NTOFS_ULONG. + */ +typedef struct { + le32 security_id; /* The security_id assigned to the descriptor. */ +} __attribute__ ((__packed__)) SII_INDEX_KEY; + +/* + * The index entry key used in the $SDH index. The keys are sorted first by + * hash and then by security_id. The collation rule is + * COLLATION_NTOFS_SECURITY_HASH. + */ +typedef struct { + le32 hash; /* Hash of the security descriptor. */ + le32 security_id; /* The security_id assigned to the descriptor. */ +} __attribute__ ((__packed__)) SDH_INDEX_KEY; + +/* + * Attribute: Volume name (0x60). + * + * NOTE: Always resident. + * NOTE: Present only in FILE_Volume. + */ +typedef struct { + ntfschar name[0]; /* The name of the volume in Unicode. */ +} __attribute__ ((__packed__)) VOLUME_NAME; + +/* + * Possible flags for the volume (16-bit). + */ +enum { + VOLUME_IS_DIRTY = cpu_to_le16(0x0001), + VOLUME_RESIZE_LOG_FILE = cpu_to_le16(0x0002), + VOLUME_UPGRADE_ON_MOUNT = cpu_to_le16(0x0004), + VOLUME_MOUNTED_ON_NT4 = cpu_to_le16(0x0008), + + VOLUME_DELETE_USN_UNDERWAY = cpu_to_le16(0x0010), + VOLUME_REPAIR_OBJECT_ID = cpu_to_le16(0x0020), + + VOLUME_CHKDSK_UNDERWAY = cpu_to_le16(0x4000), + VOLUME_MODIFIED_BY_CHKDSK = cpu_to_le16(0x8000), + + VOLUME_FLAGS_MASK = cpu_to_le16(0xc03f), + + /* To make our life easier when checking if we must mount read-only. */ + VOLUME_MUST_MOUNT_RO_MASK = cpu_to_le16(0xc027), +} __attribute__ ((__packed__)); + +typedef le16 VOLUME_FLAGS; + +/* + * Attribute: Volume information (0x70). + * + * NOTE: Always resident. + * NOTE: Present only in FILE_Volume. + * NOTE: Windows 2000 uses NTFS 3.0 while Windows NT4 service pack 6a uses + * NTFS 1.2. I haven't personally seen other values yet. + */ +typedef struct { + le64 reserved; /* Not used (yet?). */ + u8 major_ver; /* Major version of the ntfs format. */ + u8 minor_ver; /* Minor version of the ntfs format. */ + VOLUME_FLAGS flags; /* Bit array of VOLUME_* flags. */ +} __attribute__ ((__packed__)) VOLUME_INFORMATION; + +/* + * Attribute: Data attribute (0x80). + * + * NOTE: Can be resident or non-resident. + * + * Data contents of a file (i.e. the unnamed stream) or of a named stream. + */ +typedef struct { + u8 data[0]; /* The file's data contents. */ +} __attribute__ ((__packed__)) DATA_ATTR; + +/* + * Index header flags (8-bit). + */ +enum { + /* + * When index header is in an index root attribute: + */ + SMALL_INDEX = 0, /* The index is small enough to fit inside the index + root attribute and there is no index allocation + attribute present. */ + LARGE_INDEX = 1, /* The index is too large to fit in the index root + attribute and/or an index allocation attribute is + present. */ + /* + * When index header is in an index block, i.e. is part of index + * allocation attribute: + */ + LEAF_NODE = 0, /* This is a leaf node, i.e. there are no more nodes + branching off it. */ + INDEX_NODE = 1, /* This node indexes other nodes, i.e. it is not a leaf + node. */ + NODE_MASK = 1, /* Mask for accessing the *_NODE bits. */ +} __attribute__ ((__packed__)); + +typedef u8 INDEX_HEADER_FLAGS; + +/* + * This is the header for indexes, describing the INDEX_ENTRY records, which + * follow the INDEX_HEADER. Together the index header and the index entries + * make up a complete index. + * + * IMPORTANT NOTE: The offset, length and size structure members are counted + * relative to the start of the index header structure and not relative to the + * start of the index root or index allocation structures themselves. + */ +typedef struct { + le32 entries_offset; /* Byte offset to first INDEX_ENTRY + aligned to 8-byte boundary. */ + le32 index_length; /* Data size of the index in bytes, + i.e. bytes used from allocated + size, aligned to 8-byte boundary. */ + le32 allocated_size; /* Byte size of this index (block), + multiple of 8 bytes. */ + /* NOTE: For the index root attribute, the above two numbers are always + equal, as the attribute is resident and it is resized as needed. In + the case of the index allocation attribute the attribute is not + resident and hence the allocated_size is a fixed value and must + equal the index_block_size specified by the INDEX_ROOT attribute + corresponding to the INDEX_ALLOCATION attribute this INDEX_BLOCK + belongs to. */ + INDEX_HEADER_FLAGS flags; /* Bit field of INDEX_HEADER_FLAGS. */ + u8 reserved[3]; /* Reserved/align to 8-byte boundary. */ +} __attribute__ ((__packed__)) INDEX_HEADER; + +/* + * Attribute: Index root (0x90). + * + * NOTE: Always resident. + * + * This is followed by a sequence of index entries (INDEX_ENTRY structures) + * as described by the index header. + * + * When a directory is small enough to fit inside the index root then this + * is the only attribute describing the directory. When the directory is too + * large to fit in the index root, on the other hand, two additional attributes + * are present: an index allocation attribute, containing sub-nodes of the B+ + * directory tree (see below), and a bitmap attribute, describing which virtual + * cluster numbers (vcns) in the index allocation attribute are in use by an + * index block. + * + * NOTE: The root directory (FILE_root) contains an entry for itself. Other + * directories do not contain entries for themselves, though. + */ +typedef struct { + ATTR_TYPE type; /* Type of the indexed attribute. Is + $FILE_NAME for directories, zero + for view indexes. No other values + allowed. */ + COLLATION_RULE collation_rule; /* Collation rule used to sort the + index entries. If type is $FILE_NAME, + this must be COLLATION_FILE_NAME. */ + le32 index_block_size; /* Size of each index block in bytes (in + the index allocation attribute). */ + u8 clusters_per_index_block; /* Cluster size of each index block (in + the index allocation attribute), when + an index block is >= than a cluster, + otherwise this will be the log of + the size (like how the encoding of + the mft record size and the index + record size found in the boot sector + work). Has to be a power of 2. */ + u8 reserved[3]; /* Reserved/align to 8-byte boundary. */ + INDEX_HEADER index; /* Index header describing the + following index entries. */ +} __attribute__ ((__packed__)) INDEX_ROOT; + +/* + * Attribute: Index allocation (0xa0). + * + * NOTE: Always non-resident (doesn't make sense to be resident anyway!). + * + * This is an array of index blocks. Each index block starts with an + * INDEX_BLOCK structure containing an index header, followed by a sequence of + * index entries (INDEX_ENTRY structures), as described by the INDEX_HEADER. + */ +typedef struct { +/* 0 NTFS_RECORD; -- Unfolded here as gcc doesn't like unnamed structs. */ + NTFS_RECORD_TYPE magic; /* Magic is "INDX". */ + le16 usa_ofs; /* See NTFS_RECORD definition. */ + le16 usa_count; /* See NTFS_RECORD definition. */ + +/* 8*/ sle64 lsn; /* $LogFile sequence number of the last + modification of this index block. */ +/* 16*/ leVCN index_block_vcn; /* Virtual cluster number of the index block. + If the cluster_size on the volume is <= the + index_block_size of the directory, + index_block_vcn counts in units of clusters, + and in units of sectors otherwise. */ +/* 24*/ INDEX_HEADER index; /* Describes the following index entries. */ +/* sizeof()= 40 (0x28) bytes */ +/* + * When creating the index block, we place the update sequence array at this + * offset, i.e. before we start with the index entries. This also makes sense, + * otherwise we could run into problems with the update sequence array + * containing in itself the last two bytes of a sector which would mean that + * multi sector transfer protection wouldn't work. As you can't protect data + * by overwriting it since you then can't get it back... + * When reading use the data from the ntfs record header. + */ +} __attribute__ ((__packed__)) INDEX_BLOCK; + +typedef INDEX_BLOCK INDEX_ALLOCATION; + +/* + * The system file FILE_Extend/$Reparse contains an index named $R listing + * all reparse points on the volume. The index entry keys are as defined + * below. Note, that there is no index data associated with the index entries. + * + * The index entries are sorted by the index key file_id. The collation rule is + * COLLATION_NTOFS_ULONGS. FIXME: Verify whether the reparse_tag is not the + * primary key / is not a key at all. (AIA) + */ +typedef struct { + le32 reparse_tag; /* Reparse point type (inc. flags). */ + leMFT_REF file_id; /* Mft record of the file containing the + reparse point attribute. */ +} __attribute__ ((__packed__)) REPARSE_INDEX_KEY; + +/* + * Quota flags (32-bit). + * + * The user quota flags. Names explain meaning. + */ +enum { + QUOTA_FLAG_DEFAULT_LIMITS = cpu_to_le32(0x00000001), + QUOTA_FLAG_LIMIT_REACHED = cpu_to_le32(0x00000002), + QUOTA_FLAG_ID_DELETED = cpu_to_le32(0x00000004), + + QUOTA_FLAG_USER_MASK = cpu_to_le32(0x00000007), + /* This is a bit mask for the user quota flags. */ + + /* + * These flags are only present in the quota defaults index entry, i.e. + * in the entry where owner_id = QUOTA_DEFAULTS_ID. + */ + QUOTA_FLAG_TRACKING_ENABLED = cpu_to_le32(0x00000010), + QUOTA_FLAG_ENFORCEMENT_ENABLED = cpu_to_le32(0x00000020), + QUOTA_FLAG_TRACKING_REQUESTED = cpu_to_le32(0x00000040), + QUOTA_FLAG_LOG_THRESHOLD = cpu_to_le32(0x00000080), + + QUOTA_FLAG_LOG_LIMIT = cpu_to_le32(0x00000100), + QUOTA_FLAG_OUT_OF_DATE = cpu_to_le32(0x00000200), + QUOTA_FLAG_CORRUPT = cpu_to_le32(0x00000400), + QUOTA_FLAG_PENDING_DELETES = cpu_to_le32(0x00000800), +}; + +typedef le32 QUOTA_FLAGS; + +/* + * The system file FILE_Extend/$Quota contains two indexes $O and $Q. Quotas + * are on a per volume and per user basis. + * + * The $Q index contains one entry for each existing user_id on the volume. The + * index key is the user_id of the user/group owning this quota control entry, + * i.e. the key is the owner_id. The user_id of the owner of a file, i.e. the + * owner_id, is found in the standard information attribute. The collation rule + * for $Q is COLLATION_NTOFS_ULONG. + * + * The $O index contains one entry for each user/group who has been assigned + * a quota on that volume. The index key holds the SID of the user_id the + * entry belongs to, i.e. the owner_id. The collation rule for $O is + * COLLATION_NTOFS_SID. + * + * The $O index entry data is the user_id of the user corresponding to the SID. + * This user_id is used as an index into $Q to find the quota control entry + * associated with the SID. + * + * The $Q index entry data is the quota control entry and is defined below. + */ +typedef struct { + le32 version; /* Currently equals 2. */ + QUOTA_FLAGS flags; /* Flags describing this quota entry. */ + le64 bytes_used; /* How many bytes of the quota are in use. */ + sle64 change_time; /* Last time this quota entry was changed. */ + sle64 threshold; /* Soft quota (-1 if not limited). */ + sle64 limit; /* Hard quota (-1 if not limited). */ + sle64 exceeded_time; /* How long the soft quota has been exceeded. */ + SID sid; /* The SID of the user/object associated with + this quota entry. Equals zero for the quota + defaults entry (and in fact on a WinXP + volume, it is not present at all). */ +} __attribute__ ((__packed__)) QUOTA_CONTROL_ENTRY; + +/* + * Predefined owner_id values (32-bit). + */ +enum { + QUOTA_INVALID_ID = cpu_to_le32(0x00000000), + QUOTA_DEFAULTS_ID = cpu_to_le32(0x00000001), + QUOTA_FIRST_USER_ID = cpu_to_le32(0x00000100), +}; + +/* + * Current constants for quota control entries. + */ +typedef enum { + /* Current version. */ + QUOTA_VERSION = 2, +} QUOTA_CONTROL_ENTRY_CONSTANTS; + +/* + * Index entry flags (16-bit). + */ +enum { + INDEX_ENTRY_NODE = cpu_to_le16(1), /* This entry contains a + sub-node, i.e. a reference to an index block in form of + a virtual cluster number (see below). */ + INDEX_ENTRY_END = cpu_to_le16(2), /* This signifies the last + entry in an index block. The index entry does not + represent a file but it can point to a sub-node. */ + + INDEX_ENTRY_SPACE_FILLER = cpu_to_le16(0xffff), /* gcc: Force + enum bit width to 16-bit. */ +} __attribute__ ((__packed__)); + +typedef le16 INDEX_ENTRY_FLAGS; + +/* + * This the index entry header (see below). + */ +typedef struct { +/* 0*/ union { + struct { /* Only valid when INDEX_ENTRY_END is not set. */ + leMFT_REF indexed_file; /* The mft reference of the file + described by this index + entry. Used for directory + indexes. */ + } __attribute__ ((__packed__)) dir; + struct { /* Used for views/indexes to find the entry's data. */ + le16 data_offset; /* Data byte offset from this + INDEX_ENTRY. Follows the + index key. */ + le16 data_length; /* Data length in bytes. */ + le32 reservedV; /* Reserved (zero). */ + } __attribute__ ((__packed__)) vi; + } __attribute__ ((__packed__)) data; +/* 8*/ le16 length; /* Byte size of this index entry, multiple of + 8-bytes. */ +/* 10*/ le16 key_length; /* Byte size of the key value, which is in the + index entry. It follows field reserved. Not + multiple of 8-bytes. */ +/* 12*/ INDEX_ENTRY_FLAGS flags; /* Bit field of INDEX_ENTRY_* flags. */ +/* 14*/ le16 reserved; /* Reserved/align to 8-byte boundary. */ +/* sizeof() = 16 bytes */ +} __attribute__ ((__packed__)) INDEX_ENTRY_HEADER; + +/* + * This is an index entry. A sequence of such entries follows each INDEX_HEADER + * structure. Together they make up a complete index. The index follows either + * an index root attribute or an index allocation attribute. + * + * NOTE: Before NTFS 3.0 only filename attributes were indexed. + */ +typedef struct { +/*Ofs*/ +/* 0 INDEX_ENTRY_HEADER; -- Unfolded here as gcc dislikes unnamed structs. */ + union { + struct { /* Only valid when INDEX_ENTRY_END is not set. */ + leMFT_REF indexed_file; /* The mft reference of the file + described by this index + entry. Used for directory + indexes. */ + } __attribute__ ((__packed__)) dir; + struct { /* Used for views/indexes to find the entry's data. */ + le16 data_offset; /* Data byte offset from this + INDEX_ENTRY. Follows the + index key. */ + le16 data_length; /* Data length in bytes. */ + le32 reservedV; /* Reserved (zero). */ + } __attribute__ ((__packed__)) vi; + } __attribute__ ((__packed__)) data; + le16 length; /* Byte size of this index entry, multiple of + 8-bytes. */ + le16 key_length; /* Byte size of the key value, which is in the + index entry. It follows field reserved. Not + multiple of 8-bytes. */ + INDEX_ENTRY_FLAGS flags; /* Bit field of INDEX_ENTRY_* flags. */ + le16 reserved; /* Reserved/align to 8-byte boundary. */ + +/* 16*/ union { /* The key of the indexed attribute. NOTE: Only present + if INDEX_ENTRY_END bit in flags is not set. NOTE: On + NTFS versions before 3.0 the only valid key is the + FILE_NAME_ATTR. On NTFS 3.0+ the following + additional index keys are defined: */ + FILE_NAME_ATTR file_name;/* $I30 index in directories. */ + SII_INDEX_KEY sii; /* $SII index in $Secure. */ + SDH_INDEX_KEY sdh; /* $SDH index in $Secure. */ + GUID object_id; /* $O index in FILE_Extend/$ObjId: The + object_id of the mft record found in + the data part of the index. */ + REPARSE_INDEX_KEY reparse; /* $R index in + FILE_Extend/$Reparse. */ + SID sid; /* $O index in FILE_Extend/$Quota: + SID of the owner of the user_id. */ + le32 owner_id; /* $Q index in FILE_Extend/$Quota: + user_id of the owner of the quota + control entry in the data part of + the index. */ + } __attribute__ ((__packed__)) key; + /* The (optional) index data is inserted here when creating. */ + // leVCN vcn; /* If INDEX_ENTRY_NODE bit in flags is set, the last + // eight bytes of this index entry contain the virtual + // cluster number of the index block that holds the + // entries immediately preceding the current entry (the + // vcn references the corresponding cluster in the data + // of the non-resident index allocation attribute). If + // the key_length is zero, then the vcn immediately + // follows the INDEX_ENTRY_HEADER. Regardless of + // key_length, the address of the 8-byte boundary + // aligned vcn of INDEX_ENTRY{_HEADER} *ie is given by + // (char*)ie + le16_to_cpu(ie*)->length) - sizeof(VCN), + // where sizeof(VCN) can be hardcoded as 8 if wanted. */ +} __attribute__ ((__packed__)) INDEX_ENTRY; + +/* + * Attribute: Bitmap (0xb0). + * + * Contains an array of bits (aka a bitfield). + * + * When used in conjunction with the index allocation attribute, each bit + * corresponds to one index block within the index allocation attribute. Thus + * the number of bits in the bitmap * index block size / cluster size is the + * number of clusters in the index allocation attribute. + */ +typedef struct { + u8 bitmap[0]; /* Array of bits. */ +} __attribute__ ((__packed__)) BITMAP_ATTR; + +/* + * The reparse point tag defines the type of the reparse point. It also + * includes several flags, which further describe the reparse point. + * + * The reparse point tag is an unsigned 32-bit value divided in three parts: + * + * 1. The least significant 16 bits (i.e. bits 0 to 15) specifiy the type of + * the reparse point. + * 2. The 13 bits after this (i.e. bits 16 to 28) are reserved for future use. + * 3. The most significant three bits are flags describing the reparse point. + * They are defined as follows: + * bit 29: Name surrogate bit. If set, the filename is an alias for + * another object in the system. + * bit 30: High-latency bit. If set, accessing the first byte of data will + * be slow. (E.g. the data is stored on a tape drive.) + * bit 31: Microsoft bit. If set, the tag is owned by Microsoft. User + * defined tags have to use zero here. + * + * These are the predefined reparse point tags: + */ +enum { + IO_REPARSE_TAG_IS_ALIAS = cpu_to_le32(0x20000000), + IO_REPARSE_TAG_IS_HIGH_LATENCY = cpu_to_le32(0x40000000), + IO_REPARSE_TAG_IS_MICROSOFT = cpu_to_le32(0x80000000), + + IO_REPARSE_TAG_RESERVED_ZERO = cpu_to_le32(0x00000000), + IO_REPARSE_TAG_RESERVED_ONE = cpu_to_le32(0x00000001), + IO_REPARSE_TAG_RESERVED_RANGE = cpu_to_le32(0x00000001), + + IO_REPARSE_TAG_NSS = cpu_to_le32(0x68000005), + IO_REPARSE_TAG_NSS_RECOVER = cpu_to_le32(0x68000006), + IO_REPARSE_TAG_SIS = cpu_to_le32(0x68000007), + IO_REPARSE_TAG_DFS = cpu_to_le32(0x68000008), + + IO_REPARSE_TAG_MOUNT_POINT = cpu_to_le32(0x88000003), + + IO_REPARSE_TAG_HSM = cpu_to_le32(0xa8000004), + + IO_REPARSE_TAG_SYMBOLIC_LINK = cpu_to_le32(0xe8000000), + + IO_REPARSE_TAG_VALID_VALUES = cpu_to_le32(0xe000ffff), +}; + +/* + * Attribute: Reparse point (0xc0). + * + * NOTE: Can be resident or non-resident. + */ +typedef struct { + le32 reparse_tag; /* Reparse point type (inc. flags). */ + le16 reparse_data_length; /* Byte size of reparse data. */ + le16 reserved; /* Align to 8-byte boundary. */ + u8 reparse_data[0]; /* Meaning depends on reparse_tag. */ +} __attribute__ ((__packed__)) REPARSE_POINT; + +/* + * Attribute: Extended attribute (EA) information (0xd0). + * + * NOTE: Always resident. (Is this true???) + */ +typedef struct { + le16 ea_length; /* Byte size of the packed extended + attributes. */ + le16 need_ea_count; /* The number of extended attributes which have + the NEED_EA bit set. */ + le32 ea_query_length; /* Byte size of the buffer required to query + the extended attributes when calling + ZwQueryEaFile() in Windows NT/2k. I.e. the + byte size of the unpacked extended + attributes. */ +} __attribute__ ((__packed__)) EA_INFORMATION; + +/* + * Extended attribute flags (8-bit). + */ +enum { + NEED_EA = 0x80 /* If set the file to which the EA belongs + cannot be interpreted without understanding + the associates extended attributes. */ +} __attribute__ ((__packed__)); + +typedef u8 EA_FLAGS; + +/* + * Attribute: Extended attribute (EA) (0xe0). + * + * NOTE: Can be resident or non-resident. + * + * Like the attribute list and the index buffer list, the EA attribute value is + * a sequence of EA_ATTR variable length records. + */ +typedef struct { + le32 next_entry_offset; /* Offset to the next EA_ATTR. */ + EA_FLAGS flags; /* Flags describing the EA. */ + u8 ea_name_length; /* Length of the name of the EA in bytes + excluding the '\0' byte terminator. */ + le16 ea_value_length; /* Byte size of the EA's value. */ + u8 ea_name[0]; /* Name of the EA. Note this is ASCII, not + Unicode and it is zero terminated. */ + u8 ea_value[0]; /* The value of the EA. Immediately follows + the name. */ +} __attribute__ ((__packed__)) EA_ATTR; + +/* + * Attribute: Property set (0xf0). + * + * Intended to support Native Structure Storage (NSS) - a feature removed from + * NTFS 3.0 during beta testing. + */ +typedef struct { + /* Irrelevant as feature unused. */ +} __attribute__ ((__packed__)) PROPERTY_SET; + +/* + * Attribute: Logged utility stream (0x100). + * + * NOTE: Can be resident or non-resident. + * + * Operations on this attribute are logged to the journal ($LogFile) like + * normal metadata changes. + * + * Used by the Encrypting File System (EFS). All encrypted files have this + * attribute with the name $EFS. + */ +typedef struct { + /* Can be anything the creator chooses. */ + /* EFS uses it as follows: */ + // FIXME: Type this info, verifying it along the way. (AIA) +} __attribute__ ((__packed__)) LOGGED_UTILITY_STREAM, EFS_ATTR; + +#endif /* _LINUX_NTFS_LAYOUT_H */ diff --git a/fs/ntfs/lcnalloc.c b/fs/ntfs/lcnalloc.c new file mode 100644 index 000000000..eda9972e6 --- /dev/null +++ b/fs/ntfs/lcnalloc.c @@ -0,0 +1,1000 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * lcnalloc.c - Cluster (de)allocation code. Part of the Linux-NTFS project. + * + * Copyright (c) 2004-2005 Anton Altaparmakov + */ + +#ifdef NTFS_RW + +#include <linux/pagemap.h> + +#include "lcnalloc.h" +#include "debug.h" +#include "bitmap.h" +#include "inode.h" +#include "volume.h" +#include "attrib.h" +#include "malloc.h" +#include "aops.h" +#include "ntfs.h" + +/** + * ntfs_cluster_free_from_rl_nolock - free clusters from runlist + * @vol: mounted ntfs volume on which to free the clusters + * @rl: runlist describing the clusters to free + * + * Free all the clusters described by the runlist @rl on the volume @vol. In + * the case of an error being returned, at least some of the clusters were not + * freed. + * + * Return 0 on success and -errno on error. + * + * Locking: - The volume lcn bitmap must be locked for writing on entry and is + * left locked on return. + */ +int ntfs_cluster_free_from_rl_nolock(ntfs_volume *vol, + const runlist_element *rl) +{ + struct inode *lcnbmp_vi = vol->lcnbmp_ino; + int ret = 0; + + ntfs_debug("Entering."); + if (!rl) + return 0; + for (; rl->length; rl++) { + int err; + + if (rl->lcn < 0) + continue; + err = ntfs_bitmap_clear_run(lcnbmp_vi, rl->lcn, rl->length); + if (unlikely(err && (!ret || ret == -ENOMEM) && ret != err)) + ret = err; + } + ntfs_debug("Done."); + return ret; +} + +/** + * ntfs_cluster_alloc - allocate clusters on an ntfs volume + * @vol: mounted ntfs volume on which to allocate the clusters + * @start_vcn: vcn to use for the first allocated cluster + * @count: number of clusters to allocate + * @start_lcn: starting lcn at which to allocate the clusters (or -1 if none) + * @zone: zone from which to allocate the clusters + * @is_extension: if 'true', this is an attribute extension + * + * Allocate @count clusters preferably starting at cluster @start_lcn or at the + * current allocator position if @start_lcn is -1, on the mounted ntfs volume + * @vol. @zone is either DATA_ZONE for allocation of normal clusters or + * MFT_ZONE for allocation of clusters for the master file table, i.e. the + * $MFT/$DATA attribute. + * + * @start_vcn specifies the vcn of the first allocated cluster. This makes + * merging the resulting runlist with the old runlist easier. + * + * If @is_extension is 'true', the caller is allocating clusters to extend an + * attribute and if it is 'false', the caller is allocating clusters to fill a + * hole in an attribute. Practically the difference is that if @is_extension + * is 'true' the returned runlist will be terminated with LCN_ENOENT and if + * @is_extension is 'false' the runlist will be terminated with + * LCN_RL_NOT_MAPPED. + * + * You need to check the return value with IS_ERR(). If this is false, the + * function was successful and the return value is a runlist describing the + * allocated cluster(s). If IS_ERR() is true, the function failed and + * PTR_ERR() gives you the error code. + * + * Notes on the allocation algorithm + * ================================= + * + * There are two data zones. First is the area between the end of the mft zone + * and the end of the volume, and second is the area between the start of the + * volume and the start of the mft zone. On unmodified/standard NTFS 1.x + * volumes, the second data zone does not exist due to the mft zone being + * expanded to cover the start of the volume in order to reserve space for the + * mft bitmap attribute. + * + * This is not the prettiest function but the complexity stems from the need of + * implementing the mft vs data zoned approach and from the fact that we have + * access to the lcn bitmap in portions of up to 8192 bytes at a time, so we + * need to cope with crossing over boundaries of two buffers. Further, the + * fact that the allocator allows for caller supplied hints as to the location + * of where allocation should begin and the fact that the allocator keeps track + * of where in the data zones the next natural allocation should occur, + * contribute to the complexity of the function. But it should all be + * worthwhile, because this allocator should: 1) be a full implementation of + * the MFT zone approach used by Windows NT, 2) cause reduction in + * fragmentation, and 3) be speedy in allocations (the code is not optimized + * for speed, but the algorithm is, so further speed improvements are probably + * possible). + * + * FIXME: We should be monitoring cluster allocation and increment the MFT zone + * size dynamically but this is something for the future. We will just cause + * heavier fragmentation by not doing it and I am not even sure Windows would + * grow the MFT zone dynamically, so it might even be correct not to do this. + * The overhead in doing dynamic MFT zone expansion would be very large and + * unlikely worth the effort. (AIA) + * + * TODO: I have added in double the required zone position pointer wrap around + * logic which can be optimized to having only one of the two logic sets. + * However, having the double logic will work fine, but if we have only one of + * the sets and we get it wrong somewhere, then we get into trouble, so + * removing the duplicate logic requires _very_ careful consideration of _all_ + * possible code paths. So at least for now, I am leaving the double logic - + * better safe than sorry... (AIA) + * + * Locking: - The volume lcn bitmap must be unlocked on entry and is unlocked + * on return. + * - This function takes the volume lcn bitmap lock for writing and + * modifies the bitmap contents. + */ +runlist_element *ntfs_cluster_alloc(ntfs_volume *vol, const VCN start_vcn, + const s64 count, const LCN start_lcn, + const NTFS_CLUSTER_ALLOCATION_ZONES zone, + const bool is_extension) +{ + LCN zone_start, zone_end, bmp_pos, bmp_initial_pos, last_read_pos, lcn; + LCN prev_lcn = 0, prev_run_len = 0, mft_zone_size; + s64 clusters; + loff_t i_size; + struct inode *lcnbmp_vi; + runlist_element *rl = NULL; + struct address_space *mapping; + struct page *page = NULL; + u8 *buf, *byte; + int err = 0, rlpos, rlsize, buf_size; + u8 pass, done_zones, search_zone, need_writeback = 0, bit; + + ntfs_debug("Entering for start_vcn 0x%llx, count 0x%llx, start_lcn " + "0x%llx, zone %s_ZONE.", (unsigned long long)start_vcn, + (unsigned long long)count, + (unsigned long long)start_lcn, + zone == MFT_ZONE ? "MFT" : "DATA"); + BUG_ON(!vol); + lcnbmp_vi = vol->lcnbmp_ino; + BUG_ON(!lcnbmp_vi); + BUG_ON(start_vcn < 0); + BUG_ON(count < 0); + BUG_ON(start_lcn < -1); + BUG_ON(zone < FIRST_ZONE); + BUG_ON(zone > LAST_ZONE); + + /* Return NULL if @count is zero. */ + if (!count) + return NULL; + /* Take the lcnbmp lock for writing. */ + down_write(&vol->lcnbmp_lock); + /* + * If no specific @start_lcn was requested, use the current data zone + * position, otherwise use the requested @start_lcn but make sure it + * lies outside the mft zone. Also set done_zones to 0 (no zones done) + * and pass depending on whether we are starting inside a zone (1) or + * at the beginning of a zone (2). If requesting from the MFT_ZONE, + * we either start at the current position within the mft zone or at + * the specified position. If the latter is out of bounds then we start + * at the beginning of the MFT_ZONE. + */ + done_zones = 0; + pass = 1; + /* + * zone_start and zone_end are the current search range. search_zone + * is 1 for mft zone, 2 for data zone 1 (end of mft zone till end of + * volume) and 4 for data zone 2 (start of volume till start of mft + * zone). + */ + zone_start = start_lcn; + if (zone_start < 0) { + if (zone == DATA_ZONE) + zone_start = vol->data1_zone_pos; + else + zone_start = vol->mft_zone_pos; + if (!zone_start) { + /* + * Zone starts at beginning of volume which means a + * single pass is sufficient. + */ + pass = 2; + } + } else if (zone == DATA_ZONE && zone_start >= vol->mft_zone_start && + zone_start < vol->mft_zone_end) { + zone_start = vol->mft_zone_end; + /* + * Starting at beginning of data1_zone which means a single + * pass in this zone is sufficient. + */ + pass = 2; + } else if (zone == MFT_ZONE && (zone_start < vol->mft_zone_start || + zone_start >= vol->mft_zone_end)) { + zone_start = vol->mft_lcn; + if (!vol->mft_zone_end) + zone_start = 0; + /* + * Starting at beginning of volume which means a single pass + * is sufficient. + */ + pass = 2; + } + if (zone == MFT_ZONE) { + zone_end = vol->mft_zone_end; + search_zone = 1; + } else /* if (zone == DATA_ZONE) */ { + /* Skip searching the mft zone. */ + done_zones |= 1; + if (zone_start >= vol->mft_zone_end) { + zone_end = vol->nr_clusters; + search_zone = 2; + } else { + zone_end = vol->mft_zone_start; + search_zone = 4; + } + } + /* + * bmp_pos is the current bit position inside the bitmap. We use + * bmp_initial_pos to determine whether or not to do a zone switch. + */ + bmp_pos = bmp_initial_pos = zone_start; + + /* Loop until all clusters are allocated, i.e. clusters == 0. */ + clusters = count; + rlpos = rlsize = 0; + mapping = lcnbmp_vi->i_mapping; + i_size = i_size_read(lcnbmp_vi); + while (1) { + ntfs_debug("Start of outer while loop: done_zones 0x%x, " + "search_zone %i, pass %i, zone_start 0x%llx, " + "zone_end 0x%llx, bmp_initial_pos 0x%llx, " + "bmp_pos 0x%llx, rlpos %i, rlsize %i.", + done_zones, search_zone, pass, + (unsigned long long)zone_start, + (unsigned long long)zone_end, + (unsigned long long)bmp_initial_pos, + (unsigned long long)bmp_pos, rlpos, rlsize); + /* Loop until we run out of free clusters. */ + last_read_pos = bmp_pos >> 3; + ntfs_debug("last_read_pos 0x%llx.", + (unsigned long long)last_read_pos); + if (last_read_pos > i_size) { + ntfs_debug("End of attribute reached. " + "Skipping to zone_pass_done."); + goto zone_pass_done; + } + if (likely(page)) { + if (need_writeback) { + ntfs_debug("Marking page dirty."); + flush_dcache_page(page); + set_page_dirty(page); + need_writeback = 0; + } + ntfs_unmap_page(page); + } + page = ntfs_map_page(mapping, last_read_pos >> + PAGE_SHIFT); + if (IS_ERR(page)) { + err = PTR_ERR(page); + ntfs_error(vol->sb, "Failed to map page."); + goto out; + } + buf_size = last_read_pos & ~PAGE_MASK; + buf = page_address(page) + buf_size; + buf_size = PAGE_SIZE - buf_size; + if (unlikely(last_read_pos + buf_size > i_size)) + buf_size = i_size - last_read_pos; + buf_size <<= 3; + lcn = bmp_pos & 7; + bmp_pos &= ~(LCN)7; + ntfs_debug("Before inner while loop: buf_size %i, lcn 0x%llx, " + "bmp_pos 0x%llx, need_writeback %i.", buf_size, + (unsigned long long)lcn, + (unsigned long long)bmp_pos, need_writeback); + while (lcn < buf_size && lcn + bmp_pos < zone_end) { + byte = buf + (lcn >> 3); + ntfs_debug("In inner while loop: buf_size %i, " + "lcn 0x%llx, bmp_pos 0x%llx, " + "need_writeback %i, byte ofs 0x%x, " + "*byte 0x%x.", buf_size, + (unsigned long long)lcn, + (unsigned long long)bmp_pos, + need_writeback, + (unsigned int)(lcn >> 3), + (unsigned int)*byte); + /* Skip full bytes. */ + if (*byte == 0xff) { + lcn = (lcn + 8) & ~(LCN)7; + ntfs_debug("Continuing while loop 1."); + continue; + } + bit = 1 << (lcn & 7); + ntfs_debug("bit 0x%x.", bit); + /* If the bit is already set, go onto the next one. */ + if (*byte & bit) { + lcn++; + ntfs_debug("Continuing while loop 2."); + continue; + } + /* + * Allocate more memory if needed, including space for + * the terminator element. + * ntfs_malloc_nofs() operates on whole pages only. + */ + if ((rlpos + 2) * sizeof(*rl) > rlsize) { + runlist_element *rl2; + + ntfs_debug("Reallocating memory."); + if (!rl) + ntfs_debug("First free bit is at LCN " + "0x%llx.", + (unsigned long long) + (lcn + bmp_pos)); + rl2 = ntfs_malloc_nofs(rlsize + (int)PAGE_SIZE); + if (unlikely(!rl2)) { + err = -ENOMEM; + ntfs_error(vol->sb, "Failed to " + "allocate memory."); + goto out; + } + memcpy(rl2, rl, rlsize); + ntfs_free(rl); + rl = rl2; + rlsize += PAGE_SIZE; + ntfs_debug("Reallocated memory, rlsize 0x%x.", + rlsize); + } + /* Allocate the bitmap bit. */ + *byte |= bit; + /* We need to write this bitmap page to disk. */ + need_writeback = 1; + ntfs_debug("*byte 0x%x, need_writeback is set.", + (unsigned int)*byte); + /* + * Coalesce with previous run if adjacent LCNs. + * Otherwise, append a new run. + */ + ntfs_debug("Adding run (lcn 0x%llx, len 0x%llx), " + "prev_lcn 0x%llx, lcn 0x%llx, " + "bmp_pos 0x%llx, prev_run_len 0x%llx, " + "rlpos %i.", + (unsigned long long)(lcn + bmp_pos), + 1ULL, (unsigned long long)prev_lcn, + (unsigned long long)lcn, + (unsigned long long)bmp_pos, + (unsigned long long)prev_run_len, + rlpos); + if (prev_lcn == lcn + bmp_pos - prev_run_len && rlpos) { + ntfs_debug("Coalescing to run (lcn 0x%llx, " + "len 0x%llx).", + (unsigned long long) + rl[rlpos - 1].lcn, + (unsigned long long) + rl[rlpos - 1].length); + rl[rlpos - 1].length = ++prev_run_len; + ntfs_debug("Run now (lcn 0x%llx, len 0x%llx), " + "prev_run_len 0x%llx.", + (unsigned long long) + rl[rlpos - 1].lcn, + (unsigned long long) + rl[rlpos - 1].length, + (unsigned long long) + prev_run_len); + } else { + if (likely(rlpos)) { + ntfs_debug("Adding new run, (previous " + "run lcn 0x%llx, " + "len 0x%llx).", + (unsigned long long) + rl[rlpos - 1].lcn, + (unsigned long long) + rl[rlpos - 1].length); + rl[rlpos].vcn = rl[rlpos - 1].vcn + + prev_run_len; + } else { + ntfs_debug("Adding new run, is first " + "run."); + rl[rlpos].vcn = start_vcn; + } + rl[rlpos].lcn = prev_lcn = lcn + bmp_pos; + rl[rlpos].length = prev_run_len = 1; + rlpos++; + } + /* Done? */ + if (!--clusters) { + LCN tc; + /* + * Update the current zone position. Positions + * of already scanned zones have been updated + * during the respective zone switches. + */ + tc = lcn + bmp_pos + 1; + ntfs_debug("Done. Updating current zone " + "position, tc 0x%llx, " + "search_zone %i.", + (unsigned long long)tc, + search_zone); + switch (search_zone) { + case 1: + ntfs_debug("Before checks, " + "vol->mft_zone_pos " + "0x%llx.", + (unsigned long long) + vol->mft_zone_pos); + if (tc >= vol->mft_zone_end) { + vol->mft_zone_pos = + vol->mft_lcn; + if (!vol->mft_zone_end) + vol->mft_zone_pos = 0; + } else if ((bmp_initial_pos >= + vol->mft_zone_pos || + tc > vol->mft_zone_pos) + && tc >= vol->mft_lcn) + vol->mft_zone_pos = tc; + ntfs_debug("After checks, " + "vol->mft_zone_pos " + "0x%llx.", + (unsigned long long) + vol->mft_zone_pos); + break; + case 2: + ntfs_debug("Before checks, " + "vol->data1_zone_pos " + "0x%llx.", + (unsigned long long) + vol->data1_zone_pos); + if (tc >= vol->nr_clusters) + vol->data1_zone_pos = + vol->mft_zone_end; + else if ((bmp_initial_pos >= + vol->data1_zone_pos || + tc > vol->data1_zone_pos) + && tc >= vol->mft_zone_end) + vol->data1_zone_pos = tc; + ntfs_debug("After checks, " + "vol->data1_zone_pos " + "0x%llx.", + (unsigned long long) + vol->data1_zone_pos); + break; + case 4: + ntfs_debug("Before checks, " + "vol->data2_zone_pos " + "0x%llx.", + (unsigned long long) + vol->data2_zone_pos); + if (tc >= vol->mft_zone_start) + vol->data2_zone_pos = 0; + else if (bmp_initial_pos >= + vol->data2_zone_pos || + tc > vol->data2_zone_pos) + vol->data2_zone_pos = tc; + ntfs_debug("After checks, " + "vol->data2_zone_pos " + "0x%llx.", + (unsigned long long) + vol->data2_zone_pos); + break; + default: + BUG(); + } + ntfs_debug("Finished. Going to out."); + goto out; + } + lcn++; + } + bmp_pos += buf_size; + ntfs_debug("After inner while loop: buf_size 0x%x, lcn " + "0x%llx, bmp_pos 0x%llx, need_writeback %i.", + buf_size, (unsigned long long)lcn, + (unsigned long long)bmp_pos, need_writeback); + if (bmp_pos < zone_end) { + ntfs_debug("Continuing outer while loop, " + "bmp_pos 0x%llx, zone_end 0x%llx.", + (unsigned long long)bmp_pos, + (unsigned long long)zone_end); + continue; + } +zone_pass_done: /* Finished with the current zone pass. */ + ntfs_debug("At zone_pass_done, pass %i.", pass); + if (pass == 1) { + /* + * Now do pass 2, scanning the first part of the zone + * we omitted in pass 1. + */ + pass = 2; + zone_end = zone_start; + switch (search_zone) { + case 1: /* mft_zone */ + zone_start = vol->mft_zone_start; + break; + case 2: /* data1_zone */ + zone_start = vol->mft_zone_end; + break; + case 4: /* data2_zone */ + zone_start = 0; + break; + default: + BUG(); + } + /* Sanity check. */ + if (zone_end < zone_start) + zone_end = zone_start; + bmp_pos = zone_start; + ntfs_debug("Continuing outer while loop, pass 2, " + "zone_start 0x%llx, zone_end 0x%llx, " + "bmp_pos 0x%llx.", + (unsigned long long)zone_start, + (unsigned long long)zone_end, + (unsigned long long)bmp_pos); + continue; + } /* pass == 2 */ +done_zones_check: + ntfs_debug("At done_zones_check, search_zone %i, done_zones " + "before 0x%x, done_zones after 0x%x.", + search_zone, done_zones, + done_zones | search_zone); + done_zones |= search_zone; + if (done_zones < 7) { + ntfs_debug("Switching zone."); + /* Now switch to the next zone we haven't done yet. */ + pass = 1; + switch (search_zone) { + case 1: + ntfs_debug("Switching from mft zone to data1 " + "zone."); + /* Update mft zone position. */ + if (rlpos) { + LCN tc; + + ntfs_debug("Before checks, " + "vol->mft_zone_pos " + "0x%llx.", + (unsigned long long) + vol->mft_zone_pos); + tc = rl[rlpos - 1].lcn + + rl[rlpos - 1].length; + if (tc >= vol->mft_zone_end) { + vol->mft_zone_pos = + vol->mft_lcn; + if (!vol->mft_zone_end) + vol->mft_zone_pos = 0; + } else if ((bmp_initial_pos >= + vol->mft_zone_pos || + tc > vol->mft_zone_pos) + && tc >= vol->mft_lcn) + vol->mft_zone_pos = tc; + ntfs_debug("After checks, " + "vol->mft_zone_pos " + "0x%llx.", + (unsigned long long) + vol->mft_zone_pos); + } + /* Switch from mft zone to data1 zone. */ +switch_to_data1_zone: search_zone = 2; + zone_start = bmp_initial_pos = + vol->data1_zone_pos; + zone_end = vol->nr_clusters; + if (zone_start == vol->mft_zone_end) + pass = 2; + if (zone_start >= zone_end) { + vol->data1_zone_pos = zone_start = + vol->mft_zone_end; + pass = 2; + } + break; + case 2: + ntfs_debug("Switching from data1 zone to " + "data2 zone."); + /* Update data1 zone position. */ + if (rlpos) { + LCN tc; + + ntfs_debug("Before checks, " + "vol->data1_zone_pos " + "0x%llx.", + (unsigned long long) + vol->data1_zone_pos); + tc = rl[rlpos - 1].lcn + + rl[rlpos - 1].length; + if (tc >= vol->nr_clusters) + vol->data1_zone_pos = + vol->mft_zone_end; + else if ((bmp_initial_pos >= + vol->data1_zone_pos || + tc > vol->data1_zone_pos) + && tc >= vol->mft_zone_end) + vol->data1_zone_pos = tc; + ntfs_debug("After checks, " + "vol->data1_zone_pos " + "0x%llx.", + (unsigned long long) + vol->data1_zone_pos); + } + /* Switch from data1 zone to data2 zone. */ + search_zone = 4; + zone_start = bmp_initial_pos = + vol->data2_zone_pos; + zone_end = vol->mft_zone_start; + if (!zone_start) + pass = 2; + if (zone_start >= zone_end) { + vol->data2_zone_pos = zone_start = + bmp_initial_pos = 0; + pass = 2; + } + break; + case 4: + ntfs_debug("Switching from data2 zone to " + "data1 zone."); + /* Update data2 zone position. */ + if (rlpos) { + LCN tc; + + ntfs_debug("Before checks, " + "vol->data2_zone_pos " + "0x%llx.", + (unsigned long long) + vol->data2_zone_pos); + tc = rl[rlpos - 1].lcn + + rl[rlpos - 1].length; + if (tc >= vol->mft_zone_start) + vol->data2_zone_pos = 0; + else if (bmp_initial_pos >= + vol->data2_zone_pos || + tc > vol->data2_zone_pos) + vol->data2_zone_pos = tc; + ntfs_debug("After checks, " + "vol->data2_zone_pos " + "0x%llx.", + (unsigned long long) + vol->data2_zone_pos); + } + /* Switch from data2 zone to data1 zone. */ + goto switch_to_data1_zone; + default: + BUG(); + } + ntfs_debug("After zone switch, search_zone %i, " + "pass %i, bmp_initial_pos 0x%llx, " + "zone_start 0x%llx, zone_end 0x%llx.", + search_zone, pass, + (unsigned long long)bmp_initial_pos, + (unsigned long long)zone_start, + (unsigned long long)zone_end); + bmp_pos = zone_start; + if (zone_start == zone_end) { + ntfs_debug("Empty zone, going to " + "done_zones_check."); + /* Empty zone. Don't bother searching it. */ + goto done_zones_check; + } + ntfs_debug("Continuing outer while loop."); + continue; + } /* done_zones == 7 */ + ntfs_debug("All zones are finished."); + /* + * All zones are finished! If DATA_ZONE, shrink mft zone. If + * MFT_ZONE, we have really run out of space. + */ + mft_zone_size = vol->mft_zone_end - vol->mft_zone_start; + ntfs_debug("vol->mft_zone_start 0x%llx, vol->mft_zone_end " + "0x%llx, mft_zone_size 0x%llx.", + (unsigned long long)vol->mft_zone_start, + (unsigned long long)vol->mft_zone_end, + (unsigned long long)mft_zone_size); + if (zone == MFT_ZONE || mft_zone_size <= 0) { + ntfs_debug("No free clusters left, going to out."); + /* Really no more space left on device. */ + err = -ENOSPC; + goto out; + } /* zone == DATA_ZONE && mft_zone_size > 0 */ + ntfs_debug("Shrinking mft zone."); + zone_end = vol->mft_zone_end; + mft_zone_size >>= 1; + if (mft_zone_size > 0) + vol->mft_zone_end = vol->mft_zone_start + mft_zone_size; + else /* mft zone and data2 zone no longer exist. */ + vol->data2_zone_pos = vol->mft_zone_start = + vol->mft_zone_end = 0; + if (vol->mft_zone_pos >= vol->mft_zone_end) { + vol->mft_zone_pos = vol->mft_lcn; + if (!vol->mft_zone_end) + vol->mft_zone_pos = 0; + } + bmp_pos = zone_start = bmp_initial_pos = + vol->data1_zone_pos = vol->mft_zone_end; + search_zone = 2; + pass = 2; + done_zones &= ~2; + ntfs_debug("After shrinking mft zone, mft_zone_size 0x%llx, " + "vol->mft_zone_start 0x%llx, " + "vol->mft_zone_end 0x%llx, " + "vol->mft_zone_pos 0x%llx, search_zone 2, " + "pass 2, dones_zones 0x%x, zone_start 0x%llx, " + "zone_end 0x%llx, vol->data1_zone_pos 0x%llx, " + "continuing outer while loop.", + (unsigned long long)mft_zone_size, + (unsigned long long)vol->mft_zone_start, + (unsigned long long)vol->mft_zone_end, + (unsigned long long)vol->mft_zone_pos, + done_zones, (unsigned long long)zone_start, + (unsigned long long)zone_end, + (unsigned long long)vol->data1_zone_pos); + } + ntfs_debug("After outer while loop."); +out: + ntfs_debug("At out."); + /* Add runlist terminator element. */ + if (likely(rl)) { + rl[rlpos].vcn = rl[rlpos - 1].vcn + rl[rlpos - 1].length; + rl[rlpos].lcn = is_extension ? LCN_ENOENT : LCN_RL_NOT_MAPPED; + rl[rlpos].length = 0; + } + if (likely(page && !IS_ERR(page))) { + if (need_writeback) { + ntfs_debug("Marking page dirty."); + flush_dcache_page(page); + set_page_dirty(page); + need_writeback = 0; + } + ntfs_unmap_page(page); + } + if (likely(!err)) { + up_write(&vol->lcnbmp_lock); + ntfs_debug("Done."); + return rl; + } + ntfs_error(vol->sb, "Failed to allocate clusters, aborting " + "(error %i).", err); + if (rl) { + int err2; + + if (err == -ENOSPC) + ntfs_debug("Not enough space to complete allocation, " + "err -ENOSPC, first free lcn 0x%llx, " + "could allocate up to 0x%llx " + "clusters.", + (unsigned long long)rl[0].lcn, + (unsigned long long)(count - clusters)); + /* Deallocate all allocated clusters. */ + ntfs_debug("Attempting rollback..."); + err2 = ntfs_cluster_free_from_rl_nolock(vol, rl); + if (err2) { + ntfs_error(vol->sb, "Failed to rollback (error %i). " + "Leaving inconsistent metadata! " + "Unmount and run chkdsk.", err2); + NVolSetErrors(vol); + } + /* Free the runlist. */ + ntfs_free(rl); + } else if (err == -ENOSPC) + ntfs_debug("No space left at all, err = -ENOSPC, first free " + "lcn = 0x%llx.", + (long long)vol->data1_zone_pos); + up_write(&vol->lcnbmp_lock); + return ERR_PTR(err); +} + +/** + * __ntfs_cluster_free - free clusters on an ntfs volume + * @ni: ntfs inode whose runlist describes the clusters to free + * @start_vcn: vcn in the runlist of @ni at which to start freeing clusters + * @count: number of clusters to free or -1 for all clusters + * @ctx: active attribute search context if present or NULL if not + * @is_rollback: true if this is a rollback operation + * + * Free @count clusters starting at the cluster @start_vcn in the runlist + * described by the vfs inode @ni. + * + * If @count is -1, all clusters from @start_vcn to the end of the runlist are + * deallocated. Thus, to completely free all clusters in a runlist, use + * @start_vcn = 0 and @count = -1. + * + * If @ctx is specified, it is an active search context of @ni and its base mft + * record. This is needed when __ntfs_cluster_free() encounters unmapped + * runlist fragments and allows their mapping. If you do not have the mft + * record mapped, you can specify @ctx as NULL and __ntfs_cluster_free() will + * perform the necessary mapping and unmapping. + * + * Note, __ntfs_cluster_free() saves the state of @ctx on entry and restores it + * before returning. Thus, @ctx will be left pointing to the same attribute on + * return as on entry. However, the actual pointers in @ctx may point to + * different memory locations on return, so you must remember to reset any + * cached pointers from the @ctx, i.e. after the call to __ntfs_cluster_free(), + * you will probably want to do: + * m = ctx->mrec; + * a = ctx->attr; + * Assuming you cache ctx->attr in a variable @a of type ATTR_RECORD * and that + * you cache ctx->mrec in a variable @m of type MFT_RECORD *. + * + * @is_rollback should always be 'false', it is for internal use to rollback + * errors. You probably want to use ntfs_cluster_free() instead. + * + * Note, __ntfs_cluster_free() does not modify the runlist, so you have to + * remove from the runlist or mark sparse the freed runs later. + * + * Return the number of deallocated clusters (not counting sparse ones) on + * success and -errno on error. + * + * WARNING: If @ctx is supplied, regardless of whether success or failure is + * returned, you need to check IS_ERR(@ctx->mrec) and if 'true' the @ctx + * is no longer valid, i.e. you need to either call + * ntfs_attr_reinit_search_ctx() or ntfs_attr_put_search_ctx() on it. + * In that case PTR_ERR(@ctx->mrec) will give you the error code for + * why the mapping of the old inode failed. + * + * Locking: - The runlist described by @ni must be locked for writing on entry + * and is locked on return. Note the runlist may be modified when + * needed runlist fragments need to be mapped. + * - The volume lcn bitmap must be unlocked on entry and is unlocked + * on return. + * - This function takes the volume lcn bitmap lock for writing and + * modifies the bitmap contents. + * - If @ctx is NULL, the base mft record of @ni must not be mapped on + * entry and it will be left unmapped on return. + * - If @ctx is not NULL, the base mft record must be mapped on entry + * and it will be left mapped on return. + */ +s64 __ntfs_cluster_free(ntfs_inode *ni, const VCN start_vcn, s64 count, + ntfs_attr_search_ctx *ctx, const bool is_rollback) +{ + s64 delta, to_free, total_freed, real_freed; + ntfs_volume *vol; + struct inode *lcnbmp_vi; + runlist_element *rl; + int err; + + BUG_ON(!ni); + ntfs_debug("Entering for i_ino 0x%lx, start_vcn 0x%llx, count " + "0x%llx.%s", ni->mft_no, (unsigned long long)start_vcn, + (unsigned long long)count, + is_rollback ? " (rollback)" : ""); + vol = ni->vol; + lcnbmp_vi = vol->lcnbmp_ino; + BUG_ON(!lcnbmp_vi); + BUG_ON(start_vcn < 0); + BUG_ON(count < -1); + /* + * Lock the lcn bitmap for writing but only if not rolling back. We + * must hold the lock all the way including through rollback otherwise + * rollback is not possible because once we have cleared a bit and + * dropped the lock, anyone could have set the bit again, thus + * allocating the cluster for another use. + */ + if (likely(!is_rollback)) + down_write(&vol->lcnbmp_lock); + + total_freed = real_freed = 0; + + rl = ntfs_attr_find_vcn_nolock(ni, start_vcn, ctx); + if (IS_ERR(rl)) { + if (!is_rollback) + ntfs_error(vol->sb, "Failed to find first runlist " + "element (error %li), aborting.", + PTR_ERR(rl)); + err = PTR_ERR(rl); + goto err_out; + } + if (unlikely(rl->lcn < LCN_HOLE)) { + if (!is_rollback) + ntfs_error(vol->sb, "First runlist element has " + "invalid lcn, aborting."); + err = -EIO; + goto err_out; + } + /* Find the starting cluster inside the run that needs freeing. */ + delta = start_vcn - rl->vcn; + + /* The number of clusters in this run that need freeing. */ + to_free = rl->length - delta; + if (count >= 0 && to_free > count) + to_free = count; + + if (likely(rl->lcn >= 0)) { + /* Do the actual freeing of the clusters in this run. */ + err = ntfs_bitmap_set_bits_in_run(lcnbmp_vi, rl->lcn + delta, + to_free, likely(!is_rollback) ? 0 : 1); + if (unlikely(err)) { + if (!is_rollback) + ntfs_error(vol->sb, "Failed to clear first run " + "(error %i), aborting.", err); + goto err_out; + } + /* We have freed @to_free real clusters. */ + real_freed = to_free; + }; + /* Go to the next run and adjust the number of clusters left to free. */ + ++rl; + if (count >= 0) + count -= to_free; + + /* Keep track of the total "freed" clusters, including sparse ones. */ + total_freed = to_free; + /* + * Loop over the remaining runs, using @count as a capping value, and + * free them. + */ + for (; rl->length && count != 0; ++rl) { + if (unlikely(rl->lcn < LCN_HOLE)) { + VCN vcn; + + /* Attempt to map runlist. */ + vcn = rl->vcn; + rl = ntfs_attr_find_vcn_nolock(ni, vcn, ctx); + if (IS_ERR(rl)) { + err = PTR_ERR(rl); + if (!is_rollback) + ntfs_error(vol->sb, "Failed to map " + "runlist fragment or " + "failed to find " + "subsequent runlist " + "element."); + goto err_out; + } + if (unlikely(rl->lcn < LCN_HOLE)) { + if (!is_rollback) + ntfs_error(vol->sb, "Runlist element " + "has invalid lcn " + "(0x%llx).", + (unsigned long long) + rl->lcn); + err = -EIO; + goto err_out; + } + } + /* The number of clusters in this run that need freeing. */ + to_free = rl->length; + if (count >= 0 && to_free > count) + to_free = count; + + if (likely(rl->lcn >= 0)) { + /* Do the actual freeing of the clusters in the run. */ + err = ntfs_bitmap_set_bits_in_run(lcnbmp_vi, rl->lcn, + to_free, likely(!is_rollback) ? 0 : 1); + if (unlikely(err)) { + if (!is_rollback) + ntfs_error(vol->sb, "Failed to clear " + "subsequent run."); + goto err_out; + } + /* We have freed @to_free real clusters. */ + real_freed += to_free; + } + /* Adjust the number of clusters left to free. */ + if (count >= 0) + count -= to_free; + + /* Update the total done clusters. */ + total_freed += to_free; + } + if (likely(!is_rollback)) + up_write(&vol->lcnbmp_lock); + + BUG_ON(count > 0); + + /* We are done. Return the number of actually freed clusters. */ + ntfs_debug("Done."); + return real_freed; +err_out: + if (is_rollback) + return err; + /* If no real clusters were freed, no need to rollback. */ + if (!real_freed) { + up_write(&vol->lcnbmp_lock); + return err; + } + /* + * Attempt to rollback and if that succeeds just return the error code. + * If rollback fails, set the volume errors flag, emit an error + * message, and return the error code. + */ + delta = __ntfs_cluster_free(ni, start_vcn, total_freed, ctx, true); + if (delta < 0) { + ntfs_error(vol->sb, "Failed to rollback (error %i). Leaving " + "inconsistent metadata! Unmount and run " + "chkdsk.", (int)delta); + NVolSetErrors(vol); + } + up_write(&vol->lcnbmp_lock); + ntfs_error(vol->sb, "Aborting (error %i).", err); + return err; +} + +#endif /* NTFS_RW */ diff --git a/fs/ntfs/lcnalloc.h b/fs/ntfs/lcnalloc.h new file mode 100644 index 000000000..1589a6d84 --- /dev/null +++ b/fs/ntfs/lcnalloc.h @@ -0,0 +1,131 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * lcnalloc.h - Exports for NTFS kernel cluster (de)allocation. Part of the + * Linux-NTFS project. + * + * Copyright (c) 2004-2005 Anton Altaparmakov + */ + +#ifndef _LINUX_NTFS_LCNALLOC_H +#define _LINUX_NTFS_LCNALLOC_H + +#ifdef NTFS_RW + +#include <linux/fs.h> + +#include "attrib.h" +#include "types.h" +#include "inode.h" +#include "runlist.h" +#include "volume.h" + +typedef enum { + FIRST_ZONE = 0, /* For sanity checking. */ + MFT_ZONE = 0, /* Allocate from $MFT zone. */ + DATA_ZONE = 1, /* Allocate from $DATA zone. */ + LAST_ZONE = 1, /* For sanity checking. */ +} NTFS_CLUSTER_ALLOCATION_ZONES; + +extern runlist_element *ntfs_cluster_alloc(ntfs_volume *vol, + const VCN start_vcn, const s64 count, const LCN start_lcn, + const NTFS_CLUSTER_ALLOCATION_ZONES zone, + const bool is_extension); + +extern s64 __ntfs_cluster_free(ntfs_inode *ni, const VCN start_vcn, + s64 count, ntfs_attr_search_ctx *ctx, const bool is_rollback); + +/** + * ntfs_cluster_free - free clusters on an ntfs volume + * @ni: ntfs inode whose runlist describes the clusters to free + * @start_vcn: vcn in the runlist of @ni at which to start freeing clusters + * @count: number of clusters to free or -1 for all clusters + * @ctx: active attribute search context if present or NULL if not + * + * Free @count clusters starting at the cluster @start_vcn in the runlist + * described by the ntfs inode @ni. + * + * If @count is -1, all clusters from @start_vcn to the end of the runlist are + * deallocated. Thus, to completely free all clusters in a runlist, use + * @start_vcn = 0 and @count = -1. + * + * If @ctx is specified, it is an active search context of @ni and its base mft + * record. This is needed when ntfs_cluster_free() encounters unmapped runlist + * fragments and allows their mapping. If you do not have the mft record + * mapped, you can specify @ctx as NULL and ntfs_cluster_free() will perform + * the necessary mapping and unmapping. + * + * Note, ntfs_cluster_free() saves the state of @ctx on entry and restores it + * before returning. Thus, @ctx will be left pointing to the same attribute on + * return as on entry. However, the actual pointers in @ctx may point to + * different memory locations on return, so you must remember to reset any + * cached pointers from the @ctx, i.e. after the call to ntfs_cluster_free(), + * you will probably want to do: + * m = ctx->mrec; + * a = ctx->attr; + * Assuming you cache ctx->attr in a variable @a of type ATTR_RECORD * and that + * you cache ctx->mrec in a variable @m of type MFT_RECORD *. + * + * Note, ntfs_cluster_free() does not modify the runlist, so you have to remove + * from the runlist or mark sparse the freed runs later. + * + * Return the number of deallocated clusters (not counting sparse ones) on + * success and -errno on error. + * + * WARNING: If @ctx is supplied, regardless of whether success or failure is + * returned, you need to check IS_ERR(@ctx->mrec) and if 'true' the @ctx + * is no longer valid, i.e. you need to either call + * ntfs_attr_reinit_search_ctx() or ntfs_attr_put_search_ctx() on it. + * In that case PTR_ERR(@ctx->mrec) will give you the error code for + * why the mapping of the old inode failed. + * + * Locking: - The runlist described by @ni must be locked for writing on entry + * and is locked on return. Note the runlist may be modified when + * needed runlist fragments need to be mapped. + * - The volume lcn bitmap must be unlocked on entry and is unlocked + * on return. + * - This function takes the volume lcn bitmap lock for writing and + * modifies the bitmap contents. + * - If @ctx is NULL, the base mft record of @ni must not be mapped on + * entry and it will be left unmapped on return. + * - If @ctx is not NULL, the base mft record must be mapped on entry + * and it will be left mapped on return. + */ +static inline s64 ntfs_cluster_free(ntfs_inode *ni, const VCN start_vcn, + s64 count, ntfs_attr_search_ctx *ctx) +{ + return __ntfs_cluster_free(ni, start_vcn, count, ctx, false); +} + +extern int ntfs_cluster_free_from_rl_nolock(ntfs_volume *vol, + const runlist_element *rl); + +/** + * ntfs_cluster_free_from_rl - free clusters from runlist + * @vol: mounted ntfs volume on which to free the clusters + * @rl: runlist describing the clusters to free + * + * Free all the clusters described by the runlist @rl on the volume @vol. In + * the case of an error being returned, at least some of the clusters were not + * freed. + * + * Return 0 on success and -errno on error. + * + * Locking: - This function takes the volume lcn bitmap lock for writing and + * modifies the bitmap contents. + * - The caller must have locked the runlist @rl for reading or + * writing. + */ +static inline int ntfs_cluster_free_from_rl(ntfs_volume *vol, + const runlist_element *rl) +{ + int ret; + + down_write(&vol->lcnbmp_lock); + ret = ntfs_cluster_free_from_rl_nolock(vol, rl); + up_write(&vol->lcnbmp_lock); + return ret; +} + +#endif /* NTFS_RW */ + +#endif /* defined _LINUX_NTFS_LCNALLOC_H */ diff --git a/fs/ntfs/logfile.c b/fs/ntfs/logfile.c new file mode 100644 index 000000000..6ce60ffc6 --- /dev/null +++ b/fs/ntfs/logfile.c @@ -0,0 +1,849 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * logfile.c - NTFS kernel journal handling. Part of the Linux-NTFS project. + * + * Copyright (c) 2002-2007 Anton Altaparmakov + */ + +#ifdef NTFS_RW + +#include <linux/types.h> +#include <linux/fs.h> +#include <linux/highmem.h> +#include <linux/buffer_head.h> +#include <linux/bitops.h> +#include <linux/log2.h> +#include <linux/bio.h> + +#include "attrib.h" +#include "aops.h" +#include "debug.h" +#include "logfile.h" +#include "malloc.h" +#include "volume.h" +#include "ntfs.h" + +/** + * ntfs_check_restart_page_header - check the page header for consistency + * @vi: $LogFile inode to which the restart page header belongs + * @rp: restart page header to check + * @pos: position in @vi at which the restart page header resides + * + * Check the restart page header @rp for consistency and return 'true' if it is + * consistent and 'false' otherwise. + * + * This function only needs NTFS_BLOCK_SIZE bytes in @rp, i.e. it does not + * require the full restart page. + */ +static bool ntfs_check_restart_page_header(struct inode *vi, + RESTART_PAGE_HEADER *rp, s64 pos) +{ + u32 logfile_system_page_size, logfile_log_page_size; + u16 ra_ofs, usa_count, usa_ofs, usa_end = 0; + bool have_usa = true; + + ntfs_debug("Entering."); + /* + * If the system or log page sizes are smaller than the ntfs block size + * or either is not a power of 2 we cannot handle this log file. + */ + logfile_system_page_size = le32_to_cpu(rp->system_page_size); + logfile_log_page_size = le32_to_cpu(rp->log_page_size); + if (logfile_system_page_size < NTFS_BLOCK_SIZE || + logfile_log_page_size < NTFS_BLOCK_SIZE || + logfile_system_page_size & + (logfile_system_page_size - 1) || + !is_power_of_2(logfile_log_page_size)) { + ntfs_error(vi->i_sb, "$LogFile uses unsupported page size."); + return false; + } + /* + * We must be either at !pos (1st restart page) or at pos = system page + * size (2nd restart page). + */ + if (pos && pos != logfile_system_page_size) { + ntfs_error(vi->i_sb, "Found restart area in incorrect " + "position in $LogFile."); + return false; + } + /* We only know how to handle version 1.1. */ + if (sle16_to_cpu(rp->major_ver) != 1 || + sle16_to_cpu(rp->minor_ver) != 1) { + ntfs_error(vi->i_sb, "$LogFile version %i.%i is not " + "supported. (This driver supports version " + "1.1 only.)", (int)sle16_to_cpu(rp->major_ver), + (int)sle16_to_cpu(rp->minor_ver)); + return false; + } + /* + * If chkdsk has been run the restart page may not be protected by an + * update sequence array. + */ + if (ntfs_is_chkd_record(rp->magic) && !le16_to_cpu(rp->usa_count)) { + have_usa = false; + goto skip_usa_checks; + } + /* Verify the size of the update sequence array. */ + usa_count = 1 + (logfile_system_page_size >> NTFS_BLOCK_SIZE_BITS); + if (usa_count != le16_to_cpu(rp->usa_count)) { + ntfs_error(vi->i_sb, "$LogFile restart page specifies " + "inconsistent update sequence array count."); + return false; + } + /* Verify the position of the update sequence array. */ + usa_ofs = le16_to_cpu(rp->usa_ofs); + usa_end = usa_ofs + usa_count * sizeof(u16); + if (usa_ofs < sizeof(RESTART_PAGE_HEADER) || + usa_end > NTFS_BLOCK_SIZE - sizeof(u16)) { + ntfs_error(vi->i_sb, "$LogFile restart page specifies " + "inconsistent update sequence array offset."); + return false; + } +skip_usa_checks: + /* + * Verify the position of the restart area. It must be: + * - aligned to 8-byte boundary, + * - after the update sequence array, and + * - within the system page size. + */ + ra_ofs = le16_to_cpu(rp->restart_area_offset); + if (ra_ofs & 7 || (have_usa ? ra_ofs < usa_end : + ra_ofs < sizeof(RESTART_PAGE_HEADER)) || + ra_ofs > logfile_system_page_size) { + ntfs_error(vi->i_sb, "$LogFile restart page specifies " + "inconsistent restart area offset."); + return false; + } + /* + * Only restart pages modified by chkdsk are allowed to have chkdsk_lsn + * set. + */ + if (!ntfs_is_chkd_record(rp->magic) && sle64_to_cpu(rp->chkdsk_lsn)) { + ntfs_error(vi->i_sb, "$LogFile restart page is not modified " + "by chkdsk but a chkdsk LSN is specified."); + return false; + } + ntfs_debug("Done."); + return true; +} + +/** + * ntfs_check_restart_area - check the restart area for consistency + * @vi: $LogFile inode to which the restart page belongs + * @rp: restart page whose restart area to check + * + * Check the restart area of the restart page @rp for consistency and return + * 'true' if it is consistent and 'false' otherwise. + * + * This function assumes that the restart page header has already been + * consistency checked. + * + * This function only needs NTFS_BLOCK_SIZE bytes in @rp, i.e. it does not + * require the full restart page. + */ +static bool ntfs_check_restart_area(struct inode *vi, RESTART_PAGE_HEADER *rp) +{ + u64 file_size; + RESTART_AREA *ra; + u16 ra_ofs, ra_len, ca_ofs; + u8 fs_bits; + + ntfs_debug("Entering."); + ra_ofs = le16_to_cpu(rp->restart_area_offset); + ra = (RESTART_AREA*)((u8*)rp + ra_ofs); + /* + * Everything before ra->file_size must be before the first word + * protected by an update sequence number. This ensures that it is + * safe to access ra->client_array_offset. + */ + if (ra_ofs + offsetof(RESTART_AREA, file_size) > + NTFS_BLOCK_SIZE - sizeof(u16)) { + ntfs_error(vi->i_sb, "$LogFile restart area specifies " + "inconsistent file offset."); + return false; + } + /* + * Now that we can access ra->client_array_offset, make sure everything + * up to the log client array is before the first word protected by an + * update sequence number. This ensures we can access all of the + * restart area elements safely. Also, the client array offset must be + * aligned to an 8-byte boundary. + */ + ca_ofs = le16_to_cpu(ra->client_array_offset); + if (((ca_ofs + 7) & ~7) != ca_ofs || + ra_ofs + ca_ofs > NTFS_BLOCK_SIZE - sizeof(u16)) { + ntfs_error(vi->i_sb, "$LogFile restart area specifies " + "inconsistent client array offset."); + return false; + } + /* + * The restart area must end within the system page size both when + * calculated manually and as specified by ra->restart_area_length. + * Also, the calculated length must not exceed the specified length. + */ + ra_len = ca_ofs + le16_to_cpu(ra->log_clients) * + sizeof(LOG_CLIENT_RECORD); + if (ra_ofs + ra_len > le32_to_cpu(rp->system_page_size) || + ra_ofs + le16_to_cpu(ra->restart_area_length) > + le32_to_cpu(rp->system_page_size) || + ra_len > le16_to_cpu(ra->restart_area_length)) { + ntfs_error(vi->i_sb, "$LogFile restart area is out of bounds " + "of the system page size specified by the " + "restart page header and/or the specified " + "restart area length is inconsistent."); + return false; + } + /* + * The ra->client_free_list and ra->client_in_use_list must be either + * LOGFILE_NO_CLIENT or less than ra->log_clients or they are + * overflowing the client array. + */ + if ((ra->client_free_list != LOGFILE_NO_CLIENT && + le16_to_cpu(ra->client_free_list) >= + le16_to_cpu(ra->log_clients)) || + (ra->client_in_use_list != LOGFILE_NO_CLIENT && + le16_to_cpu(ra->client_in_use_list) >= + le16_to_cpu(ra->log_clients))) { + ntfs_error(vi->i_sb, "$LogFile restart area specifies " + "overflowing client free and/or in use lists."); + return false; + } + /* + * Check ra->seq_number_bits against ra->file_size for consistency. + * We cannot just use ffs() because the file size is not a power of 2. + */ + file_size = (u64)sle64_to_cpu(ra->file_size); + fs_bits = 0; + while (file_size) { + file_size >>= 1; + fs_bits++; + } + if (le32_to_cpu(ra->seq_number_bits) != 67 - fs_bits) { + ntfs_error(vi->i_sb, "$LogFile restart area specifies " + "inconsistent sequence number bits."); + return false; + } + /* The log record header length must be a multiple of 8. */ + if (((le16_to_cpu(ra->log_record_header_length) + 7) & ~7) != + le16_to_cpu(ra->log_record_header_length)) { + ntfs_error(vi->i_sb, "$LogFile restart area specifies " + "inconsistent log record header length."); + return false; + } + /* Dito for the log page data offset. */ + if (((le16_to_cpu(ra->log_page_data_offset) + 7) & ~7) != + le16_to_cpu(ra->log_page_data_offset)) { + ntfs_error(vi->i_sb, "$LogFile restart area specifies " + "inconsistent log page data offset."); + return false; + } + ntfs_debug("Done."); + return true; +} + +/** + * ntfs_check_log_client_array - check the log client array for consistency + * @vi: $LogFile inode to which the restart page belongs + * @rp: restart page whose log client array to check + * + * Check the log client array of the restart page @rp for consistency and + * return 'true' if it is consistent and 'false' otherwise. + * + * This function assumes that the restart page header and the restart area have + * already been consistency checked. + * + * Unlike ntfs_check_restart_page_header() and ntfs_check_restart_area(), this + * function needs @rp->system_page_size bytes in @rp, i.e. it requires the full + * restart page and the page must be multi sector transfer deprotected. + */ +static bool ntfs_check_log_client_array(struct inode *vi, + RESTART_PAGE_HEADER *rp) +{ + RESTART_AREA *ra; + LOG_CLIENT_RECORD *ca, *cr; + u16 nr_clients, idx; + bool in_free_list, idx_is_first; + + ntfs_debug("Entering."); + ra = (RESTART_AREA*)((u8*)rp + le16_to_cpu(rp->restart_area_offset)); + ca = (LOG_CLIENT_RECORD*)((u8*)ra + + le16_to_cpu(ra->client_array_offset)); + /* + * Check the ra->client_free_list first and then check the + * ra->client_in_use_list. Check each of the log client records in + * each of the lists and check that the array does not overflow the + * ra->log_clients value. Also keep track of the number of records + * visited as there cannot be more than ra->log_clients records and + * that way we detect eventual loops in within a list. + */ + nr_clients = le16_to_cpu(ra->log_clients); + idx = le16_to_cpu(ra->client_free_list); + in_free_list = true; +check_list: + for (idx_is_first = true; idx != LOGFILE_NO_CLIENT_CPU; nr_clients--, + idx = le16_to_cpu(cr->next_client)) { + if (!nr_clients || idx >= le16_to_cpu(ra->log_clients)) + goto err_out; + /* Set @cr to the current log client record. */ + cr = ca + idx; + /* The first log client record must not have a prev_client. */ + if (idx_is_first) { + if (cr->prev_client != LOGFILE_NO_CLIENT) + goto err_out; + idx_is_first = false; + } + } + /* Switch to and check the in use list if we just did the free list. */ + if (in_free_list) { + in_free_list = false; + idx = le16_to_cpu(ra->client_in_use_list); + goto check_list; + } + ntfs_debug("Done."); + return true; +err_out: + ntfs_error(vi->i_sb, "$LogFile log client array is corrupt."); + return false; +} + +/** + * ntfs_check_and_load_restart_page - check the restart page for consistency + * @vi: $LogFile inode to which the restart page belongs + * @rp: restart page to check + * @pos: position in @vi at which the restart page resides + * @wrp: [OUT] copy of the multi sector transfer deprotected restart page + * @lsn: [OUT] set to the current logfile lsn on success + * + * Check the restart page @rp for consistency and return 0 if it is consistent + * and -errno otherwise. The restart page may have been modified by chkdsk in + * which case its magic is CHKD instead of RSTR. + * + * This function only needs NTFS_BLOCK_SIZE bytes in @rp, i.e. it does not + * require the full restart page. + * + * If @wrp is not NULL, on success, *@wrp will point to a buffer containing a + * copy of the complete multi sector transfer deprotected page. On failure, + * *@wrp is undefined. + * + * Simillarly, if @lsn is not NULL, on success *@lsn will be set to the current + * logfile lsn according to this restart page. On failure, *@lsn is undefined. + * + * The following error codes are defined: + * -EINVAL - The restart page is inconsistent. + * -ENOMEM - Not enough memory to load the restart page. + * -EIO - Failed to reading from $LogFile. + */ +static int ntfs_check_and_load_restart_page(struct inode *vi, + RESTART_PAGE_HEADER *rp, s64 pos, RESTART_PAGE_HEADER **wrp, + LSN *lsn) +{ + RESTART_AREA *ra; + RESTART_PAGE_HEADER *trp; + int size, err; + + ntfs_debug("Entering."); + /* Check the restart page header for consistency. */ + if (!ntfs_check_restart_page_header(vi, rp, pos)) { + /* Error output already done inside the function. */ + return -EINVAL; + } + /* Check the restart area for consistency. */ + if (!ntfs_check_restart_area(vi, rp)) { + /* Error output already done inside the function. */ + return -EINVAL; + } + ra = (RESTART_AREA*)((u8*)rp + le16_to_cpu(rp->restart_area_offset)); + /* + * Allocate a buffer to store the whole restart page so we can multi + * sector transfer deprotect it. + */ + trp = ntfs_malloc_nofs(le32_to_cpu(rp->system_page_size)); + if (!trp) { + ntfs_error(vi->i_sb, "Failed to allocate memory for $LogFile " + "restart page buffer."); + return -ENOMEM; + } + /* + * Read the whole of the restart page into the buffer. If it fits + * completely inside @rp, just copy it from there. Otherwise map all + * the required pages and copy the data from them. + */ + size = PAGE_SIZE - (pos & ~PAGE_MASK); + if (size >= le32_to_cpu(rp->system_page_size)) { + memcpy(trp, rp, le32_to_cpu(rp->system_page_size)); + } else { + pgoff_t idx; + struct page *page; + int have_read, to_read; + + /* First copy what we already have in @rp. */ + memcpy(trp, rp, size); + /* Copy the remaining data one page at a time. */ + have_read = size; + to_read = le32_to_cpu(rp->system_page_size) - size; + idx = (pos + size) >> PAGE_SHIFT; + BUG_ON((pos + size) & ~PAGE_MASK); + do { + page = ntfs_map_page(vi->i_mapping, idx); + if (IS_ERR(page)) { + ntfs_error(vi->i_sb, "Error mapping $LogFile " + "page (index %lu).", idx); + err = PTR_ERR(page); + if (err != -EIO && err != -ENOMEM) + err = -EIO; + goto err_out; + } + size = min_t(int, to_read, PAGE_SIZE); + memcpy((u8*)trp + have_read, page_address(page), size); + ntfs_unmap_page(page); + have_read += size; + to_read -= size; + idx++; + } while (to_read > 0); + } + /* + * Perform the multi sector transfer deprotection on the buffer if the + * restart page is protected. + */ + if ((!ntfs_is_chkd_record(trp->magic) || le16_to_cpu(trp->usa_count)) + && post_read_mst_fixup((NTFS_RECORD*)trp, + le32_to_cpu(rp->system_page_size))) { + /* + * A multi sector tranfer error was detected. We only need to + * abort if the restart page contents exceed the multi sector + * transfer fixup of the first sector. + */ + if (le16_to_cpu(rp->restart_area_offset) + + le16_to_cpu(ra->restart_area_length) > + NTFS_BLOCK_SIZE - sizeof(u16)) { + ntfs_error(vi->i_sb, "Multi sector transfer error " + "detected in $LogFile restart page."); + err = -EINVAL; + goto err_out; + } + } + /* + * If the restart page is modified by chkdsk or there are no active + * logfile clients, the logfile is consistent. Otherwise, need to + * check the log client records for consistency, too. + */ + err = 0; + if (ntfs_is_rstr_record(rp->magic) && + ra->client_in_use_list != LOGFILE_NO_CLIENT) { + if (!ntfs_check_log_client_array(vi, trp)) { + err = -EINVAL; + goto err_out; + } + } + if (lsn) { + if (ntfs_is_rstr_record(rp->magic)) + *lsn = sle64_to_cpu(ra->current_lsn); + else /* if (ntfs_is_chkd_record(rp->magic)) */ + *lsn = sle64_to_cpu(rp->chkdsk_lsn); + } + ntfs_debug("Done."); + if (wrp) + *wrp = trp; + else { +err_out: + ntfs_free(trp); + } + return err; +} + +/** + * ntfs_check_logfile - check the journal for consistency + * @log_vi: struct inode of loaded journal $LogFile to check + * @rp: [OUT] on success this is a copy of the current restart page + * + * Check the $LogFile journal for consistency and return 'true' if it is + * consistent and 'false' if not. On success, the current restart page is + * returned in *@rp. Caller must call ntfs_free(*@rp) when finished with it. + * + * At present we only check the two restart pages and ignore the log record + * pages. + * + * Note that the MstProtected flag is not set on the $LogFile inode and hence + * when reading pages they are not deprotected. This is because we do not know + * if the $LogFile was created on a system with a different page size to ours + * yet and mst deprotection would fail if our page size is smaller. + */ +bool ntfs_check_logfile(struct inode *log_vi, RESTART_PAGE_HEADER **rp) +{ + s64 size, pos; + LSN rstr1_lsn, rstr2_lsn; + ntfs_volume *vol = NTFS_SB(log_vi->i_sb); + struct address_space *mapping = log_vi->i_mapping; + struct page *page = NULL; + u8 *kaddr = NULL; + RESTART_PAGE_HEADER *rstr1_ph = NULL; + RESTART_PAGE_HEADER *rstr2_ph = NULL; + int log_page_size, err; + bool logfile_is_empty = true; + u8 log_page_bits; + + ntfs_debug("Entering."); + /* An empty $LogFile must have been clean before it got emptied. */ + if (NVolLogFileEmpty(vol)) + goto is_empty; + size = i_size_read(log_vi); + /* Make sure the file doesn't exceed the maximum allowed size. */ + if (size > MaxLogFileSize) + size = MaxLogFileSize; + /* + * Truncate size to a multiple of the page cache size or the default + * log page size if the page cache size is between the default log page + * log page size if the page cache size is between the default log page + * size and twice that. + */ + if (PAGE_SIZE >= DefaultLogPageSize && PAGE_SIZE <= + DefaultLogPageSize * 2) + log_page_size = DefaultLogPageSize; + else + log_page_size = PAGE_SIZE; + /* + * Use ntfs_ffs() instead of ffs() to enable the compiler to + * optimize log_page_size and log_page_bits into constants. + */ + log_page_bits = ntfs_ffs(log_page_size) - 1; + size &= ~(s64)(log_page_size - 1); + /* + * Ensure the log file is big enough to store at least the two restart + * pages and the minimum number of log record pages. + */ + if (size < log_page_size * 2 || (size - log_page_size * 2) >> + log_page_bits < MinLogRecordPages) { + ntfs_error(vol->sb, "$LogFile is too small."); + return false; + } + /* + * Read through the file looking for a restart page. Since the restart + * page header is at the beginning of a page we only need to search at + * what could be the beginning of a page (for each page size) rather + * than scanning the whole file byte by byte. If all potential places + * contain empty and uninitialzed records, the log file can be assumed + * to be empty. + */ + for (pos = 0; pos < size; pos <<= 1) { + pgoff_t idx = pos >> PAGE_SHIFT; + if (!page || page->index != idx) { + if (page) + ntfs_unmap_page(page); + page = ntfs_map_page(mapping, idx); + if (IS_ERR(page)) { + ntfs_error(vol->sb, "Error mapping $LogFile " + "page (index %lu).", idx); + goto err_out; + } + } + kaddr = (u8*)page_address(page) + (pos & ~PAGE_MASK); + /* + * A non-empty block means the logfile is not empty while an + * empty block after a non-empty block has been encountered + * means we are done. + */ + if (!ntfs_is_empty_recordp((le32*)kaddr)) + logfile_is_empty = false; + else if (!logfile_is_empty) + break; + /* + * A log record page means there cannot be a restart page after + * this so no need to continue searching. + */ + if (ntfs_is_rcrd_recordp((le32*)kaddr)) + break; + /* If not a (modified by chkdsk) restart page, continue. */ + if (!ntfs_is_rstr_recordp((le32*)kaddr) && + !ntfs_is_chkd_recordp((le32*)kaddr)) { + if (!pos) + pos = NTFS_BLOCK_SIZE >> 1; + continue; + } + /* + * Check the (modified by chkdsk) restart page for consistency + * and get a copy of the complete multi sector transfer + * deprotected restart page. + */ + err = ntfs_check_and_load_restart_page(log_vi, + (RESTART_PAGE_HEADER*)kaddr, pos, + !rstr1_ph ? &rstr1_ph : &rstr2_ph, + !rstr1_ph ? &rstr1_lsn : &rstr2_lsn); + if (!err) { + /* + * If we have now found the first (modified by chkdsk) + * restart page, continue looking for the second one. + */ + if (!pos) { + pos = NTFS_BLOCK_SIZE >> 1; + continue; + } + /* + * We have now found the second (modified by chkdsk) + * restart page, so we can stop looking. + */ + break; + } + /* + * Error output already done inside the function. Note, we do + * not abort if the restart page was invalid as we might still + * find a valid one further in the file. + */ + if (err != -EINVAL) { + ntfs_unmap_page(page); + goto err_out; + } + /* Continue looking. */ + if (!pos) + pos = NTFS_BLOCK_SIZE >> 1; + } + if (page) + ntfs_unmap_page(page); + if (logfile_is_empty) { + NVolSetLogFileEmpty(vol); +is_empty: + ntfs_debug("Done. ($LogFile is empty.)"); + return true; + } + if (!rstr1_ph) { + BUG_ON(rstr2_ph); + ntfs_error(vol->sb, "Did not find any restart pages in " + "$LogFile and it was not empty."); + return false; + } + /* If both restart pages were found, use the more recent one. */ + if (rstr2_ph) { + /* + * If the second restart area is more recent, switch to it. + * Otherwise just throw it away. + */ + if (rstr2_lsn > rstr1_lsn) { + ntfs_debug("Using second restart page as it is more " + "recent."); + ntfs_free(rstr1_ph); + rstr1_ph = rstr2_ph; + /* rstr1_lsn = rstr2_lsn; */ + } else { + ntfs_debug("Using first restart page as it is more " + "recent."); + ntfs_free(rstr2_ph); + } + rstr2_ph = NULL; + } + /* All consistency checks passed. */ + if (rp) + *rp = rstr1_ph; + else + ntfs_free(rstr1_ph); + ntfs_debug("Done."); + return true; +err_out: + if (rstr1_ph) + ntfs_free(rstr1_ph); + return false; +} + +/** + * ntfs_is_logfile_clean - check in the journal if the volume is clean + * @log_vi: struct inode of loaded journal $LogFile to check + * @rp: copy of the current restart page + * + * Analyze the $LogFile journal and return 'true' if it indicates the volume was + * shutdown cleanly and 'false' if not. + * + * At present we only look at the two restart pages and ignore the log record + * pages. This is a little bit crude in that there will be a very small number + * of cases where we think that a volume is dirty when in fact it is clean. + * This should only affect volumes that have not been shutdown cleanly but did + * not have any pending, non-check-pointed i/o, i.e. they were completely idle + * at least for the five seconds preceding the unclean shutdown. + * + * This function assumes that the $LogFile journal has already been consistency + * checked by a call to ntfs_check_logfile() and in particular if the $LogFile + * is empty this function requires that NVolLogFileEmpty() is true otherwise an + * empty volume will be reported as dirty. + */ +bool ntfs_is_logfile_clean(struct inode *log_vi, const RESTART_PAGE_HEADER *rp) +{ + ntfs_volume *vol = NTFS_SB(log_vi->i_sb); + RESTART_AREA *ra; + + ntfs_debug("Entering."); + /* An empty $LogFile must have been clean before it got emptied. */ + if (NVolLogFileEmpty(vol)) { + ntfs_debug("Done. ($LogFile is empty.)"); + return true; + } + BUG_ON(!rp); + if (!ntfs_is_rstr_record(rp->magic) && + !ntfs_is_chkd_record(rp->magic)) { + ntfs_error(vol->sb, "Restart page buffer is invalid. This is " + "probably a bug in that the $LogFile should " + "have been consistency checked before calling " + "this function."); + return false; + } + ra = (RESTART_AREA*)((u8*)rp + le16_to_cpu(rp->restart_area_offset)); + /* + * If the $LogFile has active clients, i.e. it is open, and we do not + * have the RESTART_VOLUME_IS_CLEAN bit set in the restart area flags, + * we assume there was an unclean shutdown. + */ + if (ra->client_in_use_list != LOGFILE_NO_CLIENT && + !(ra->flags & RESTART_VOLUME_IS_CLEAN)) { + ntfs_debug("Done. $LogFile indicates a dirty shutdown."); + return false; + } + /* $LogFile indicates a clean shutdown. */ + ntfs_debug("Done. $LogFile indicates a clean shutdown."); + return true; +} + +/** + * ntfs_empty_logfile - empty the contents of the $LogFile journal + * @log_vi: struct inode of loaded journal $LogFile to empty + * + * Empty the contents of the $LogFile journal @log_vi and return 'true' on + * success and 'false' on error. + * + * This function assumes that the $LogFile journal has already been consistency + * checked by a call to ntfs_check_logfile() and that ntfs_is_logfile_clean() + * has been used to ensure that the $LogFile is clean. + */ +bool ntfs_empty_logfile(struct inode *log_vi) +{ + VCN vcn, end_vcn; + ntfs_inode *log_ni = NTFS_I(log_vi); + ntfs_volume *vol = log_ni->vol; + struct super_block *sb = vol->sb; + runlist_element *rl; + unsigned long flags; + unsigned block_size, block_size_bits; + int err; + bool should_wait = true; + + ntfs_debug("Entering."); + if (NVolLogFileEmpty(vol)) { + ntfs_debug("Done."); + return true; + } + /* + * We cannot use ntfs_attr_set() because we may be still in the middle + * of a mount operation. Thus we do the emptying by hand by first + * zapping the page cache pages for the $LogFile/$DATA attribute and + * then emptying each of the buffers in each of the clusters specified + * by the runlist by hand. + */ + block_size = sb->s_blocksize; + block_size_bits = sb->s_blocksize_bits; + vcn = 0; + read_lock_irqsave(&log_ni->size_lock, flags); + end_vcn = (log_ni->initialized_size + vol->cluster_size_mask) >> + vol->cluster_size_bits; + read_unlock_irqrestore(&log_ni->size_lock, flags); + truncate_inode_pages(log_vi->i_mapping, 0); + down_write(&log_ni->runlist.lock); + rl = log_ni->runlist.rl; + if (unlikely(!rl || vcn < rl->vcn || !rl->length)) { +map_vcn: + err = ntfs_map_runlist_nolock(log_ni, vcn, NULL); + if (err) { + ntfs_error(sb, "Failed to map runlist fragment (error " + "%d).", -err); + goto err; + } + rl = log_ni->runlist.rl; + BUG_ON(!rl || vcn < rl->vcn || !rl->length); + } + /* Seek to the runlist element containing @vcn. */ + while (rl->length && vcn >= rl[1].vcn) + rl++; + do { + LCN lcn; + sector_t block, end_block; + s64 len; + + /* + * If this run is not mapped map it now and start again as the + * runlist will have been updated. + */ + lcn = rl->lcn; + if (unlikely(lcn == LCN_RL_NOT_MAPPED)) { + vcn = rl->vcn; + goto map_vcn; + } + /* If this run is not valid abort with an error. */ + if (unlikely(!rl->length || lcn < LCN_HOLE)) + goto rl_err; + /* Skip holes. */ + if (lcn == LCN_HOLE) + continue; + block = lcn << vol->cluster_size_bits >> block_size_bits; + len = rl->length; + if (rl[1].vcn > end_vcn) + len = end_vcn - rl->vcn; + end_block = (lcn + len) << vol->cluster_size_bits >> + block_size_bits; + /* Iterate over the blocks in the run and empty them. */ + do { + struct buffer_head *bh; + + /* Obtain the buffer, possibly not uptodate. */ + bh = sb_getblk(sb, block); + BUG_ON(!bh); + /* Setup buffer i/o submission. */ + lock_buffer(bh); + bh->b_end_io = end_buffer_write_sync; + get_bh(bh); + /* Set the entire contents of the buffer to 0xff. */ + memset(bh->b_data, -1, block_size); + if (!buffer_uptodate(bh)) + set_buffer_uptodate(bh); + if (buffer_dirty(bh)) + clear_buffer_dirty(bh); + /* + * Submit the buffer and wait for i/o to complete but + * only for the first buffer so we do not miss really + * serious i/o errors. Once the first buffer has + * completed ignore errors afterwards as we can assume + * that if one buffer worked all of them will work. + */ + submit_bh(REQ_OP_WRITE, bh); + if (should_wait) { + should_wait = false; + wait_on_buffer(bh); + if (unlikely(!buffer_uptodate(bh))) + goto io_err; + } + brelse(bh); + } while (++block < end_block); + } while ((++rl)->vcn < end_vcn); + up_write(&log_ni->runlist.lock); + /* + * Zap the pages again just in case any got instantiated whilst we were + * emptying the blocks by hand. FIXME: We may not have completed + * writing to all the buffer heads yet so this may happen too early. + * We really should use a kernel thread to do the emptying + * asynchronously and then we can also set the volume dirty and output + * an error message if emptying should fail. + */ + truncate_inode_pages(log_vi->i_mapping, 0); + /* Set the flag so we do not have to do it again on remount. */ + NVolSetLogFileEmpty(vol); + ntfs_debug("Done."); + return true; +io_err: + ntfs_error(sb, "Failed to write buffer. Unmount and run chkdsk."); + goto dirty_err; +rl_err: + ntfs_error(sb, "Runlist is corrupt. Unmount and run chkdsk."); +dirty_err: + NVolSetErrors(vol); + err = -EIO; +err: + up_write(&log_ni->runlist.lock); + ntfs_error(sb, "Failed to fill $LogFile with 0xff bytes (error %d).", + -err); + return false; +} + +#endif /* NTFS_RW */ diff --git a/fs/ntfs/logfile.h b/fs/ntfs/logfile.h new file mode 100644 index 000000000..429d4909c --- /dev/null +++ b/fs/ntfs/logfile.h @@ -0,0 +1,295 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * logfile.h - Defines for NTFS kernel journal ($LogFile) handling. Part of + * the Linux-NTFS project. + * + * Copyright (c) 2000-2005 Anton Altaparmakov + */ + +#ifndef _LINUX_NTFS_LOGFILE_H +#define _LINUX_NTFS_LOGFILE_H + +#ifdef NTFS_RW + +#include <linux/fs.h> + +#include "types.h" +#include "endian.h" +#include "layout.h" + +/* + * Journal ($LogFile) organization: + * + * Two restart areas present in the first two pages (restart pages, one restart + * area in each page). When the volume is dismounted they should be identical, + * except for the update sequence array which usually has a different update + * sequence number. + * + * These are followed by log records organized in pages headed by a log record + * header going up to log file size. Not all pages contain log records when a + * volume is first formatted, but as the volume ages, all records will be used. + * When the log file fills up, the records at the beginning are purged (by + * modifying the oldest_lsn to a higher value presumably) and writing begins + * at the beginning of the file. Effectively, the log file is viewed as a + * circular entity. + * + * NOTE: Windows NT, 2000, and XP all use log file version 1.1 but they accept + * versions <= 1.x, including 0.-1. (Yes, that is a minus one in there!) We + * probably only want to support 1.1 as this seems to be the current version + * and we don't know how that differs from the older versions. The only + * exception is if the journal is clean as marked by the two restart pages + * then it doesn't matter whether we are on an earlier version. We can just + * reinitialize the logfile and start again with version 1.1. + */ + +/* Some $LogFile related constants. */ +#define MaxLogFileSize 0x100000000ULL +#define DefaultLogPageSize 4096 +#define MinLogRecordPages 48 + +/* + * Log file restart page header (begins the restart area). + */ +typedef struct { +/*Ofs*/ +/* 0 NTFS_RECORD; -- Unfolded here as gcc doesn't like unnamed structs. */ +/* 0*/ NTFS_RECORD_TYPE magic; /* The magic is "RSTR". */ +/* 4*/ le16 usa_ofs; /* See NTFS_RECORD definition in layout.h. + When creating, set this to be immediately + after this header structure (without any + alignment). */ +/* 6*/ le16 usa_count; /* See NTFS_RECORD definition in layout.h. */ + +/* 8*/ leLSN chkdsk_lsn; /* The last log file sequence number found by + chkdsk. Only used when the magic is changed + to "CHKD". Otherwise this is zero. */ +/* 16*/ le32 system_page_size; /* Byte size of system pages when the log file + was created, has to be >= 512 and a power of + 2. Use this to calculate the required size + of the usa (usa_count) and add it to usa_ofs. + Then verify that the result is less than the + value of the restart_area_offset. */ +/* 20*/ le32 log_page_size; /* Byte size of log file pages, has to be >= + 512 and a power of 2. The default is 4096 + and is used when the system page size is + between 4096 and 8192. Otherwise this is + set to the system page size instead. */ +/* 24*/ le16 restart_area_offset;/* Byte offset from the start of this header to + the RESTART_AREA. Value has to be aligned + to 8-byte boundary. When creating, set this + to be after the usa. */ +/* 26*/ sle16 minor_ver; /* Log file minor version. Only check if major + version is 1. */ +/* 28*/ sle16 major_ver; /* Log file major version. We only support + version 1.1. */ +/* sizeof() = 30 (0x1e) bytes */ +} __attribute__ ((__packed__)) RESTART_PAGE_HEADER; + +/* + * Constant for the log client indices meaning that there are no client records + * in this particular client array. Also inside the client records themselves, + * this means that there are no client records preceding or following this one. + */ +#define LOGFILE_NO_CLIENT cpu_to_le16(0xffff) +#define LOGFILE_NO_CLIENT_CPU 0xffff + +/* + * These are the so far known RESTART_AREA_* flags (16-bit) which contain + * information about the log file in which they are present. + */ +enum { + RESTART_VOLUME_IS_CLEAN = cpu_to_le16(0x0002), + RESTART_SPACE_FILLER = cpu_to_le16(0xffff), /* gcc: Force enum bit width to 16. */ +} __attribute__ ((__packed__)); + +typedef le16 RESTART_AREA_FLAGS; + +/* + * Log file restart area record. The offset of this record is found by adding + * the offset of the RESTART_PAGE_HEADER to the restart_area_offset value found + * in it. See notes at restart_area_offset above. + */ +typedef struct { +/*Ofs*/ +/* 0*/ leLSN current_lsn; /* The current, i.e. last LSN inside the log + when the restart area was last written. + This happens often but what is the interval? + Is it just fixed time or is it every time a + check point is written or somethine else? + On create set to 0. */ +/* 8*/ le16 log_clients; /* Number of log client records in the array of + log client records which follows this + restart area. Must be 1. */ +/* 10*/ le16 client_free_list; /* The index of the first free log client record + in the array of log client records. + LOGFILE_NO_CLIENT means that there are no + free log client records in the array. + If != LOGFILE_NO_CLIENT, check that + log_clients > client_free_list. On Win2k + and presumably earlier, on a clean volume + this is != LOGFILE_NO_CLIENT, and it should + be 0, i.e. the first (and only) client + record is free and thus the logfile is + closed and hence clean. A dirty volume + would have left the logfile open and hence + this would be LOGFILE_NO_CLIENT. On WinXP + and presumably later, the logfile is always + open, even on clean shutdown so this should + always be LOGFILE_NO_CLIENT. */ +/* 12*/ le16 client_in_use_list;/* The index of the first in-use log client + record in the array of log client records. + LOGFILE_NO_CLIENT means that there are no + in-use log client records in the array. If + != LOGFILE_NO_CLIENT check that log_clients + > client_in_use_list. On Win2k and + presumably earlier, on a clean volume this + is LOGFILE_NO_CLIENT, i.e. there are no + client records in use and thus the logfile + is closed and hence clean. A dirty volume + would have left the logfile open and hence + this would be != LOGFILE_NO_CLIENT, and it + should be 0, i.e. the first (and only) + client record is in use. On WinXP and + presumably later, the logfile is always + open, even on clean shutdown so this should + always be 0. */ +/* 14*/ RESTART_AREA_FLAGS flags;/* Flags modifying LFS behaviour. On Win2k + and presumably earlier this is always 0. On + WinXP and presumably later, if the logfile + was shutdown cleanly, the second bit, + RESTART_VOLUME_IS_CLEAN, is set. This bit + is cleared when the volume is mounted by + WinXP and set when the volume is dismounted, + thus if the logfile is dirty, this bit is + clear. Thus we don't need to check the + Windows version to determine if the logfile + is clean. Instead if the logfile is closed, + we know it must be clean. If it is open and + this bit is set, we also know it must be + clean. If on the other hand the logfile is + open and this bit is clear, we can be almost + certain that the logfile is dirty. */ +/* 16*/ le32 seq_number_bits; /* How many bits to use for the sequence + number. This is calculated as 67 - the + number of bits required to store the logfile + size in bytes and this can be used in with + the specified file_size as a consistency + check. */ +/* 20*/ le16 restart_area_length;/* Length of the restart area including the + client array. Following checks required if + version matches. Otherwise, skip them. + restart_area_offset + restart_area_length + has to be <= system_page_size. Also, + restart_area_length has to be >= + client_array_offset + (log_clients * + sizeof(log client record)). */ +/* 22*/ le16 client_array_offset;/* Offset from the start of this record to + the first log client record if versions are + matched. When creating, set this to be + after this restart area structure, aligned + to 8-bytes boundary. If the versions do not + match, this is ignored and the offset is + assumed to be (sizeof(RESTART_AREA) + 7) & + ~7, i.e. rounded up to first 8-byte + boundary. Either way, client_array_offset + has to be aligned to an 8-byte boundary. + Also, restart_area_offset + + client_array_offset has to be <= 510. + Finally, client_array_offset + (log_clients + * sizeof(log client record)) has to be <= + system_page_size. On Win2k and presumably + earlier, this is 0x30, i.e. immediately + following this record. On WinXP and + presumably later, this is 0x40, i.e. there + are 16 extra bytes between this record and + the client array. This probably means that + the RESTART_AREA record is actually bigger + in WinXP and later. */ +/* 24*/ sle64 file_size; /* Usable byte size of the log file. If the + restart_area_offset + the offset of the + file_size are > 510 then corruption has + occurred. This is the very first check when + starting with the restart_area as if it + fails it means that some of the above values + will be corrupted by the multi sector + transfer protection. The file_size has to + be rounded down to be a multiple of the + log_page_size in the RESTART_PAGE_HEADER and + then it has to be at least big enough to + store the two restart pages and 48 (0x30) + log record pages. */ +/* 32*/ le32 last_lsn_data_length;/* Length of data of last LSN, not including + the log record header. On create set to + 0. */ +/* 36*/ le16 log_record_header_length;/* Byte size of the log record header. + If the version matches then check that the + value of log_record_header_length is a + multiple of 8, i.e. + (log_record_header_length + 7) & ~7 == + log_record_header_length. When creating set + it to sizeof(LOG_RECORD_HEADER), aligned to + 8 bytes. */ +/* 38*/ le16 log_page_data_offset;/* Offset to the start of data in a log record + page. Must be a multiple of 8. On create + set it to immediately after the update + sequence array of the log record page. */ +/* 40*/ le32 restart_log_open_count;/* A counter that gets incremented every + time the logfile is restarted which happens + at mount time when the logfile is opened. + When creating set to a random value. Win2k + sets it to the low 32 bits of the current + system time in NTFS format (see time.h). */ +/* 44*/ le32 reserved; /* Reserved/alignment to 8-byte boundary. */ +/* sizeof() = 48 (0x30) bytes */ +} __attribute__ ((__packed__)) RESTART_AREA; + +/* + * Log client record. The offset of this record is found by adding the offset + * of the RESTART_AREA to the client_array_offset value found in it. + */ +typedef struct { +/*Ofs*/ +/* 0*/ leLSN oldest_lsn; /* Oldest LSN needed by this client. On create + set to 0. */ +/* 8*/ leLSN client_restart_lsn;/* LSN at which this client needs to restart + the volume, i.e. the current position within + the log file. At present, if clean this + should = current_lsn in restart area but it + probably also = current_lsn when dirty most + of the time. At create set to 0. */ +/* 16*/ le16 prev_client; /* The offset to the previous log client record + in the array of log client records. + LOGFILE_NO_CLIENT means there is no previous + client record, i.e. this is the first one. + This is always LOGFILE_NO_CLIENT. */ +/* 18*/ le16 next_client; /* The offset to the next log client record in + the array of log client records. + LOGFILE_NO_CLIENT means there are no next + client records, i.e. this is the last one. + This is always LOGFILE_NO_CLIENT. */ +/* 20*/ le16 seq_number; /* On Win2k and presumably earlier, this is set + to zero every time the logfile is restarted + and it is incremented when the logfile is + closed at dismount time. Thus it is 0 when + dirty and 1 when clean. On WinXP and + presumably later, this is always 0. */ +/* 22*/ u8 reserved[6]; /* Reserved/alignment. */ +/* 28*/ le32 client_name_length;/* Length of client name in bytes. Should + always be 8. */ +/* 32*/ ntfschar client_name[64];/* Name of the client in Unicode. Should + always be "NTFS" with the remaining bytes + set to 0. */ +/* sizeof() = 160 (0xa0) bytes */ +} __attribute__ ((__packed__)) LOG_CLIENT_RECORD; + +extern bool ntfs_check_logfile(struct inode *log_vi, + RESTART_PAGE_HEADER **rp); + +extern bool ntfs_is_logfile_clean(struct inode *log_vi, + const RESTART_PAGE_HEADER *rp); + +extern bool ntfs_empty_logfile(struct inode *log_vi); + +#endif /* NTFS_RW */ + +#endif /* _LINUX_NTFS_LOGFILE_H */ diff --git a/fs/ntfs/malloc.h b/fs/ntfs/malloc.h new file mode 100644 index 000000000..706842573 --- /dev/null +++ b/fs/ntfs/malloc.h @@ -0,0 +1,77 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * malloc.h - NTFS kernel memory handling. Part of the Linux-NTFS project. + * + * Copyright (c) 2001-2005 Anton Altaparmakov + */ + +#ifndef _LINUX_NTFS_MALLOC_H +#define _LINUX_NTFS_MALLOC_H + +#include <linux/vmalloc.h> +#include <linux/slab.h> +#include <linux/highmem.h> + +/** + * __ntfs_malloc - allocate memory in multiples of pages + * @size: number of bytes to allocate + * @gfp_mask: extra flags for the allocator + * + * Internal function. You probably want ntfs_malloc_nofs()... + * + * Allocates @size bytes of memory, rounded up to multiples of PAGE_SIZE and + * returns a pointer to the allocated memory. + * + * If there was insufficient memory to complete the request, return NULL. + * Depending on @gfp_mask the allocation may be guaranteed to succeed. + */ +static inline void *__ntfs_malloc(unsigned long size, gfp_t gfp_mask) +{ + if (likely(size <= PAGE_SIZE)) { + BUG_ON(!size); + /* kmalloc() has per-CPU caches so is faster for now. */ + return kmalloc(PAGE_SIZE, gfp_mask & ~__GFP_HIGHMEM); + /* return (void *)__get_free_page(gfp_mask); */ + } + if (likely((size >> PAGE_SHIFT) < totalram_pages())) + return __vmalloc(size, gfp_mask); + return NULL; +} + +/** + * ntfs_malloc_nofs - allocate memory in multiples of pages + * @size: number of bytes to allocate + * + * Allocates @size bytes of memory, rounded up to multiples of PAGE_SIZE and + * returns a pointer to the allocated memory. + * + * If there was insufficient memory to complete the request, return NULL. + */ +static inline void *ntfs_malloc_nofs(unsigned long size) +{ + return __ntfs_malloc(size, GFP_NOFS | __GFP_HIGHMEM); +} + +/** + * ntfs_malloc_nofs_nofail - allocate memory in multiples of pages + * @size: number of bytes to allocate + * + * Allocates @size bytes of memory, rounded up to multiples of PAGE_SIZE and + * returns a pointer to the allocated memory. + * + * This function guarantees that the allocation will succeed. It will sleep + * for as long as it takes to complete the allocation. + * + * If there was insufficient memory to complete the request, return NULL. + */ +static inline void *ntfs_malloc_nofs_nofail(unsigned long size) +{ + return __ntfs_malloc(size, GFP_NOFS | __GFP_HIGHMEM | __GFP_NOFAIL); +} + +static inline void ntfs_free(void *addr) +{ + kvfree(addr); +} + +#endif /* _LINUX_NTFS_MALLOC_H */ diff --git a/fs/ntfs/mft.c b/fs/ntfs/mft.c new file mode 100644 index 000000000..f7bf5ce96 --- /dev/null +++ b/fs/ntfs/mft.c @@ -0,0 +1,2906 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/** + * mft.c - NTFS kernel mft record operations. Part of the Linux-NTFS project. + * + * Copyright (c) 2001-2012 Anton Altaparmakov and Tuxera Inc. + * Copyright (c) 2002 Richard Russon + */ + +#include <linux/buffer_head.h> +#include <linux/slab.h> +#include <linux/swap.h> +#include <linux/bio.h> + +#include "attrib.h" +#include "aops.h" +#include "bitmap.h" +#include "debug.h" +#include "dir.h" +#include "lcnalloc.h" +#include "malloc.h" +#include "mft.h" +#include "ntfs.h" + +#define MAX_BHS (PAGE_SIZE / NTFS_BLOCK_SIZE) + +/** + * map_mft_record_page - map the page in which a specific mft record resides + * @ni: ntfs inode whose mft record page to map + * + * This maps the page in which the mft record of the ntfs inode @ni is situated + * and returns a pointer to the mft record within the mapped page. + * + * Return value needs to be checked with IS_ERR() and if that is true PTR_ERR() + * contains the negative error code returned. + */ +static inline MFT_RECORD *map_mft_record_page(ntfs_inode *ni) +{ + loff_t i_size; + ntfs_volume *vol = ni->vol; + struct inode *mft_vi = vol->mft_ino; + struct page *page; + unsigned long index, end_index; + unsigned ofs; + + BUG_ON(ni->page); + /* + * The index into the page cache and the offset within the page cache + * page of the wanted mft record. FIXME: We need to check for + * overflowing the unsigned long, but I don't think we would ever get + * here if the volume was that big... + */ + index = (u64)ni->mft_no << vol->mft_record_size_bits >> + PAGE_SHIFT; + ofs = (ni->mft_no << vol->mft_record_size_bits) & ~PAGE_MASK; + + i_size = i_size_read(mft_vi); + /* The maximum valid index into the page cache for $MFT's data. */ + end_index = i_size >> PAGE_SHIFT; + + /* If the wanted index is out of bounds the mft record doesn't exist. */ + if (unlikely(index >= end_index)) { + if (index > end_index || (i_size & ~PAGE_MASK) < ofs + + vol->mft_record_size) { + page = ERR_PTR(-ENOENT); + ntfs_error(vol->sb, "Attempt to read mft record 0x%lx, " + "which is beyond the end of the mft. " + "This is probably a bug in the ntfs " + "driver.", ni->mft_no); + goto err_out; + } + } + /* Read, map, and pin the page. */ + page = ntfs_map_page(mft_vi->i_mapping, index); + if (!IS_ERR(page)) { + /* Catch multi sector transfer fixup errors. */ + if (likely(ntfs_is_mft_recordp((le32*)(page_address(page) + + ofs)))) { + ni->page = page; + ni->page_ofs = ofs; + return page_address(page) + ofs; + } + ntfs_error(vol->sb, "Mft record 0x%lx is corrupt. " + "Run chkdsk.", ni->mft_no); + ntfs_unmap_page(page); + page = ERR_PTR(-EIO); + NVolSetErrors(vol); + } +err_out: + ni->page = NULL; + ni->page_ofs = 0; + return (void*)page; +} + +/** + * map_mft_record - map, pin and lock an mft record + * @ni: ntfs inode whose MFT record to map + * + * First, take the mrec_lock mutex. We might now be sleeping, while waiting + * for the mutex if it was already locked by someone else. + * + * The page of the record is mapped using map_mft_record_page() before being + * returned to the caller. + * + * This in turn uses ntfs_map_page() to get the page containing the wanted mft + * record (it in turn calls read_cache_page() which reads it in from disk if + * necessary, increments the use count on the page so that it cannot disappear + * under us and returns a reference to the page cache page). + * + * If read_cache_page() invokes ntfs_readpage() to load the page from disk, it + * sets PG_locked and clears PG_uptodate on the page. Once I/O has completed + * and the post-read mst fixups on each mft record in the page have been + * performed, the page gets PG_uptodate set and PG_locked cleared (this is done + * in our asynchronous I/O completion handler end_buffer_read_mft_async()). + * ntfs_map_page() waits for PG_locked to become clear and checks if + * PG_uptodate is set and returns an error code if not. This provides + * sufficient protection against races when reading/using the page. + * + * However there is the write mapping to think about. Doing the above described + * checking here will be fine, because when initiating the write we will set + * PG_locked and clear PG_uptodate making sure nobody is touching the page + * contents. Doing the locking this way means that the commit to disk code in + * the page cache code paths is automatically sufficiently locked with us as + * we will not touch a page that has been locked or is not uptodate. The only + * locking problem then is them locking the page while we are accessing it. + * + * So that code will end up having to own the mrec_lock of all mft + * records/inodes present in the page before I/O can proceed. In that case we + * wouldn't need to bother with PG_locked and PG_uptodate as nobody will be + * accessing anything without owning the mrec_lock mutex. But we do need to + * use them because of the read_cache_page() invocation and the code becomes so + * much simpler this way that it is well worth it. + * + * The mft record is now ours and we return a pointer to it. You need to check + * the returned pointer with IS_ERR() and if that is true, PTR_ERR() will return + * the error code. + * + * NOTE: Caller is responsible for setting the mft record dirty before calling + * unmap_mft_record(). This is obviously only necessary if the caller really + * modified the mft record... + * Q: Do we want to recycle one of the VFS inode state bits instead? + * A: No, the inode ones mean we want to change the mft record, not we want to + * write it out. + */ +MFT_RECORD *map_mft_record(ntfs_inode *ni) +{ + MFT_RECORD *m; + + ntfs_debug("Entering for mft_no 0x%lx.", ni->mft_no); + + /* Make sure the ntfs inode doesn't go away. */ + atomic_inc(&ni->count); + + /* Serialize access to this mft record. */ + mutex_lock(&ni->mrec_lock); + + m = map_mft_record_page(ni); + if (!IS_ERR(m)) + return m; + + mutex_unlock(&ni->mrec_lock); + atomic_dec(&ni->count); + ntfs_error(ni->vol->sb, "Failed with error code %lu.", -PTR_ERR(m)); + return m; +} + +/** + * unmap_mft_record_page - unmap the page in which a specific mft record resides + * @ni: ntfs inode whose mft record page to unmap + * + * This unmaps the page in which the mft record of the ntfs inode @ni is + * situated and returns. This is a NOOP if highmem is not configured. + * + * The unmap happens via ntfs_unmap_page() which in turn decrements the use + * count on the page thus releasing it from the pinned state. + * + * We do not actually unmap the page from memory of course, as that will be + * done by the page cache code itself when memory pressure increases or + * whatever. + */ +static inline void unmap_mft_record_page(ntfs_inode *ni) +{ + BUG_ON(!ni->page); + + // TODO: If dirty, blah... + ntfs_unmap_page(ni->page); + ni->page = NULL; + ni->page_ofs = 0; + return; +} + +/** + * unmap_mft_record - release a mapped mft record + * @ni: ntfs inode whose MFT record to unmap + * + * We release the page mapping and the mrec_lock mutex which unmaps the mft + * record and releases it for others to get hold of. We also release the ntfs + * inode by decrementing the ntfs inode reference count. + * + * NOTE: If caller has modified the mft record, it is imperative to set the mft + * record dirty BEFORE calling unmap_mft_record(). + */ +void unmap_mft_record(ntfs_inode *ni) +{ + struct page *page = ni->page; + + BUG_ON(!page); + + ntfs_debug("Entering for mft_no 0x%lx.", ni->mft_no); + + unmap_mft_record_page(ni); + mutex_unlock(&ni->mrec_lock); + atomic_dec(&ni->count); + /* + * If pure ntfs_inode, i.e. no vfs inode attached, we leave it to + * ntfs_clear_extent_inode() in the extent inode case, and to the + * caller in the non-extent, yet pure ntfs inode case, to do the actual + * tear down of all structures and freeing of all allocated memory. + */ + return; +} + +/** + * map_extent_mft_record - load an extent inode and attach it to its base + * @base_ni: base ntfs inode + * @mref: mft reference of the extent inode to load + * @ntfs_ino: on successful return, pointer to the ntfs_inode structure + * + * Load the extent mft record @mref and attach it to its base inode @base_ni. + * Return the mapped extent mft record if IS_ERR(result) is false. Otherwise + * PTR_ERR(result) gives the negative error code. + * + * On successful return, @ntfs_ino contains a pointer to the ntfs_inode + * structure of the mapped extent inode. + */ +MFT_RECORD *map_extent_mft_record(ntfs_inode *base_ni, MFT_REF mref, + ntfs_inode **ntfs_ino) +{ + MFT_RECORD *m; + ntfs_inode *ni = NULL; + ntfs_inode **extent_nis = NULL; + int i; + unsigned long mft_no = MREF(mref); + u16 seq_no = MSEQNO(mref); + bool destroy_ni = false; + + ntfs_debug("Mapping extent mft record 0x%lx (base mft record 0x%lx).", + mft_no, base_ni->mft_no); + /* Make sure the base ntfs inode doesn't go away. */ + atomic_inc(&base_ni->count); + /* + * Check if this extent inode has already been added to the base inode, + * in which case just return it. If not found, add it to the base + * inode before returning it. + */ + mutex_lock(&base_ni->extent_lock); + if (base_ni->nr_extents > 0) { + extent_nis = base_ni->ext.extent_ntfs_inos; + for (i = 0; i < base_ni->nr_extents; i++) { + if (mft_no != extent_nis[i]->mft_no) + continue; + ni = extent_nis[i]; + /* Make sure the ntfs inode doesn't go away. */ + atomic_inc(&ni->count); + break; + } + } + if (likely(ni != NULL)) { + mutex_unlock(&base_ni->extent_lock); + atomic_dec(&base_ni->count); + /* We found the record; just have to map and return it. */ + m = map_mft_record(ni); + /* map_mft_record() has incremented this on success. */ + atomic_dec(&ni->count); + if (!IS_ERR(m)) { + /* Verify the sequence number. */ + if (likely(le16_to_cpu(m->sequence_number) == seq_no)) { + ntfs_debug("Done 1."); + *ntfs_ino = ni; + return m; + } + unmap_mft_record(ni); + ntfs_error(base_ni->vol->sb, "Found stale extent mft " + "reference! Corrupt filesystem. " + "Run chkdsk."); + return ERR_PTR(-EIO); + } +map_err_out: + ntfs_error(base_ni->vol->sb, "Failed to map extent " + "mft record, error code %ld.", -PTR_ERR(m)); + return m; + } + /* Record wasn't there. Get a new ntfs inode and initialize it. */ + ni = ntfs_new_extent_inode(base_ni->vol->sb, mft_no); + if (unlikely(!ni)) { + mutex_unlock(&base_ni->extent_lock); + atomic_dec(&base_ni->count); + return ERR_PTR(-ENOMEM); + } + ni->vol = base_ni->vol; + ni->seq_no = seq_no; + ni->nr_extents = -1; + ni->ext.base_ntfs_ino = base_ni; + /* Now map the record. */ + m = map_mft_record(ni); + if (IS_ERR(m)) { + mutex_unlock(&base_ni->extent_lock); + atomic_dec(&base_ni->count); + ntfs_clear_extent_inode(ni); + goto map_err_out; + } + /* Verify the sequence number if it is present. */ + if (seq_no && (le16_to_cpu(m->sequence_number) != seq_no)) { + ntfs_error(base_ni->vol->sb, "Found stale extent mft " + "reference! Corrupt filesystem. Run chkdsk."); + destroy_ni = true; + m = ERR_PTR(-EIO); + goto unm_err_out; + } + /* Attach extent inode to base inode, reallocating memory if needed. */ + if (!(base_ni->nr_extents & 3)) { + ntfs_inode **tmp; + int new_size = (base_ni->nr_extents + 4) * sizeof(ntfs_inode *); + + tmp = kmalloc(new_size, GFP_NOFS); + if (unlikely(!tmp)) { + ntfs_error(base_ni->vol->sb, "Failed to allocate " + "internal buffer."); + destroy_ni = true; + m = ERR_PTR(-ENOMEM); + goto unm_err_out; + } + if (base_ni->nr_extents) { + BUG_ON(!base_ni->ext.extent_ntfs_inos); + memcpy(tmp, base_ni->ext.extent_ntfs_inos, new_size - + 4 * sizeof(ntfs_inode *)); + kfree(base_ni->ext.extent_ntfs_inos); + } + base_ni->ext.extent_ntfs_inos = tmp; + } + base_ni->ext.extent_ntfs_inos[base_ni->nr_extents++] = ni; + mutex_unlock(&base_ni->extent_lock); + atomic_dec(&base_ni->count); + ntfs_debug("Done 2."); + *ntfs_ino = ni; + return m; +unm_err_out: + unmap_mft_record(ni); + mutex_unlock(&base_ni->extent_lock); + atomic_dec(&base_ni->count); + /* + * If the extent inode was not attached to the base inode we need to + * release it or we will leak memory. + */ + if (destroy_ni) + ntfs_clear_extent_inode(ni); + return m; +} + +#ifdef NTFS_RW + +/** + * __mark_mft_record_dirty - set the mft record and the page containing it dirty + * @ni: ntfs inode describing the mapped mft record + * + * Internal function. Users should call mark_mft_record_dirty() instead. + * + * Set the mapped (extent) mft record of the (base or extent) ntfs inode @ni, + * as well as the page containing the mft record, dirty. Also, mark the base + * vfs inode dirty. This ensures that any changes to the mft record are + * written out to disk. + * + * NOTE: We only set I_DIRTY_DATASYNC (and not I_DIRTY_PAGES) + * on the base vfs inode, because even though file data may have been modified, + * it is dirty in the inode meta data rather than the data page cache of the + * inode, and thus there are no data pages that need writing out. Therefore, a + * full mark_inode_dirty() is overkill. A mark_inode_dirty_sync(), on the + * other hand, is not sufficient, because ->write_inode needs to be called even + * in case of fdatasync. This needs to happen or the file data would not + * necessarily hit the device synchronously, even though the vfs inode has the + * O_SYNC flag set. Also, I_DIRTY_DATASYNC simply "feels" better than just + * I_DIRTY_SYNC, since the file data has not actually hit the block device yet, + * which is not what I_DIRTY_SYNC on its own would suggest. + */ +void __mark_mft_record_dirty(ntfs_inode *ni) +{ + ntfs_inode *base_ni; + + ntfs_debug("Entering for inode 0x%lx.", ni->mft_no); + BUG_ON(NInoAttr(ni)); + mark_ntfs_record_dirty(ni->page, ni->page_ofs); + /* Determine the base vfs inode and mark it dirty, too. */ + mutex_lock(&ni->extent_lock); + if (likely(ni->nr_extents >= 0)) + base_ni = ni; + else + base_ni = ni->ext.base_ntfs_ino; + mutex_unlock(&ni->extent_lock); + __mark_inode_dirty(VFS_I(base_ni), I_DIRTY_DATASYNC); +} + +static const char *ntfs_please_email = "Please email " + "linux-ntfs-dev@lists.sourceforge.net and say that you saw " + "this message. Thank you."; + +/** + * ntfs_sync_mft_mirror_umount - synchronise an mft record to the mft mirror + * @vol: ntfs volume on which the mft record to synchronize resides + * @mft_no: mft record number of mft record to synchronize + * @m: mapped, mst protected (extent) mft record to synchronize + * + * Write the mapped, mst protected (extent) mft record @m with mft record + * number @mft_no to the mft mirror ($MFTMirr) of the ntfs volume @vol, + * bypassing the page cache and the $MFTMirr inode itself. + * + * This function is only for use at umount time when the mft mirror inode has + * already been disposed off. We BUG() if we are called while the mft mirror + * inode is still attached to the volume. + * + * On success return 0. On error return -errno. + * + * NOTE: This function is not implemented yet as I am not convinced it can + * actually be triggered considering the sequence of commits we do in super.c:: + * ntfs_put_super(). But just in case we provide this place holder as the + * alternative would be either to BUG() or to get a NULL pointer dereference + * and Oops. + */ +static int ntfs_sync_mft_mirror_umount(ntfs_volume *vol, + const unsigned long mft_no, MFT_RECORD *m) +{ + BUG_ON(vol->mftmirr_ino); + ntfs_error(vol->sb, "Umount time mft mirror syncing is not " + "implemented yet. %s", ntfs_please_email); + return -EOPNOTSUPP; +} + +/** + * ntfs_sync_mft_mirror - synchronize an mft record to the mft mirror + * @vol: ntfs volume on which the mft record to synchronize resides + * @mft_no: mft record number of mft record to synchronize + * @m: mapped, mst protected (extent) mft record to synchronize + * @sync: if true, wait for i/o completion + * + * Write the mapped, mst protected (extent) mft record @m with mft record + * number @mft_no to the mft mirror ($MFTMirr) of the ntfs volume @vol. + * + * On success return 0. On error return -errno and set the volume errors flag + * in the ntfs volume @vol. + * + * NOTE: We always perform synchronous i/o and ignore the @sync parameter. + * + * TODO: If @sync is false, want to do truly asynchronous i/o, i.e. just + * schedule i/o via ->writepage or do it via kntfsd or whatever. + */ +int ntfs_sync_mft_mirror(ntfs_volume *vol, const unsigned long mft_no, + MFT_RECORD *m, int sync) +{ + struct page *page; + unsigned int blocksize = vol->sb->s_blocksize; + int max_bhs = vol->mft_record_size / blocksize; + struct buffer_head *bhs[MAX_BHS]; + struct buffer_head *bh, *head; + u8 *kmirr; + runlist_element *rl; + unsigned int block_start, block_end, m_start, m_end, page_ofs; + int i_bhs, nr_bhs, err = 0; + unsigned char blocksize_bits = vol->sb->s_blocksize_bits; + + ntfs_debug("Entering for inode 0x%lx.", mft_no); + BUG_ON(!max_bhs); + if (WARN_ON(max_bhs > MAX_BHS)) + return -EINVAL; + if (unlikely(!vol->mftmirr_ino)) { + /* This could happen during umount... */ + err = ntfs_sync_mft_mirror_umount(vol, mft_no, m); + if (likely(!err)) + return err; + goto err_out; + } + /* Get the page containing the mirror copy of the mft record @m. */ + page = ntfs_map_page(vol->mftmirr_ino->i_mapping, mft_no >> + (PAGE_SHIFT - vol->mft_record_size_bits)); + if (IS_ERR(page)) { + ntfs_error(vol->sb, "Failed to map mft mirror page."); + err = PTR_ERR(page); + goto err_out; + } + lock_page(page); + BUG_ON(!PageUptodate(page)); + ClearPageUptodate(page); + /* Offset of the mft mirror record inside the page. */ + page_ofs = (mft_no << vol->mft_record_size_bits) & ~PAGE_MASK; + /* The address in the page of the mirror copy of the mft record @m. */ + kmirr = page_address(page) + page_ofs; + /* Copy the mst protected mft record to the mirror. */ + memcpy(kmirr, m, vol->mft_record_size); + /* Create uptodate buffers if not present. */ + if (unlikely(!page_has_buffers(page))) { + struct buffer_head *tail; + + bh = head = alloc_page_buffers(page, blocksize, true); + do { + set_buffer_uptodate(bh); + tail = bh; + bh = bh->b_this_page; + } while (bh); + tail->b_this_page = head; + attach_page_private(page, head); + } + bh = head = page_buffers(page); + BUG_ON(!bh); + rl = NULL; + nr_bhs = 0; + block_start = 0; + m_start = kmirr - (u8*)page_address(page); + m_end = m_start + vol->mft_record_size; + do { + block_end = block_start + blocksize; + /* If the buffer is outside the mft record, skip it. */ + if (block_end <= m_start) + continue; + if (unlikely(block_start >= m_end)) + break; + /* Need to map the buffer if it is not mapped already. */ + if (unlikely(!buffer_mapped(bh))) { + VCN vcn; + LCN lcn; + unsigned int vcn_ofs; + + bh->b_bdev = vol->sb->s_bdev; + /* Obtain the vcn and offset of the current block. */ + vcn = ((VCN)mft_no << vol->mft_record_size_bits) + + (block_start - m_start); + vcn_ofs = vcn & vol->cluster_size_mask; + vcn >>= vol->cluster_size_bits; + if (!rl) { + down_read(&NTFS_I(vol->mftmirr_ino)-> + runlist.lock); + rl = NTFS_I(vol->mftmirr_ino)->runlist.rl; + /* + * $MFTMirr always has the whole of its runlist + * in memory. + */ + BUG_ON(!rl); + } + /* Seek to element containing target vcn. */ + while (rl->length && rl[1].vcn <= vcn) + rl++; + lcn = ntfs_rl_vcn_to_lcn(rl, vcn); + /* For $MFTMirr, only lcn >= 0 is a successful remap. */ + if (likely(lcn >= 0)) { + /* Setup buffer head to correct block. */ + bh->b_blocknr = ((lcn << + vol->cluster_size_bits) + + vcn_ofs) >> blocksize_bits; + set_buffer_mapped(bh); + } else { + bh->b_blocknr = -1; + ntfs_error(vol->sb, "Cannot write mft mirror " + "record 0x%lx because its " + "location on disk could not " + "be determined (error code " + "%lli).", mft_no, + (long long)lcn); + err = -EIO; + } + } + BUG_ON(!buffer_uptodate(bh)); + BUG_ON(!nr_bhs && (m_start != block_start)); + BUG_ON(nr_bhs >= max_bhs); + bhs[nr_bhs++] = bh; + BUG_ON((nr_bhs >= max_bhs) && (m_end != block_end)); + } while (block_start = block_end, (bh = bh->b_this_page) != head); + if (unlikely(rl)) + up_read(&NTFS_I(vol->mftmirr_ino)->runlist.lock); + if (likely(!err)) { + /* Lock buffers and start synchronous write i/o on them. */ + for (i_bhs = 0; i_bhs < nr_bhs; i_bhs++) { + struct buffer_head *tbh = bhs[i_bhs]; + + if (!trylock_buffer(tbh)) + BUG(); + BUG_ON(!buffer_uptodate(tbh)); + clear_buffer_dirty(tbh); + get_bh(tbh); + tbh->b_end_io = end_buffer_write_sync; + submit_bh(REQ_OP_WRITE, tbh); + } + /* Wait on i/o completion of buffers. */ + for (i_bhs = 0; i_bhs < nr_bhs; i_bhs++) { + struct buffer_head *tbh = bhs[i_bhs]; + + wait_on_buffer(tbh); + if (unlikely(!buffer_uptodate(tbh))) { + err = -EIO; + /* + * Set the buffer uptodate so the page and + * buffer states do not become out of sync. + */ + set_buffer_uptodate(tbh); + } + } + } else /* if (unlikely(err)) */ { + /* Clean the buffers. */ + for (i_bhs = 0; i_bhs < nr_bhs; i_bhs++) + clear_buffer_dirty(bhs[i_bhs]); + } + /* Current state: all buffers are clean, unlocked, and uptodate. */ + /* Remove the mst protection fixups again. */ + post_write_mst_fixup((NTFS_RECORD*)kmirr); + flush_dcache_page(page); + SetPageUptodate(page); + unlock_page(page); + ntfs_unmap_page(page); + if (likely(!err)) { + ntfs_debug("Done."); + } else { + ntfs_error(vol->sb, "I/O error while writing mft mirror " + "record 0x%lx!", mft_no); +err_out: + ntfs_error(vol->sb, "Failed to synchronize $MFTMirr (error " + "code %i). Volume will be left marked dirty " + "on umount. Run ntfsfix on the partition " + "after umounting to correct this.", -err); + NVolSetErrors(vol); + } + return err; +} + +/** + * write_mft_record_nolock - write out a mapped (extent) mft record + * @ni: ntfs inode describing the mapped (extent) mft record + * @m: mapped (extent) mft record to write + * @sync: if true, wait for i/o completion + * + * Write the mapped (extent) mft record @m described by the (regular or extent) + * ntfs inode @ni to backing store. If the mft record @m has a counterpart in + * the mft mirror, that is also updated. + * + * We only write the mft record if the ntfs inode @ni is dirty and the first + * buffer belonging to its mft record is dirty, too. We ignore the dirty state + * of subsequent buffers because we could have raced with + * fs/ntfs/aops.c::mark_ntfs_record_dirty(). + * + * On success, clean the mft record and return 0. On error, leave the mft + * record dirty and return -errno. + * + * NOTE: We always perform synchronous i/o and ignore the @sync parameter. + * However, if the mft record has a counterpart in the mft mirror and @sync is + * true, we write the mft record, wait for i/o completion, and only then write + * the mft mirror copy. This ensures that if the system crashes either the mft + * or the mft mirror will contain a self-consistent mft record @m. If @sync is + * false on the other hand, we start i/o on both and then wait for completion + * on them. This provides a speedup but no longer guarantees that you will end + * up with a self-consistent mft record in the case of a crash but if you asked + * for asynchronous writing you probably do not care about that anyway. + * + * TODO: If @sync is false, want to do truly asynchronous i/o, i.e. just + * schedule i/o via ->writepage or do it via kntfsd or whatever. + */ +int write_mft_record_nolock(ntfs_inode *ni, MFT_RECORD *m, int sync) +{ + ntfs_volume *vol = ni->vol; + struct page *page = ni->page; + unsigned int blocksize = vol->sb->s_blocksize; + unsigned char blocksize_bits = vol->sb->s_blocksize_bits; + int max_bhs = vol->mft_record_size / blocksize; + struct buffer_head *bhs[MAX_BHS]; + struct buffer_head *bh, *head; + runlist_element *rl; + unsigned int block_start, block_end, m_start, m_end; + int i_bhs, nr_bhs, err = 0; + + ntfs_debug("Entering for inode 0x%lx.", ni->mft_no); + BUG_ON(NInoAttr(ni)); + BUG_ON(!max_bhs); + BUG_ON(!PageLocked(page)); + if (WARN_ON(max_bhs > MAX_BHS)) { + err = -EINVAL; + goto err_out; + } + /* + * If the ntfs_inode is clean no need to do anything. If it is dirty, + * mark it as clean now so that it can be redirtied later on if needed. + * There is no danger of races since the caller is holding the locks + * for the mft record @m and the page it is in. + */ + if (!NInoTestClearDirty(ni)) + goto done; + bh = head = page_buffers(page); + BUG_ON(!bh); + rl = NULL; + nr_bhs = 0; + block_start = 0; + m_start = ni->page_ofs; + m_end = m_start + vol->mft_record_size; + do { + block_end = block_start + blocksize; + /* If the buffer is outside the mft record, skip it. */ + if (block_end <= m_start) + continue; + if (unlikely(block_start >= m_end)) + break; + /* + * If this block is not the first one in the record, we ignore + * the buffer's dirty state because we could have raced with a + * parallel mark_ntfs_record_dirty(). + */ + if (block_start == m_start) { + /* This block is the first one in the record. */ + if (!buffer_dirty(bh)) { + BUG_ON(nr_bhs); + /* Clean records are not written out. */ + break; + } + } + /* Need to map the buffer if it is not mapped already. */ + if (unlikely(!buffer_mapped(bh))) { + VCN vcn; + LCN lcn; + unsigned int vcn_ofs; + + bh->b_bdev = vol->sb->s_bdev; + /* Obtain the vcn and offset of the current block. */ + vcn = ((VCN)ni->mft_no << vol->mft_record_size_bits) + + (block_start - m_start); + vcn_ofs = vcn & vol->cluster_size_mask; + vcn >>= vol->cluster_size_bits; + if (!rl) { + down_read(&NTFS_I(vol->mft_ino)->runlist.lock); + rl = NTFS_I(vol->mft_ino)->runlist.rl; + BUG_ON(!rl); + } + /* Seek to element containing target vcn. */ + while (rl->length && rl[1].vcn <= vcn) + rl++; + lcn = ntfs_rl_vcn_to_lcn(rl, vcn); + /* For $MFT, only lcn >= 0 is a successful remap. */ + if (likely(lcn >= 0)) { + /* Setup buffer head to correct block. */ + bh->b_blocknr = ((lcn << + vol->cluster_size_bits) + + vcn_ofs) >> blocksize_bits; + set_buffer_mapped(bh); + } else { + bh->b_blocknr = -1; + ntfs_error(vol->sb, "Cannot write mft record " + "0x%lx because its location " + "on disk could not be " + "determined (error code %lli).", + ni->mft_no, (long long)lcn); + err = -EIO; + } + } + BUG_ON(!buffer_uptodate(bh)); + BUG_ON(!nr_bhs && (m_start != block_start)); + BUG_ON(nr_bhs >= max_bhs); + bhs[nr_bhs++] = bh; + BUG_ON((nr_bhs >= max_bhs) && (m_end != block_end)); + } while (block_start = block_end, (bh = bh->b_this_page) != head); + if (unlikely(rl)) + up_read(&NTFS_I(vol->mft_ino)->runlist.lock); + if (!nr_bhs) + goto done; + if (unlikely(err)) + goto cleanup_out; + /* Apply the mst protection fixups. */ + err = pre_write_mst_fixup((NTFS_RECORD*)m, vol->mft_record_size); + if (err) { + ntfs_error(vol->sb, "Failed to apply mst fixups!"); + goto cleanup_out; + } + flush_dcache_mft_record_page(ni); + /* Lock buffers and start synchronous write i/o on them. */ + for (i_bhs = 0; i_bhs < nr_bhs; i_bhs++) { + struct buffer_head *tbh = bhs[i_bhs]; + + if (!trylock_buffer(tbh)) + BUG(); + BUG_ON(!buffer_uptodate(tbh)); + clear_buffer_dirty(tbh); + get_bh(tbh); + tbh->b_end_io = end_buffer_write_sync; + submit_bh(REQ_OP_WRITE, tbh); + } + /* Synchronize the mft mirror now if not @sync. */ + if (!sync && ni->mft_no < vol->mftmirr_size) + ntfs_sync_mft_mirror(vol, ni->mft_no, m, sync); + /* Wait on i/o completion of buffers. */ + for (i_bhs = 0; i_bhs < nr_bhs; i_bhs++) { + struct buffer_head *tbh = bhs[i_bhs]; + + wait_on_buffer(tbh); + if (unlikely(!buffer_uptodate(tbh))) { + err = -EIO; + /* + * Set the buffer uptodate so the page and buffer + * states do not become out of sync. + */ + if (PageUptodate(page)) + set_buffer_uptodate(tbh); + } + } + /* If @sync, now synchronize the mft mirror. */ + if (sync && ni->mft_no < vol->mftmirr_size) + ntfs_sync_mft_mirror(vol, ni->mft_no, m, sync); + /* Remove the mst protection fixups again. */ + post_write_mst_fixup((NTFS_RECORD*)m); + flush_dcache_mft_record_page(ni); + if (unlikely(err)) { + /* I/O error during writing. This is really bad! */ + ntfs_error(vol->sb, "I/O error while writing mft record " + "0x%lx! Marking base inode as bad. You " + "should unmount the volume and run chkdsk.", + ni->mft_no); + goto err_out; + } +done: + ntfs_debug("Done."); + return 0; +cleanup_out: + /* Clean the buffers. */ + for (i_bhs = 0; i_bhs < nr_bhs; i_bhs++) + clear_buffer_dirty(bhs[i_bhs]); +err_out: + /* + * Current state: all buffers are clean, unlocked, and uptodate. + * The caller should mark the base inode as bad so that no more i/o + * happens. ->clear_inode() will still be invoked so all extent inodes + * and other allocated memory will be freed. + */ + if (err == -ENOMEM) { + ntfs_error(vol->sb, "Not enough memory to write mft record. " + "Redirtying so the write is retried later."); + mark_mft_record_dirty(ni); + err = 0; + } else + NVolSetErrors(vol); + return err; +} + +/** + * ntfs_may_write_mft_record - check if an mft record may be written out + * @vol: [IN] ntfs volume on which the mft record to check resides + * @mft_no: [IN] mft record number of the mft record to check + * @m: [IN] mapped mft record to check + * @locked_ni: [OUT] caller has to unlock this ntfs inode if one is returned + * + * Check if the mapped (base or extent) mft record @m with mft record number + * @mft_no belonging to the ntfs volume @vol may be written out. If necessary + * and possible the ntfs inode of the mft record is locked and the base vfs + * inode is pinned. The locked ntfs inode is then returned in @locked_ni. The + * caller is responsible for unlocking the ntfs inode and unpinning the base + * vfs inode. + * + * Return 'true' if the mft record may be written out and 'false' if not. + * + * The caller has locked the page and cleared the uptodate flag on it which + * means that we can safely write out any dirty mft records that do not have + * their inodes in icache as determined by ilookup5() as anyone + * opening/creating such an inode would block when attempting to map the mft + * record in read_cache_page() until we are finished with the write out. + * + * Here is a description of the tests we perform: + * + * If the inode is found in icache we know the mft record must be a base mft + * record. If it is dirty, we do not write it and return 'false' as the vfs + * inode write paths will result in the access times being updated which would + * cause the base mft record to be redirtied and written out again. (We know + * the access time update will modify the base mft record because Windows + * chkdsk complains if the standard information attribute is not in the base + * mft record.) + * + * If the inode is in icache and not dirty, we attempt to lock the mft record + * and if we find the lock was already taken, it is not safe to write the mft + * record and we return 'false'. + * + * If we manage to obtain the lock we have exclusive access to the mft record, + * which also allows us safe writeout of the mft record. We then set + * @locked_ni to the locked ntfs inode and return 'true'. + * + * Note we cannot just lock the mft record and sleep while waiting for the lock + * because this would deadlock due to lock reversal (normally the mft record is + * locked before the page is locked but we already have the page locked here + * when we try to lock the mft record). + * + * If the inode is not in icache we need to perform further checks. + * + * If the mft record is not a FILE record or it is a base mft record, we can + * safely write it and return 'true'. + * + * We now know the mft record is an extent mft record. We check if the inode + * corresponding to its base mft record is in icache and obtain a reference to + * it if it is. If it is not, we can safely write it and return 'true'. + * + * We now have the base inode for the extent mft record. We check if it has an + * ntfs inode for the extent mft record attached and if not it is safe to write + * the extent mft record and we return 'true'. + * + * The ntfs inode for the extent mft record is attached to the base inode so we + * attempt to lock the extent mft record and if we find the lock was already + * taken, it is not safe to write the extent mft record and we return 'false'. + * + * If we manage to obtain the lock we have exclusive access to the extent mft + * record, which also allows us safe writeout of the extent mft record. We + * set the ntfs inode of the extent mft record clean and then set @locked_ni to + * the now locked ntfs inode and return 'true'. + * + * Note, the reason for actually writing dirty mft records here and not just + * relying on the vfs inode dirty code paths is that we can have mft records + * modified without them ever having actual inodes in memory. Also we can have + * dirty mft records with clean ntfs inodes in memory. None of the described + * cases would result in the dirty mft records being written out if we only + * relied on the vfs inode dirty code paths. And these cases can really occur + * during allocation of new mft records and in particular when the + * initialized_size of the $MFT/$DATA attribute is extended and the new space + * is initialized using ntfs_mft_record_format(). The clean inode can then + * appear if the mft record is reused for a new inode before it got written + * out. + */ +bool ntfs_may_write_mft_record(ntfs_volume *vol, const unsigned long mft_no, + const MFT_RECORD *m, ntfs_inode **locked_ni) +{ + struct super_block *sb = vol->sb; + struct inode *mft_vi = vol->mft_ino; + struct inode *vi; + ntfs_inode *ni, *eni, **extent_nis; + int i; + ntfs_attr na; + + ntfs_debug("Entering for inode 0x%lx.", mft_no); + /* + * Normally we do not return a locked inode so set @locked_ni to NULL. + */ + BUG_ON(!locked_ni); + *locked_ni = NULL; + /* + * Check if the inode corresponding to this mft record is in the VFS + * inode cache and obtain a reference to it if it is. + */ + ntfs_debug("Looking for inode 0x%lx in icache.", mft_no); + na.mft_no = mft_no; + na.name = NULL; + na.name_len = 0; + na.type = AT_UNUSED; + /* + * Optimize inode 0, i.e. $MFT itself, since we have it in memory and + * we get here for it rather often. + */ + if (!mft_no) { + /* Balance the below iput(). */ + vi = igrab(mft_vi); + BUG_ON(vi != mft_vi); + } else { + /* + * Have to use ilookup5_nowait() since ilookup5() waits for the + * inode lock which causes ntfs to deadlock when a concurrent + * inode write via the inode dirty code paths and the page + * dirty code path of the inode dirty code path when writing + * $MFT occurs. + */ + vi = ilookup5_nowait(sb, mft_no, ntfs_test_inode, &na); + } + if (vi) { + ntfs_debug("Base inode 0x%lx is in icache.", mft_no); + /* The inode is in icache. */ + ni = NTFS_I(vi); + /* Take a reference to the ntfs inode. */ + atomic_inc(&ni->count); + /* If the inode is dirty, do not write this record. */ + if (NInoDirty(ni)) { + ntfs_debug("Inode 0x%lx is dirty, do not write it.", + mft_no); + atomic_dec(&ni->count); + iput(vi); + return false; + } + ntfs_debug("Inode 0x%lx is not dirty.", mft_no); + /* The inode is not dirty, try to take the mft record lock. */ + if (unlikely(!mutex_trylock(&ni->mrec_lock))) { + ntfs_debug("Mft record 0x%lx is already locked, do " + "not write it.", mft_no); + atomic_dec(&ni->count); + iput(vi); + return false; + } + ntfs_debug("Managed to lock mft record 0x%lx, write it.", + mft_no); + /* + * The write has to occur while we hold the mft record lock so + * return the locked ntfs inode. + */ + *locked_ni = ni; + return true; + } + ntfs_debug("Inode 0x%lx is not in icache.", mft_no); + /* The inode is not in icache. */ + /* Write the record if it is not a mft record (type "FILE"). */ + if (!ntfs_is_mft_record(m->magic)) { + ntfs_debug("Mft record 0x%lx is not a FILE record, write it.", + mft_no); + return true; + } + /* Write the mft record if it is a base inode. */ + if (!m->base_mft_record) { + ntfs_debug("Mft record 0x%lx is a base record, write it.", + mft_no); + return true; + } + /* + * This is an extent mft record. Check if the inode corresponding to + * its base mft record is in icache and obtain a reference to it if it + * is. + */ + na.mft_no = MREF_LE(m->base_mft_record); + ntfs_debug("Mft record 0x%lx is an extent record. Looking for base " + "inode 0x%lx in icache.", mft_no, na.mft_no); + if (!na.mft_no) { + /* Balance the below iput(). */ + vi = igrab(mft_vi); + BUG_ON(vi != mft_vi); + } else + vi = ilookup5_nowait(sb, na.mft_no, ntfs_test_inode, + &na); + if (!vi) { + /* + * The base inode is not in icache, write this extent mft + * record. + */ + ntfs_debug("Base inode 0x%lx is not in icache, write the " + "extent record.", na.mft_no); + return true; + } + ntfs_debug("Base inode 0x%lx is in icache.", na.mft_no); + /* + * The base inode is in icache. Check if it has the extent inode + * corresponding to this extent mft record attached. + */ + ni = NTFS_I(vi); + mutex_lock(&ni->extent_lock); + if (ni->nr_extents <= 0) { + /* + * The base inode has no attached extent inodes, write this + * extent mft record. + */ + mutex_unlock(&ni->extent_lock); + iput(vi); + ntfs_debug("Base inode 0x%lx has no attached extent inodes, " + "write the extent record.", na.mft_no); + return true; + } + /* Iterate over the attached extent inodes. */ + extent_nis = ni->ext.extent_ntfs_inos; + for (eni = NULL, i = 0; i < ni->nr_extents; ++i) { + if (mft_no == extent_nis[i]->mft_no) { + /* + * Found the extent inode corresponding to this extent + * mft record. + */ + eni = extent_nis[i]; + break; + } + } + /* + * If the extent inode was not attached to the base inode, write this + * extent mft record. + */ + if (!eni) { + mutex_unlock(&ni->extent_lock); + iput(vi); + ntfs_debug("Extent inode 0x%lx is not attached to its base " + "inode 0x%lx, write the extent record.", + mft_no, na.mft_no); + return true; + } + ntfs_debug("Extent inode 0x%lx is attached to its base inode 0x%lx.", + mft_no, na.mft_no); + /* Take a reference to the extent ntfs inode. */ + atomic_inc(&eni->count); + mutex_unlock(&ni->extent_lock); + /* + * Found the extent inode coresponding to this extent mft record. + * Try to take the mft record lock. + */ + if (unlikely(!mutex_trylock(&eni->mrec_lock))) { + atomic_dec(&eni->count); + iput(vi); + ntfs_debug("Extent mft record 0x%lx is already locked, do " + "not write it.", mft_no); + return false; + } + ntfs_debug("Managed to lock extent mft record 0x%lx, write it.", + mft_no); + if (NInoTestClearDirty(eni)) + ntfs_debug("Extent inode 0x%lx is dirty, marking it clean.", + mft_no); + /* + * The write has to occur while we hold the mft record lock so return + * the locked extent ntfs inode. + */ + *locked_ni = eni; + return true; +} + +static const char *es = " Leaving inconsistent metadata. Unmount and run " + "chkdsk."; + +/** + * ntfs_mft_bitmap_find_and_alloc_free_rec_nolock - see name + * @vol: volume on which to search for a free mft record + * @base_ni: open base inode if allocating an extent mft record or NULL + * + * Search for a free mft record in the mft bitmap attribute on the ntfs volume + * @vol. + * + * If @base_ni is NULL start the search at the default allocator position. + * + * If @base_ni is not NULL start the search at the mft record after the base + * mft record @base_ni. + * + * Return the free mft record on success and -errno on error. An error code of + * -ENOSPC means that there are no free mft records in the currently + * initialized mft bitmap. + * + * Locking: Caller must hold vol->mftbmp_lock for writing. + */ +static int ntfs_mft_bitmap_find_and_alloc_free_rec_nolock(ntfs_volume *vol, + ntfs_inode *base_ni) +{ + s64 pass_end, ll, data_pos, pass_start, ofs, bit; + unsigned long flags; + struct address_space *mftbmp_mapping; + u8 *buf, *byte; + struct page *page; + unsigned int page_ofs, size; + u8 pass, b; + + ntfs_debug("Searching for free mft record in the currently " + "initialized mft bitmap."); + mftbmp_mapping = vol->mftbmp_ino->i_mapping; + /* + * Set the end of the pass making sure we do not overflow the mft + * bitmap. + */ + read_lock_irqsave(&NTFS_I(vol->mft_ino)->size_lock, flags); + pass_end = NTFS_I(vol->mft_ino)->allocated_size >> + vol->mft_record_size_bits; + read_unlock_irqrestore(&NTFS_I(vol->mft_ino)->size_lock, flags); + read_lock_irqsave(&NTFS_I(vol->mftbmp_ino)->size_lock, flags); + ll = NTFS_I(vol->mftbmp_ino)->initialized_size << 3; + read_unlock_irqrestore(&NTFS_I(vol->mftbmp_ino)->size_lock, flags); + if (pass_end > ll) + pass_end = ll; + pass = 1; + if (!base_ni) + data_pos = vol->mft_data_pos; + else + data_pos = base_ni->mft_no + 1; + if (data_pos < 24) + data_pos = 24; + if (data_pos >= pass_end) { + data_pos = 24; + pass = 2; + /* This happens on a freshly formatted volume. */ + if (data_pos >= pass_end) + return -ENOSPC; + } + pass_start = data_pos; + ntfs_debug("Starting bitmap search: pass %u, pass_start 0x%llx, " + "pass_end 0x%llx, data_pos 0x%llx.", pass, + (long long)pass_start, (long long)pass_end, + (long long)data_pos); + /* Loop until a free mft record is found. */ + for (; pass <= 2;) { + /* Cap size to pass_end. */ + ofs = data_pos >> 3; + page_ofs = ofs & ~PAGE_MASK; + size = PAGE_SIZE - page_ofs; + ll = ((pass_end + 7) >> 3) - ofs; + if (size > ll) + size = ll; + size <<= 3; + /* + * If we are still within the active pass, search the next page + * for a zero bit. + */ + if (size) { + page = ntfs_map_page(mftbmp_mapping, + ofs >> PAGE_SHIFT); + if (IS_ERR(page)) { + ntfs_error(vol->sb, "Failed to read mft " + "bitmap, aborting."); + return PTR_ERR(page); + } + buf = (u8*)page_address(page) + page_ofs; + bit = data_pos & 7; + data_pos &= ~7ull; + ntfs_debug("Before inner for loop: size 0x%x, " + "data_pos 0x%llx, bit 0x%llx", size, + (long long)data_pos, (long long)bit); + for (; bit < size && data_pos + bit < pass_end; + bit &= ~7ull, bit += 8) { + byte = buf + (bit >> 3); + if (*byte == 0xff) + continue; + b = ffz((unsigned long)*byte); + if (b < 8 && b >= (bit & 7)) { + ll = data_pos + (bit & ~7ull) + b; + if (unlikely(ll > (1ll << 32))) { + ntfs_unmap_page(page); + return -ENOSPC; + } + *byte |= 1 << b; + flush_dcache_page(page); + set_page_dirty(page); + ntfs_unmap_page(page); + ntfs_debug("Done. (Found and " + "allocated mft record " + "0x%llx.)", + (long long)ll); + return ll; + } + } + ntfs_debug("After inner for loop: size 0x%x, " + "data_pos 0x%llx, bit 0x%llx", size, + (long long)data_pos, (long long)bit); + data_pos += size; + ntfs_unmap_page(page); + /* + * If the end of the pass has not been reached yet, + * continue searching the mft bitmap for a zero bit. + */ + if (data_pos < pass_end) + continue; + } + /* Do the next pass. */ + if (++pass == 2) { + /* + * Starting the second pass, in which we scan the first + * part of the zone which we omitted earlier. + */ + pass_end = pass_start; + data_pos = pass_start = 24; + ntfs_debug("pass %i, pass_start 0x%llx, pass_end " + "0x%llx.", pass, (long long)pass_start, + (long long)pass_end); + if (data_pos >= pass_end) + break; + } + } + /* No free mft records in currently initialized mft bitmap. */ + ntfs_debug("Done. (No free mft records left in currently initialized " + "mft bitmap.)"); + return -ENOSPC; +} + +/** + * ntfs_mft_bitmap_extend_allocation_nolock - extend mft bitmap by a cluster + * @vol: volume on which to extend the mft bitmap attribute + * + * Extend the mft bitmap attribute on the ntfs volume @vol by one cluster. + * + * Note: Only changes allocated_size, i.e. does not touch initialized_size or + * data_size. + * + * Return 0 on success and -errno on error. + * + * Locking: - Caller must hold vol->mftbmp_lock for writing. + * - This function takes NTFS_I(vol->mftbmp_ino)->runlist.lock for + * writing and releases it before returning. + * - This function takes vol->lcnbmp_lock for writing and releases it + * before returning. + */ +static int ntfs_mft_bitmap_extend_allocation_nolock(ntfs_volume *vol) +{ + LCN lcn; + s64 ll; + unsigned long flags; + struct page *page; + ntfs_inode *mft_ni, *mftbmp_ni; + runlist_element *rl, *rl2 = NULL; + ntfs_attr_search_ctx *ctx = NULL; + MFT_RECORD *mrec; + ATTR_RECORD *a = NULL; + int ret, mp_size; + u32 old_alen = 0; + u8 *b, tb; + struct { + u8 added_cluster:1; + u8 added_run:1; + u8 mp_rebuilt:1; + } status = { 0, 0, 0 }; + + ntfs_debug("Extending mft bitmap allocation."); + mft_ni = NTFS_I(vol->mft_ino); + mftbmp_ni = NTFS_I(vol->mftbmp_ino); + /* + * Determine the last lcn of the mft bitmap. The allocated size of the + * mft bitmap cannot be zero so we are ok to do this. + */ + down_write(&mftbmp_ni->runlist.lock); + read_lock_irqsave(&mftbmp_ni->size_lock, flags); + ll = mftbmp_ni->allocated_size; + read_unlock_irqrestore(&mftbmp_ni->size_lock, flags); + rl = ntfs_attr_find_vcn_nolock(mftbmp_ni, + (ll - 1) >> vol->cluster_size_bits, NULL); + if (IS_ERR(rl) || unlikely(!rl->length || rl->lcn < 0)) { + up_write(&mftbmp_ni->runlist.lock); + ntfs_error(vol->sb, "Failed to determine last allocated " + "cluster of mft bitmap attribute."); + if (!IS_ERR(rl)) + ret = -EIO; + else + ret = PTR_ERR(rl); + return ret; + } + lcn = rl->lcn + rl->length; + ntfs_debug("Last lcn of mft bitmap attribute is 0x%llx.", + (long long)lcn); + /* + * Attempt to get the cluster following the last allocated cluster by + * hand as it may be in the MFT zone so the allocator would not give it + * to us. + */ + ll = lcn >> 3; + page = ntfs_map_page(vol->lcnbmp_ino->i_mapping, + ll >> PAGE_SHIFT); + if (IS_ERR(page)) { + up_write(&mftbmp_ni->runlist.lock); + ntfs_error(vol->sb, "Failed to read from lcn bitmap."); + return PTR_ERR(page); + } + b = (u8*)page_address(page) + (ll & ~PAGE_MASK); + tb = 1 << (lcn & 7ull); + down_write(&vol->lcnbmp_lock); + if (*b != 0xff && !(*b & tb)) { + /* Next cluster is free, allocate it. */ + *b |= tb; + flush_dcache_page(page); + set_page_dirty(page); + up_write(&vol->lcnbmp_lock); + ntfs_unmap_page(page); + /* Update the mft bitmap runlist. */ + rl->length++; + rl[1].vcn++; + status.added_cluster = 1; + ntfs_debug("Appending one cluster to mft bitmap."); + } else { + up_write(&vol->lcnbmp_lock); + ntfs_unmap_page(page); + /* Allocate a cluster from the DATA_ZONE. */ + rl2 = ntfs_cluster_alloc(vol, rl[1].vcn, 1, lcn, DATA_ZONE, + true); + if (IS_ERR(rl2)) { + up_write(&mftbmp_ni->runlist.lock); + ntfs_error(vol->sb, "Failed to allocate a cluster for " + "the mft bitmap."); + return PTR_ERR(rl2); + } + rl = ntfs_runlists_merge(mftbmp_ni->runlist.rl, rl2); + if (IS_ERR(rl)) { + up_write(&mftbmp_ni->runlist.lock); + ntfs_error(vol->sb, "Failed to merge runlists for mft " + "bitmap."); + if (ntfs_cluster_free_from_rl(vol, rl2)) { + ntfs_error(vol->sb, "Failed to deallocate " + "allocated cluster.%s", es); + NVolSetErrors(vol); + } + ntfs_free(rl2); + return PTR_ERR(rl); + } + mftbmp_ni->runlist.rl = rl; + status.added_run = 1; + ntfs_debug("Adding one run to mft bitmap."); + /* Find the last run in the new runlist. */ + for (; rl[1].length; rl++) + ; + } + /* + * Update the attribute record as well. Note: @rl is the last + * (non-terminator) runlist element of mft bitmap. + */ + mrec = map_mft_record(mft_ni); + if (IS_ERR(mrec)) { + ntfs_error(vol->sb, "Failed to map mft record."); + ret = PTR_ERR(mrec); + goto undo_alloc; + } + ctx = ntfs_attr_get_search_ctx(mft_ni, mrec); + if (unlikely(!ctx)) { + ntfs_error(vol->sb, "Failed to get search context."); + ret = -ENOMEM; + goto undo_alloc; + } + ret = ntfs_attr_lookup(mftbmp_ni->type, mftbmp_ni->name, + mftbmp_ni->name_len, CASE_SENSITIVE, rl[1].vcn, NULL, + 0, ctx); + if (unlikely(ret)) { + ntfs_error(vol->sb, "Failed to find last attribute extent of " + "mft bitmap attribute."); + if (ret == -ENOENT) + ret = -EIO; + goto undo_alloc; + } + a = ctx->attr; + ll = sle64_to_cpu(a->data.non_resident.lowest_vcn); + /* Search back for the previous last allocated cluster of mft bitmap. */ + for (rl2 = rl; rl2 > mftbmp_ni->runlist.rl; rl2--) { + if (ll >= rl2->vcn) + break; + } + BUG_ON(ll < rl2->vcn); + BUG_ON(ll >= rl2->vcn + rl2->length); + /* Get the size for the new mapping pairs array for this extent. */ + mp_size = ntfs_get_size_for_mapping_pairs(vol, rl2, ll, -1); + if (unlikely(mp_size <= 0)) { + ntfs_error(vol->sb, "Get size for mapping pairs failed for " + "mft bitmap attribute extent."); + ret = mp_size; + if (!ret) + ret = -EIO; + goto undo_alloc; + } + /* Expand the attribute record if necessary. */ + old_alen = le32_to_cpu(a->length); + ret = ntfs_attr_record_resize(ctx->mrec, a, mp_size + + le16_to_cpu(a->data.non_resident.mapping_pairs_offset)); + if (unlikely(ret)) { + if (ret != -ENOSPC) { + ntfs_error(vol->sb, "Failed to resize attribute " + "record for mft bitmap attribute."); + goto undo_alloc; + } + // TODO: Deal with this by moving this extent to a new mft + // record or by starting a new extent in a new mft record or by + // moving other attributes out of this mft record. + // Note: It will need to be a special mft record and if none of + // those are available it gets rather complicated... + ntfs_error(vol->sb, "Not enough space in this mft record to " + "accommodate extended mft bitmap attribute " + "extent. Cannot handle this yet."); + ret = -EOPNOTSUPP; + goto undo_alloc; + } + status.mp_rebuilt = 1; + /* Generate the mapping pairs array directly into the attr record. */ + ret = ntfs_mapping_pairs_build(vol, (u8*)a + + le16_to_cpu(a->data.non_resident.mapping_pairs_offset), + mp_size, rl2, ll, -1, NULL); + if (unlikely(ret)) { + ntfs_error(vol->sb, "Failed to build mapping pairs array for " + "mft bitmap attribute."); + goto undo_alloc; + } + /* Update the highest_vcn. */ + a->data.non_resident.highest_vcn = cpu_to_sle64(rl[1].vcn - 1); + /* + * We now have extended the mft bitmap allocated_size by one cluster. + * Reflect this in the ntfs_inode structure and the attribute record. + */ + if (a->data.non_resident.lowest_vcn) { + /* + * We are not in the first attribute extent, switch to it, but + * first ensure the changes will make it to disk later. + */ + flush_dcache_mft_record_page(ctx->ntfs_ino); + mark_mft_record_dirty(ctx->ntfs_ino); + ntfs_attr_reinit_search_ctx(ctx); + ret = ntfs_attr_lookup(mftbmp_ni->type, mftbmp_ni->name, + mftbmp_ni->name_len, CASE_SENSITIVE, 0, NULL, + 0, ctx); + if (unlikely(ret)) { + ntfs_error(vol->sb, "Failed to find first attribute " + "extent of mft bitmap attribute."); + goto restore_undo_alloc; + } + a = ctx->attr; + } + write_lock_irqsave(&mftbmp_ni->size_lock, flags); + mftbmp_ni->allocated_size += vol->cluster_size; + a->data.non_resident.allocated_size = + cpu_to_sle64(mftbmp_ni->allocated_size); + write_unlock_irqrestore(&mftbmp_ni->size_lock, flags); + /* Ensure the changes make it to disk. */ + flush_dcache_mft_record_page(ctx->ntfs_ino); + mark_mft_record_dirty(ctx->ntfs_ino); + ntfs_attr_put_search_ctx(ctx); + unmap_mft_record(mft_ni); + up_write(&mftbmp_ni->runlist.lock); + ntfs_debug("Done."); + return 0; +restore_undo_alloc: + ntfs_attr_reinit_search_ctx(ctx); + if (ntfs_attr_lookup(mftbmp_ni->type, mftbmp_ni->name, + mftbmp_ni->name_len, CASE_SENSITIVE, rl[1].vcn, NULL, + 0, ctx)) { + ntfs_error(vol->sb, "Failed to find last attribute extent of " + "mft bitmap attribute.%s", es); + write_lock_irqsave(&mftbmp_ni->size_lock, flags); + mftbmp_ni->allocated_size += vol->cluster_size; + write_unlock_irqrestore(&mftbmp_ni->size_lock, flags); + ntfs_attr_put_search_ctx(ctx); + unmap_mft_record(mft_ni); + up_write(&mftbmp_ni->runlist.lock); + /* + * The only thing that is now wrong is ->allocated_size of the + * base attribute extent which chkdsk should be able to fix. + */ + NVolSetErrors(vol); + return ret; + } + a = ctx->attr; + a->data.non_resident.highest_vcn = cpu_to_sle64(rl[1].vcn - 2); +undo_alloc: + if (status.added_cluster) { + /* Truncate the last run in the runlist by one cluster. */ + rl->length--; + rl[1].vcn--; + } else if (status.added_run) { + lcn = rl->lcn; + /* Remove the last run from the runlist. */ + rl->lcn = rl[1].lcn; + rl->length = 0; + } + /* Deallocate the cluster. */ + down_write(&vol->lcnbmp_lock); + if (ntfs_bitmap_clear_bit(vol->lcnbmp_ino, lcn)) { + ntfs_error(vol->sb, "Failed to free allocated cluster.%s", es); + NVolSetErrors(vol); + } + up_write(&vol->lcnbmp_lock); + if (status.mp_rebuilt) { + if (ntfs_mapping_pairs_build(vol, (u8*)a + le16_to_cpu( + a->data.non_resident.mapping_pairs_offset), + old_alen - le16_to_cpu( + a->data.non_resident.mapping_pairs_offset), + rl2, ll, -1, NULL)) { + ntfs_error(vol->sb, "Failed to restore mapping pairs " + "array.%s", es); + NVolSetErrors(vol); + } + if (ntfs_attr_record_resize(ctx->mrec, a, old_alen)) { + ntfs_error(vol->sb, "Failed to restore attribute " + "record.%s", es); + NVolSetErrors(vol); + } + flush_dcache_mft_record_page(ctx->ntfs_ino); + mark_mft_record_dirty(ctx->ntfs_ino); + } + if (ctx) + ntfs_attr_put_search_ctx(ctx); + if (!IS_ERR(mrec)) + unmap_mft_record(mft_ni); + up_write(&mftbmp_ni->runlist.lock); + return ret; +} + +/** + * ntfs_mft_bitmap_extend_initialized_nolock - extend mftbmp initialized data + * @vol: volume on which to extend the mft bitmap attribute + * + * Extend the initialized portion of the mft bitmap attribute on the ntfs + * volume @vol by 8 bytes. + * + * Note: Only changes initialized_size and data_size, i.e. requires that + * allocated_size is big enough to fit the new initialized_size. + * + * Return 0 on success and -error on error. + * + * Locking: Caller must hold vol->mftbmp_lock for writing. + */ +static int ntfs_mft_bitmap_extend_initialized_nolock(ntfs_volume *vol) +{ + s64 old_data_size, old_initialized_size; + unsigned long flags; + struct inode *mftbmp_vi; + ntfs_inode *mft_ni, *mftbmp_ni; + ntfs_attr_search_ctx *ctx; + MFT_RECORD *mrec; + ATTR_RECORD *a; + int ret; + + ntfs_debug("Extending mft bitmap initiailized (and data) size."); + mft_ni = NTFS_I(vol->mft_ino); + mftbmp_vi = vol->mftbmp_ino; + mftbmp_ni = NTFS_I(mftbmp_vi); + /* Get the attribute record. */ + mrec = map_mft_record(mft_ni); + if (IS_ERR(mrec)) { + ntfs_error(vol->sb, "Failed to map mft record."); + return PTR_ERR(mrec); + } + ctx = ntfs_attr_get_search_ctx(mft_ni, mrec); + if (unlikely(!ctx)) { + ntfs_error(vol->sb, "Failed to get search context."); + ret = -ENOMEM; + goto unm_err_out; + } + ret = ntfs_attr_lookup(mftbmp_ni->type, mftbmp_ni->name, + mftbmp_ni->name_len, CASE_SENSITIVE, 0, NULL, 0, ctx); + if (unlikely(ret)) { + ntfs_error(vol->sb, "Failed to find first attribute extent of " + "mft bitmap attribute."); + if (ret == -ENOENT) + ret = -EIO; + goto put_err_out; + } + a = ctx->attr; + write_lock_irqsave(&mftbmp_ni->size_lock, flags); + old_data_size = i_size_read(mftbmp_vi); + old_initialized_size = mftbmp_ni->initialized_size; + /* + * We can simply update the initialized_size before filling the space + * with zeroes because the caller is holding the mft bitmap lock for + * writing which ensures that no one else is trying to access the data. + */ + mftbmp_ni->initialized_size += 8; + a->data.non_resident.initialized_size = + cpu_to_sle64(mftbmp_ni->initialized_size); + if (mftbmp_ni->initialized_size > old_data_size) { + i_size_write(mftbmp_vi, mftbmp_ni->initialized_size); + a->data.non_resident.data_size = + cpu_to_sle64(mftbmp_ni->initialized_size); + } + write_unlock_irqrestore(&mftbmp_ni->size_lock, flags); + /* Ensure the changes make it to disk. */ + flush_dcache_mft_record_page(ctx->ntfs_ino); + mark_mft_record_dirty(ctx->ntfs_ino); + ntfs_attr_put_search_ctx(ctx); + unmap_mft_record(mft_ni); + /* Initialize the mft bitmap attribute value with zeroes. */ + ret = ntfs_attr_set(mftbmp_ni, old_initialized_size, 8, 0); + if (likely(!ret)) { + ntfs_debug("Done. (Wrote eight initialized bytes to mft " + "bitmap."); + return 0; + } + ntfs_error(vol->sb, "Failed to write to mft bitmap."); + /* Try to recover from the error. */ + mrec = map_mft_record(mft_ni); + if (IS_ERR(mrec)) { + ntfs_error(vol->sb, "Failed to map mft record.%s", es); + NVolSetErrors(vol); + return ret; + } + ctx = ntfs_attr_get_search_ctx(mft_ni, mrec); + if (unlikely(!ctx)) { + ntfs_error(vol->sb, "Failed to get search context.%s", es); + NVolSetErrors(vol); + goto unm_err_out; + } + if (ntfs_attr_lookup(mftbmp_ni->type, mftbmp_ni->name, + mftbmp_ni->name_len, CASE_SENSITIVE, 0, NULL, 0, ctx)) { + ntfs_error(vol->sb, "Failed to find first attribute extent of " + "mft bitmap attribute.%s", es); + NVolSetErrors(vol); +put_err_out: + ntfs_attr_put_search_ctx(ctx); +unm_err_out: + unmap_mft_record(mft_ni); + goto err_out; + } + a = ctx->attr; + write_lock_irqsave(&mftbmp_ni->size_lock, flags); + mftbmp_ni->initialized_size = old_initialized_size; + a->data.non_resident.initialized_size = + cpu_to_sle64(old_initialized_size); + if (i_size_read(mftbmp_vi) != old_data_size) { + i_size_write(mftbmp_vi, old_data_size); + a->data.non_resident.data_size = cpu_to_sle64(old_data_size); + } + write_unlock_irqrestore(&mftbmp_ni->size_lock, flags); + flush_dcache_mft_record_page(ctx->ntfs_ino); + mark_mft_record_dirty(ctx->ntfs_ino); + ntfs_attr_put_search_ctx(ctx); + unmap_mft_record(mft_ni); +#ifdef DEBUG + read_lock_irqsave(&mftbmp_ni->size_lock, flags); + ntfs_debug("Restored status of mftbmp: allocated_size 0x%llx, " + "data_size 0x%llx, initialized_size 0x%llx.", + (long long)mftbmp_ni->allocated_size, + (long long)i_size_read(mftbmp_vi), + (long long)mftbmp_ni->initialized_size); + read_unlock_irqrestore(&mftbmp_ni->size_lock, flags); +#endif /* DEBUG */ +err_out: + return ret; +} + +/** + * ntfs_mft_data_extend_allocation_nolock - extend mft data attribute + * @vol: volume on which to extend the mft data attribute + * + * Extend the mft data attribute on the ntfs volume @vol by 16 mft records + * worth of clusters or if not enough space for this by one mft record worth + * of clusters. + * + * Note: Only changes allocated_size, i.e. does not touch initialized_size or + * data_size. + * + * Return 0 on success and -errno on error. + * + * Locking: - Caller must hold vol->mftbmp_lock for writing. + * - This function takes NTFS_I(vol->mft_ino)->runlist.lock for + * writing and releases it before returning. + * - This function calls functions which take vol->lcnbmp_lock for + * writing and release it before returning. + */ +static int ntfs_mft_data_extend_allocation_nolock(ntfs_volume *vol) +{ + LCN lcn; + VCN old_last_vcn; + s64 min_nr, nr, ll; + unsigned long flags; + ntfs_inode *mft_ni; + runlist_element *rl, *rl2; + ntfs_attr_search_ctx *ctx = NULL; + MFT_RECORD *mrec; + ATTR_RECORD *a = NULL; + int ret, mp_size; + u32 old_alen = 0; + bool mp_rebuilt = false; + + ntfs_debug("Extending mft data allocation."); + mft_ni = NTFS_I(vol->mft_ino); + /* + * Determine the preferred allocation location, i.e. the last lcn of + * the mft data attribute. The allocated size of the mft data + * attribute cannot be zero so we are ok to do this. + */ + down_write(&mft_ni->runlist.lock); + read_lock_irqsave(&mft_ni->size_lock, flags); + ll = mft_ni->allocated_size; + read_unlock_irqrestore(&mft_ni->size_lock, flags); + rl = ntfs_attr_find_vcn_nolock(mft_ni, + (ll - 1) >> vol->cluster_size_bits, NULL); + if (IS_ERR(rl) || unlikely(!rl->length || rl->lcn < 0)) { + up_write(&mft_ni->runlist.lock); + ntfs_error(vol->sb, "Failed to determine last allocated " + "cluster of mft data attribute."); + if (!IS_ERR(rl)) + ret = -EIO; + else + ret = PTR_ERR(rl); + return ret; + } + lcn = rl->lcn + rl->length; + ntfs_debug("Last lcn of mft data attribute is 0x%llx.", (long long)lcn); + /* Minimum allocation is one mft record worth of clusters. */ + min_nr = vol->mft_record_size >> vol->cluster_size_bits; + if (!min_nr) + min_nr = 1; + /* Want to allocate 16 mft records worth of clusters. */ + nr = vol->mft_record_size << 4 >> vol->cluster_size_bits; + if (!nr) + nr = min_nr; + /* Ensure we do not go above 2^32-1 mft records. */ + read_lock_irqsave(&mft_ni->size_lock, flags); + ll = mft_ni->allocated_size; + read_unlock_irqrestore(&mft_ni->size_lock, flags); + if (unlikely((ll + (nr << vol->cluster_size_bits)) >> + vol->mft_record_size_bits >= (1ll << 32))) { + nr = min_nr; + if (unlikely((ll + (nr << vol->cluster_size_bits)) >> + vol->mft_record_size_bits >= (1ll << 32))) { + ntfs_warning(vol->sb, "Cannot allocate mft record " + "because the maximum number of inodes " + "(2^32) has already been reached."); + up_write(&mft_ni->runlist.lock); + return -ENOSPC; + } + } + ntfs_debug("Trying mft data allocation with %s cluster count %lli.", + nr > min_nr ? "default" : "minimal", (long long)nr); + old_last_vcn = rl[1].vcn; + do { + rl2 = ntfs_cluster_alloc(vol, old_last_vcn, nr, lcn, MFT_ZONE, + true); + if (!IS_ERR(rl2)) + break; + if (PTR_ERR(rl2) != -ENOSPC || nr == min_nr) { + ntfs_error(vol->sb, "Failed to allocate the minimal " + "number of clusters (%lli) for the " + "mft data attribute.", (long long)nr); + up_write(&mft_ni->runlist.lock); + return PTR_ERR(rl2); + } + /* + * There is not enough space to do the allocation, but there + * might be enough space to do a minimal allocation so try that + * before failing. + */ + nr = min_nr; + ntfs_debug("Retrying mft data allocation with minimal cluster " + "count %lli.", (long long)nr); + } while (1); + rl = ntfs_runlists_merge(mft_ni->runlist.rl, rl2); + if (IS_ERR(rl)) { + up_write(&mft_ni->runlist.lock); + ntfs_error(vol->sb, "Failed to merge runlists for mft data " + "attribute."); + if (ntfs_cluster_free_from_rl(vol, rl2)) { + ntfs_error(vol->sb, "Failed to deallocate clusters " + "from the mft data attribute.%s", es); + NVolSetErrors(vol); + } + ntfs_free(rl2); + return PTR_ERR(rl); + } + mft_ni->runlist.rl = rl; + ntfs_debug("Allocated %lli clusters.", (long long)nr); + /* Find the last run in the new runlist. */ + for (; rl[1].length; rl++) + ; + /* Update the attribute record as well. */ + mrec = map_mft_record(mft_ni); + if (IS_ERR(mrec)) { + ntfs_error(vol->sb, "Failed to map mft record."); + ret = PTR_ERR(mrec); + goto undo_alloc; + } + ctx = ntfs_attr_get_search_ctx(mft_ni, mrec); + if (unlikely(!ctx)) { + ntfs_error(vol->sb, "Failed to get search context."); + ret = -ENOMEM; + goto undo_alloc; + } + ret = ntfs_attr_lookup(mft_ni->type, mft_ni->name, mft_ni->name_len, + CASE_SENSITIVE, rl[1].vcn, NULL, 0, ctx); + if (unlikely(ret)) { + ntfs_error(vol->sb, "Failed to find last attribute extent of " + "mft data attribute."); + if (ret == -ENOENT) + ret = -EIO; + goto undo_alloc; + } + a = ctx->attr; + ll = sle64_to_cpu(a->data.non_resident.lowest_vcn); + /* Search back for the previous last allocated cluster of mft bitmap. */ + for (rl2 = rl; rl2 > mft_ni->runlist.rl; rl2--) { + if (ll >= rl2->vcn) + break; + } + BUG_ON(ll < rl2->vcn); + BUG_ON(ll >= rl2->vcn + rl2->length); + /* Get the size for the new mapping pairs array for this extent. */ + mp_size = ntfs_get_size_for_mapping_pairs(vol, rl2, ll, -1); + if (unlikely(mp_size <= 0)) { + ntfs_error(vol->sb, "Get size for mapping pairs failed for " + "mft data attribute extent."); + ret = mp_size; + if (!ret) + ret = -EIO; + goto undo_alloc; + } + /* Expand the attribute record if necessary. */ + old_alen = le32_to_cpu(a->length); + ret = ntfs_attr_record_resize(ctx->mrec, a, mp_size + + le16_to_cpu(a->data.non_resident.mapping_pairs_offset)); + if (unlikely(ret)) { + if (ret != -ENOSPC) { + ntfs_error(vol->sb, "Failed to resize attribute " + "record for mft data attribute."); + goto undo_alloc; + } + // TODO: Deal with this by moving this extent to a new mft + // record or by starting a new extent in a new mft record or by + // moving other attributes out of this mft record. + // Note: Use the special reserved mft records and ensure that + // this extent is not required to find the mft record in + // question. If no free special records left we would need to + // move an existing record away, insert ours in its place, and + // then place the moved record into the newly allocated space + // and we would then need to update all references to this mft + // record appropriately. This is rather complicated... + ntfs_error(vol->sb, "Not enough space in this mft record to " + "accommodate extended mft data attribute " + "extent. Cannot handle this yet."); + ret = -EOPNOTSUPP; + goto undo_alloc; + } + mp_rebuilt = true; + /* Generate the mapping pairs array directly into the attr record. */ + ret = ntfs_mapping_pairs_build(vol, (u8*)a + + le16_to_cpu(a->data.non_resident.mapping_pairs_offset), + mp_size, rl2, ll, -1, NULL); + if (unlikely(ret)) { + ntfs_error(vol->sb, "Failed to build mapping pairs array of " + "mft data attribute."); + goto undo_alloc; + } + /* Update the highest_vcn. */ + a->data.non_resident.highest_vcn = cpu_to_sle64(rl[1].vcn - 1); + /* + * We now have extended the mft data allocated_size by nr clusters. + * Reflect this in the ntfs_inode structure and the attribute record. + * @rl is the last (non-terminator) runlist element of mft data + * attribute. + */ + if (a->data.non_resident.lowest_vcn) { + /* + * We are not in the first attribute extent, switch to it, but + * first ensure the changes will make it to disk later. + */ + flush_dcache_mft_record_page(ctx->ntfs_ino); + mark_mft_record_dirty(ctx->ntfs_ino); + ntfs_attr_reinit_search_ctx(ctx); + ret = ntfs_attr_lookup(mft_ni->type, mft_ni->name, + mft_ni->name_len, CASE_SENSITIVE, 0, NULL, 0, + ctx); + if (unlikely(ret)) { + ntfs_error(vol->sb, "Failed to find first attribute " + "extent of mft data attribute."); + goto restore_undo_alloc; + } + a = ctx->attr; + } + write_lock_irqsave(&mft_ni->size_lock, flags); + mft_ni->allocated_size += nr << vol->cluster_size_bits; + a->data.non_resident.allocated_size = + cpu_to_sle64(mft_ni->allocated_size); + write_unlock_irqrestore(&mft_ni->size_lock, flags); + /* Ensure the changes make it to disk. */ + flush_dcache_mft_record_page(ctx->ntfs_ino); + mark_mft_record_dirty(ctx->ntfs_ino); + ntfs_attr_put_search_ctx(ctx); + unmap_mft_record(mft_ni); + up_write(&mft_ni->runlist.lock); + ntfs_debug("Done."); + return 0; +restore_undo_alloc: + ntfs_attr_reinit_search_ctx(ctx); + if (ntfs_attr_lookup(mft_ni->type, mft_ni->name, mft_ni->name_len, + CASE_SENSITIVE, rl[1].vcn, NULL, 0, ctx)) { + ntfs_error(vol->sb, "Failed to find last attribute extent of " + "mft data attribute.%s", es); + write_lock_irqsave(&mft_ni->size_lock, flags); + mft_ni->allocated_size += nr << vol->cluster_size_bits; + write_unlock_irqrestore(&mft_ni->size_lock, flags); + ntfs_attr_put_search_ctx(ctx); + unmap_mft_record(mft_ni); + up_write(&mft_ni->runlist.lock); + /* + * The only thing that is now wrong is ->allocated_size of the + * base attribute extent which chkdsk should be able to fix. + */ + NVolSetErrors(vol); + return ret; + } + ctx->attr->data.non_resident.highest_vcn = + cpu_to_sle64(old_last_vcn - 1); +undo_alloc: + if (ntfs_cluster_free(mft_ni, old_last_vcn, -1, ctx) < 0) { + ntfs_error(vol->sb, "Failed to free clusters from mft data " + "attribute.%s", es); + NVolSetErrors(vol); + } + a = ctx->attr; + if (ntfs_rl_truncate_nolock(vol, &mft_ni->runlist, old_last_vcn)) { + ntfs_error(vol->sb, "Failed to truncate mft data attribute " + "runlist.%s", es); + NVolSetErrors(vol); + } + if (mp_rebuilt && !IS_ERR(ctx->mrec)) { + if (ntfs_mapping_pairs_build(vol, (u8*)a + le16_to_cpu( + a->data.non_resident.mapping_pairs_offset), + old_alen - le16_to_cpu( + a->data.non_resident.mapping_pairs_offset), + rl2, ll, -1, NULL)) { + ntfs_error(vol->sb, "Failed to restore mapping pairs " + "array.%s", es); + NVolSetErrors(vol); + } + if (ntfs_attr_record_resize(ctx->mrec, a, old_alen)) { + ntfs_error(vol->sb, "Failed to restore attribute " + "record.%s", es); + NVolSetErrors(vol); + } + flush_dcache_mft_record_page(ctx->ntfs_ino); + mark_mft_record_dirty(ctx->ntfs_ino); + } else if (IS_ERR(ctx->mrec)) { + ntfs_error(vol->sb, "Failed to restore attribute search " + "context.%s", es); + NVolSetErrors(vol); + } + if (ctx) + ntfs_attr_put_search_ctx(ctx); + if (!IS_ERR(mrec)) + unmap_mft_record(mft_ni); + up_write(&mft_ni->runlist.lock); + return ret; +} + +/** + * ntfs_mft_record_layout - layout an mft record into a memory buffer + * @vol: volume to which the mft record will belong + * @mft_no: mft reference specifying the mft record number + * @m: destination buffer of size >= @vol->mft_record_size bytes + * + * Layout an empty, unused mft record with the mft record number @mft_no into + * the buffer @m. The volume @vol is needed because the mft record structure + * was modified in NTFS 3.1 so we need to know which volume version this mft + * record will be used on. + * + * Return 0 on success and -errno on error. + */ +static int ntfs_mft_record_layout(const ntfs_volume *vol, const s64 mft_no, + MFT_RECORD *m) +{ + ATTR_RECORD *a; + + ntfs_debug("Entering for mft record 0x%llx.", (long long)mft_no); + if (mft_no >= (1ll << 32)) { + ntfs_error(vol->sb, "Mft record number 0x%llx exceeds " + "maximum of 2^32.", (long long)mft_no); + return -ERANGE; + } + /* Start by clearing the whole mft record to gives us a clean slate. */ + memset(m, 0, vol->mft_record_size); + /* Aligned to 2-byte boundary. */ + if (vol->major_ver < 3 || (vol->major_ver == 3 && !vol->minor_ver)) + m->usa_ofs = cpu_to_le16((sizeof(MFT_RECORD_OLD) + 1) & ~1); + else { + m->usa_ofs = cpu_to_le16((sizeof(MFT_RECORD) + 1) & ~1); + /* + * Set the NTFS 3.1+ specific fields while we know that the + * volume version is 3.1+. + */ + m->reserved = 0; + m->mft_record_number = cpu_to_le32((u32)mft_no); + } + m->magic = magic_FILE; + if (vol->mft_record_size >= NTFS_BLOCK_SIZE) + m->usa_count = cpu_to_le16(vol->mft_record_size / + NTFS_BLOCK_SIZE + 1); + else { + m->usa_count = cpu_to_le16(1); + ntfs_warning(vol->sb, "Sector size is bigger than mft record " + "size. Setting usa_count to 1. If chkdsk " + "reports this as corruption, please email " + "linux-ntfs-dev@lists.sourceforge.net stating " + "that you saw this message and that the " + "modified filesystem created was corrupt. " + "Thank you."); + } + /* Set the update sequence number to 1. */ + *(le16*)((u8*)m + le16_to_cpu(m->usa_ofs)) = cpu_to_le16(1); + m->lsn = 0; + m->sequence_number = cpu_to_le16(1); + m->link_count = 0; + /* + * Place the attributes straight after the update sequence array, + * aligned to 8-byte boundary. + */ + m->attrs_offset = cpu_to_le16((le16_to_cpu(m->usa_ofs) + + (le16_to_cpu(m->usa_count) << 1) + 7) & ~7); + m->flags = 0; + /* + * Using attrs_offset plus eight bytes (for the termination attribute). + * attrs_offset is already aligned to 8-byte boundary, so no need to + * align again. + */ + m->bytes_in_use = cpu_to_le32(le16_to_cpu(m->attrs_offset) + 8); + m->bytes_allocated = cpu_to_le32(vol->mft_record_size); + m->base_mft_record = 0; + m->next_attr_instance = 0; + /* Add the termination attribute. */ + a = (ATTR_RECORD*)((u8*)m + le16_to_cpu(m->attrs_offset)); + a->type = AT_END; + a->length = 0; + ntfs_debug("Done."); + return 0; +} + +/** + * ntfs_mft_record_format - format an mft record on an ntfs volume + * @vol: volume on which to format the mft record + * @mft_no: mft record number to format + * + * Format the mft record @mft_no in $MFT/$DATA, i.e. lay out an empty, unused + * mft record into the appropriate place of the mft data attribute. This is + * used when extending the mft data attribute. + * + * Return 0 on success and -errno on error. + */ +static int ntfs_mft_record_format(const ntfs_volume *vol, const s64 mft_no) +{ + loff_t i_size; + struct inode *mft_vi = vol->mft_ino; + struct page *page; + MFT_RECORD *m; + pgoff_t index, end_index; + unsigned int ofs; + int err; + + ntfs_debug("Entering for mft record 0x%llx.", (long long)mft_no); + /* + * The index into the page cache and the offset within the page cache + * page of the wanted mft record. + */ + index = mft_no << vol->mft_record_size_bits >> PAGE_SHIFT; + ofs = (mft_no << vol->mft_record_size_bits) & ~PAGE_MASK; + /* The maximum valid index into the page cache for $MFT's data. */ + i_size = i_size_read(mft_vi); + end_index = i_size >> PAGE_SHIFT; + if (unlikely(index >= end_index)) { + if (unlikely(index > end_index || ofs + vol->mft_record_size >= + (i_size & ~PAGE_MASK))) { + ntfs_error(vol->sb, "Tried to format non-existing mft " + "record 0x%llx.", (long long)mft_no); + return -ENOENT; + } + } + /* Read, map, and pin the page containing the mft record. */ + page = ntfs_map_page(mft_vi->i_mapping, index); + if (IS_ERR(page)) { + ntfs_error(vol->sb, "Failed to map page containing mft record " + "to format 0x%llx.", (long long)mft_no); + return PTR_ERR(page); + } + lock_page(page); + BUG_ON(!PageUptodate(page)); + ClearPageUptodate(page); + m = (MFT_RECORD*)((u8*)page_address(page) + ofs); + err = ntfs_mft_record_layout(vol, mft_no, m); + if (unlikely(err)) { + ntfs_error(vol->sb, "Failed to layout mft record 0x%llx.", + (long long)mft_no); + SetPageUptodate(page); + unlock_page(page); + ntfs_unmap_page(page); + return err; + } + flush_dcache_page(page); + SetPageUptodate(page); + unlock_page(page); + /* + * Make sure the mft record is written out to disk. We could use + * ilookup5() to check if an inode is in icache and so on but this is + * unnecessary as ntfs_writepage() will write the dirty record anyway. + */ + mark_ntfs_record_dirty(page, ofs); + ntfs_unmap_page(page); + ntfs_debug("Done."); + return 0; +} + +/** + * ntfs_mft_record_alloc - allocate an mft record on an ntfs volume + * @vol: [IN] volume on which to allocate the mft record + * @mode: [IN] mode if want a file or directory, i.e. base inode or 0 + * @base_ni: [IN] open base inode if allocating an extent mft record or NULL + * @mrec: [OUT] on successful return this is the mapped mft record + * + * Allocate an mft record in $MFT/$DATA of an open ntfs volume @vol. + * + * If @base_ni is NULL make the mft record a base mft record, i.e. a file or + * direvctory inode, and allocate it at the default allocator position. In + * this case @mode is the file mode as given to us by the caller. We in + * particular use @mode to distinguish whether a file or a directory is being + * created (S_IFDIR(mode) and S_IFREG(mode), respectively). + * + * If @base_ni is not NULL make the allocated mft record an extent record, + * allocate it starting at the mft record after the base mft record and attach + * the allocated and opened ntfs inode to the base inode @base_ni. In this + * case @mode must be 0 as it is meaningless for extent inodes. + * + * You need to check the return value with IS_ERR(). If false, the function + * was successful and the return value is the now opened ntfs inode of the + * allocated mft record. *@mrec is then set to the allocated, mapped, pinned, + * and locked mft record. If IS_ERR() is true, the function failed and the + * error code is obtained from PTR_ERR(return value). *@mrec is undefined in + * this case. + * + * Allocation strategy: + * + * To find a free mft record, we scan the mft bitmap for a zero bit. To + * optimize this we start scanning at the place specified by @base_ni or if + * @base_ni is NULL we start where we last stopped and we perform wrap around + * when we reach the end. Note, we do not try to allocate mft records below + * number 24 because numbers 0 to 15 are the defined system files anyway and 16 + * to 24 are special in that they are used for storing extension mft records + * for the $DATA attribute of $MFT. This is required to avoid the possibility + * of creating a runlist with a circular dependency which once written to disk + * can never be read in again. Windows will only use records 16 to 24 for + * normal files if the volume is completely out of space. We never use them + * which means that when the volume is really out of space we cannot create any + * more files while Windows can still create up to 8 small files. We can start + * doing this at some later time, it does not matter much for now. + * + * When scanning the mft bitmap, we only search up to the last allocated mft + * record. If there are no free records left in the range 24 to number of + * allocated mft records, then we extend the $MFT/$DATA attribute in order to + * create free mft records. We extend the allocated size of $MFT/$DATA by 16 + * records at a time or one cluster, if cluster size is above 16kiB. If there + * is not sufficient space to do this, we try to extend by a single mft record + * or one cluster, if cluster size is above the mft record size. + * + * No matter how many mft records we allocate, we initialize only the first + * allocated mft record, incrementing mft data size and initialized size + * accordingly, open an ntfs_inode for it and return it to the caller, unless + * there are less than 24 mft records, in which case we allocate and initialize + * mft records until we reach record 24 which we consider as the first free mft + * record for use by normal files. + * + * If during any stage we overflow the initialized data in the mft bitmap, we + * extend the initialized size (and data size) by 8 bytes, allocating another + * cluster if required. The bitmap data size has to be at least equal to the + * number of mft records in the mft, but it can be bigger, in which case the + * superflous bits are padded with zeroes. + * + * Thus, when we return successfully (IS_ERR() is false), we will have: + * - initialized / extended the mft bitmap if necessary, + * - initialized / extended the mft data if necessary, + * - set the bit corresponding to the mft record being allocated in the + * mft bitmap, + * - opened an ntfs_inode for the allocated mft record, and we will have + * - returned the ntfs_inode as well as the allocated mapped, pinned, and + * locked mft record. + * + * On error, the volume will be left in a consistent state and no record will + * be allocated. If rolling back a partial operation fails, we may leave some + * inconsistent metadata in which case we set NVolErrors() so the volume is + * left dirty when unmounted. + * + * Note, this function cannot make use of most of the normal functions, like + * for example for attribute resizing, etc, because when the run list overflows + * the base mft record and an attribute list is used, it is very important that + * the extension mft records used to store the $DATA attribute of $MFT can be + * reached without having to read the information contained inside them, as + * this would make it impossible to find them in the first place after the + * volume is unmounted. $MFT/$BITMAP probably does not need to follow this + * rule because the bitmap is not essential for finding the mft records, but on + * the other hand, handling the bitmap in this special way would make life + * easier because otherwise there might be circular invocations of functions + * when reading the bitmap. + */ +ntfs_inode *ntfs_mft_record_alloc(ntfs_volume *vol, const int mode, + ntfs_inode *base_ni, MFT_RECORD **mrec) +{ + s64 ll, bit, old_data_initialized, old_data_size; + unsigned long flags; + struct inode *vi; + struct page *page; + ntfs_inode *mft_ni, *mftbmp_ni, *ni; + ntfs_attr_search_ctx *ctx; + MFT_RECORD *m; + ATTR_RECORD *a; + pgoff_t index; + unsigned int ofs; + int err; + le16 seq_no, usn; + bool record_formatted = false; + + if (base_ni) { + ntfs_debug("Entering (allocating an extent mft record for " + "base mft record 0x%llx).", + (long long)base_ni->mft_no); + /* @mode and @base_ni are mutually exclusive. */ + BUG_ON(mode); + } else + ntfs_debug("Entering (allocating a base mft record)."); + if (mode) { + /* @mode and @base_ni are mutually exclusive. */ + BUG_ON(base_ni); + /* We only support creation of normal files and directories. */ + if (!S_ISREG(mode) && !S_ISDIR(mode)) + return ERR_PTR(-EOPNOTSUPP); + } + BUG_ON(!mrec); + mft_ni = NTFS_I(vol->mft_ino); + mftbmp_ni = NTFS_I(vol->mftbmp_ino); + down_write(&vol->mftbmp_lock); + bit = ntfs_mft_bitmap_find_and_alloc_free_rec_nolock(vol, base_ni); + if (bit >= 0) { + ntfs_debug("Found and allocated free record (#1), bit 0x%llx.", + (long long)bit); + goto have_alloc_rec; + } + if (bit != -ENOSPC) { + up_write(&vol->mftbmp_lock); + return ERR_PTR(bit); + } + /* + * No free mft records left. If the mft bitmap already covers more + * than the currently used mft records, the next records are all free, + * so we can simply allocate the first unused mft record. + * Note: We also have to make sure that the mft bitmap at least covers + * the first 24 mft records as they are special and whilst they may not + * be in use, we do not allocate from them. + */ + read_lock_irqsave(&mft_ni->size_lock, flags); + ll = mft_ni->initialized_size >> vol->mft_record_size_bits; + read_unlock_irqrestore(&mft_ni->size_lock, flags); + read_lock_irqsave(&mftbmp_ni->size_lock, flags); + old_data_initialized = mftbmp_ni->initialized_size; + read_unlock_irqrestore(&mftbmp_ni->size_lock, flags); + if (old_data_initialized << 3 > ll && old_data_initialized > 3) { + bit = ll; + if (bit < 24) + bit = 24; + if (unlikely(bit >= (1ll << 32))) + goto max_err_out; + ntfs_debug("Found free record (#2), bit 0x%llx.", + (long long)bit); + goto found_free_rec; + } + /* + * The mft bitmap needs to be expanded until it covers the first unused + * mft record that we can allocate. + * Note: The smallest mft record we allocate is mft record 24. + */ + bit = old_data_initialized << 3; + if (unlikely(bit >= (1ll << 32))) + goto max_err_out; + read_lock_irqsave(&mftbmp_ni->size_lock, flags); + old_data_size = mftbmp_ni->allocated_size; + ntfs_debug("Status of mftbmp before extension: allocated_size 0x%llx, " + "data_size 0x%llx, initialized_size 0x%llx.", + (long long)old_data_size, + (long long)i_size_read(vol->mftbmp_ino), + (long long)old_data_initialized); + read_unlock_irqrestore(&mftbmp_ni->size_lock, flags); + if (old_data_initialized + 8 > old_data_size) { + /* Need to extend bitmap by one more cluster. */ + ntfs_debug("mftbmp: initialized_size + 8 > allocated_size."); + err = ntfs_mft_bitmap_extend_allocation_nolock(vol); + if (unlikely(err)) { + up_write(&vol->mftbmp_lock); + goto err_out; + } +#ifdef DEBUG + read_lock_irqsave(&mftbmp_ni->size_lock, flags); + ntfs_debug("Status of mftbmp after allocation extension: " + "allocated_size 0x%llx, data_size 0x%llx, " + "initialized_size 0x%llx.", + (long long)mftbmp_ni->allocated_size, + (long long)i_size_read(vol->mftbmp_ino), + (long long)mftbmp_ni->initialized_size); + read_unlock_irqrestore(&mftbmp_ni->size_lock, flags); +#endif /* DEBUG */ + } + /* + * We now have sufficient allocated space, extend the initialized_size + * as well as the data_size if necessary and fill the new space with + * zeroes. + */ + err = ntfs_mft_bitmap_extend_initialized_nolock(vol); + if (unlikely(err)) { + up_write(&vol->mftbmp_lock); + goto err_out; + } +#ifdef DEBUG + read_lock_irqsave(&mftbmp_ni->size_lock, flags); + ntfs_debug("Status of mftbmp after initialized extension: " + "allocated_size 0x%llx, data_size 0x%llx, " + "initialized_size 0x%llx.", + (long long)mftbmp_ni->allocated_size, + (long long)i_size_read(vol->mftbmp_ino), + (long long)mftbmp_ni->initialized_size); + read_unlock_irqrestore(&mftbmp_ni->size_lock, flags); +#endif /* DEBUG */ + ntfs_debug("Found free record (#3), bit 0x%llx.", (long long)bit); +found_free_rec: + /* @bit is the found free mft record, allocate it in the mft bitmap. */ + ntfs_debug("At found_free_rec."); + err = ntfs_bitmap_set_bit(vol->mftbmp_ino, bit); + if (unlikely(err)) { + ntfs_error(vol->sb, "Failed to allocate bit in mft bitmap."); + up_write(&vol->mftbmp_lock); + goto err_out; + } + ntfs_debug("Set bit 0x%llx in mft bitmap.", (long long)bit); +have_alloc_rec: + /* + * The mft bitmap is now uptodate. Deal with mft data attribute now. + * Note, we keep hold of the mft bitmap lock for writing until all + * modifications to the mft data attribute are complete, too, as they + * will impact decisions for mft bitmap and mft record allocation done + * by a parallel allocation and if the lock is not maintained a + * parallel allocation could allocate the same mft record as this one. + */ + ll = (bit + 1) << vol->mft_record_size_bits; + read_lock_irqsave(&mft_ni->size_lock, flags); + old_data_initialized = mft_ni->initialized_size; + read_unlock_irqrestore(&mft_ni->size_lock, flags); + if (ll <= old_data_initialized) { + ntfs_debug("Allocated mft record already initialized."); + goto mft_rec_already_initialized; + } + ntfs_debug("Initializing allocated mft record."); + /* + * The mft record is outside the initialized data. Extend the mft data + * attribute until it covers the allocated record. The loop is only + * actually traversed more than once when a freshly formatted volume is + * first written to so it optimizes away nicely in the common case. + */ + read_lock_irqsave(&mft_ni->size_lock, flags); + ntfs_debug("Status of mft data before extension: " + "allocated_size 0x%llx, data_size 0x%llx, " + "initialized_size 0x%llx.", + (long long)mft_ni->allocated_size, + (long long)i_size_read(vol->mft_ino), + (long long)mft_ni->initialized_size); + while (ll > mft_ni->allocated_size) { + read_unlock_irqrestore(&mft_ni->size_lock, flags); + err = ntfs_mft_data_extend_allocation_nolock(vol); + if (unlikely(err)) { + ntfs_error(vol->sb, "Failed to extend mft data " + "allocation."); + goto undo_mftbmp_alloc_nolock; + } + read_lock_irqsave(&mft_ni->size_lock, flags); + ntfs_debug("Status of mft data after allocation extension: " + "allocated_size 0x%llx, data_size 0x%llx, " + "initialized_size 0x%llx.", + (long long)mft_ni->allocated_size, + (long long)i_size_read(vol->mft_ino), + (long long)mft_ni->initialized_size); + } + read_unlock_irqrestore(&mft_ni->size_lock, flags); + /* + * Extend mft data initialized size (and data size of course) to reach + * the allocated mft record, formatting the mft records allong the way. + * Note: We only modify the ntfs_inode structure as that is all that is + * needed by ntfs_mft_record_format(). We will update the attribute + * record itself in one fell swoop later on. + */ + write_lock_irqsave(&mft_ni->size_lock, flags); + old_data_initialized = mft_ni->initialized_size; + old_data_size = vol->mft_ino->i_size; + while (ll > mft_ni->initialized_size) { + s64 new_initialized_size, mft_no; + + new_initialized_size = mft_ni->initialized_size + + vol->mft_record_size; + mft_no = mft_ni->initialized_size >> vol->mft_record_size_bits; + if (new_initialized_size > i_size_read(vol->mft_ino)) + i_size_write(vol->mft_ino, new_initialized_size); + write_unlock_irqrestore(&mft_ni->size_lock, flags); + ntfs_debug("Initializing mft record 0x%llx.", + (long long)mft_no); + err = ntfs_mft_record_format(vol, mft_no); + if (unlikely(err)) { + ntfs_error(vol->sb, "Failed to format mft record."); + goto undo_data_init; + } + write_lock_irqsave(&mft_ni->size_lock, flags); + mft_ni->initialized_size = new_initialized_size; + } + write_unlock_irqrestore(&mft_ni->size_lock, flags); + record_formatted = true; + /* Update the mft data attribute record to reflect the new sizes. */ + m = map_mft_record(mft_ni); + if (IS_ERR(m)) { + ntfs_error(vol->sb, "Failed to map mft record."); + err = PTR_ERR(m); + goto undo_data_init; + } + ctx = ntfs_attr_get_search_ctx(mft_ni, m); + if (unlikely(!ctx)) { + ntfs_error(vol->sb, "Failed to get search context."); + err = -ENOMEM; + unmap_mft_record(mft_ni); + goto undo_data_init; + } + err = ntfs_attr_lookup(mft_ni->type, mft_ni->name, mft_ni->name_len, + CASE_SENSITIVE, 0, NULL, 0, ctx); + if (unlikely(err)) { + ntfs_error(vol->sb, "Failed to find first attribute extent of " + "mft data attribute."); + ntfs_attr_put_search_ctx(ctx); + unmap_mft_record(mft_ni); + goto undo_data_init; + } + a = ctx->attr; + read_lock_irqsave(&mft_ni->size_lock, flags); + a->data.non_resident.initialized_size = + cpu_to_sle64(mft_ni->initialized_size); + a->data.non_resident.data_size = + cpu_to_sle64(i_size_read(vol->mft_ino)); + read_unlock_irqrestore(&mft_ni->size_lock, flags); + /* Ensure the changes make it to disk. */ + flush_dcache_mft_record_page(ctx->ntfs_ino); + mark_mft_record_dirty(ctx->ntfs_ino); + ntfs_attr_put_search_ctx(ctx); + unmap_mft_record(mft_ni); + read_lock_irqsave(&mft_ni->size_lock, flags); + ntfs_debug("Status of mft data after mft record initialization: " + "allocated_size 0x%llx, data_size 0x%llx, " + "initialized_size 0x%llx.", + (long long)mft_ni->allocated_size, + (long long)i_size_read(vol->mft_ino), + (long long)mft_ni->initialized_size); + BUG_ON(i_size_read(vol->mft_ino) > mft_ni->allocated_size); + BUG_ON(mft_ni->initialized_size > i_size_read(vol->mft_ino)); + read_unlock_irqrestore(&mft_ni->size_lock, flags); +mft_rec_already_initialized: + /* + * We can finally drop the mft bitmap lock as the mft data attribute + * has been fully updated. The only disparity left is that the + * allocated mft record still needs to be marked as in use to match the + * set bit in the mft bitmap but this is actually not a problem since + * this mft record is not referenced from anywhere yet and the fact + * that it is allocated in the mft bitmap means that no-one will try to + * allocate it either. + */ + up_write(&vol->mftbmp_lock); + /* + * We now have allocated and initialized the mft record. Calculate the + * index of and the offset within the page cache page the record is in. + */ + index = bit << vol->mft_record_size_bits >> PAGE_SHIFT; + ofs = (bit << vol->mft_record_size_bits) & ~PAGE_MASK; + /* Read, map, and pin the page containing the mft record. */ + page = ntfs_map_page(vol->mft_ino->i_mapping, index); + if (IS_ERR(page)) { + ntfs_error(vol->sb, "Failed to map page containing allocated " + "mft record 0x%llx.", (long long)bit); + err = PTR_ERR(page); + goto undo_mftbmp_alloc; + } + lock_page(page); + BUG_ON(!PageUptodate(page)); + ClearPageUptodate(page); + m = (MFT_RECORD*)((u8*)page_address(page) + ofs); + /* If we just formatted the mft record no need to do it again. */ + if (!record_formatted) { + /* Sanity check that the mft record is really not in use. */ + if (ntfs_is_file_record(m->magic) && + (m->flags & MFT_RECORD_IN_USE)) { + ntfs_error(vol->sb, "Mft record 0x%llx was marked " + "free in mft bitmap but is marked " + "used itself. Corrupt filesystem. " + "Unmount and run chkdsk.", + (long long)bit); + err = -EIO; + SetPageUptodate(page); + unlock_page(page); + ntfs_unmap_page(page); + NVolSetErrors(vol); + goto undo_mftbmp_alloc; + } + /* + * We need to (re-)format the mft record, preserving the + * sequence number if it is not zero as well as the update + * sequence number if it is not zero or -1 (0xffff). This + * means we do not need to care whether or not something went + * wrong with the previous mft record. + */ + seq_no = m->sequence_number; + usn = *(le16*)((u8*)m + le16_to_cpu(m->usa_ofs)); + err = ntfs_mft_record_layout(vol, bit, m); + if (unlikely(err)) { + ntfs_error(vol->sb, "Failed to layout allocated mft " + "record 0x%llx.", (long long)bit); + SetPageUptodate(page); + unlock_page(page); + ntfs_unmap_page(page); + goto undo_mftbmp_alloc; + } + if (seq_no) + m->sequence_number = seq_no; + if (usn && le16_to_cpu(usn) != 0xffff) + *(le16*)((u8*)m + le16_to_cpu(m->usa_ofs)) = usn; + } + /* Set the mft record itself in use. */ + m->flags |= MFT_RECORD_IN_USE; + if (S_ISDIR(mode)) + m->flags |= MFT_RECORD_IS_DIRECTORY; + flush_dcache_page(page); + SetPageUptodate(page); + if (base_ni) { + MFT_RECORD *m_tmp; + + /* + * Setup the base mft record in the extent mft record. This + * completes initialization of the allocated extent mft record + * and we can simply use it with map_extent_mft_record(). + */ + m->base_mft_record = MK_LE_MREF(base_ni->mft_no, + base_ni->seq_no); + /* + * Allocate an extent inode structure for the new mft record, + * attach it to the base inode @base_ni and map, pin, and lock + * its, i.e. the allocated, mft record. + */ + m_tmp = map_extent_mft_record(base_ni, bit, &ni); + if (IS_ERR(m_tmp)) { + ntfs_error(vol->sb, "Failed to map allocated extent " + "mft record 0x%llx.", (long long)bit); + err = PTR_ERR(m_tmp); + /* Set the mft record itself not in use. */ + m->flags &= cpu_to_le16( + ~le16_to_cpu(MFT_RECORD_IN_USE)); + flush_dcache_page(page); + /* Make sure the mft record is written out to disk. */ + mark_ntfs_record_dirty(page, ofs); + unlock_page(page); + ntfs_unmap_page(page); + goto undo_mftbmp_alloc; + } + BUG_ON(m != m_tmp); + /* + * Make sure the allocated mft record is written out to disk. + * No need to set the inode dirty because the caller is going + * to do that anyway after finishing with the new extent mft + * record (e.g. at a minimum a new attribute will be added to + * the mft record. + */ + mark_ntfs_record_dirty(page, ofs); + unlock_page(page); + /* + * Need to unmap the page since map_extent_mft_record() mapped + * it as well so we have it mapped twice at the moment. + */ + ntfs_unmap_page(page); + } else { + /* + * Allocate a new VFS inode and set it up. NOTE: @vi->i_nlink + * is set to 1 but the mft record->link_count is 0. The caller + * needs to bear this in mind. + */ + vi = new_inode(vol->sb); + if (unlikely(!vi)) { + err = -ENOMEM; + /* Set the mft record itself not in use. */ + m->flags &= cpu_to_le16( + ~le16_to_cpu(MFT_RECORD_IN_USE)); + flush_dcache_page(page); + /* Make sure the mft record is written out to disk. */ + mark_ntfs_record_dirty(page, ofs); + unlock_page(page); + ntfs_unmap_page(page); + goto undo_mftbmp_alloc; + } + vi->i_ino = bit; + + /* The owner and group come from the ntfs volume. */ + vi->i_uid = vol->uid; + vi->i_gid = vol->gid; + + /* Initialize the ntfs specific part of @vi. */ + ntfs_init_big_inode(vi); + ni = NTFS_I(vi); + /* + * Set the appropriate mode, attribute type, and name. For + * directories, also setup the index values to the defaults. + */ + if (S_ISDIR(mode)) { + vi->i_mode = S_IFDIR | S_IRWXUGO; + vi->i_mode &= ~vol->dmask; + + NInoSetMstProtected(ni); + ni->type = AT_INDEX_ALLOCATION; + ni->name = I30; + ni->name_len = 4; + + ni->itype.index.block_size = 4096; + ni->itype.index.block_size_bits = ntfs_ffs(4096) - 1; + ni->itype.index.collation_rule = COLLATION_FILE_NAME; + if (vol->cluster_size <= ni->itype.index.block_size) { + ni->itype.index.vcn_size = vol->cluster_size; + ni->itype.index.vcn_size_bits = + vol->cluster_size_bits; + } else { + ni->itype.index.vcn_size = vol->sector_size; + ni->itype.index.vcn_size_bits = + vol->sector_size_bits; + } + } else { + vi->i_mode = S_IFREG | S_IRWXUGO; + vi->i_mode &= ~vol->fmask; + + ni->type = AT_DATA; + ni->name = NULL; + ni->name_len = 0; + } + if (IS_RDONLY(vi)) + vi->i_mode &= ~S_IWUGO; + + /* Set the inode times to the current time. */ + vi->i_atime = vi->i_mtime = vi->i_ctime = + current_time(vi); + /* + * Set the file size to 0, the ntfs inode sizes are set to 0 by + * the call to ntfs_init_big_inode() below. + */ + vi->i_size = 0; + vi->i_blocks = 0; + + /* Set the sequence number. */ + vi->i_generation = ni->seq_no = le16_to_cpu(m->sequence_number); + /* + * Manually map, pin, and lock the mft record as we already + * have its page mapped and it is very easy to do. + */ + atomic_inc(&ni->count); + mutex_lock(&ni->mrec_lock); + ni->page = page; + ni->page_ofs = ofs; + /* + * Make sure the allocated mft record is written out to disk. + * NOTE: We do not set the ntfs inode dirty because this would + * fail in ntfs_write_inode() because the inode does not have a + * standard information attribute yet. Also, there is no need + * to set the inode dirty because the caller is going to do + * that anyway after finishing with the new mft record (e.g. at + * a minimum some new attributes will be added to the mft + * record. + */ + mark_ntfs_record_dirty(page, ofs); + unlock_page(page); + + /* Add the inode to the inode hash for the superblock. */ + insert_inode_hash(vi); + + /* Update the default mft allocation position. */ + vol->mft_data_pos = bit + 1; + } + /* + * Return the opened, allocated inode of the allocated mft record as + * well as the mapped, pinned, and locked mft record. + */ + ntfs_debug("Returning opened, allocated %sinode 0x%llx.", + base_ni ? "extent " : "", (long long)bit); + *mrec = m; + return ni; +undo_data_init: + write_lock_irqsave(&mft_ni->size_lock, flags); + mft_ni->initialized_size = old_data_initialized; + i_size_write(vol->mft_ino, old_data_size); + write_unlock_irqrestore(&mft_ni->size_lock, flags); + goto undo_mftbmp_alloc_nolock; +undo_mftbmp_alloc: + down_write(&vol->mftbmp_lock); +undo_mftbmp_alloc_nolock: + if (ntfs_bitmap_clear_bit(vol->mftbmp_ino, bit)) { + ntfs_error(vol->sb, "Failed to clear bit in mft bitmap.%s", es); + NVolSetErrors(vol); + } + up_write(&vol->mftbmp_lock); +err_out: + return ERR_PTR(err); +max_err_out: + ntfs_warning(vol->sb, "Cannot allocate mft record because the maximum " + "number of inodes (2^32) has already been reached."); + up_write(&vol->mftbmp_lock); + return ERR_PTR(-ENOSPC); +} + +/** + * ntfs_extent_mft_record_free - free an extent mft record on an ntfs volume + * @ni: ntfs inode of the mapped extent mft record to free + * @m: mapped extent mft record of the ntfs inode @ni + * + * Free the mapped extent mft record @m of the extent ntfs inode @ni. + * + * Note that this function unmaps the mft record and closes and destroys @ni + * internally and hence you cannot use either @ni nor @m any more after this + * function returns success. + * + * On success return 0 and on error return -errno. @ni and @m are still valid + * in this case and have not been freed. + * + * For some errors an error message is displayed and the success code 0 is + * returned and the volume is then left dirty on umount. This makes sense in + * case we could not rollback the changes that were already done since the + * caller no longer wants to reference this mft record so it does not matter to + * the caller if something is wrong with it as long as it is properly detached + * from the base inode. + */ +int ntfs_extent_mft_record_free(ntfs_inode *ni, MFT_RECORD *m) +{ + unsigned long mft_no = ni->mft_no; + ntfs_volume *vol = ni->vol; + ntfs_inode *base_ni; + ntfs_inode **extent_nis; + int i, err; + le16 old_seq_no; + u16 seq_no; + + BUG_ON(NInoAttr(ni)); + BUG_ON(ni->nr_extents != -1); + + mutex_lock(&ni->extent_lock); + base_ni = ni->ext.base_ntfs_ino; + mutex_unlock(&ni->extent_lock); + + BUG_ON(base_ni->nr_extents <= 0); + + ntfs_debug("Entering for extent inode 0x%lx, base inode 0x%lx.\n", + mft_no, base_ni->mft_no); + + mutex_lock(&base_ni->extent_lock); + + /* Make sure we are holding the only reference to the extent inode. */ + if (atomic_read(&ni->count) > 2) { + ntfs_error(vol->sb, "Tried to free busy extent inode 0x%lx, " + "not freeing.", base_ni->mft_no); + mutex_unlock(&base_ni->extent_lock); + return -EBUSY; + } + + /* Dissociate the ntfs inode from the base inode. */ + extent_nis = base_ni->ext.extent_ntfs_inos; + err = -ENOENT; + for (i = 0; i < base_ni->nr_extents; i++) { + if (ni != extent_nis[i]) + continue; + extent_nis += i; + base_ni->nr_extents--; + memmove(extent_nis, extent_nis + 1, (base_ni->nr_extents - i) * + sizeof(ntfs_inode*)); + err = 0; + break; + } + + mutex_unlock(&base_ni->extent_lock); + + if (unlikely(err)) { + ntfs_error(vol->sb, "Extent inode 0x%lx is not attached to " + "its base inode 0x%lx.", mft_no, + base_ni->mft_no); + BUG(); + } + + /* + * The extent inode is no longer attached to the base inode so no one + * can get a reference to it any more. + */ + + /* Mark the mft record as not in use. */ + m->flags &= ~MFT_RECORD_IN_USE; + + /* Increment the sequence number, skipping zero, if it is not zero. */ + old_seq_no = m->sequence_number; + seq_no = le16_to_cpu(old_seq_no); + if (seq_no == 0xffff) + seq_no = 1; + else if (seq_no) + seq_no++; + m->sequence_number = cpu_to_le16(seq_no); + + /* + * Set the ntfs inode dirty and write it out. We do not need to worry + * about the base inode here since whatever caused the extent mft + * record to be freed is guaranteed to do it already. + */ + NInoSetDirty(ni); + err = write_mft_record(ni, m, 0); + if (unlikely(err)) { + ntfs_error(vol->sb, "Failed to write mft record 0x%lx, not " + "freeing.", mft_no); + goto rollback; + } +rollback_error: + /* Unmap and throw away the now freed extent inode. */ + unmap_extent_mft_record(ni); + ntfs_clear_extent_inode(ni); + + /* Clear the bit in the $MFT/$BITMAP corresponding to this record. */ + down_write(&vol->mftbmp_lock); + err = ntfs_bitmap_clear_bit(vol->mftbmp_ino, mft_no); + up_write(&vol->mftbmp_lock); + if (unlikely(err)) { + /* + * The extent inode is gone but we failed to deallocate it in + * the mft bitmap. Just emit a warning and leave the volume + * dirty on umount. + */ + ntfs_error(vol->sb, "Failed to clear bit in mft bitmap.%s", es); + NVolSetErrors(vol); + } + return 0; +rollback: + /* Rollback what we did... */ + mutex_lock(&base_ni->extent_lock); + extent_nis = base_ni->ext.extent_ntfs_inos; + if (!(base_ni->nr_extents & 3)) { + int new_size = (base_ni->nr_extents + 4) * sizeof(ntfs_inode*); + + extent_nis = kmalloc(new_size, GFP_NOFS); + if (unlikely(!extent_nis)) { + ntfs_error(vol->sb, "Failed to allocate internal " + "buffer during rollback.%s", es); + mutex_unlock(&base_ni->extent_lock); + NVolSetErrors(vol); + goto rollback_error; + } + if (base_ni->nr_extents) { + BUG_ON(!base_ni->ext.extent_ntfs_inos); + memcpy(extent_nis, base_ni->ext.extent_ntfs_inos, + new_size - 4 * sizeof(ntfs_inode*)); + kfree(base_ni->ext.extent_ntfs_inos); + } + base_ni->ext.extent_ntfs_inos = extent_nis; + } + m->flags |= MFT_RECORD_IN_USE; + m->sequence_number = old_seq_no; + extent_nis[base_ni->nr_extents++] = ni; + mutex_unlock(&base_ni->extent_lock); + mark_mft_record_dirty(ni); + return err; +} +#endif /* NTFS_RW */ diff --git a/fs/ntfs/mft.h b/fs/ntfs/mft.h new file mode 100644 index 000000000..49c001af1 --- /dev/null +++ b/fs/ntfs/mft.h @@ -0,0 +1,110 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * mft.h - Defines for mft record handling in NTFS Linux kernel driver. + * Part of the Linux-NTFS project. + * + * Copyright (c) 2001-2004 Anton Altaparmakov + */ + +#ifndef _LINUX_NTFS_MFT_H +#define _LINUX_NTFS_MFT_H + +#include <linux/fs.h> +#include <linux/highmem.h> +#include <linux/pagemap.h> + +#include "inode.h" + +extern MFT_RECORD *map_mft_record(ntfs_inode *ni); +extern void unmap_mft_record(ntfs_inode *ni); + +extern MFT_RECORD *map_extent_mft_record(ntfs_inode *base_ni, MFT_REF mref, + ntfs_inode **ntfs_ino); + +static inline void unmap_extent_mft_record(ntfs_inode *ni) +{ + unmap_mft_record(ni); + return; +} + +#ifdef NTFS_RW + +/** + * flush_dcache_mft_record_page - flush_dcache_page() for mft records + * @ni: ntfs inode structure of mft record + * + * Call flush_dcache_page() for the page in which an mft record resides. + * + * This must be called every time an mft record is modified, just after the + * modification. + */ +static inline void flush_dcache_mft_record_page(ntfs_inode *ni) +{ + flush_dcache_page(ni->page); +} + +extern void __mark_mft_record_dirty(ntfs_inode *ni); + +/** + * mark_mft_record_dirty - set the mft record and the page containing it dirty + * @ni: ntfs inode describing the mapped mft record + * + * Set the mapped (extent) mft record of the (base or extent) ntfs inode @ni, + * as well as the page containing the mft record, dirty. Also, mark the base + * vfs inode dirty. This ensures that any changes to the mft record are + * written out to disk. + * + * NOTE: Do not do anything if the mft record is already marked dirty. + */ +static inline void mark_mft_record_dirty(ntfs_inode *ni) +{ + if (!NInoTestSetDirty(ni)) + __mark_mft_record_dirty(ni); +} + +extern int ntfs_sync_mft_mirror(ntfs_volume *vol, const unsigned long mft_no, + MFT_RECORD *m, int sync); + +extern int write_mft_record_nolock(ntfs_inode *ni, MFT_RECORD *m, int sync); + +/** + * write_mft_record - write out a mapped (extent) mft record + * @ni: ntfs inode describing the mapped (extent) mft record + * @m: mapped (extent) mft record to write + * @sync: if true, wait for i/o completion + * + * This is just a wrapper for write_mft_record_nolock() (see mft.c), which + * locks the page for the duration of the write. This ensures that there are + * no race conditions between writing the mft record via the dirty inode code + * paths and via the page cache write back code paths or between writing + * neighbouring mft records residing in the same page. + * + * Locking the page also serializes us against ->read_folio() if the page is not + * uptodate. + * + * On success, clean the mft record and return 0. On error, leave the mft + * record dirty and return -errno. + */ +static inline int write_mft_record(ntfs_inode *ni, MFT_RECORD *m, int sync) +{ + struct page *page = ni->page; + int err; + + BUG_ON(!page); + lock_page(page); + err = write_mft_record_nolock(ni, m, sync); + unlock_page(page); + return err; +} + +extern bool ntfs_may_write_mft_record(ntfs_volume *vol, + const unsigned long mft_no, const MFT_RECORD *m, + ntfs_inode **locked_ni); + +extern ntfs_inode *ntfs_mft_record_alloc(ntfs_volume *vol, const int mode, + ntfs_inode *base_ni, MFT_RECORD **mrec); +extern int ntfs_extent_mft_record_free(ntfs_inode *ni, MFT_RECORD *m); + +#endif /* NTFS_RW */ + +#endif /* _LINUX_NTFS_MFT_H */ diff --git a/fs/ntfs/mst.c b/fs/ntfs/mst.c new file mode 100644 index 000000000..16b3c884a --- /dev/null +++ b/fs/ntfs/mst.c @@ -0,0 +1,189 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * mst.c - NTFS multi sector transfer protection handling code. Part of the + * Linux-NTFS project. + * + * Copyright (c) 2001-2004 Anton Altaparmakov + */ + +#include "ntfs.h" + +/** + * post_read_mst_fixup - deprotect multi sector transfer protected data + * @b: pointer to the data to deprotect + * @size: size in bytes of @b + * + * Perform the necessary post read multi sector transfer fixup and detect the + * presence of incomplete multi sector transfers. - In that case, overwrite the + * magic of the ntfs record header being processed with "BAAD" (in memory only!) + * and abort processing. + * + * Return 0 on success and -EINVAL on error ("BAAD" magic will be present). + * + * NOTE: We consider the absence / invalidity of an update sequence array to + * mean that the structure is not protected at all and hence doesn't need to + * be fixed up. Thus, we return success and not failure in this case. This is + * in contrast to pre_write_mst_fixup(), see below. + */ +int post_read_mst_fixup(NTFS_RECORD *b, const u32 size) +{ + u16 usa_ofs, usa_count, usn; + u16 *usa_pos, *data_pos; + + /* Setup the variables. */ + usa_ofs = le16_to_cpu(b->usa_ofs); + /* Decrement usa_count to get number of fixups. */ + usa_count = le16_to_cpu(b->usa_count) - 1; + /* Size and alignment checks. */ + if ( size & (NTFS_BLOCK_SIZE - 1) || + usa_ofs & 1 || + usa_ofs + (usa_count * 2) > size || + (size >> NTFS_BLOCK_SIZE_BITS) != usa_count) + return 0; + /* Position of usn in update sequence array. */ + usa_pos = (u16*)b + usa_ofs/sizeof(u16); + /* + * The update sequence number which has to be equal to each of the + * u16 values before they are fixed up. Note no need to care for + * endianness since we are comparing and moving data for on disk + * structures which means the data is consistent. - If it is + * consistenty the wrong endianness it doesn't make any difference. + */ + usn = *usa_pos; + /* + * Position in protected data of first u16 that needs fixing up. + */ + data_pos = (u16*)b + NTFS_BLOCK_SIZE/sizeof(u16) - 1; + /* + * Check for incomplete multi sector transfer(s). + */ + while (usa_count--) { + if (*data_pos != usn) { + /* + * Incomplete multi sector transfer detected! )-: + * Set the magic to "BAAD" and return failure. + * Note that magic_BAAD is already converted to le32. + */ + b->magic = magic_BAAD; + return -EINVAL; + } + data_pos += NTFS_BLOCK_SIZE/sizeof(u16); + } + /* Re-setup the variables. */ + usa_count = le16_to_cpu(b->usa_count) - 1; + data_pos = (u16*)b + NTFS_BLOCK_SIZE/sizeof(u16) - 1; + /* Fixup all sectors. */ + while (usa_count--) { + /* + * Increment position in usa and restore original data from + * the usa into the data buffer. + */ + *data_pos = *(++usa_pos); + /* Increment position in data as well. */ + data_pos += NTFS_BLOCK_SIZE/sizeof(u16); + } + return 0; +} + +/** + * pre_write_mst_fixup - apply multi sector transfer protection + * @b: pointer to the data to protect + * @size: size in bytes of @b + * + * Perform the necessary pre write multi sector transfer fixup on the data + * pointer to by @b of @size. + * + * Return 0 if fixup applied (success) or -EINVAL if no fixup was performed + * (assumed not needed). This is in contrast to post_read_mst_fixup() above. + * + * NOTE: We consider the absence / invalidity of an update sequence array to + * mean that the structure is not subject to protection and hence doesn't need + * to be fixed up. This means that you have to create a valid update sequence + * array header in the ntfs record before calling this function, otherwise it + * will fail (the header needs to contain the position of the update sequence + * array together with the number of elements in the array). You also need to + * initialise the update sequence number before calling this function + * otherwise a random word will be used (whatever was in the record at that + * position at that time). + */ +int pre_write_mst_fixup(NTFS_RECORD *b, const u32 size) +{ + le16 *usa_pos, *data_pos; + u16 usa_ofs, usa_count, usn; + le16 le_usn; + + /* Sanity check + only fixup if it makes sense. */ + if (!b || ntfs_is_baad_record(b->magic) || + ntfs_is_hole_record(b->magic)) + return -EINVAL; + /* Setup the variables. */ + usa_ofs = le16_to_cpu(b->usa_ofs); + /* Decrement usa_count to get number of fixups. */ + usa_count = le16_to_cpu(b->usa_count) - 1; + /* Size and alignment checks. */ + if ( size & (NTFS_BLOCK_SIZE - 1) || + usa_ofs & 1 || + usa_ofs + (usa_count * 2) > size || + (size >> NTFS_BLOCK_SIZE_BITS) != usa_count) + return -EINVAL; + /* Position of usn in update sequence array. */ + usa_pos = (le16*)((u8*)b + usa_ofs); + /* + * Cyclically increment the update sequence number + * (skipping 0 and -1, i.e. 0xffff). + */ + usn = le16_to_cpup(usa_pos) + 1; + if (usn == 0xffff || !usn) + usn = 1; + le_usn = cpu_to_le16(usn); + *usa_pos = le_usn; + /* Position in data of first u16 that needs fixing up. */ + data_pos = (le16*)b + NTFS_BLOCK_SIZE/sizeof(le16) - 1; + /* Fixup all sectors. */ + while (usa_count--) { + /* + * Increment the position in the usa and save the + * original data from the data buffer into the usa. + */ + *(++usa_pos) = *data_pos; + /* Apply fixup to data. */ + *data_pos = le_usn; + /* Increment position in data as well. */ + data_pos += NTFS_BLOCK_SIZE/sizeof(le16); + } + return 0; +} + +/** + * post_write_mst_fixup - fast deprotect multi sector transfer protected data + * @b: pointer to the data to deprotect + * + * Perform the necessary post write multi sector transfer fixup, not checking + * for any errors, because we assume we have just used pre_write_mst_fixup(), + * thus the data will be fine or we would never have gotten here. + */ +void post_write_mst_fixup(NTFS_RECORD *b) +{ + le16 *usa_pos, *data_pos; + + u16 usa_ofs = le16_to_cpu(b->usa_ofs); + u16 usa_count = le16_to_cpu(b->usa_count) - 1; + + /* Position of usn in update sequence array. */ + usa_pos = (le16*)b + usa_ofs/sizeof(le16); + + /* Position in protected data of first u16 that needs fixing up. */ + data_pos = (le16*)b + NTFS_BLOCK_SIZE/sizeof(le16) - 1; + + /* Fixup all sectors. */ + while (usa_count--) { + /* + * Increment position in usa and restore original data from + * the usa into the data buffer. + */ + *data_pos = *(++usa_pos); + + /* Increment position in data as well. */ + data_pos += NTFS_BLOCK_SIZE/sizeof(le16); + } +} diff --git a/fs/ntfs/namei.c b/fs/ntfs/namei.c new file mode 100644 index 000000000..4e6a44bc6 --- /dev/null +++ b/fs/ntfs/namei.c @@ -0,0 +1,391 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * namei.c - NTFS kernel directory inode operations. Part of the Linux-NTFS + * project. + * + * Copyright (c) 2001-2006 Anton Altaparmakov + */ + +#include <linux/dcache.h> +#include <linux/exportfs.h> +#include <linux/security.h> +#include <linux/slab.h> + +#include "attrib.h" +#include "debug.h" +#include "dir.h" +#include "mft.h" +#include "ntfs.h" + +/** + * ntfs_lookup - find the inode represented by a dentry in a directory inode + * @dir_ino: directory inode in which to look for the inode + * @dent: dentry representing the inode to look for + * @flags: lookup flags + * + * In short, ntfs_lookup() looks for the inode represented by the dentry @dent + * in the directory inode @dir_ino and if found attaches the inode to the + * dentry @dent. + * + * In more detail, the dentry @dent specifies which inode to look for by + * supplying the name of the inode in @dent->d_name.name. ntfs_lookup() + * converts the name to Unicode and walks the contents of the directory inode + * @dir_ino looking for the converted Unicode name. If the name is found in the + * directory, the corresponding inode is loaded by calling ntfs_iget() on its + * inode number and the inode is associated with the dentry @dent via a call to + * d_splice_alias(). + * + * If the name is not found in the directory, a NULL inode is inserted into the + * dentry @dent via a call to d_add(). The dentry is then termed a negative + * dentry. + * + * Only if an actual error occurs, do we return an error via ERR_PTR(). + * + * In order to handle the case insensitivity issues of NTFS with regards to the + * dcache and the dcache requiring only one dentry per directory, we deal with + * dentry aliases that only differ in case in ->ntfs_lookup() while maintaining + * a case sensitive dcache. This means that we get the full benefit of dcache + * speed when the file/directory is looked up with the same case as returned by + * ->ntfs_readdir() but that a lookup for any other case (or for the short file + * name) will not find anything in dcache and will enter ->ntfs_lookup() + * instead, where we search the directory for a fully matching file name + * (including case) and if that is not found, we search for a file name that + * matches with different case and if that has non-POSIX semantics we return + * that. We actually do only one search (case sensitive) and keep tabs on + * whether we have found a case insensitive match in the process. + * + * To simplify matters for us, we do not treat the short vs long filenames as + * two hard links but instead if the lookup matches a short filename, we + * return the dentry for the corresponding long filename instead. + * + * There are three cases we need to distinguish here: + * + * 1) @dent perfectly matches (i.e. including case) a directory entry with a + * file name in the WIN32 or POSIX namespaces. In this case + * ntfs_lookup_inode_by_name() will return with name set to NULL and we + * just d_splice_alias() @dent. + * 2) @dent matches (not including case) a directory entry with a file name in + * the WIN32 namespace. In this case ntfs_lookup_inode_by_name() will return + * with name set to point to a kmalloc()ed ntfs_name structure containing + * the properly cased little endian Unicode name. We convert the name to the + * current NLS code page, search if a dentry with this name already exists + * and if so return that instead of @dent. At this point things are + * complicated by the possibility of 'disconnected' dentries due to NFS + * which we deal with appropriately (see the code comments). The VFS will + * then destroy the old @dent and use the one we returned. If a dentry is + * not found, we allocate a new one, d_splice_alias() it, and return it as + * above. + * 3) @dent matches either perfectly or not (i.e. we don't care about case) a + * directory entry with a file name in the DOS namespace. In this case + * ntfs_lookup_inode_by_name() will return with name set to point to a + * kmalloc()ed ntfs_name structure containing the mft reference (cpu endian) + * of the inode. We use the mft reference to read the inode and to find the + * file name in the WIN32 namespace corresponding to the matched short file + * name. We then convert the name to the current NLS code page, and proceed + * searching for a dentry with this name, etc, as in case 2), above. + * + * Locking: Caller must hold i_mutex on the directory. + */ +static struct dentry *ntfs_lookup(struct inode *dir_ino, struct dentry *dent, + unsigned int flags) +{ + ntfs_volume *vol = NTFS_SB(dir_ino->i_sb); + struct inode *dent_inode; + ntfschar *uname; + ntfs_name *name = NULL; + MFT_REF mref; + unsigned long dent_ino; + int uname_len; + + ntfs_debug("Looking up %pd in directory inode 0x%lx.", + dent, dir_ino->i_ino); + /* Convert the name of the dentry to Unicode. */ + uname_len = ntfs_nlstoucs(vol, dent->d_name.name, dent->d_name.len, + &uname); + if (uname_len < 0) { + if (uname_len != -ENAMETOOLONG) + ntfs_error(vol->sb, "Failed to convert name to " + "Unicode."); + return ERR_PTR(uname_len); + } + mref = ntfs_lookup_inode_by_name(NTFS_I(dir_ino), uname, uname_len, + &name); + kmem_cache_free(ntfs_name_cache, uname); + if (!IS_ERR_MREF(mref)) { + dent_ino = MREF(mref); + ntfs_debug("Found inode 0x%lx. Calling ntfs_iget.", dent_ino); + dent_inode = ntfs_iget(vol->sb, dent_ino); + if (!IS_ERR(dent_inode)) { + /* Consistency check. */ + if (is_bad_inode(dent_inode) || MSEQNO(mref) == + NTFS_I(dent_inode)->seq_no || + dent_ino == FILE_MFT) { + /* Perfect WIN32/POSIX match. -- Case 1. */ + if (!name) { + ntfs_debug("Done. (Case 1.)"); + return d_splice_alias(dent_inode, dent); + } + /* + * We are too indented. Handle imperfect + * matches and short file names further below. + */ + goto handle_name; + } + ntfs_error(vol->sb, "Found stale reference to inode " + "0x%lx (reference sequence number = " + "0x%x, inode sequence number = 0x%x), " + "returning -EIO. Run chkdsk.", + dent_ino, MSEQNO(mref), + NTFS_I(dent_inode)->seq_no); + iput(dent_inode); + dent_inode = ERR_PTR(-EIO); + } else + ntfs_error(vol->sb, "ntfs_iget(0x%lx) failed with " + "error code %li.", dent_ino, + PTR_ERR(dent_inode)); + kfree(name); + /* Return the error code. */ + return ERR_CAST(dent_inode); + } + /* It is guaranteed that @name is no longer allocated at this point. */ + if (MREF_ERR(mref) == -ENOENT) { + ntfs_debug("Entry was not found, adding negative dentry."); + /* The dcache will handle negative entries. */ + d_add(dent, NULL); + ntfs_debug("Done."); + return NULL; + } + ntfs_error(vol->sb, "ntfs_lookup_ino_by_name() failed with error " + "code %i.", -MREF_ERR(mref)); + return ERR_PTR(MREF_ERR(mref)); + // TODO: Consider moving this lot to a separate function! (AIA) +handle_name: + { + MFT_RECORD *m; + ntfs_attr_search_ctx *ctx; + ntfs_inode *ni = NTFS_I(dent_inode); + int err; + struct qstr nls_name; + + nls_name.name = NULL; + if (name->type != FILE_NAME_DOS) { /* Case 2. */ + ntfs_debug("Case 2."); + nls_name.len = (unsigned)ntfs_ucstonls(vol, + (ntfschar*)&name->name, name->len, + (unsigned char**)&nls_name.name, 0); + kfree(name); + } else /* if (name->type == FILE_NAME_DOS) */ { /* Case 3. */ + FILE_NAME_ATTR *fn; + + ntfs_debug("Case 3."); + kfree(name); + + /* Find the WIN32 name corresponding to the matched DOS name. */ + ni = NTFS_I(dent_inode); + m = map_mft_record(ni); + if (IS_ERR(m)) { + err = PTR_ERR(m); + m = NULL; + ctx = NULL; + goto err_out; + } + ctx = ntfs_attr_get_search_ctx(ni, m); + if (unlikely(!ctx)) { + err = -ENOMEM; + goto err_out; + } + do { + ATTR_RECORD *a; + u32 val_len; + + err = ntfs_attr_lookup(AT_FILE_NAME, NULL, 0, 0, 0, + NULL, 0, ctx); + if (unlikely(err)) { + ntfs_error(vol->sb, "Inode corrupt: No WIN32 " + "namespace counterpart to DOS " + "file name. Run chkdsk."); + if (err == -ENOENT) + err = -EIO; + goto err_out; + } + /* Consistency checks. */ + a = ctx->attr; + if (a->non_resident || a->flags) + goto eio_err_out; + val_len = le32_to_cpu(a->data.resident.value_length); + if (le16_to_cpu(a->data.resident.value_offset) + + val_len > le32_to_cpu(a->length)) + goto eio_err_out; + fn = (FILE_NAME_ATTR*)((u8*)ctx->attr + le16_to_cpu( + ctx->attr->data.resident.value_offset)); + if ((u32)(fn->file_name_length * sizeof(ntfschar) + + sizeof(FILE_NAME_ATTR)) > val_len) + goto eio_err_out; + } while (fn->file_name_type != FILE_NAME_WIN32); + + /* Convert the found WIN32 name to current NLS code page. */ + nls_name.len = (unsigned)ntfs_ucstonls(vol, + (ntfschar*)&fn->file_name, fn->file_name_length, + (unsigned char**)&nls_name.name, 0); + + ntfs_attr_put_search_ctx(ctx); + unmap_mft_record(ni); + } + m = NULL; + ctx = NULL; + + /* Check if a conversion error occurred. */ + if ((signed)nls_name.len < 0) { + err = (signed)nls_name.len; + goto err_out; + } + nls_name.hash = full_name_hash(dent, nls_name.name, nls_name.len); + + dent = d_add_ci(dent, dent_inode, &nls_name); + kfree(nls_name.name); + return dent; + +eio_err_out: + ntfs_error(vol->sb, "Illegal file name attribute. Run chkdsk."); + err = -EIO; +err_out: + if (ctx) + ntfs_attr_put_search_ctx(ctx); + if (m) + unmap_mft_record(ni); + iput(dent_inode); + ntfs_error(vol->sb, "Failed, returning error code %i.", err); + return ERR_PTR(err); + } +} + +/** + * Inode operations for directories. + */ +const struct inode_operations ntfs_dir_inode_ops = { + .lookup = ntfs_lookup, /* VFS: Lookup directory. */ +}; + +/** + * ntfs_get_parent - find the dentry of the parent of a given directory dentry + * @child_dent: dentry of the directory whose parent directory to find + * + * Find the dentry for the parent directory of the directory specified by the + * dentry @child_dent. This function is called from + * fs/exportfs/expfs.c::find_exported_dentry() which in turn is called from the + * default ->decode_fh() which is export_decode_fh() in the same file. + * + * The code is based on the ext3 ->get_parent() implementation found in + * fs/ext3/namei.c::ext3_get_parent(). + * + * Note: ntfs_get_parent() is called with @d_inode(child_dent)->i_mutex down. + * + * Return the dentry of the parent directory on success or the error code on + * error (IS_ERR() is true). + */ +static struct dentry *ntfs_get_parent(struct dentry *child_dent) +{ + struct inode *vi = d_inode(child_dent); + ntfs_inode *ni = NTFS_I(vi); + MFT_RECORD *mrec; + ntfs_attr_search_ctx *ctx; + ATTR_RECORD *attr; + FILE_NAME_ATTR *fn; + unsigned long parent_ino; + int err; + + ntfs_debug("Entering for inode 0x%lx.", vi->i_ino); + /* Get the mft record of the inode belonging to the child dentry. */ + mrec = map_mft_record(ni); + if (IS_ERR(mrec)) + return ERR_CAST(mrec); + /* Find the first file name attribute in the mft record. */ + ctx = ntfs_attr_get_search_ctx(ni, mrec); + if (unlikely(!ctx)) { + unmap_mft_record(ni); + return ERR_PTR(-ENOMEM); + } +try_next: + err = ntfs_attr_lookup(AT_FILE_NAME, NULL, 0, CASE_SENSITIVE, 0, NULL, + 0, ctx); + if (unlikely(err)) { + ntfs_attr_put_search_ctx(ctx); + unmap_mft_record(ni); + if (err == -ENOENT) + ntfs_error(vi->i_sb, "Inode 0x%lx does not have a " + "file name attribute. Run chkdsk.", + vi->i_ino); + return ERR_PTR(err); + } + attr = ctx->attr; + if (unlikely(attr->non_resident)) + goto try_next; + fn = (FILE_NAME_ATTR *)((u8 *)attr + + le16_to_cpu(attr->data.resident.value_offset)); + if (unlikely((u8 *)fn + le32_to_cpu(attr->data.resident.value_length) > + (u8*)attr + le32_to_cpu(attr->length))) + goto try_next; + /* Get the inode number of the parent directory. */ + parent_ino = MREF_LE(fn->parent_directory); + /* Release the search context and the mft record of the child. */ + ntfs_attr_put_search_ctx(ctx); + unmap_mft_record(ni); + + return d_obtain_alias(ntfs_iget(vi->i_sb, parent_ino)); +} + +static struct inode *ntfs_nfs_get_inode(struct super_block *sb, + u64 ino, u32 generation) +{ + struct inode *inode; + + inode = ntfs_iget(sb, ino); + if (!IS_ERR(inode)) { + if (is_bad_inode(inode) || inode->i_generation != generation) { + iput(inode); + inode = ERR_PTR(-ESTALE); + } + } + + return inode; +} + +static struct dentry *ntfs_fh_to_dentry(struct super_block *sb, struct fid *fid, + int fh_len, int fh_type) +{ + return generic_fh_to_dentry(sb, fid, fh_len, fh_type, + ntfs_nfs_get_inode); +} + +static struct dentry *ntfs_fh_to_parent(struct super_block *sb, struct fid *fid, + int fh_len, int fh_type) +{ + return generic_fh_to_parent(sb, fid, fh_len, fh_type, + ntfs_nfs_get_inode); +} + +/** + * Export operations allowing NFS exporting of mounted NTFS partitions. + * + * We use the default ->encode_fh() for now. Note that they + * use 32 bits to store the inode number which is an unsigned long so on 64-bit + * architectures is usually 64 bits so it would all fail horribly on huge + * volumes. I guess we need to define our own encode and decode fh functions + * that store 64-bit inode numbers at some point but for now we will ignore the + * problem... + * + * We also use the default ->get_name() helper (used by ->decode_fh() via + * fs/exportfs/expfs.c::find_exported_dentry()) as that is completely fs + * independent. + * + * The default ->get_parent() just returns -EACCES so we have to provide our + * own and the default ->get_dentry() is incompatible with NTFS due to not + * allowing the inode number 0 which is used in NTFS for the system file $MFT + * and due to using iget() whereas NTFS needs ntfs_iget(). + */ +const struct export_operations ntfs_export_ops = { + .get_parent = ntfs_get_parent, /* Find the parent of a given + directory. */ + .fh_to_dentry = ntfs_fh_to_dentry, + .fh_to_parent = ntfs_fh_to_parent, +}; diff --git a/fs/ntfs/ntfs.h b/fs/ntfs/ntfs.h new file mode 100644 index 000000000..e81376ea9 --- /dev/null +++ b/fs/ntfs/ntfs.h @@ -0,0 +1,150 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * ntfs.h - Defines for NTFS Linux kernel driver. + * + * Copyright (c) 2001-2014 Anton Altaparmakov and Tuxera Inc. + * Copyright (C) 2002 Richard Russon + */ + +#ifndef _LINUX_NTFS_H +#define _LINUX_NTFS_H + +#include <linux/stddef.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/compiler.h> +#include <linux/fs.h> +#include <linux/nls.h> +#include <linux/smp.h> +#include <linux/pagemap.h> + +#include "types.h" +#include "volume.h" +#include "layout.h" + +typedef enum { + NTFS_BLOCK_SIZE = 512, + NTFS_BLOCK_SIZE_BITS = 9, + NTFS_SB_MAGIC = 0x5346544e, /* 'NTFS' */ + NTFS_MAX_NAME_LEN = 255, + NTFS_MAX_ATTR_NAME_LEN = 255, + NTFS_MAX_CLUSTER_SIZE = 64 * 1024, /* 64kiB */ + NTFS_MAX_PAGES_PER_CLUSTER = NTFS_MAX_CLUSTER_SIZE / PAGE_SIZE, +} NTFS_CONSTANTS; + +/* Global variables. */ + +/* Slab caches (from super.c). */ +extern struct kmem_cache *ntfs_name_cache; +extern struct kmem_cache *ntfs_inode_cache; +extern struct kmem_cache *ntfs_big_inode_cache; +extern struct kmem_cache *ntfs_attr_ctx_cache; +extern struct kmem_cache *ntfs_index_ctx_cache; + +/* The various operations structs defined throughout the driver files. */ +extern const struct address_space_operations ntfs_normal_aops; +extern const struct address_space_operations ntfs_compressed_aops; +extern const struct address_space_operations ntfs_mst_aops; + +extern const struct file_operations ntfs_file_ops; +extern const struct inode_operations ntfs_file_inode_ops; + +extern const struct file_operations ntfs_dir_ops; +extern const struct inode_operations ntfs_dir_inode_ops; + +extern const struct file_operations ntfs_empty_file_ops; +extern const struct inode_operations ntfs_empty_inode_ops; + +extern const struct export_operations ntfs_export_ops; + +/** + * NTFS_SB - return the ntfs volume given a vfs super block + * @sb: VFS super block + * + * NTFS_SB() returns the ntfs volume associated with the VFS super block @sb. + */ +static inline ntfs_volume *NTFS_SB(struct super_block *sb) +{ + return sb->s_fs_info; +} + +/* Declarations of functions and global variables. */ + +/* From fs/ntfs/compress.c */ +extern int ntfs_read_compressed_block(struct page *page); +extern int allocate_compression_buffers(void); +extern void free_compression_buffers(void); + +/* From fs/ntfs/super.c */ +#define default_upcase_len 0x10000 +extern struct mutex ntfs_lock; + +typedef struct { + int val; + char *str; +} option_t; +extern const option_t on_errors_arr[]; + +/* From fs/ntfs/mst.c */ +extern int post_read_mst_fixup(NTFS_RECORD *b, const u32 size); +extern int pre_write_mst_fixup(NTFS_RECORD *b, const u32 size); +extern void post_write_mst_fixup(NTFS_RECORD *b); + +/* From fs/ntfs/unistr.c */ +extern bool ntfs_are_names_equal(const ntfschar *s1, size_t s1_len, + const ntfschar *s2, size_t s2_len, + const IGNORE_CASE_BOOL ic, + const ntfschar *upcase, const u32 upcase_size); +extern int ntfs_collate_names(const ntfschar *name1, const u32 name1_len, + const ntfschar *name2, const u32 name2_len, + const int err_val, const IGNORE_CASE_BOOL ic, + const ntfschar *upcase, const u32 upcase_len); +extern int ntfs_ucsncmp(const ntfschar *s1, const ntfschar *s2, size_t n); +extern int ntfs_ucsncasecmp(const ntfschar *s1, const ntfschar *s2, size_t n, + const ntfschar *upcase, const u32 upcase_size); +extern void ntfs_upcase_name(ntfschar *name, u32 name_len, + const ntfschar *upcase, const u32 upcase_len); +extern void ntfs_file_upcase_value(FILE_NAME_ATTR *file_name_attr, + const ntfschar *upcase, const u32 upcase_len); +extern int ntfs_file_compare_values(FILE_NAME_ATTR *file_name_attr1, + FILE_NAME_ATTR *file_name_attr2, + const int err_val, const IGNORE_CASE_BOOL ic, + const ntfschar *upcase, const u32 upcase_len); +extern int ntfs_nlstoucs(const ntfs_volume *vol, const char *ins, + const int ins_len, ntfschar **outs); +extern int ntfs_ucstonls(const ntfs_volume *vol, const ntfschar *ins, + const int ins_len, unsigned char **outs, int outs_len); + +/* From fs/ntfs/upcase.c */ +extern ntfschar *generate_default_upcase(void); + +static inline int ntfs_ffs(int x) +{ + int r = 1; + + if (!x) + return 0; + if (!(x & 0xffff)) { + x >>= 16; + r += 16; + } + if (!(x & 0xff)) { + x >>= 8; + r += 8; + } + if (!(x & 0xf)) { + x >>= 4; + r += 4; + } + if (!(x & 3)) { + x >>= 2; + r += 2; + } + if (!(x & 1)) { + x >>= 1; + r += 1; + } + return r; +} + +#endif /* _LINUX_NTFS_H */ diff --git a/fs/ntfs/quota.c b/fs/ntfs/quota.c new file mode 100644 index 000000000..916048022 --- /dev/null +++ b/fs/ntfs/quota.c @@ -0,0 +1,103 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * quota.c - NTFS kernel quota ($Quota) handling. Part of the Linux-NTFS + * project. + * + * Copyright (c) 2004 Anton Altaparmakov + */ + +#ifdef NTFS_RW + +#include "index.h" +#include "quota.h" +#include "debug.h" +#include "ntfs.h" + +/** + * ntfs_mark_quotas_out_of_date - mark the quotas out of date on an ntfs volume + * @vol: ntfs volume on which to mark the quotas out of date + * + * Mark the quotas out of date on the ntfs volume @vol and return 'true' on + * success and 'false' on error. + */ +bool ntfs_mark_quotas_out_of_date(ntfs_volume *vol) +{ + ntfs_index_context *ictx; + QUOTA_CONTROL_ENTRY *qce; + const le32 qid = QUOTA_DEFAULTS_ID; + int err; + + ntfs_debug("Entering."); + if (NVolQuotaOutOfDate(vol)) + goto done; + if (!vol->quota_ino || !vol->quota_q_ino) { + ntfs_error(vol->sb, "Quota inodes are not open."); + return false; + } + inode_lock(vol->quota_q_ino); + ictx = ntfs_index_ctx_get(NTFS_I(vol->quota_q_ino)); + if (!ictx) { + ntfs_error(vol->sb, "Failed to get index context."); + goto err_out; + } + err = ntfs_index_lookup(&qid, sizeof(qid), ictx); + if (err) { + if (err == -ENOENT) + ntfs_error(vol->sb, "Quota defaults entry is not " + "present."); + else + ntfs_error(vol->sb, "Lookup of quota defaults entry " + "failed."); + goto err_out; + } + if (ictx->data_len < offsetof(QUOTA_CONTROL_ENTRY, sid)) { + ntfs_error(vol->sb, "Quota defaults entry size is invalid. " + "Run chkdsk."); + goto err_out; + } + qce = (QUOTA_CONTROL_ENTRY*)ictx->data; + if (le32_to_cpu(qce->version) != QUOTA_VERSION) { + ntfs_error(vol->sb, "Quota defaults entry version 0x%x is not " + "supported.", le32_to_cpu(qce->version)); + goto err_out; + } + ntfs_debug("Quota defaults flags = 0x%x.", le32_to_cpu(qce->flags)); + /* If quotas are already marked out of date, no need to do anything. */ + if (qce->flags & QUOTA_FLAG_OUT_OF_DATE) + goto set_done; + /* + * If quota tracking is neither requested, nor enabled and there are no + * pending deletes, no need to mark the quotas out of date. + */ + if (!(qce->flags & (QUOTA_FLAG_TRACKING_ENABLED | + QUOTA_FLAG_TRACKING_REQUESTED | + QUOTA_FLAG_PENDING_DELETES))) + goto set_done; + /* + * Set the QUOTA_FLAG_OUT_OF_DATE bit thus marking quotas out of date. + * This is verified on WinXP to be sufficient to cause windows to + * rescan the volume on boot and update all quota entries. + */ + qce->flags |= QUOTA_FLAG_OUT_OF_DATE; + /* Ensure the modified flags are written to disk. */ + ntfs_index_entry_flush_dcache_page(ictx); + ntfs_index_entry_mark_dirty(ictx); +set_done: + ntfs_index_ctx_put(ictx); + inode_unlock(vol->quota_q_ino); + /* + * We set the flag so we do not try to mark the quotas out of date + * again on remount. + */ + NVolSetQuotaOutOfDate(vol); +done: + ntfs_debug("Done."); + return true; +err_out: + if (ictx) + ntfs_index_ctx_put(ictx); + inode_unlock(vol->quota_q_ino); + return false; +} + +#endif /* NTFS_RW */ diff --git a/fs/ntfs/quota.h b/fs/ntfs/quota.h new file mode 100644 index 000000000..fe3132a3d --- /dev/null +++ b/fs/ntfs/quota.h @@ -0,0 +1,21 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * quota.h - Defines for NTFS kernel quota ($Quota) handling. Part of the + * Linux-NTFS project. + * + * Copyright (c) 2004 Anton Altaparmakov + */ + +#ifndef _LINUX_NTFS_QUOTA_H +#define _LINUX_NTFS_QUOTA_H + +#ifdef NTFS_RW + +#include "types.h" +#include "volume.h" + +extern bool ntfs_mark_quotas_out_of_date(ntfs_volume *vol); + +#endif /* NTFS_RW */ + +#endif /* _LINUX_NTFS_QUOTA_H */ diff --git a/fs/ntfs/runlist.c b/fs/ntfs/runlist.c new file mode 100644 index 000000000..97932fb51 --- /dev/null +++ b/fs/ntfs/runlist.c @@ -0,0 +1,1893 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/** + * runlist.c - NTFS runlist handling code. Part of the Linux-NTFS project. + * + * Copyright (c) 2001-2007 Anton Altaparmakov + * Copyright (c) 2002-2005 Richard Russon + */ + +#include "debug.h" +#include "dir.h" +#include "endian.h" +#include "malloc.h" +#include "ntfs.h" + +/** + * ntfs_rl_mm - runlist memmove + * + * It is up to the caller to serialize access to the runlist @base. + */ +static inline void ntfs_rl_mm(runlist_element *base, int dst, int src, + int size) +{ + if (likely((dst != src) && (size > 0))) + memmove(base + dst, base + src, size * sizeof(*base)); +} + +/** + * ntfs_rl_mc - runlist memory copy + * + * It is up to the caller to serialize access to the runlists @dstbase and + * @srcbase. + */ +static inline void ntfs_rl_mc(runlist_element *dstbase, int dst, + runlist_element *srcbase, int src, int size) +{ + if (likely(size > 0)) + memcpy(dstbase + dst, srcbase + src, size * sizeof(*dstbase)); +} + +/** + * ntfs_rl_realloc - Reallocate memory for runlists + * @rl: original runlist + * @old_size: number of runlist elements in the original runlist @rl + * @new_size: number of runlist elements we need space for + * + * As the runlists grow, more memory will be required. To prevent the + * kernel having to allocate and reallocate large numbers of small bits of + * memory, this function returns an entire page of memory. + * + * It is up to the caller to serialize access to the runlist @rl. + * + * N.B. If the new allocation doesn't require a different number of pages in + * memory, the function will return the original pointer. + * + * On success, return a pointer to the newly allocated, or recycled, memory. + * On error, return -errno. The following error codes are defined: + * -ENOMEM - Not enough memory to allocate runlist array. + * -EINVAL - Invalid parameters were passed in. + */ +static inline runlist_element *ntfs_rl_realloc(runlist_element *rl, + int old_size, int new_size) +{ + runlist_element *new_rl; + + old_size = PAGE_ALIGN(old_size * sizeof(*rl)); + new_size = PAGE_ALIGN(new_size * sizeof(*rl)); + if (old_size == new_size) + return rl; + + new_rl = ntfs_malloc_nofs(new_size); + if (unlikely(!new_rl)) + return ERR_PTR(-ENOMEM); + + if (likely(rl != NULL)) { + if (unlikely(old_size > new_size)) + old_size = new_size; + memcpy(new_rl, rl, old_size); + ntfs_free(rl); + } + return new_rl; +} + +/** + * ntfs_rl_realloc_nofail - Reallocate memory for runlists + * @rl: original runlist + * @old_size: number of runlist elements in the original runlist @rl + * @new_size: number of runlist elements we need space for + * + * As the runlists grow, more memory will be required. To prevent the + * kernel having to allocate and reallocate large numbers of small bits of + * memory, this function returns an entire page of memory. + * + * This function guarantees that the allocation will succeed. It will sleep + * for as long as it takes to complete the allocation. + * + * It is up to the caller to serialize access to the runlist @rl. + * + * N.B. If the new allocation doesn't require a different number of pages in + * memory, the function will return the original pointer. + * + * On success, return a pointer to the newly allocated, or recycled, memory. + * On error, return -errno. The following error codes are defined: + * -ENOMEM - Not enough memory to allocate runlist array. + * -EINVAL - Invalid parameters were passed in. + */ +static inline runlist_element *ntfs_rl_realloc_nofail(runlist_element *rl, + int old_size, int new_size) +{ + runlist_element *new_rl; + + old_size = PAGE_ALIGN(old_size * sizeof(*rl)); + new_size = PAGE_ALIGN(new_size * sizeof(*rl)); + if (old_size == new_size) + return rl; + + new_rl = ntfs_malloc_nofs_nofail(new_size); + BUG_ON(!new_rl); + + if (likely(rl != NULL)) { + if (unlikely(old_size > new_size)) + old_size = new_size; + memcpy(new_rl, rl, old_size); + ntfs_free(rl); + } + return new_rl; +} + +/** + * ntfs_are_rl_mergeable - test if two runlists can be joined together + * @dst: original runlist + * @src: new runlist to test for mergeability with @dst + * + * Test if two runlists can be joined together. For this, their VCNs and LCNs + * must be adjacent. + * + * It is up to the caller to serialize access to the runlists @dst and @src. + * + * Return: true Success, the runlists can be merged. + * false Failure, the runlists cannot be merged. + */ +static inline bool ntfs_are_rl_mergeable(runlist_element *dst, + runlist_element *src) +{ + BUG_ON(!dst); + BUG_ON(!src); + + /* We can merge unmapped regions even if they are misaligned. */ + if ((dst->lcn == LCN_RL_NOT_MAPPED) && (src->lcn == LCN_RL_NOT_MAPPED)) + return true; + /* If the runs are misaligned, we cannot merge them. */ + if ((dst->vcn + dst->length) != src->vcn) + return false; + /* If both runs are non-sparse and contiguous, we can merge them. */ + if ((dst->lcn >= 0) && (src->lcn >= 0) && + ((dst->lcn + dst->length) == src->lcn)) + return true; + /* If we are merging two holes, we can merge them. */ + if ((dst->lcn == LCN_HOLE) && (src->lcn == LCN_HOLE)) + return true; + /* Cannot merge. */ + return false; +} + +/** + * __ntfs_rl_merge - merge two runlists without testing if they can be merged + * @dst: original, destination runlist + * @src: new runlist to merge with @dst + * + * Merge the two runlists, writing into the destination runlist @dst. The + * caller must make sure the runlists can be merged or this will corrupt the + * destination runlist. + * + * It is up to the caller to serialize access to the runlists @dst and @src. + */ +static inline void __ntfs_rl_merge(runlist_element *dst, runlist_element *src) +{ + dst->length += src->length; +} + +/** + * ntfs_rl_append - append a runlist after a given element + * @dst: original runlist to be worked on + * @dsize: number of elements in @dst (including end marker) + * @src: runlist to be inserted into @dst + * @ssize: number of elements in @src (excluding end marker) + * @loc: append the new runlist @src after this element in @dst + * + * Append the runlist @src after element @loc in @dst. Merge the right end of + * the new runlist, if necessary. Adjust the size of the hole before the + * appended runlist. + * + * It is up to the caller to serialize access to the runlists @dst and @src. + * + * On success, return a pointer to the new, combined, runlist. Note, both + * runlists @dst and @src are deallocated before returning so you cannot use + * the pointers for anything any more. (Strictly speaking the returned runlist + * may be the same as @dst but this is irrelevant.) + * + * On error, return -errno. Both runlists are left unmodified. The following + * error codes are defined: + * -ENOMEM - Not enough memory to allocate runlist array. + * -EINVAL - Invalid parameters were passed in. + */ +static inline runlist_element *ntfs_rl_append(runlist_element *dst, + int dsize, runlist_element *src, int ssize, int loc) +{ + bool right = false; /* Right end of @src needs merging. */ + int marker; /* End of the inserted runs. */ + + BUG_ON(!dst); + BUG_ON(!src); + + /* First, check if the right hand end needs merging. */ + if ((loc + 1) < dsize) + right = ntfs_are_rl_mergeable(src + ssize - 1, dst + loc + 1); + + /* Space required: @dst size + @src size, less one if we merged. */ + dst = ntfs_rl_realloc(dst, dsize, dsize + ssize - right); + if (IS_ERR(dst)) + return dst; + /* + * We are guaranteed to succeed from here so can start modifying the + * original runlists. + */ + + /* First, merge the right hand end, if necessary. */ + if (right) + __ntfs_rl_merge(src + ssize - 1, dst + loc + 1); + + /* First run after the @src runs that have been inserted. */ + marker = loc + ssize + 1; + + /* Move the tail of @dst out of the way, then copy in @src. */ + ntfs_rl_mm(dst, marker, loc + 1 + right, dsize - (loc + 1 + right)); + ntfs_rl_mc(dst, loc + 1, src, 0, ssize); + + /* Adjust the size of the preceding hole. */ + dst[loc].length = dst[loc + 1].vcn - dst[loc].vcn; + + /* We may have changed the length of the file, so fix the end marker */ + if (dst[marker].lcn == LCN_ENOENT) + dst[marker].vcn = dst[marker - 1].vcn + dst[marker - 1].length; + + return dst; +} + +/** + * ntfs_rl_insert - insert a runlist into another + * @dst: original runlist to be worked on + * @dsize: number of elements in @dst (including end marker) + * @src: new runlist to be inserted + * @ssize: number of elements in @src (excluding end marker) + * @loc: insert the new runlist @src before this element in @dst + * + * Insert the runlist @src before element @loc in the runlist @dst. Merge the + * left end of the new runlist, if necessary. Adjust the size of the hole + * after the inserted runlist. + * + * It is up to the caller to serialize access to the runlists @dst and @src. + * + * On success, return a pointer to the new, combined, runlist. Note, both + * runlists @dst and @src are deallocated before returning so you cannot use + * the pointers for anything any more. (Strictly speaking the returned runlist + * may be the same as @dst but this is irrelevant.) + * + * On error, return -errno. Both runlists are left unmodified. The following + * error codes are defined: + * -ENOMEM - Not enough memory to allocate runlist array. + * -EINVAL - Invalid parameters were passed in. + */ +static inline runlist_element *ntfs_rl_insert(runlist_element *dst, + int dsize, runlist_element *src, int ssize, int loc) +{ + bool left = false; /* Left end of @src needs merging. */ + bool disc = false; /* Discontinuity between @dst and @src. */ + int marker; /* End of the inserted runs. */ + + BUG_ON(!dst); + BUG_ON(!src); + + /* + * disc => Discontinuity between the end of @dst and the start of @src. + * This means we might need to insert a "not mapped" run. + */ + if (loc == 0) + disc = (src[0].vcn > 0); + else { + s64 merged_length; + + left = ntfs_are_rl_mergeable(dst + loc - 1, src); + + merged_length = dst[loc - 1].length; + if (left) + merged_length += src->length; + + disc = (src[0].vcn > dst[loc - 1].vcn + merged_length); + } + /* + * Space required: @dst size + @src size, less one if we merged, plus + * one if there was a discontinuity. + */ + dst = ntfs_rl_realloc(dst, dsize, dsize + ssize - left + disc); + if (IS_ERR(dst)) + return dst; + /* + * We are guaranteed to succeed from here so can start modifying the + * original runlist. + */ + if (left) + __ntfs_rl_merge(dst + loc - 1, src); + /* + * First run after the @src runs that have been inserted. + * Nominally, @marker equals @loc + @ssize, i.e. location + number of + * runs in @src. However, if @left, then the first run in @src has + * been merged with one in @dst. And if @disc, then @dst and @src do + * not meet and we need an extra run to fill the gap. + */ + marker = loc + ssize - left + disc; + + /* Move the tail of @dst out of the way, then copy in @src. */ + ntfs_rl_mm(dst, marker, loc, dsize - loc); + ntfs_rl_mc(dst, loc + disc, src, left, ssize - left); + + /* Adjust the VCN of the first run after the insertion... */ + dst[marker].vcn = dst[marker - 1].vcn + dst[marker - 1].length; + /* ... and the length. */ + if (dst[marker].lcn == LCN_HOLE || dst[marker].lcn == LCN_RL_NOT_MAPPED) + dst[marker].length = dst[marker + 1].vcn - dst[marker].vcn; + + /* Writing beyond the end of the file and there is a discontinuity. */ + if (disc) { + if (loc > 0) { + dst[loc].vcn = dst[loc - 1].vcn + dst[loc - 1].length; + dst[loc].length = dst[loc + 1].vcn - dst[loc].vcn; + } else { + dst[loc].vcn = 0; + dst[loc].length = dst[loc + 1].vcn; + } + dst[loc].lcn = LCN_RL_NOT_MAPPED; + } + return dst; +} + +/** + * ntfs_rl_replace - overwrite a runlist element with another runlist + * @dst: original runlist to be worked on + * @dsize: number of elements in @dst (including end marker) + * @src: new runlist to be inserted + * @ssize: number of elements in @src (excluding end marker) + * @loc: index in runlist @dst to overwrite with @src + * + * Replace the runlist element @dst at @loc with @src. Merge the left and + * right ends of the inserted runlist, if necessary. + * + * It is up to the caller to serialize access to the runlists @dst and @src. + * + * On success, return a pointer to the new, combined, runlist. Note, both + * runlists @dst and @src are deallocated before returning so you cannot use + * the pointers for anything any more. (Strictly speaking the returned runlist + * may be the same as @dst but this is irrelevant.) + * + * On error, return -errno. Both runlists are left unmodified. The following + * error codes are defined: + * -ENOMEM - Not enough memory to allocate runlist array. + * -EINVAL - Invalid parameters were passed in. + */ +static inline runlist_element *ntfs_rl_replace(runlist_element *dst, + int dsize, runlist_element *src, int ssize, int loc) +{ + signed delta; + bool left = false; /* Left end of @src needs merging. */ + bool right = false; /* Right end of @src needs merging. */ + int tail; /* Start of tail of @dst. */ + int marker; /* End of the inserted runs. */ + + BUG_ON(!dst); + BUG_ON(!src); + + /* First, see if the left and right ends need merging. */ + if ((loc + 1) < dsize) + right = ntfs_are_rl_mergeable(src + ssize - 1, dst + loc + 1); + if (loc > 0) + left = ntfs_are_rl_mergeable(dst + loc - 1, src); + /* + * Allocate some space. We will need less if the left, right, or both + * ends get merged. The -1 accounts for the run being replaced. + */ + delta = ssize - 1 - left - right; + if (delta > 0) { + dst = ntfs_rl_realloc(dst, dsize, dsize + delta); + if (IS_ERR(dst)) + return dst; + } + /* + * We are guaranteed to succeed from here so can start modifying the + * original runlists. + */ + + /* First, merge the left and right ends, if necessary. */ + if (right) + __ntfs_rl_merge(src + ssize - 1, dst + loc + 1); + if (left) + __ntfs_rl_merge(dst + loc - 1, src); + /* + * Offset of the tail of @dst. This needs to be moved out of the way + * to make space for the runs to be copied from @src, i.e. the first + * run of the tail of @dst. + * Nominally, @tail equals @loc + 1, i.e. location, skipping the + * replaced run. However, if @right, then one of @dst's runs is + * already merged into @src. + */ + tail = loc + right + 1; + /* + * First run after the @src runs that have been inserted, i.e. where + * the tail of @dst needs to be moved to. + * Nominally, @marker equals @loc + @ssize, i.e. location + number of + * runs in @src. However, if @left, then the first run in @src has + * been merged with one in @dst. + */ + marker = loc + ssize - left; + + /* Move the tail of @dst out of the way, then copy in @src. */ + ntfs_rl_mm(dst, marker, tail, dsize - tail); + ntfs_rl_mc(dst, loc, src, left, ssize - left); + + /* We may have changed the length of the file, so fix the end marker. */ + if (dsize - tail > 0 && dst[marker].lcn == LCN_ENOENT) + dst[marker].vcn = dst[marker - 1].vcn + dst[marker - 1].length; + return dst; +} + +/** + * ntfs_rl_split - insert a runlist into the centre of a hole + * @dst: original runlist to be worked on + * @dsize: number of elements in @dst (including end marker) + * @src: new runlist to be inserted + * @ssize: number of elements in @src (excluding end marker) + * @loc: index in runlist @dst at which to split and insert @src + * + * Split the runlist @dst at @loc into two and insert @new in between the two + * fragments. No merging of runlists is necessary. Adjust the size of the + * holes either side. + * + * It is up to the caller to serialize access to the runlists @dst and @src. + * + * On success, return a pointer to the new, combined, runlist. Note, both + * runlists @dst and @src are deallocated before returning so you cannot use + * the pointers for anything any more. (Strictly speaking the returned runlist + * may be the same as @dst but this is irrelevant.) + * + * On error, return -errno. Both runlists are left unmodified. The following + * error codes are defined: + * -ENOMEM - Not enough memory to allocate runlist array. + * -EINVAL - Invalid parameters were passed in. + */ +static inline runlist_element *ntfs_rl_split(runlist_element *dst, int dsize, + runlist_element *src, int ssize, int loc) +{ + BUG_ON(!dst); + BUG_ON(!src); + + /* Space required: @dst size + @src size + one new hole. */ + dst = ntfs_rl_realloc(dst, dsize, dsize + ssize + 1); + if (IS_ERR(dst)) + return dst; + /* + * We are guaranteed to succeed from here so can start modifying the + * original runlists. + */ + + /* Move the tail of @dst out of the way, then copy in @src. */ + ntfs_rl_mm(dst, loc + 1 + ssize, loc, dsize - loc); + ntfs_rl_mc(dst, loc + 1, src, 0, ssize); + + /* Adjust the size of the holes either size of @src. */ + dst[loc].length = dst[loc+1].vcn - dst[loc].vcn; + dst[loc+ssize+1].vcn = dst[loc+ssize].vcn + dst[loc+ssize].length; + dst[loc+ssize+1].length = dst[loc+ssize+2].vcn - dst[loc+ssize+1].vcn; + + return dst; +} + +/** + * ntfs_runlists_merge - merge two runlists into one + * @drl: original runlist to be worked on + * @srl: new runlist to be merged into @drl + * + * First we sanity check the two runlists @srl and @drl to make sure that they + * are sensible and can be merged. The runlist @srl must be either after the + * runlist @drl or completely within a hole (or unmapped region) in @drl. + * + * It is up to the caller to serialize access to the runlists @drl and @srl. + * + * Merging of runlists is necessary in two cases: + * 1. When attribute lists are used and a further extent is being mapped. + * 2. When new clusters are allocated to fill a hole or extend a file. + * + * There are four possible ways @srl can be merged. It can: + * - be inserted at the beginning of a hole, + * - split the hole in two and be inserted between the two fragments, + * - be appended at the end of a hole, or it can + * - replace the whole hole. + * It can also be appended to the end of the runlist, which is just a variant + * of the insert case. + * + * On success, return a pointer to the new, combined, runlist. Note, both + * runlists @drl and @srl are deallocated before returning so you cannot use + * the pointers for anything any more. (Strictly speaking the returned runlist + * may be the same as @dst but this is irrelevant.) + * + * On error, return -errno. Both runlists are left unmodified. The following + * error codes are defined: + * -ENOMEM - Not enough memory to allocate runlist array. + * -EINVAL - Invalid parameters were passed in. + * -ERANGE - The runlists overlap and cannot be merged. + */ +runlist_element *ntfs_runlists_merge(runlist_element *drl, + runlist_element *srl) +{ + int di, si; /* Current index into @[ds]rl. */ + int sstart; /* First index with lcn > LCN_RL_NOT_MAPPED. */ + int dins; /* Index into @drl at which to insert @srl. */ + int dend, send; /* Last index into @[ds]rl. */ + int dfinal, sfinal; /* The last index into @[ds]rl with + lcn >= LCN_HOLE. */ + int marker = 0; + VCN marker_vcn = 0; + +#ifdef DEBUG + ntfs_debug("dst:"); + ntfs_debug_dump_runlist(drl); + ntfs_debug("src:"); + ntfs_debug_dump_runlist(srl); +#endif + + /* Check for silly calling... */ + if (unlikely(!srl)) + return drl; + if (IS_ERR(srl) || IS_ERR(drl)) + return ERR_PTR(-EINVAL); + + /* Check for the case where the first mapping is being done now. */ + if (unlikely(!drl)) { + drl = srl; + /* Complete the source runlist if necessary. */ + if (unlikely(drl[0].vcn)) { + /* Scan to the end of the source runlist. */ + for (dend = 0; likely(drl[dend].length); dend++) + ; + dend++; + drl = ntfs_rl_realloc(drl, dend, dend + 1); + if (IS_ERR(drl)) + return drl; + /* Insert start element at the front of the runlist. */ + ntfs_rl_mm(drl, 1, 0, dend); + drl[0].vcn = 0; + drl[0].lcn = LCN_RL_NOT_MAPPED; + drl[0].length = drl[1].vcn; + } + goto finished; + } + + si = di = 0; + + /* Skip any unmapped start element(s) in the source runlist. */ + while (srl[si].length && srl[si].lcn < LCN_HOLE) + si++; + + /* Can't have an entirely unmapped source runlist. */ + BUG_ON(!srl[si].length); + + /* Record the starting points. */ + sstart = si; + + /* + * Skip forward in @drl until we reach the position where @srl needs to + * be inserted. If we reach the end of @drl, @srl just needs to be + * appended to @drl. + */ + for (; drl[di].length; di++) { + if (drl[di].vcn + drl[di].length > srl[sstart].vcn) + break; + } + dins = di; + + /* Sanity check for illegal overlaps. */ + if ((drl[di].vcn == srl[si].vcn) && (drl[di].lcn >= 0) && + (srl[si].lcn >= 0)) { + ntfs_error(NULL, "Run lists overlap. Cannot merge!"); + return ERR_PTR(-ERANGE); + } + + /* Scan to the end of both runlists in order to know their sizes. */ + for (send = si; srl[send].length; send++) + ; + for (dend = di; drl[dend].length; dend++) + ; + + if (srl[send].lcn == LCN_ENOENT) + marker_vcn = srl[marker = send].vcn; + + /* Scan to the last element with lcn >= LCN_HOLE. */ + for (sfinal = send; sfinal >= 0 && srl[sfinal].lcn < LCN_HOLE; sfinal--) + ; + for (dfinal = dend; dfinal >= 0 && drl[dfinal].lcn < LCN_HOLE; dfinal--) + ; + + { + bool start; + bool finish; + int ds = dend + 1; /* Number of elements in drl & srl */ + int ss = sfinal - sstart + 1; + + start = ((drl[dins].lcn < LCN_RL_NOT_MAPPED) || /* End of file */ + (drl[dins].vcn == srl[sstart].vcn)); /* Start of hole */ + finish = ((drl[dins].lcn >= LCN_RL_NOT_MAPPED) && /* End of file */ + ((drl[dins].vcn + drl[dins].length) <= /* End of hole */ + (srl[send - 1].vcn + srl[send - 1].length))); + + /* Or we will lose an end marker. */ + if (finish && !drl[dins].length) + ss++; + if (marker && (drl[dins].vcn + drl[dins].length > srl[send - 1].vcn)) + finish = false; +#if 0 + ntfs_debug("dfinal = %i, dend = %i", dfinal, dend); + ntfs_debug("sstart = %i, sfinal = %i, send = %i", sstart, sfinal, send); + ntfs_debug("start = %i, finish = %i", start, finish); + ntfs_debug("ds = %i, ss = %i, dins = %i", ds, ss, dins); +#endif + if (start) { + if (finish) + drl = ntfs_rl_replace(drl, ds, srl + sstart, ss, dins); + else + drl = ntfs_rl_insert(drl, ds, srl + sstart, ss, dins); + } else { + if (finish) + drl = ntfs_rl_append(drl, ds, srl + sstart, ss, dins); + else + drl = ntfs_rl_split(drl, ds, srl + sstart, ss, dins); + } + if (IS_ERR(drl)) { + ntfs_error(NULL, "Merge failed."); + return drl; + } + ntfs_free(srl); + if (marker) { + ntfs_debug("Triggering marker code."); + for (ds = dend; drl[ds].length; ds++) + ; + /* We only need to care if @srl ended after @drl. */ + if (drl[ds].vcn <= marker_vcn) { + int slots = 0; + + if (drl[ds].vcn == marker_vcn) { + ntfs_debug("Old marker = 0x%llx, replacing " + "with LCN_ENOENT.", + (unsigned long long) + drl[ds].lcn); + drl[ds].lcn = LCN_ENOENT; + goto finished; + } + /* + * We need to create an unmapped runlist element in + * @drl or extend an existing one before adding the + * ENOENT terminator. + */ + if (drl[ds].lcn == LCN_ENOENT) { + ds--; + slots = 1; + } + if (drl[ds].lcn != LCN_RL_NOT_MAPPED) { + /* Add an unmapped runlist element. */ + if (!slots) { + drl = ntfs_rl_realloc_nofail(drl, ds, + ds + 2); + slots = 2; + } + ds++; + /* Need to set vcn if it isn't set already. */ + if (slots != 1) + drl[ds].vcn = drl[ds - 1].vcn + + drl[ds - 1].length; + drl[ds].lcn = LCN_RL_NOT_MAPPED; + /* We now used up a slot. */ + slots--; + } + drl[ds].length = marker_vcn - drl[ds].vcn; + /* Finally add the ENOENT terminator. */ + ds++; + if (!slots) + drl = ntfs_rl_realloc_nofail(drl, ds, ds + 1); + drl[ds].vcn = marker_vcn; + drl[ds].lcn = LCN_ENOENT; + drl[ds].length = (s64)0; + } + } + } + +finished: + /* The merge was completed successfully. */ + ntfs_debug("Merged runlist:"); + ntfs_debug_dump_runlist(drl); + return drl; +} + +/** + * ntfs_mapping_pairs_decompress - convert mapping pairs array to runlist + * @vol: ntfs volume on which the attribute resides + * @attr: attribute record whose mapping pairs array to decompress + * @old_rl: optional runlist in which to insert @attr's runlist + * + * It is up to the caller to serialize access to the runlist @old_rl. + * + * Decompress the attribute @attr's mapping pairs array into a runlist. On + * success, return the decompressed runlist. + * + * If @old_rl is not NULL, decompressed runlist is inserted into the + * appropriate place in @old_rl and the resultant, combined runlist is + * returned. The original @old_rl is deallocated. + * + * On error, return -errno. @old_rl is left unmodified in that case. + * + * The following error codes are defined: + * -ENOMEM - Not enough memory to allocate runlist array. + * -EIO - Corrupt runlist. + * -EINVAL - Invalid parameters were passed in. + * -ERANGE - The two runlists overlap. + * + * FIXME: For now we take the conceptionally simplest approach of creating the + * new runlist disregarding the already existing one and then splicing the + * two into one, if that is possible (we check for overlap and discard the new + * runlist if overlap present before returning ERR_PTR(-ERANGE)). + */ +runlist_element *ntfs_mapping_pairs_decompress(const ntfs_volume *vol, + const ATTR_RECORD *attr, runlist_element *old_rl) +{ + VCN vcn; /* Current vcn. */ + LCN lcn; /* Current lcn. */ + s64 deltaxcn; /* Change in [vl]cn. */ + runlist_element *rl; /* The output runlist. */ + u8 *buf; /* Current position in mapping pairs array. */ + u8 *attr_end; /* End of attribute. */ + int rlsize; /* Size of runlist buffer. */ + u16 rlpos; /* Current runlist position in units of + runlist_elements. */ + u8 b; /* Current byte offset in buf. */ + +#ifdef DEBUG + /* Make sure attr exists and is non-resident. */ + if (!attr || !attr->non_resident || sle64_to_cpu( + attr->data.non_resident.lowest_vcn) < (VCN)0) { + ntfs_error(vol->sb, "Invalid arguments."); + return ERR_PTR(-EINVAL); + } +#endif + /* Start at vcn = lowest_vcn and lcn 0. */ + vcn = sle64_to_cpu(attr->data.non_resident.lowest_vcn); + lcn = 0; + /* Get start of the mapping pairs array. */ + buf = (u8*)attr + le16_to_cpu( + attr->data.non_resident.mapping_pairs_offset); + attr_end = (u8*)attr + le32_to_cpu(attr->length); + if (unlikely(buf < (u8*)attr || buf > attr_end)) { + ntfs_error(vol->sb, "Corrupt attribute."); + return ERR_PTR(-EIO); + } + /* If the mapping pairs array is valid but empty, nothing to do. */ + if (!vcn && !*buf) + return old_rl; + /* Current position in runlist array. */ + rlpos = 0; + /* Allocate first page and set current runlist size to one page. */ + rl = ntfs_malloc_nofs(rlsize = PAGE_SIZE); + if (unlikely(!rl)) + return ERR_PTR(-ENOMEM); + /* Insert unmapped starting element if necessary. */ + if (vcn) { + rl->vcn = 0; + rl->lcn = LCN_RL_NOT_MAPPED; + rl->length = vcn; + rlpos++; + } + while (buf < attr_end && *buf) { + /* + * Allocate more memory if needed, including space for the + * not-mapped and terminator elements. ntfs_malloc_nofs() + * operates on whole pages only. + */ + if (((rlpos + 3) * sizeof(*old_rl)) > rlsize) { + runlist_element *rl2; + + rl2 = ntfs_malloc_nofs(rlsize + (int)PAGE_SIZE); + if (unlikely(!rl2)) { + ntfs_free(rl); + return ERR_PTR(-ENOMEM); + } + memcpy(rl2, rl, rlsize); + ntfs_free(rl); + rl = rl2; + rlsize += PAGE_SIZE; + } + /* Enter the current vcn into the current runlist element. */ + rl[rlpos].vcn = vcn; + /* + * Get the change in vcn, i.e. the run length in clusters. + * Doing it this way ensures that we signextend negative values. + * A negative run length doesn't make any sense, but hey, I + * didn't make up the NTFS specs and Windows NT4 treats the run + * length as a signed value so that's how it is... + */ + b = *buf & 0xf; + if (b) { + if (unlikely(buf + b > attr_end)) + goto io_error; + for (deltaxcn = (s8)buf[b--]; b; b--) + deltaxcn = (deltaxcn << 8) + buf[b]; + } else { /* The length entry is compulsory. */ + ntfs_error(vol->sb, "Missing length entry in mapping " + "pairs array."); + deltaxcn = (s64)-1; + } + /* + * Assume a negative length to indicate data corruption and + * hence clean-up and return NULL. + */ + if (unlikely(deltaxcn < 0)) { + ntfs_error(vol->sb, "Invalid length in mapping pairs " + "array."); + goto err_out; + } + /* + * Enter the current run length into the current runlist + * element. + */ + rl[rlpos].length = deltaxcn; + /* Increment the current vcn by the current run length. */ + vcn += deltaxcn; + /* + * There might be no lcn change at all, as is the case for + * sparse clusters on NTFS 3.0+, in which case we set the lcn + * to LCN_HOLE. + */ + if (!(*buf & 0xf0)) + rl[rlpos].lcn = LCN_HOLE; + else { + /* Get the lcn change which really can be negative. */ + u8 b2 = *buf & 0xf; + b = b2 + ((*buf >> 4) & 0xf); + if (buf + b > attr_end) + goto io_error; + for (deltaxcn = (s8)buf[b--]; b > b2; b--) + deltaxcn = (deltaxcn << 8) + buf[b]; + /* Change the current lcn to its new value. */ + lcn += deltaxcn; +#ifdef DEBUG + /* + * On NTFS 1.2-, apparently can have lcn == -1 to + * indicate a hole. But we haven't verified ourselves + * whether it is really the lcn or the deltaxcn that is + * -1. So if either is found give us a message so we + * can investigate it further! + */ + if (vol->major_ver < 3) { + if (unlikely(deltaxcn == (LCN)-1)) + ntfs_error(vol->sb, "lcn delta == -1"); + if (unlikely(lcn == (LCN)-1)) + ntfs_error(vol->sb, "lcn == -1"); + } +#endif + /* Check lcn is not below -1. */ + if (unlikely(lcn < (LCN)-1)) { + ntfs_error(vol->sb, "Invalid LCN < -1 in " + "mapping pairs array."); + goto err_out; + } + /* Enter the current lcn into the runlist element. */ + rl[rlpos].lcn = lcn; + } + /* Get to the next runlist element. */ + rlpos++; + /* Increment the buffer position to the next mapping pair. */ + buf += (*buf & 0xf) + ((*buf >> 4) & 0xf) + 1; + } + if (unlikely(buf >= attr_end)) + goto io_error; + /* + * If there is a highest_vcn specified, it must be equal to the final + * vcn in the runlist - 1, or something has gone badly wrong. + */ + deltaxcn = sle64_to_cpu(attr->data.non_resident.highest_vcn); + if (unlikely(deltaxcn && vcn - 1 != deltaxcn)) { +mpa_err: + ntfs_error(vol->sb, "Corrupt mapping pairs array in " + "non-resident attribute."); + goto err_out; + } + /* Setup not mapped runlist element if this is the base extent. */ + if (!attr->data.non_resident.lowest_vcn) { + VCN max_cluster; + + max_cluster = ((sle64_to_cpu( + attr->data.non_resident.allocated_size) + + vol->cluster_size - 1) >> + vol->cluster_size_bits) - 1; + /* + * A highest_vcn of zero means this is a single extent + * attribute so simply terminate the runlist with LCN_ENOENT). + */ + if (deltaxcn) { + /* + * If there is a difference between the highest_vcn and + * the highest cluster, the runlist is either corrupt + * or, more likely, there are more extents following + * this one. + */ + if (deltaxcn < max_cluster) { + ntfs_debug("More extents to follow; deltaxcn " + "= 0x%llx, max_cluster = " + "0x%llx", + (unsigned long long)deltaxcn, + (unsigned long long) + max_cluster); + rl[rlpos].vcn = vcn; + vcn += rl[rlpos].length = max_cluster - + deltaxcn; + rl[rlpos].lcn = LCN_RL_NOT_MAPPED; + rlpos++; + } else if (unlikely(deltaxcn > max_cluster)) { + ntfs_error(vol->sb, "Corrupt attribute. " + "deltaxcn = 0x%llx, " + "max_cluster = 0x%llx", + (unsigned long long)deltaxcn, + (unsigned long long) + max_cluster); + goto mpa_err; + } + } + rl[rlpos].lcn = LCN_ENOENT; + } else /* Not the base extent. There may be more extents to follow. */ + rl[rlpos].lcn = LCN_RL_NOT_MAPPED; + + /* Setup terminating runlist element. */ + rl[rlpos].vcn = vcn; + rl[rlpos].length = (s64)0; + /* If no existing runlist was specified, we are done. */ + if (!old_rl) { + ntfs_debug("Mapping pairs array successfully decompressed:"); + ntfs_debug_dump_runlist(rl); + return rl; + } + /* Now combine the new and old runlists checking for overlaps. */ + old_rl = ntfs_runlists_merge(old_rl, rl); + if (!IS_ERR(old_rl)) + return old_rl; + ntfs_free(rl); + ntfs_error(vol->sb, "Failed to merge runlists."); + return old_rl; +io_error: + ntfs_error(vol->sb, "Corrupt attribute."); +err_out: + ntfs_free(rl); + return ERR_PTR(-EIO); +} + +/** + * ntfs_rl_vcn_to_lcn - convert a vcn into a lcn given a runlist + * @rl: runlist to use for conversion + * @vcn: vcn to convert + * + * Convert the virtual cluster number @vcn of an attribute into a logical + * cluster number (lcn) of a device using the runlist @rl to map vcns to their + * corresponding lcns. + * + * It is up to the caller to serialize access to the runlist @rl. + * + * Since lcns must be >= 0, we use negative return codes with special meaning: + * + * Return code Meaning / Description + * ================================================== + * LCN_HOLE Hole / not allocated on disk. + * LCN_RL_NOT_MAPPED This is part of the runlist which has not been + * inserted into the runlist yet. + * LCN_ENOENT There is no such vcn in the attribute. + * + * Locking: - The caller must have locked the runlist (for reading or writing). + * - This function does not touch the lock, nor does it modify the + * runlist. + */ +LCN ntfs_rl_vcn_to_lcn(const runlist_element *rl, const VCN vcn) +{ + int i; + + BUG_ON(vcn < 0); + /* + * If rl is NULL, assume that we have found an unmapped runlist. The + * caller can then attempt to map it and fail appropriately if + * necessary. + */ + if (unlikely(!rl)) + return LCN_RL_NOT_MAPPED; + + /* Catch out of lower bounds vcn. */ + if (unlikely(vcn < rl[0].vcn)) + return LCN_ENOENT; + + for (i = 0; likely(rl[i].length); i++) { + if (unlikely(vcn < rl[i+1].vcn)) { + if (likely(rl[i].lcn >= (LCN)0)) + return rl[i].lcn + (vcn - rl[i].vcn); + return rl[i].lcn; + } + } + /* + * The terminator element is setup to the correct value, i.e. one of + * LCN_HOLE, LCN_RL_NOT_MAPPED, or LCN_ENOENT. + */ + if (likely(rl[i].lcn < (LCN)0)) + return rl[i].lcn; + /* Just in case... We could replace this with BUG() some day. */ + return LCN_ENOENT; +} + +#ifdef NTFS_RW + +/** + * ntfs_rl_find_vcn_nolock - find a vcn in a runlist + * @rl: runlist to search + * @vcn: vcn to find + * + * Find the virtual cluster number @vcn in the runlist @rl and return the + * address of the runlist element containing the @vcn on success. + * + * Return NULL if @rl is NULL or @vcn is in an unmapped part/out of bounds of + * the runlist. + * + * Locking: The runlist must be locked on entry. + */ +runlist_element *ntfs_rl_find_vcn_nolock(runlist_element *rl, const VCN vcn) +{ + BUG_ON(vcn < 0); + if (unlikely(!rl || vcn < rl[0].vcn)) + return NULL; + while (likely(rl->length)) { + if (unlikely(vcn < rl[1].vcn)) { + if (likely(rl->lcn >= LCN_HOLE)) + return rl; + return NULL; + } + rl++; + } + if (likely(rl->lcn == LCN_ENOENT)) + return rl; + return NULL; +} + +/** + * ntfs_get_nr_significant_bytes - get number of bytes needed to store a number + * @n: number for which to get the number of bytes for + * + * Return the number of bytes required to store @n unambiguously as + * a signed number. + * + * This is used in the context of the mapping pairs array to determine how + * many bytes will be needed in the array to store a given logical cluster + * number (lcn) or a specific run length. + * + * Return the number of bytes written. This function cannot fail. + */ +static inline int ntfs_get_nr_significant_bytes(const s64 n) +{ + s64 l = n; + int i; + s8 j; + + i = 0; + do { + l >>= 8; + i++; + } while (l != 0 && l != -1); + j = (n >> 8 * (i - 1)) & 0xff; + /* If the sign bit is wrong, we need an extra byte. */ + if ((n < 0 && j >= 0) || (n > 0 && j < 0)) + i++; + return i; +} + +/** + * ntfs_get_size_for_mapping_pairs - get bytes needed for mapping pairs array + * @vol: ntfs volume (needed for the ntfs version) + * @rl: locked runlist to determine the size of the mapping pairs of + * @first_vcn: first vcn which to include in the mapping pairs array + * @last_vcn: last vcn which to include in the mapping pairs array + * + * Walk the locked runlist @rl and calculate the size in bytes of the mapping + * pairs array corresponding to the runlist @rl, starting at vcn @first_vcn and + * finishing with vcn @last_vcn. + * + * A @last_vcn of -1 means end of runlist and in that case the size of the + * mapping pairs array corresponding to the runlist starting at vcn @first_vcn + * and finishing at the end of the runlist is determined. + * + * This for example allows us to allocate a buffer of the right size when + * building the mapping pairs array. + * + * If @rl is NULL, just return 1 (for the single terminator byte). + * + * Return the calculated size in bytes on success. On error, return -errno. + * The following error codes are defined: + * -EINVAL - Run list contains unmapped elements. Make sure to only pass + * fully mapped runlists to this function. + * -EIO - The runlist is corrupt. + * + * Locking: @rl must be locked on entry (either for reading or writing), it + * remains locked throughout, and is left locked upon return. + */ +int ntfs_get_size_for_mapping_pairs(const ntfs_volume *vol, + const runlist_element *rl, const VCN first_vcn, + const VCN last_vcn) +{ + LCN prev_lcn; + int rls; + bool the_end = false; + + BUG_ON(first_vcn < 0); + BUG_ON(last_vcn < -1); + BUG_ON(last_vcn >= 0 && first_vcn > last_vcn); + if (!rl) { + BUG_ON(first_vcn); + BUG_ON(last_vcn > 0); + return 1; + } + /* Skip to runlist element containing @first_vcn. */ + while (rl->length && first_vcn >= rl[1].vcn) + rl++; + if (unlikely((!rl->length && first_vcn > rl->vcn) || + first_vcn < rl->vcn)) + return -EINVAL; + prev_lcn = 0; + /* Always need the termining zero byte. */ + rls = 1; + /* Do the first partial run if present. */ + if (first_vcn > rl->vcn) { + s64 delta, length = rl->length; + + /* We know rl->length != 0 already. */ + if (unlikely(length < 0 || rl->lcn < LCN_HOLE)) + goto err_out; + /* + * If @stop_vcn is given and finishes inside this run, cap the + * run length. + */ + if (unlikely(last_vcn >= 0 && rl[1].vcn > last_vcn)) { + s64 s1 = last_vcn + 1; + if (unlikely(rl[1].vcn > s1)) + length = s1 - rl->vcn; + the_end = true; + } + delta = first_vcn - rl->vcn; + /* Header byte + length. */ + rls += 1 + ntfs_get_nr_significant_bytes(length - delta); + /* + * If the logical cluster number (lcn) denotes a hole and we + * are on NTFS 3.0+, we don't store it at all, i.e. we need + * zero space. On earlier NTFS versions we just store the lcn. + * Note: this assumes that on NTFS 1.2-, holes are stored with + * an lcn of -1 and not a delta_lcn of -1 (unless both are -1). + */ + if (likely(rl->lcn >= 0 || vol->major_ver < 3)) { + prev_lcn = rl->lcn; + if (likely(rl->lcn >= 0)) + prev_lcn += delta; + /* Change in lcn. */ + rls += ntfs_get_nr_significant_bytes(prev_lcn); + } + /* Go to next runlist element. */ + rl++; + } + /* Do the full runs. */ + for (; rl->length && !the_end; rl++) { + s64 length = rl->length; + + if (unlikely(length < 0 || rl->lcn < LCN_HOLE)) + goto err_out; + /* + * If @stop_vcn is given and finishes inside this run, cap the + * run length. + */ + if (unlikely(last_vcn >= 0 && rl[1].vcn > last_vcn)) { + s64 s1 = last_vcn + 1; + if (unlikely(rl[1].vcn > s1)) + length = s1 - rl->vcn; + the_end = true; + } + /* Header byte + length. */ + rls += 1 + ntfs_get_nr_significant_bytes(length); + /* + * If the logical cluster number (lcn) denotes a hole and we + * are on NTFS 3.0+, we don't store it at all, i.e. we need + * zero space. On earlier NTFS versions we just store the lcn. + * Note: this assumes that on NTFS 1.2-, holes are stored with + * an lcn of -1 and not a delta_lcn of -1 (unless both are -1). + */ + if (likely(rl->lcn >= 0 || vol->major_ver < 3)) { + /* Change in lcn. */ + rls += ntfs_get_nr_significant_bytes(rl->lcn - + prev_lcn); + prev_lcn = rl->lcn; + } + } + return rls; +err_out: + if (rl->lcn == LCN_RL_NOT_MAPPED) + rls = -EINVAL; + else + rls = -EIO; + return rls; +} + +/** + * ntfs_write_significant_bytes - write the significant bytes of a number + * @dst: destination buffer to write to + * @dst_max: pointer to last byte of destination buffer for bounds checking + * @n: number whose significant bytes to write + * + * Store in @dst, the minimum bytes of the number @n which are required to + * identify @n unambiguously as a signed number, taking care not to exceed + * @dest_max, the maximum position within @dst to which we are allowed to + * write. + * + * This is used when building the mapping pairs array of a runlist to compress + * a given logical cluster number (lcn) or a specific run length to the minimum + * size possible. + * + * Return the number of bytes written on success. On error, i.e. the + * destination buffer @dst is too small, return -ENOSPC. + */ +static inline int ntfs_write_significant_bytes(s8 *dst, const s8 *dst_max, + const s64 n) +{ + s64 l = n; + int i; + s8 j; + + i = 0; + do { + if (unlikely(dst > dst_max)) + goto err_out; + *dst++ = l & 0xffll; + l >>= 8; + i++; + } while (l != 0 && l != -1); + j = (n >> 8 * (i - 1)) & 0xff; + /* If the sign bit is wrong, we need an extra byte. */ + if (n < 0 && j >= 0) { + if (unlikely(dst > dst_max)) + goto err_out; + i++; + *dst = (s8)-1; + } else if (n > 0 && j < 0) { + if (unlikely(dst > dst_max)) + goto err_out; + i++; + *dst = (s8)0; + } + return i; +err_out: + return -ENOSPC; +} + +/** + * ntfs_mapping_pairs_build - build the mapping pairs array from a runlist + * @vol: ntfs volume (needed for the ntfs version) + * @dst: destination buffer to which to write the mapping pairs array + * @dst_len: size of destination buffer @dst in bytes + * @rl: locked runlist for which to build the mapping pairs array + * @first_vcn: first vcn which to include in the mapping pairs array + * @last_vcn: last vcn which to include in the mapping pairs array + * @stop_vcn: first vcn outside destination buffer on success or -ENOSPC + * + * Create the mapping pairs array from the locked runlist @rl, starting at vcn + * @first_vcn and finishing with vcn @last_vcn and save the array in @dst. + * @dst_len is the size of @dst in bytes and it should be at least equal to the + * value obtained by calling ntfs_get_size_for_mapping_pairs(). + * + * A @last_vcn of -1 means end of runlist and in that case the mapping pairs + * array corresponding to the runlist starting at vcn @first_vcn and finishing + * at the end of the runlist is created. + * + * If @rl is NULL, just write a single terminator byte to @dst. + * + * On success or -ENOSPC error, if @stop_vcn is not NULL, *@stop_vcn is set to + * the first vcn outside the destination buffer. Note that on error, @dst has + * been filled with all the mapping pairs that will fit, thus it can be treated + * as partial success, in that a new attribute extent needs to be created or + * the next extent has to be used and the mapping pairs build has to be + * continued with @first_vcn set to *@stop_vcn. + * + * Return 0 on success and -errno on error. The following error codes are + * defined: + * -EINVAL - Run list contains unmapped elements. Make sure to only pass + * fully mapped runlists to this function. + * -EIO - The runlist is corrupt. + * -ENOSPC - The destination buffer is too small. + * + * Locking: @rl must be locked on entry (either for reading or writing), it + * remains locked throughout, and is left locked upon return. + */ +int ntfs_mapping_pairs_build(const ntfs_volume *vol, s8 *dst, + const int dst_len, const runlist_element *rl, + const VCN first_vcn, const VCN last_vcn, VCN *const stop_vcn) +{ + LCN prev_lcn; + s8 *dst_max, *dst_next; + int err = -ENOSPC; + bool the_end = false; + s8 len_len, lcn_len; + + BUG_ON(first_vcn < 0); + BUG_ON(last_vcn < -1); + BUG_ON(last_vcn >= 0 && first_vcn > last_vcn); + BUG_ON(dst_len < 1); + if (!rl) { + BUG_ON(first_vcn); + BUG_ON(last_vcn > 0); + if (stop_vcn) + *stop_vcn = 0; + /* Terminator byte. */ + *dst = 0; + return 0; + } + /* Skip to runlist element containing @first_vcn. */ + while (rl->length && first_vcn >= rl[1].vcn) + rl++; + if (unlikely((!rl->length && first_vcn > rl->vcn) || + first_vcn < rl->vcn)) + return -EINVAL; + /* + * @dst_max is used for bounds checking in + * ntfs_write_significant_bytes(). + */ + dst_max = dst + dst_len - 1; + prev_lcn = 0; + /* Do the first partial run if present. */ + if (first_vcn > rl->vcn) { + s64 delta, length = rl->length; + + /* We know rl->length != 0 already. */ + if (unlikely(length < 0 || rl->lcn < LCN_HOLE)) + goto err_out; + /* + * If @stop_vcn is given and finishes inside this run, cap the + * run length. + */ + if (unlikely(last_vcn >= 0 && rl[1].vcn > last_vcn)) { + s64 s1 = last_vcn + 1; + if (unlikely(rl[1].vcn > s1)) + length = s1 - rl->vcn; + the_end = true; + } + delta = first_vcn - rl->vcn; + /* Write length. */ + len_len = ntfs_write_significant_bytes(dst + 1, dst_max, + length - delta); + if (unlikely(len_len < 0)) + goto size_err; + /* + * If the logical cluster number (lcn) denotes a hole and we + * are on NTFS 3.0+, we don't store it at all, i.e. we need + * zero space. On earlier NTFS versions we just write the lcn + * change. FIXME: Do we need to write the lcn change or just + * the lcn in that case? Not sure as I have never seen this + * case on NT4. - We assume that we just need to write the lcn + * change until someone tells us otherwise... (AIA) + */ + if (likely(rl->lcn >= 0 || vol->major_ver < 3)) { + prev_lcn = rl->lcn; + if (likely(rl->lcn >= 0)) + prev_lcn += delta; + /* Write change in lcn. */ + lcn_len = ntfs_write_significant_bytes(dst + 1 + + len_len, dst_max, prev_lcn); + if (unlikely(lcn_len < 0)) + goto size_err; + } else + lcn_len = 0; + dst_next = dst + len_len + lcn_len + 1; + if (unlikely(dst_next > dst_max)) + goto size_err; + /* Update header byte. */ + *dst = lcn_len << 4 | len_len; + /* Position at next mapping pairs array element. */ + dst = dst_next; + /* Go to next runlist element. */ + rl++; + } + /* Do the full runs. */ + for (; rl->length && !the_end; rl++) { + s64 length = rl->length; + + if (unlikely(length < 0 || rl->lcn < LCN_HOLE)) + goto err_out; + /* + * If @stop_vcn is given and finishes inside this run, cap the + * run length. + */ + if (unlikely(last_vcn >= 0 && rl[1].vcn > last_vcn)) { + s64 s1 = last_vcn + 1; + if (unlikely(rl[1].vcn > s1)) + length = s1 - rl->vcn; + the_end = true; + } + /* Write length. */ + len_len = ntfs_write_significant_bytes(dst + 1, dst_max, + length); + if (unlikely(len_len < 0)) + goto size_err; + /* + * If the logical cluster number (lcn) denotes a hole and we + * are on NTFS 3.0+, we don't store it at all, i.e. we need + * zero space. On earlier NTFS versions we just write the lcn + * change. FIXME: Do we need to write the lcn change or just + * the lcn in that case? Not sure as I have never seen this + * case on NT4. - We assume that we just need to write the lcn + * change until someone tells us otherwise... (AIA) + */ + if (likely(rl->lcn >= 0 || vol->major_ver < 3)) { + /* Write change in lcn. */ + lcn_len = ntfs_write_significant_bytes(dst + 1 + + len_len, dst_max, rl->lcn - prev_lcn); + if (unlikely(lcn_len < 0)) + goto size_err; + prev_lcn = rl->lcn; + } else + lcn_len = 0; + dst_next = dst + len_len + lcn_len + 1; + if (unlikely(dst_next > dst_max)) + goto size_err; + /* Update header byte. */ + *dst = lcn_len << 4 | len_len; + /* Position at next mapping pairs array element. */ + dst = dst_next; + } + /* Success. */ + err = 0; +size_err: + /* Set stop vcn. */ + if (stop_vcn) + *stop_vcn = rl->vcn; + /* Add terminator byte. */ + *dst = 0; + return err; +err_out: + if (rl->lcn == LCN_RL_NOT_MAPPED) + err = -EINVAL; + else + err = -EIO; + return err; +} + +/** + * ntfs_rl_truncate_nolock - truncate a runlist starting at a specified vcn + * @vol: ntfs volume (needed for error output) + * @runlist: runlist to truncate + * @new_length: the new length of the runlist in VCNs + * + * Truncate the runlist described by @runlist as well as the memory buffer + * holding the runlist elements to a length of @new_length VCNs. + * + * If @new_length lies within the runlist, the runlist elements with VCNs of + * @new_length and above are discarded. As a special case if @new_length is + * zero, the runlist is discarded and set to NULL. + * + * If @new_length lies beyond the runlist, a sparse runlist element is added to + * the end of the runlist @runlist or if the last runlist element is a sparse + * one already, this is extended. + * + * Note, no checking is done for unmapped runlist elements. It is assumed that + * the caller has mapped any elements that need to be mapped already. + * + * Return 0 on success and -errno on error. + * + * Locking: The caller must hold @runlist->lock for writing. + */ +int ntfs_rl_truncate_nolock(const ntfs_volume *vol, runlist *const runlist, + const s64 new_length) +{ + runlist_element *rl; + int old_size; + + ntfs_debug("Entering for new_length 0x%llx.", (long long)new_length); + BUG_ON(!runlist); + BUG_ON(new_length < 0); + rl = runlist->rl; + if (!new_length) { + ntfs_debug("Freeing runlist."); + runlist->rl = NULL; + if (rl) + ntfs_free(rl); + return 0; + } + if (unlikely(!rl)) { + /* + * Create a runlist consisting of a sparse runlist element of + * length @new_length followed by a terminator runlist element. + */ + rl = ntfs_malloc_nofs(PAGE_SIZE); + if (unlikely(!rl)) { + ntfs_error(vol->sb, "Not enough memory to allocate " + "runlist element buffer."); + return -ENOMEM; + } + runlist->rl = rl; + rl[1].length = rl->vcn = 0; + rl->lcn = LCN_HOLE; + rl[1].vcn = rl->length = new_length; + rl[1].lcn = LCN_ENOENT; + return 0; + } + BUG_ON(new_length < rl->vcn); + /* Find @new_length in the runlist. */ + while (likely(rl->length && new_length >= rl[1].vcn)) + rl++; + /* + * If not at the end of the runlist we need to shrink it. + * If at the end of the runlist we need to expand it. + */ + if (rl->length) { + runlist_element *trl; + bool is_end; + + ntfs_debug("Shrinking runlist."); + /* Determine the runlist size. */ + trl = rl + 1; + while (likely(trl->length)) + trl++; + old_size = trl - runlist->rl + 1; + /* Truncate the run. */ + rl->length = new_length - rl->vcn; + /* + * If a run was partially truncated, make the following runlist + * element a terminator. + */ + is_end = false; + if (rl->length) { + rl++; + if (!rl->length) + is_end = true; + rl->vcn = new_length; + rl->length = 0; + } + rl->lcn = LCN_ENOENT; + /* Reallocate memory if necessary. */ + if (!is_end) { + int new_size = rl - runlist->rl + 1; + rl = ntfs_rl_realloc(runlist->rl, old_size, new_size); + if (IS_ERR(rl)) + ntfs_warning(vol->sb, "Failed to shrink " + "runlist buffer. This just " + "wastes a bit of memory " + "temporarily so we ignore it " + "and return success."); + else + runlist->rl = rl; + } + } else if (likely(/* !rl->length && */ new_length > rl->vcn)) { + ntfs_debug("Expanding runlist."); + /* + * If there is a previous runlist element and it is a sparse + * one, extend it. Otherwise need to add a new, sparse runlist + * element. + */ + if ((rl > runlist->rl) && ((rl - 1)->lcn == LCN_HOLE)) + (rl - 1)->length = new_length - (rl - 1)->vcn; + else { + /* Determine the runlist size. */ + old_size = rl - runlist->rl + 1; + /* Reallocate memory if necessary. */ + rl = ntfs_rl_realloc(runlist->rl, old_size, + old_size + 1); + if (IS_ERR(rl)) { + ntfs_error(vol->sb, "Failed to expand runlist " + "buffer, aborting."); + return PTR_ERR(rl); + } + runlist->rl = rl; + /* + * Set @rl to the same runlist element in the new + * runlist as before in the old runlist. + */ + rl += old_size - 1; + /* Add a new, sparse runlist element. */ + rl->lcn = LCN_HOLE; + rl->length = new_length - rl->vcn; + /* Add a new terminator runlist element. */ + rl++; + rl->length = 0; + } + rl->vcn = new_length; + rl->lcn = LCN_ENOENT; + } else /* if (unlikely(!rl->length && new_length == rl->vcn)) */ { + /* Runlist already has same size as requested. */ + rl->lcn = LCN_ENOENT; + } + ntfs_debug("Done."); + return 0; +} + +/** + * ntfs_rl_punch_nolock - punch a hole into a runlist + * @vol: ntfs volume (needed for error output) + * @runlist: runlist to punch a hole into + * @start: starting VCN of the hole to be created + * @length: size of the hole to be created in units of clusters + * + * Punch a hole into the runlist @runlist starting at VCN @start and of size + * @length clusters. + * + * Return 0 on success and -errno on error, in which case @runlist has not been + * modified. + * + * If @start and/or @start + @length are outside the runlist return error code + * -ENOENT. + * + * If the runlist contains unmapped or error elements between @start and @start + * + @length return error code -EINVAL. + * + * Locking: The caller must hold @runlist->lock for writing. + */ +int ntfs_rl_punch_nolock(const ntfs_volume *vol, runlist *const runlist, + const VCN start, const s64 length) +{ + const VCN end = start + length; + s64 delta; + runlist_element *rl, *rl_end, *rl_real_end, *trl; + int old_size; + bool lcn_fixup = false; + + ntfs_debug("Entering for start 0x%llx, length 0x%llx.", + (long long)start, (long long)length); + BUG_ON(!runlist); + BUG_ON(start < 0); + BUG_ON(length < 0); + BUG_ON(end < 0); + rl = runlist->rl; + if (unlikely(!rl)) { + if (likely(!start && !length)) + return 0; + return -ENOENT; + } + /* Find @start in the runlist. */ + while (likely(rl->length && start >= rl[1].vcn)) + rl++; + rl_end = rl; + /* Find @end in the runlist. */ + while (likely(rl_end->length && end >= rl_end[1].vcn)) { + /* Verify there are no unmapped or error elements. */ + if (unlikely(rl_end->lcn < LCN_HOLE)) + return -EINVAL; + rl_end++; + } + /* Check the last element. */ + if (unlikely(rl_end->length && rl_end->lcn < LCN_HOLE)) + return -EINVAL; + /* This covers @start being out of bounds, too. */ + if (!rl_end->length && end > rl_end->vcn) + return -ENOENT; + if (!length) + return 0; + if (!rl->length) + return -ENOENT; + rl_real_end = rl_end; + /* Determine the runlist size. */ + while (likely(rl_real_end->length)) + rl_real_end++; + old_size = rl_real_end - runlist->rl + 1; + /* If @start is in a hole simply extend the hole. */ + if (rl->lcn == LCN_HOLE) { + /* + * If both @start and @end are in the same sparse run, we are + * done. + */ + if (end <= rl[1].vcn) { + ntfs_debug("Done (requested hole is already sparse)."); + return 0; + } +extend_hole: + /* Extend the hole. */ + rl->length = end - rl->vcn; + /* If @end is in a hole, merge it with the current one. */ + if (rl_end->lcn == LCN_HOLE) { + rl_end++; + rl->length = rl_end->vcn - rl->vcn; + } + /* We have done the hole. Now deal with the remaining tail. */ + rl++; + /* Cut out all runlist elements up to @end. */ + if (rl < rl_end) + memmove(rl, rl_end, (rl_real_end - rl_end + 1) * + sizeof(*rl)); + /* Adjust the beginning of the tail if necessary. */ + if (end > rl->vcn) { + delta = end - rl->vcn; + rl->vcn = end; + rl->length -= delta; + /* Only adjust the lcn if it is real. */ + if (rl->lcn >= 0) + rl->lcn += delta; + } +shrink_allocation: + /* Reallocate memory if the allocation changed. */ + if (rl < rl_end) { + rl = ntfs_rl_realloc(runlist->rl, old_size, + old_size - (rl_end - rl)); + if (IS_ERR(rl)) + ntfs_warning(vol->sb, "Failed to shrink " + "runlist buffer. This just " + "wastes a bit of memory " + "temporarily so we ignore it " + "and return success."); + else + runlist->rl = rl; + } + ntfs_debug("Done (extend hole)."); + return 0; + } + /* + * If @start is at the beginning of a run things are easier as there is + * no need to split the first run. + */ + if (start == rl->vcn) { + /* + * @start is at the beginning of a run. + * + * If the previous run is sparse, extend its hole. + * + * If @end is not in the same run, switch the run to be sparse + * and extend the newly created hole. + * + * Thus both of these cases reduce the problem to the above + * case of "@start is in a hole". + */ + if (rl > runlist->rl && (rl - 1)->lcn == LCN_HOLE) { + rl--; + goto extend_hole; + } + if (end >= rl[1].vcn) { + rl->lcn = LCN_HOLE; + goto extend_hole; + } + /* + * The final case is when @end is in the same run as @start. + * For this need to split the run into two. One run for the + * sparse region between the beginning of the old run, i.e. + * @start, and @end and one for the remaining non-sparse + * region, i.e. between @end and the end of the old run. + */ + trl = ntfs_rl_realloc(runlist->rl, old_size, old_size + 1); + if (IS_ERR(trl)) + goto enomem_out; + old_size++; + if (runlist->rl != trl) { + rl = trl + (rl - runlist->rl); + rl_end = trl + (rl_end - runlist->rl); + rl_real_end = trl + (rl_real_end - runlist->rl); + runlist->rl = trl; + } +split_end: + /* Shift all the runs up by one. */ + memmove(rl + 1, rl, (rl_real_end - rl + 1) * sizeof(*rl)); + /* Finally, setup the two split runs. */ + rl->lcn = LCN_HOLE; + rl->length = length; + rl++; + rl->vcn += length; + /* Only adjust the lcn if it is real. */ + if (rl->lcn >= 0 || lcn_fixup) + rl->lcn += length; + rl->length -= length; + ntfs_debug("Done (split one)."); + return 0; + } + /* + * @start is neither in a hole nor at the beginning of a run. + * + * If @end is in a hole, things are easier as simply truncating the run + * @start is in to end at @start - 1, deleting all runs after that up + * to @end, and finally extending the beginning of the run @end is in + * to be @start is all that is needed. + */ + if (rl_end->lcn == LCN_HOLE) { + /* Truncate the run containing @start. */ + rl->length = start - rl->vcn; + rl++; + /* Cut out all runlist elements up to @end. */ + if (rl < rl_end) + memmove(rl, rl_end, (rl_real_end - rl_end + 1) * + sizeof(*rl)); + /* Extend the beginning of the run @end is in to be @start. */ + rl->vcn = start; + rl->length = rl[1].vcn - start; + goto shrink_allocation; + } + /* + * If @end is not in a hole there are still two cases to distinguish. + * Either @end is or is not in the same run as @start. + * + * The second case is easier as it can be reduced to an already solved + * problem by truncating the run @start is in to end at @start - 1. + * Then, if @end is in the next run need to split the run into a sparse + * run followed by a non-sparse run (already covered above) and if @end + * is not in the next run switching it to be sparse, again reduces the + * problem to the already covered case of "@start is in a hole". + */ + if (end >= rl[1].vcn) { + /* + * If @end is not in the next run, reduce the problem to the + * case of "@start is in a hole". + */ + if (rl[1].length && end >= rl[2].vcn) { + /* Truncate the run containing @start. */ + rl->length = start - rl->vcn; + rl++; + rl->vcn = start; + rl->lcn = LCN_HOLE; + goto extend_hole; + } + trl = ntfs_rl_realloc(runlist->rl, old_size, old_size + 1); + if (IS_ERR(trl)) + goto enomem_out; + old_size++; + if (runlist->rl != trl) { + rl = trl + (rl - runlist->rl); + rl_end = trl + (rl_end - runlist->rl); + rl_real_end = trl + (rl_real_end - runlist->rl); + runlist->rl = trl; + } + /* Truncate the run containing @start. */ + rl->length = start - rl->vcn; + rl++; + /* + * @end is in the next run, reduce the problem to the case + * where "@start is at the beginning of a run and @end is in + * the same run as @start". + */ + delta = rl->vcn - start; + rl->vcn = start; + if (rl->lcn >= 0) { + rl->lcn -= delta; + /* Need this in case the lcn just became negative. */ + lcn_fixup = true; + } + rl->length += delta; + goto split_end; + } + /* + * The first case from above, i.e. @end is in the same run as @start. + * We need to split the run into three. One run for the non-sparse + * region between the beginning of the old run and @start, one for the + * sparse region between @start and @end, and one for the remaining + * non-sparse region, i.e. between @end and the end of the old run. + */ + trl = ntfs_rl_realloc(runlist->rl, old_size, old_size + 2); + if (IS_ERR(trl)) + goto enomem_out; + old_size += 2; + if (runlist->rl != trl) { + rl = trl + (rl - runlist->rl); + rl_end = trl + (rl_end - runlist->rl); + rl_real_end = trl + (rl_real_end - runlist->rl); + runlist->rl = trl; + } + /* Shift all the runs up by two. */ + memmove(rl + 2, rl, (rl_real_end - rl + 1) * sizeof(*rl)); + /* Finally, setup the three split runs. */ + rl->length = start - rl->vcn; + rl++; + rl->vcn = start; + rl->lcn = LCN_HOLE; + rl->length = length; + rl++; + delta = end - rl->vcn; + rl->vcn = end; + rl->lcn += delta; + rl->length -= delta; + ntfs_debug("Done (split both)."); + return 0; +enomem_out: + ntfs_error(vol->sb, "Not enough memory to extend runlist buffer."); + return -ENOMEM; +} + +#endif /* NTFS_RW */ diff --git a/fs/ntfs/runlist.h b/fs/ntfs/runlist.h new file mode 100644 index 000000000..38de0a375 --- /dev/null +++ b/fs/ntfs/runlist.h @@ -0,0 +1,88 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * runlist.h - Defines for runlist handling in NTFS Linux kernel driver. + * Part of the Linux-NTFS project. + * + * Copyright (c) 2001-2005 Anton Altaparmakov + * Copyright (c) 2002 Richard Russon + */ + +#ifndef _LINUX_NTFS_RUNLIST_H +#define _LINUX_NTFS_RUNLIST_H + +#include "types.h" +#include "layout.h" +#include "volume.h" + +/** + * runlist_element - in memory vcn to lcn mapping array element + * @vcn: starting vcn of the current array element + * @lcn: starting lcn of the current array element + * @length: length in clusters of the current array element + * + * The last vcn (in fact the last vcn + 1) is reached when length == 0. + * + * When lcn == -1 this means that the count vcns starting at vcn are not + * physically allocated (i.e. this is a hole / data is sparse). + */ +typedef struct { /* In memory vcn to lcn mapping structure element. */ + VCN vcn; /* vcn = Starting virtual cluster number. */ + LCN lcn; /* lcn = Starting logical cluster number. */ + s64 length; /* Run length in clusters. */ +} runlist_element; + +/** + * runlist - in memory vcn to lcn mapping array including a read/write lock + * @rl: pointer to an array of runlist elements + * @lock: read/write spinlock for serializing access to @rl + * + */ +typedef struct { + runlist_element *rl; + struct rw_semaphore lock; +} runlist; + +static inline void ntfs_init_runlist(runlist *rl) +{ + rl->rl = NULL; + init_rwsem(&rl->lock); +} + +typedef enum { + LCN_HOLE = -1, /* Keep this as highest value or die! */ + LCN_RL_NOT_MAPPED = -2, + LCN_ENOENT = -3, + LCN_ENOMEM = -4, + LCN_EIO = -5, +} LCN_SPECIAL_VALUES; + +extern runlist_element *ntfs_runlists_merge(runlist_element *drl, + runlist_element *srl); + +extern runlist_element *ntfs_mapping_pairs_decompress(const ntfs_volume *vol, + const ATTR_RECORD *attr, runlist_element *old_rl); + +extern LCN ntfs_rl_vcn_to_lcn(const runlist_element *rl, const VCN vcn); + +#ifdef NTFS_RW + +extern runlist_element *ntfs_rl_find_vcn_nolock(runlist_element *rl, + const VCN vcn); + +extern int ntfs_get_size_for_mapping_pairs(const ntfs_volume *vol, + const runlist_element *rl, const VCN first_vcn, + const VCN last_vcn); + +extern int ntfs_mapping_pairs_build(const ntfs_volume *vol, s8 *dst, + const int dst_len, const runlist_element *rl, + const VCN first_vcn, const VCN last_vcn, VCN *const stop_vcn); + +extern int ntfs_rl_truncate_nolock(const ntfs_volume *vol, + runlist *const runlist, const s64 new_length); + +int ntfs_rl_punch_nolock(const ntfs_volume *vol, runlist *const runlist, + const VCN start, const s64 length); + +#endif /* NTFS_RW */ + +#endif /* _LINUX_NTFS_RUNLIST_H */ diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c new file mode 100644 index 000000000..001f4e053 --- /dev/null +++ b/fs/ntfs/super.c @@ -0,0 +1,3194 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * super.c - NTFS kernel super block handling. Part of the Linux-NTFS project. + * + * Copyright (c) 2001-2012 Anton Altaparmakov and Tuxera Inc. + * Copyright (c) 2001,2002 Richard Russon + */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/stddef.h> +#include <linux/init.h> +#include <linux/slab.h> +#include <linux/string.h> +#include <linux/spinlock.h> +#include <linux/blkdev.h> /* For bdev_logical_block_size(). */ +#include <linux/backing-dev.h> +#include <linux/buffer_head.h> +#include <linux/vfs.h> +#include <linux/moduleparam.h> +#include <linux/bitmap.h> + +#include "sysctl.h" +#include "logfile.h" +#include "quota.h" +#include "usnjrnl.h" +#include "dir.h" +#include "debug.h" +#include "index.h" +#include "inode.h" +#include "aops.h" +#include "layout.h" +#include "malloc.h" +#include "ntfs.h" + +/* Number of mounted filesystems which have compression enabled. */ +static unsigned long ntfs_nr_compression_users; + +/* A global default upcase table and a corresponding reference count. */ +static ntfschar *default_upcase; +static unsigned long ntfs_nr_upcase_users; + +/* Error constants/strings used in inode.c::ntfs_show_options(). */ +typedef enum { + /* One of these must be present, default is ON_ERRORS_CONTINUE. */ + ON_ERRORS_PANIC = 0x01, + ON_ERRORS_REMOUNT_RO = 0x02, + ON_ERRORS_CONTINUE = 0x04, + /* Optional, can be combined with any of the above. */ + ON_ERRORS_RECOVER = 0x10, +} ON_ERRORS_ACTIONS; + +const option_t on_errors_arr[] = { + { ON_ERRORS_PANIC, "panic" }, + { ON_ERRORS_REMOUNT_RO, "remount-ro", }, + { ON_ERRORS_CONTINUE, "continue", }, + { ON_ERRORS_RECOVER, "recover" }, + { 0, NULL } +}; + +/** + * simple_getbool - + * + * Copied from old ntfs driver (which copied from vfat driver). + */ +static int simple_getbool(char *s, bool *setval) +{ + if (s) { + if (!strcmp(s, "1") || !strcmp(s, "yes") || !strcmp(s, "true")) + *setval = true; + else if (!strcmp(s, "0") || !strcmp(s, "no") || + !strcmp(s, "false")) + *setval = false; + else + return 0; + } else + *setval = true; + return 1; +} + +/** + * parse_options - parse the (re)mount options + * @vol: ntfs volume + * @opt: string containing the (re)mount options + * + * Parse the recognized options in @opt for the ntfs volume described by @vol. + */ +static bool parse_options(ntfs_volume *vol, char *opt) +{ + char *p, *v, *ov; + static char *utf8 = "utf8"; + int errors = 0, sloppy = 0; + kuid_t uid = INVALID_UID; + kgid_t gid = INVALID_GID; + umode_t fmask = (umode_t)-1, dmask = (umode_t)-1; + int mft_zone_multiplier = -1, on_errors = -1; + int show_sys_files = -1, case_sensitive = -1, disable_sparse = -1; + struct nls_table *nls_map = NULL, *old_nls; + + /* I am lazy... (-8 */ +#define NTFS_GETOPT_WITH_DEFAULT(option, variable, default_value) \ + if (!strcmp(p, option)) { \ + if (!v || !*v) \ + variable = default_value; \ + else { \ + variable = simple_strtoul(ov = v, &v, 0); \ + if (*v) \ + goto needs_val; \ + } \ + } +#define NTFS_GETOPT(option, variable) \ + if (!strcmp(p, option)) { \ + if (!v || !*v) \ + goto needs_arg; \ + variable = simple_strtoul(ov = v, &v, 0); \ + if (*v) \ + goto needs_val; \ + } +#define NTFS_GETOPT_UID(option, variable) \ + if (!strcmp(p, option)) { \ + uid_t uid_value; \ + if (!v || !*v) \ + goto needs_arg; \ + uid_value = simple_strtoul(ov = v, &v, 0); \ + if (*v) \ + goto needs_val; \ + variable = make_kuid(current_user_ns(), uid_value); \ + if (!uid_valid(variable)) \ + goto needs_val; \ + } +#define NTFS_GETOPT_GID(option, variable) \ + if (!strcmp(p, option)) { \ + gid_t gid_value; \ + if (!v || !*v) \ + goto needs_arg; \ + gid_value = simple_strtoul(ov = v, &v, 0); \ + if (*v) \ + goto needs_val; \ + variable = make_kgid(current_user_ns(), gid_value); \ + if (!gid_valid(variable)) \ + goto needs_val; \ + } +#define NTFS_GETOPT_OCTAL(option, variable) \ + if (!strcmp(p, option)) { \ + if (!v || !*v) \ + goto needs_arg; \ + variable = simple_strtoul(ov = v, &v, 8); \ + if (*v) \ + goto needs_val; \ + } +#define NTFS_GETOPT_BOOL(option, variable) \ + if (!strcmp(p, option)) { \ + bool val; \ + if (!simple_getbool(v, &val)) \ + goto needs_bool; \ + variable = val; \ + } +#define NTFS_GETOPT_OPTIONS_ARRAY(option, variable, opt_array) \ + if (!strcmp(p, option)) { \ + int _i; \ + if (!v || !*v) \ + goto needs_arg; \ + ov = v; \ + if (variable == -1) \ + variable = 0; \ + for (_i = 0; opt_array[_i].str && *opt_array[_i].str; _i++) \ + if (!strcmp(opt_array[_i].str, v)) { \ + variable |= opt_array[_i].val; \ + break; \ + } \ + if (!opt_array[_i].str || !*opt_array[_i].str) \ + goto needs_val; \ + } + if (!opt || !*opt) + goto no_mount_options; + ntfs_debug("Entering with mount options string: %s", opt); + while ((p = strsep(&opt, ","))) { + if ((v = strchr(p, '='))) + *v++ = 0; + NTFS_GETOPT_UID("uid", uid) + else NTFS_GETOPT_GID("gid", gid) + else NTFS_GETOPT_OCTAL("umask", fmask = dmask) + else NTFS_GETOPT_OCTAL("fmask", fmask) + else NTFS_GETOPT_OCTAL("dmask", dmask) + else NTFS_GETOPT("mft_zone_multiplier", mft_zone_multiplier) + else NTFS_GETOPT_WITH_DEFAULT("sloppy", sloppy, true) + else NTFS_GETOPT_BOOL("show_sys_files", show_sys_files) + else NTFS_GETOPT_BOOL("case_sensitive", case_sensitive) + else NTFS_GETOPT_BOOL("disable_sparse", disable_sparse) + else NTFS_GETOPT_OPTIONS_ARRAY("errors", on_errors, + on_errors_arr) + else if (!strcmp(p, "posix") || !strcmp(p, "show_inodes")) + ntfs_warning(vol->sb, "Ignoring obsolete option %s.", + p); + else if (!strcmp(p, "nls") || !strcmp(p, "iocharset")) { + if (!strcmp(p, "iocharset")) + ntfs_warning(vol->sb, "Option iocharset is " + "deprecated. Please use " + "option nls=<charsetname> in " + "the future."); + if (!v || !*v) + goto needs_arg; +use_utf8: + old_nls = nls_map; + nls_map = load_nls(v); + if (!nls_map) { + if (!old_nls) { + ntfs_error(vol->sb, "NLS character set " + "%s not found.", v); + return false; + } + ntfs_error(vol->sb, "NLS character set %s not " + "found. Using previous one %s.", + v, old_nls->charset); + nls_map = old_nls; + } else /* nls_map */ { + unload_nls(old_nls); + } + } else if (!strcmp(p, "utf8")) { + bool val = false; + ntfs_warning(vol->sb, "Option utf8 is no longer " + "supported, using option nls=utf8. Please " + "use option nls=utf8 in the future and " + "make sure utf8 is compiled either as a " + "module or into the kernel."); + if (!v || !*v) + val = true; + else if (!simple_getbool(v, &val)) + goto needs_bool; + if (val) { + v = utf8; + goto use_utf8; + } + } else { + ntfs_error(vol->sb, "Unrecognized mount option %s.", p); + if (errors < INT_MAX) + errors++; + } +#undef NTFS_GETOPT_OPTIONS_ARRAY +#undef NTFS_GETOPT_BOOL +#undef NTFS_GETOPT +#undef NTFS_GETOPT_WITH_DEFAULT + } +no_mount_options: + if (errors && !sloppy) + return false; + if (sloppy) + ntfs_warning(vol->sb, "Sloppy option given. Ignoring " + "unrecognized mount option(s) and continuing."); + /* Keep this first! */ + if (on_errors != -1) { + if (!on_errors) { + ntfs_error(vol->sb, "Invalid errors option argument " + "or bug in options parser."); + return false; + } + } + if (nls_map) { + if (vol->nls_map && vol->nls_map != nls_map) { + ntfs_error(vol->sb, "Cannot change NLS character set " + "on remount."); + return false; + } /* else (!vol->nls_map) */ + ntfs_debug("Using NLS character set %s.", nls_map->charset); + vol->nls_map = nls_map; + } else /* (!nls_map) */ { + if (!vol->nls_map) { + vol->nls_map = load_nls_default(); + if (!vol->nls_map) { + ntfs_error(vol->sb, "Failed to load default " + "NLS character set."); + return false; + } + ntfs_debug("Using default NLS character set (%s).", + vol->nls_map->charset); + } + } + if (mft_zone_multiplier != -1) { + if (vol->mft_zone_multiplier && vol->mft_zone_multiplier != + mft_zone_multiplier) { + ntfs_error(vol->sb, "Cannot change mft_zone_multiplier " + "on remount."); + return false; + } + if (mft_zone_multiplier < 1 || mft_zone_multiplier > 4) { + ntfs_error(vol->sb, "Invalid mft_zone_multiplier. " + "Using default value, i.e. 1."); + mft_zone_multiplier = 1; + } + vol->mft_zone_multiplier = mft_zone_multiplier; + } + if (!vol->mft_zone_multiplier) + vol->mft_zone_multiplier = 1; + if (on_errors != -1) + vol->on_errors = on_errors; + if (!vol->on_errors || vol->on_errors == ON_ERRORS_RECOVER) + vol->on_errors |= ON_ERRORS_CONTINUE; + if (uid_valid(uid)) + vol->uid = uid; + if (gid_valid(gid)) + vol->gid = gid; + if (fmask != (umode_t)-1) + vol->fmask = fmask; + if (dmask != (umode_t)-1) + vol->dmask = dmask; + if (show_sys_files != -1) { + if (show_sys_files) + NVolSetShowSystemFiles(vol); + else + NVolClearShowSystemFiles(vol); + } + if (case_sensitive != -1) { + if (case_sensitive) + NVolSetCaseSensitive(vol); + else + NVolClearCaseSensitive(vol); + } + if (disable_sparse != -1) { + if (disable_sparse) + NVolClearSparseEnabled(vol); + else { + if (!NVolSparseEnabled(vol) && + vol->major_ver && vol->major_ver < 3) + ntfs_warning(vol->sb, "Not enabling sparse " + "support due to NTFS volume " + "version %i.%i (need at least " + "version 3.0).", vol->major_ver, + vol->minor_ver); + else + NVolSetSparseEnabled(vol); + } + } + return true; +needs_arg: + ntfs_error(vol->sb, "The %s option requires an argument.", p); + return false; +needs_bool: + ntfs_error(vol->sb, "The %s option requires a boolean argument.", p); + return false; +needs_val: + ntfs_error(vol->sb, "Invalid %s option argument: %s", p, ov); + return false; +} + +#ifdef NTFS_RW + +/** + * ntfs_write_volume_flags - write new flags to the volume information flags + * @vol: ntfs volume on which to modify the flags + * @flags: new flags value for the volume information flags + * + * Internal function. You probably want to use ntfs_{set,clear}_volume_flags() + * instead (see below). + * + * Replace the volume information flags on the volume @vol with the value + * supplied in @flags. Note, this overwrites the volume information flags, so + * make sure to combine the flags you want to modify with the old flags and use + * the result when calling ntfs_write_volume_flags(). + * + * Return 0 on success and -errno on error. + */ +static int ntfs_write_volume_flags(ntfs_volume *vol, const VOLUME_FLAGS flags) +{ + ntfs_inode *ni = NTFS_I(vol->vol_ino); + MFT_RECORD *m; + VOLUME_INFORMATION *vi; + ntfs_attr_search_ctx *ctx; + int err; + + ntfs_debug("Entering, old flags = 0x%x, new flags = 0x%x.", + le16_to_cpu(vol->vol_flags), le16_to_cpu(flags)); + if (vol->vol_flags == flags) + goto done; + BUG_ON(!ni); + m = map_mft_record(ni); + if (IS_ERR(m)) { + err = PTR_ERR(m); + goto err_out; + } + ctx = ntfs_attr_get_search_ctx(ni, m); + if (!ctx) { + err = -ENOMEM; + goto put_unm_err_out; + } + err = ntfs_attr_lookup(AT_VOLUME_INFORMATION, NULL, 0, 0, 0, NULL, 0, + ctx); + if (err) + goto put_unm_err_out; + vi = (VOLUME_INFORMATION*)((u8*)ctx->attr + + le16_to_cpu(ctx->attr->data.resident.value_offset)); + vol->vol_flags = vi->flags = flags; + flush_dcache_mft_record_page(ctx->ntfs_ino); + mark_mft_record_dirty(ctx->ntfs_ino); + ntfs_attr_put_search_ctx(ctx); + unmap_mft_record(ni); +done: + ntfs_debug("Done."); + return 0; +put_unm_err_out: + if (ctx) + ntfs_attr_put_search_ctx(ctx); + unmap_mft_record(ni); +err_out: + ntfs_error(vol->sb, "Failed with error code %i.", -err); + return err; +} + +/** + * ntfs_set_volume_flags - set bits in the volume information flags + * @vol: ntfs volume on which to modify the flags + * @flags: flags to set on the volume + * + * Set the bits in @flags in the volume information flags on the volume @vol. + * + * Return 0 on success and -errno on error. + */ +static inline int ntfs_set_volume_flags(ntfs_volume *vol, VOLUME_FLAGS flags) +{ + flags &= VOLUME_FLAGS_MASK; + return ntfs_write_volume_flags(vol, vol->vol_flags | flags); +} + +/** + * ntfs_clear_volume_flags - clear bits in the volume information flags + * @vol: ntfs volume on which to modify the flags + * @flags: flags to clear on the volume + * + * Clear the bits in @flags in the volume information flags on the volume @vol. + * + * Return 0 on success and -errno on error. + */ +static inline int ntfs_clear_volume_flags(ntfs_volume *vol, VOLUME_FLAGS flags) +{ + flags &= VOLUME_FLAGS_MASK; + flags = vol->vol_flags & cpu_to_le16(~le16_to_cpu(flags)); + return ntfs_write_volume_flags(vol, flags); +} + +#endif /* NTFS_RW */ + +/** + * ntfs_remount - change the mount options of a mounted ntfs filesystem + * @sb: superblock of mounted ntfs filesystem + * @flags: remount flags + * @opt: remount options string + * + * Change the mount options of an already mounted ntfs filesystem. + * + * NOTE: The VFS sets the @sb->s_flags remount flags to @flags after + * ntfs_remount() returns successfully (i.e. returns 0). Otherwise, + * @sb->s_flags are not changed. + */ +static int ntfs_remount(struct super_block *sb, int *flags, char *opt) +{ + ntfs_volume *vol = NTFS_SB(sb); + + ntfs_debug("Entering with remount options string: %s", opt); + + sync_filesystem(sb); + +#ifndef NTFS_RW + /* For read-only compiled driver, enforce read-only flag. */ + *flags |= SB_RDONLY; +#else /* NTFS_RW */ + /* + * For the read-write compiled driver, if we are remounting read-write, + * make sure there are no volume errors and that no unsupported volume + * flags are set. Also, empty the logfile journal as it would become + * stale as soon as something is written to the volume and mark the + * volume dirty so that chkdsk is run if the volume is not umounted + * cleanly. Finally, mark the quotas out of date so Windows rescans + * the volume on boot and updates them. + * + * When remounting read-only, mark the volume clean if no volume errors + * have occurred. + */ + if (sb_rdonly(sb) && !(*flags & SB_RDONLY)) { + static const char *es = ". Cannot remount read-write."; + + /* Remounting read-write. */ + if (NVolErrors(vol)) { + ntfs_error(sb, "Volume has errors and is read-only%s", + es); + return -EROFS; + } + if (vol->vol_flags & VOLUME_IS_DIRTY) { + ntfs_error(sb, "Volume is dirty and read-only%s", es); + return -EROFS; + } + if (vol->vol_flags & VOLUME_MODIFIED_BY_CHKDSK) { + ntfs_error(sb, "Volume has been modified by chkdsk " + "and is read-only%s", es); + return -EROFS; + } + if (vol->vol_flags & VOLUME_MUST_MOUNT_RO_MASK) { + ntfs_error(sb, "Volume has unsupported flags set " + "(0x%x) and is read-only%s", + (unsigned)le16_to_cpu(vol->vol_flags), + es); + return -EROFS; + } + if (ntfs_set_volume_flags(vol, VOLUME_IS_DIRTY)) { + ntfs_error(sb, "Failed to set dirty bit in volume " + "information flags%s", es); + return -EROFS; + } +#if 0 + // TODO: Enable this code once we start modifying anything that + // is different between NTFS 1.2 and 3.x... + /* Set NT4 compatibility flag on newer NTFS version volumes. */ + if ((vol->major_ver > 1)) { + if (ntfs_set_volume_flags(vol, VOLUME_MOUNTED_ON_NT4)) { + ntfs_error(sb, "Failed to set NT4 " + "compatibility flag%s", es); + NVolSetErrors(vol); + return -EROFS; + } + } +#endif + if (!ntfs_empty_logfile(vol->logfile_ino)) { + ntfs_error(sb, "Failed to empty journal $LogFile%s", + es); + NVolSetErrors(vol); + return -EROFS; + } + if (!ntfs_mark_quotas_out_of_date(vol)) { + ntfs_error(sb, "Failed to mark quotas out of date%s", + es); + NVolSetErrors(vol); + return -EROFS; + } + if (!ntfs_stamp_usnjrnl(vol)) { + ntfs_error(sb, "Failed to stamp transaction log " + "($UsnJrnl)%s", es); + NVolSetErrors(vol); + return -EROFS; + } + } else if (!sb_rdonly(sb) && (*flags & SB_RDONLY)) { + /* Remounting read-only. */ + if (!NVolErrors(vol)) { + if (ntfs_clear_volume_flags(vol, VOLUME_IS_DIRTY)) + ntfs_warning(sb, "Failed to clear dirty bit " + "in volume information " + "flags. Run chkdsk."); + } + } +#endif /* NTFS_RW */ + + // TODO: Deal with *flags. + + if (!parse_options(vol, opt)) + return -EINVAL; + + ntfs_debug("Done."); + return 0; +} + +/** + * is_boot_sector_ntfs - check whether a boot sector is a valid NTFS boot sector + * @sb: Super block of the device to which @b belongs. + * @b: Boot sector of device @sb to check. + * @silent: If 'true', all output will be silenced. + * + * is_boot_sector_ntfs() checks whether the boot sector @b is a valid NTFS boot + * sector. Returns 'true' if it is valid and 'false' if not. + * + * @sb is only needed for warning/error output, i.e. it can be NULL when silent + * is 'true'. + */ +static bool is_boot_sector_ntfs(const struct super_block *sb, + const NTFS_BOOT_SECTOR *b, const bool silent) +{ + /* + * Check that checksum == sum of u32 values from b to the checksum + * field. If checksum is zero, no checking is done. We will work when + * the checksum test fails, since some utilities update the boot sector + * ignoring the checksum which leaves the checksum out-of-date. We + * report a warning if this is the case. + */ + if ((void*)b < (void*)&b->checksum && b->checksum && !silent) { + le32 *u; + u32 i; + + for (i = 0, u = (le32*)b; u < (le32*)(&b->checksum); ++u) + i += le32_to_cpup(u); + if (le32_to_cpu(b->checksum) != i) + ntfs_warning(sb, "Invalid boot sector checksum."); + } + /* Check OEMidentifier is "NTFS " */ + if (b->oem_id != magicNTFS) + goto not_ntfs; + /* Check bytes per sector value is between 256 and 4096. */ + if (le16_to_cpu(b->bpb.bytes_per_sector) < 0x100 || + le16_to_cpu(b->bpb.bytes_per_sector) > 0x1000) + goto not_ntfs; + /* Check sectors per cluster value is valid. */ + switch (b->bpb.sectors_per_cluster) { + case 1: case 2: case 4: case 8: case 16: case 32: case 64: case 128: + break; + default: + goto not_ntfs; + } + /* Check the cluster size is not above the maximum (64kiB). */ + if ((u32)le16_to_cpu(b->bpb.bytes_per_sector) * + b->bpb.sectors_per_cluster > NTFS_MAX_CLUSTER_SIZE) + goto not_ntfs; + /* Check reserved/unused fields are really zero. */ + if (le16_to_cpu(b->bpb.reserved_sectors) || + le16_to_cpu(b->bpb.root_entries) || + le16_to_cpu(b->bpb.sectors) || + le16_to_cpu(b->bpb.sectors_per_fat) || + le32_to_cpu(b->bpb.large_sectors) || b->bpb.fats) + goto not_ntfs; + /* Check clusters per file mft record value is valid. */ + if ((u8)b->clusters_per_mft_record < 0xe1 || + (u8)b->clusters_per_mft_record > 0xf7) + switch (b->clusters_per_mft_record) { + case 1: case 2: case 4: case 8: case 16: case 32: case 64: + break; + default: + goto not_ntfs; + } + /* Check clusters per index block value is valid. */ + if ((u8)b->clusters_per_index_record < 0xe1 || + (u8)b->clusters_per_index_record > 0xf7) + switch (b->clusters_per_index_record) { + case 1: case 2: case 4: case 8: case 16: case 32: case 64: + break; + default: + goto not_ntfs; + } + /* + * Check for valid end of sector marker. We will work without it, but + * many BIOSes will refuse to boot from a bootsector if the magic is + * incorrect, so we emit a warning. + */ + if (!silent && b->end_of_sector_marker != cpu_to_le16(0xaa55)) + ntfs_warning(sb, "Invalid end of sector marker."); + return true; +not_ntfs: + return false; +} + +/** + * read_ntfs_boot_sector - read the NTFS boot sector of a device + * @sb: super block of device to read the boot sector from + * @silent: if true, suppress all output + * + * Reads the boot sector from the device and validates it. If that fails, tries + * to read the backup boot sector, first from the end of the device a-la NT4 and + * later and then from the middle of the device a-la NT3.51 and before. + * + * If a valid boot sector is found but it is not the primary boot sector, we + * repair the primary boot sector silently (unless the device is read-only or + * the primary boot sector is not accessible). + * + * NOTE: To call this function, @sb must have the fields s_dev, the ntfs super + * block (u.ntfs_sb), nr_blocks and the device flags (s_flags) initialized + * to their respective values. + * + * Return the unlocked buffer head containing the boot sector or NULL on error. + */ +static struct buffer_head *read_ntfs_boot_sector(struct super_block *sb, + const int silent) +{ + const char *read_err_str = "Unable to read %s boot sector."; + struct buffer_head *bh_primary, *bh_backup; + sector_t nr_blocks = NTFS_SB(sb)->nr_blocks; + + /* Try to read primary boot sector. */ + if ((bh_primary = sb_bread(sb, 0))) { + if (is_boot_sector_ntfs(sb, (NTFS_BOOT_SECTOR*) + bh_primary->b_data, silent)) + return bh_primary; + if (!silent) + ntfs_error(sb, "Primary boot sector is invalid."); + } else if (!silent) + ntfs_error(sb, read_err_str, "primary"); + if (!(NTFS_SB(sb)->on_errors & ON_ERRORS_RECOVER)) { + if (bh_primary) + brelse(bh_primary); + if (!silent) + ntfs_error(sb, "Mount option errors=recover not used. " + "Aborting without trying to recover."); + return NULL; + } + /* Try to read NT4+ backup boot sector. */ + if ((bh_backup = sb_bread(sb, nr_blocks - 1))) { + if (is_boot_sector_ntfs(sb, (NTFS_BOOT_SECTOR*) + bh_backup->b_data, silent)) + goto hotfix_primary_boot_sector; + brelse(bh_backup); + } else if (!silent) + ntfs_error(sb, read_err_str, "backup"); + /* Try to read NT3.51- backup boot sector. */ + if ((bh_backup = sb_bread(sb, nr_blocks >> 1))) { + if (is_boot_sector_ntfs(sb, (NTFS_BOOT_SECTOR*) + bh_backup->b_data, silent)) + goto hotfix_primary_boot_sector; + if (!silent) + ntfs_error(sb, "Could not find a valid backup boot " + "sector."); + brelse(bh_backup); + } else if (!silent) + ntfs_error(sb, read_err_str, "backup"); + /* We failed. Cleanup and return. */ + if (bh_primary) + brelse(bh_primary); + return NULL; +hotfix_primary_boot_sector: + if (bh_primary) { + /* + * If we managed to read sector zero and the volume is not + * read-only, copy the found, valid backup boot sector to the + * primary boot sector. Note we only copy the actual boot + * sector structure, not the actual whole device sector as that + * may be bigger and would potentially damage the $Boot system + * file (FIXME: Would be nice to know if the backup boot sector + * on a large sector device contains the whole boot loader or + * just the first 512 bytes). + */ + if (!sb_rdonly(sb)) { + ntfs_warning(sb, "Hot-fix: Recovering invalid primary " + "boot sector from backup copy."); + memcpy(bh_primary->b_data, bh_backup->b_data, + NTFS_BLOCK_SIZE); + mark_buffer_dirty(bh_primary); + sync_dirty_buffer(bh_primary); + if (buffer_uptodate(bh_primary)) { + brelse(bh_backup); + return bh_primary; + } + ntfs_error(sb, "Hot-fix: Device write error while " + "recovering primary boot sector."); + } else { + ntfs_warning(sb, "Hot-fix: Recovery of primary boot " + "sector failed: Read-only mount."); + } + brelse(bh_primary); + } + ntfs_warning(sb, "Using backup boot sector."); + return bh_backup; +} + +/** + * parse_ntfs_boot_sector - parse the boot sector and store the data in @vol + * @vol: volume structure to initialise with data from boot sector + * @b: boot sector to parse + * + * Parse the ntfs boot sector @b and store all imporant information therein in + * the ntfs super block @vol. Return 'true' on success and 'false' on error. + */ +static bool parse_ntfs_boot_sector(ntfs_volume *vol, const NTFS_BOOT_SECTOR *b) +{ + unsigned int sectors_per_cluster_bits, nr_hidden_sects; + int clusters_per_mft_record, clusters_per_index_record; + s64 ll; + + vol->sector_size = le16_to_cpu(b->bpb.bytes_per_sector); + vol->sector_size_bits = ffs(vol->sector_size) - 1; + ntfs_debug("vol->sector_size = %i (0x%x)", vol->sector_size, + vol->sector_size); + ntfs_debug("vol->sector_size_bits = %i (0x%x)", vol->sector_size_bits, + vol->sector_size_bits); + if (vol->sector_size < vol->sb->s_blocksize) { + ntfs_error(vol->sb, "Sector size (%i) is smaller than the " + "device block size (%lu). This is not " + "supported. Sorry.", vol->sector_size, + vol->sb->s_blocksize); + return false; + } + ntfs_debug("sectors_per_cluster = 0x%x", b->bpb.sectors_per_cluster); + sectors_per_cluster_bits = ffs(b->bpb.sectors_per_cluster) - 1; + ntfs_debug("sectors_per_cluster_bits = 0x%x", + sectors_per_cluster_bits); + nr_hidden_sects = le32_to_cpu(b->bpb.hidden_sectors); + ntfs_debug("number of hidden sectors = 0x%x", nr_hidden_sects); + vol->cluster_size = vol->sector_size << sectors_per_cluster_bits; + vol->cluster_size_mask = vol->cluster_size - 1; + vol->cluster_size_bits = ffs(vol->cluster_size) - 1; + ntfs_debug("vol->cluster_size = %i (0x%x)", vol->cluster_size, + vol->cluster_size); + ntfs_debug("vol->cluster_size_mask = 0x%x", vol->cluster_size_mask); + ntfs_debug("vol->cluster_size_bits = %i", vol->cluster_size_bits); + if (vol->cluster_size < vol->sector_size) { + ntfs_error(vol->sb, "Cluster size (%i) is smaller than the " + "sector size (%i). This is not supported. " + "Sorry.", vol->cluster_size, vol->sector_size); + return false; + } + clusters_per_mft_record = b->clusters_per_mft_record; + ntfs_debug("clusters_per_mft_record = %i (0x%x)", + clusters_per_mft_record, clusters_per_mft_record); + if (clusters_per_mft_record > 0) + vol->mft_record_size = vol->cluster_size << + (ffs(clusters_per_mft_record) - 1); + else + /* + * When mft_record_size < cluster_size, clusters_per_mft_record + * = -log2(mft_record_size) bytes. mft_record_size normaly is + * 1024 bytes, which is encoded as 0xF6 (-10 in decimal). + */ + vol->mft_record_size = 1 << -clusters_per_mft_record; + vol->mft_record_size_mask = vol->mft_record_size - 1; + vol->mft_record_size_bits = ffs(vol->mft_record_size) - 1; + ntfs_debug("vol->mft_record_size = %i (0x%x)", vol->mft_record_size, + vol->mft_record_size); + ntfs_debug("vol->mft_record_size_mask = 0x%x", + vol->mft_record_size_mask); + ntfs_debug("vol->mft_record_size_bits = %i (0x%x)", + vol->mft_record_size_bits, vol->mft_record_size_bits); + /* + * We cannot support mft record sizes above the PAGE_SIZE since + * we store $MFT/$DATA, the table of mft records in the page cache. + */ + if (vol->mft_record_size > PAGE_SIZE) { + ntfs_error(vol->sb, "Mft record size (%i) exceeds the " + "PAGE_SIZE on your system (%lu). " + "This is not supported. Sorry.", + vol->mft_record_size, PAGE_SIZE); + return false; + } + /* We cannot support mft record sizes below the sector size. */ + if (vol->mft_record_size < vol->sector_size) { + ntfs_error(vol->sb, "Mft record size (%i) is smaller than the " + "sector size (%i). This is not supported. " + "Sorry.", vol->mft_record_size, + vol->sector_size); + return false; + } + clusters_per_index_record = b->clusters_per_index_record; + ntfs_debug("clusters_per_index_record = %i (0x%x)", + clusters_per_index_record, clusters_per_index_record); + if (clusters_per_index_record > 0) + vol->index_record_size = vol->cluster_size << + (ffs(clusters_per_index_record) - 1); + else + /* + * When index_record_size < cluster_size, + * clusters_per_index_record = -log2(index_record_size) bytes. + * index_record_size normaly equals 4096 bytes, which is + * encoded as 0xF4 (-12 in decimal). + */ + vol->index_record_size = 1 << -clusters_per_index_record; + vol->index_record_size_mask = vol->index_record_size - 1; + vol->index_record_size_bits = ffs(vol->index_record_size) - 1; + ntfs_debug("vol->index_record_size = %i (0x%x)", + vol->index_record_size, vol->index_record_size); + ntfs_debug("vol->index_record_size_mask = 0x%x", + vol->index_record_size_mask); + ntfs_debug("vol->index_record_size_bits = %i (0x%x)", + vol->index_record_size_bits, + vol->index_record_size_bits); + /* We cannot support index record sizes below the sector size. */ + if (vol->index_record_size < vol->sector_size) { + ntfs_error(vol->sb, "Index record size (%i) is smaller than " + "the sector size (%i). This is not " + "supported. Sorry.", vol->index_record_size, + vol->sector_size); + return false; + } + /* + * Get the size of the volume in clusters and check for 64-bit-ness. + * Windows currently only uses 32 bits to save the clusters so we do + * the same as it is much faster on 32-bit CPUs. + */ + ll = sle64_to_cpu(b->number_of_sectors) >> sectors_per_cluster_bits; + if ((u64)ll >= 1ULL << 32) { + ntfs_error(vol->sb, "Cannot handle 64-bit clusters. Sorry."); + return false; + } + vol->nr_clusters = ll; + ntfs_debug("vol->nr_clusters = 0x%llx", (long long)vol->nr_clusters); + /* + * On an architecture where unsigned long is 32-bits, we restrict the + * volume size to 2TiB (2^41). On a 64-bit architecture, the compiler + * will hopefully optimize the whole check away. + */ + if (sizeof(unsigned long) < 8) { + if ((ll << vol->cluster_size_bits) >= (1ULL << 41)) { + ntfs_error(vol->sb, "Volume size (%lluTiB) is too " + "large for this architecture. " + "Maximum supported is 2TiB. Sorry.", + (unsigned long long)ll >> (40 - + vol->cluster_size_bits)); + return false; + } + } + ll = sle64_to_cpu(b->mft_lcn); + if (ll >= vol->nr_clusters) { + ntfs_error(vol->sb, "MFT LCN (%lli, 0x%llx) is beyond end of " + "volume. Weird.", (unsigned long long)ll, + (unsigned long long)ll); + return false; + } + vol->mft_lcn = ll; + ntfs_debug("vol->mft_lcn = 0x%llx", (long long)vol->mft_lcn); + ll = sle64_to_cpu(b->mftmirr_lcn); + if (ll >= vol->nr_clusters) { + ntfs_error(vol->sb, "MFTMirr LCN (%lli, 0x%llx) is beyond end " + "of volume. Weird.", (unsigned long long)ll, + (unsigned long long)ll); + return false; + } + vol->mftmirr_lcn = ll; + ntfs_debug("vol->mftmirr_lcn = 0x%llx", (long long)vol->mftmirr_lcn); +#ifdef NTFS_RW + /* + * Work out the size of the mft mirror in number of mft records. If the + * cluster size is less than or equal to the size taken by four mft + * records, the mft mirror stores the first four mft records. If the + * cluster size is bigger than the size taken by four mft records, the + * mft mirror contains as many mft records as will fit into one + * cluster. + */ + if (vol->cluster_size <= (4 << vol->mft_record_size_bits)) + vol->mftmirr_size = 4; + else + vol->mftmirr_size = vol->cluster_size >> + vol->mft_record_size_bits; + ntfs_debug("vol->mftmirr_size = %i", vol->mftmirr_size); +#endif /* NTFS_RW */ + vol->serial_no = le64_to_cpu(b->volume_serial_number); + ntfs_debug("vol->serial_no = 0x%llx", + (unsigned long long)vol->serial_no); + return true; +} + +/** + * ntfs_setup_allocators - initialize the cluster and mft allocators + * @vol: volume structure for which to setup the allocators + * + * Setup the cluster (lcn) and mft allocators to the starting values. + */ +static void ntfs_setup_allocators(ntfs_volume *vol) +{ +#ifdef NTFS_RW + LCN mft_zone_size, mft_lcn; +#endif /* NTFS_RW */ + + ntfs_debug("vol->mft_zone_multiplier = 0x%x", + vol->mft_zone_multiplier); +#ifdef NTFS_RW + /* Determine the size of the MFT zone. */ + mft_zone_size = vol->nr_clusters; + switch (vol->mft_zone_multiplier) { /* % of volume size in clusters */ + case 4: + mft_zone_size >>= 1; /* 50% */ + break; + case 3: + mft_zone_size = (mft_zone_size + + (mft_zone_size >> 1)) >> 2; /* 37.5% */ + break; + case 2: + mft_zone_size >>= 2; /* 25% */ + break; + /* case 1: */ + default: + mft_zone_size >>= 3; /* 12.5% */ + break; + } + /* Setup the mft zone. */ + vol->mft_zone_start = vol->mft_zone_pos = vol->mft_lcn; + ntfs_debug("vol->mft_zone_pos = 0x%llx", + (unsigned long long)vol->mft_zone_pos); + /* + * Calculate the mft_lcn for an unmodified NTFS volume (see mkntfs + * source) and if the actual mft_lcn is in the expected place or even + * further to the front of the volume, extend the mft_zone to cover the + * beginning of the volume as well. This is in order to protect the + * area reserved for the mft bitmap as well within the mft_zone itself. + * On non-standard volumes we do not protect it as the overhead would + * be higher than the speed increase we would get by doing it. + */ + mft_lcn = (8192 + 2 * vol->cluster_size - 1) / vol->cluster_size; + if (mft_lcn * vol->cluster_size < 16 * 1024) + mft_lcn = (16 * 1024 + vol->cluster_size - 1) / + vol->cluster_size; + if (vol->mft_zone_start <= mft_lcn) + vol->mft_zone_start = 0; + ntfs_debug("vol->mft_zone_start = 0x%llx", + (unsigned long long)vol->mft_zone_start); + /* + * Need to cap the mft zone on non-standard volumes so that it does + * not point outside the boundaries of the volume. We do this by + * halving the zone size until we are inside the volume. + */ + vol->mft_zone_end = vol->mft_lcn + mft_zone_size; + while (vol->mft_zone_end >= vol->nr_clusters) { + mft_zone_size >>= 1; + vol->mft_zone_end = vol->mft_lcn + mft_zone_size; + } + ntfs_debug("vol->mft_zone_end = 0x%llx", + (unsigned long long)vol->mft_zone_end); + /* + * Set the current position within each data zone to the start of the + * respective zone. + */ + vol->data1_zone_pos = vol->mft_zone_end; + ntfs_debug("vol->data1_zone_pos = 0x%llx", + (unsigned long long)vol->data1_zone_pos); + vol->data2_zone_pos = 0; + ntfs_debug("vol->data2_zone_pos = 0x%llx", + (unsigned long long)vol->data2_zone_pos); + + /* Set the mft data allocation position to mft record 24. */ + vol->mft_data_pos = 24; + ntfs_debug("vol->mft_data_pos = 0x%llx", + (unsigned long long)vol->mft_data_pos); +#endif /* NTFS_RW */ +} + +#ifdef NTFS_RW + +/** + * load_and_init_mft_mirror - load and setup the mft mirror inode for a volume + * @vol: ntfs super block describing device whose mft mirror to load + * + * Return 'true' on success or 'false' on error. + */ +static bool load_and_init_mft_mirror(ntfs_volume *vol) +{ + struct inode *tmp_ino; + ntfs_inode *tmp_ni; + + ntfs_debug("Entering."); + /* Get mft mirror inode. */ + tmp_ino = ntfs_iget(vol->sb, FILE_MFTMirr); + if (IS_ERR(tmp_ino) || is_bad_inode(tmp_ino)) { + if (!IS_ERR(tmp_ino)) + iput(tmp_ino); + /* Caller will display error message. */ + return false; + } + /* + * Re-initialize some specifics about $MFTMirr's inode as + * ntfs_read_inode() will have set up the default ones. + */ + /* Set uid and gid to root. */ + tmp_ino->i_uid = GLOBAL_ROOT_UID; + tmp_ino->i_gid = GLOBAL_ROOT_GID; + /* Regular file. No access for anyone. */ + tmp_ino->i_mode = S_IFREG; + /* No VFS initiated operations allowed for $MFTMirr. */ + tmp_ino->i_op = &ntfs_empty_inode_ops; + tmp_ino->i_fop = &ntfs_empty_file_ops; + /* Put in our special address space operations. */ + tmp_ino->i_mapping->a_ops = &ntfs_mst_aops; + tmp_ni = NTFS_I(tmp_ino); + /* The $MFTMirr, like the $MFT is multi sector transfer protected. */ + NInoSetMstProtected(tmp_ni); + NInoSetSparseDisabled(tmp_ni); + /* + * Set up our little cheat allowing us to reuse the async read io + * completion handler for directories. + */ + tmp_ni->itype.index.block_size = vol->mft_record_size; + tmp_ni->itype.index.block_size_bits = vol->mft_record_size_bits; + vol->mftmirr_ino = tmp_ino; + ntfs_debug("Done."); + return true; +} + +/** + * check_mft_mirror - compare contents of the mft mirror with the mft + * @vol: ntfs super block describing device whose mft mirror to check + * + * Return 'true' on success or 'false' on error. + * + * Note, this function also results in the mft mirror runlist being completely + * mapped into memory. The mft mirror write code requires this and will BUG() + * should it find an unmapped runlist element. + */ +static bool check_mft_mirror(ntfs_volume *vol) +{ + struct super_block *sb = vol->sb; + ntfs_inode *mirr_ni; + struct page *mft_page, *mirr_page; + u8 *kmft, *kmirr; + runlist_element *rl, rl2[2]; + pgoff_t index; + int mrecs_per_page, i; + + ntfs_debug("Entering."); + /* Compare contents of $MFT and $MFTMirr. */ + mrecs_per_page = PAGE_SIZE / vol->mft_record_size; + BUG_ON(!mrecs_per_page); + BUG_ON(!vol->mftmirr_size); + mft_page = mirr_page = NULL; + kmft = kmirr = NULL; + index = i = 0; + do { + u32 bytes; + + /* Switch pages if necessary. */ + if (!(i % mrecs_per_page)) { + if (index) { + ntfs_unmap_page(mft_page); + ntfs_unmap_page(mirr_page); + } + /* Get the $MFT page. */ + mft_page = ntfs_map_page(vol->mft_ino->i_mapping, + index); + if (IS_ERR(mft_page)) { + ntfs_error(sb, "Failed to read $MFT."); + return false; + } + kmft = page_address(mft_page); + /* Get the $MFTMirr page. */ + mirr_page = ntfs_map_page(vol->mftmirr_ino->i_mapping, + index); + if (IS_ERR(mirr_page)) { + ntfs_error(sb, "Failed to read $MFTMirr."); + goto mft_unmap_out; + } + kmirr = page_address(mirr_page); + ++index; + } + /* Do not check the record if it is not in use. */ + if (((MFT_RECORD*)kmft)->flags & MFT_RECORD_IN_USE) { + /* Make sure the record is ok. */ + if (ntfs_is_baad_recordp((le32*)kmft)) { + ntfs_error(sb, "Incomplete multi sector " + "transfer detected in mft " + "record %i.", i); +mm_unmap_out: + ntfs_unmap_page(mirr_page); +mft_unmap_out: + ntfs_unmap_page(mft_page); + return false; + } + } + /* Do not check the mirror record if it is not in use. */ + if (((MFT_RECORD*)kmirr)->flags & MFT_RECORD_IN_USE) { + if (ntfs_is_baad_recordp((le32*)kmirr)) { + ntfs_error(sb, "Incomplete multi sector " + "transfer detected in mft " + "mirror record %i.", i); + goto mm_unmap_out; + } + } + /* Get the amount of data in the current record. */ + bytes = le32_to_cpu(((MFT_RECORD*)kmft)->bytes_in_use); + if (bytes < sizeof(MFT_RECORD_OLD) || + bytes > vol->mft_record_size || + ntfs_is_baad_recordp((le32*)kmft)) { + bytes = le32_to_cpu(((MFT_RECORD*)kmirr)->bytes_in_use); + if (bytes < sizeof(MFT_RECORD_OLD) || + bytes > vol->mft_record_size || + ntfs_is_baad_recordp((le32*)kmirr)) + bytes = vol->mft_record_size; + } + /* Compare the two records. */ + if (memcmp(kmft, kmirr, bytes)) { + ntfs_error(sb, "$MFT and $MFTMirr (record %i) do not " + "match. Run ntfsfix or chkdsk.", i); + goto mm_unmap_out; + } + kmft += vol->mft_record_size; + kmirr += vol->mft_record_size; + } while (++i < vol->mftmirr_size); + /* Release the last pages. */ + ntfs_unmap_page(mft_page); + ntfs_unmap_page(mirr_page); + + /* Construct the mft mirror runlist by hand. */ + rl2[0].vcn = 0; + rl2[0].lcn = vol->mftmirr_lcn; + rl2[0].length = (vol->mftmirr_size * vol->mft_record_size + + vol->cluster_size - 1) / vol->cluster_size; + rl2[1].vcn = rl2[0].length; + rl2[1].lcn = LCN_ENOENT; + rl2[1].length = 0; + /* + * Because we have just read all of the mft mirror, we know we have + * mapped the full runlist for it. + */ + mirr_ni = NTFS_I(vol->mftmirr_ino); + down_read(&mirr_ni->runlist.lock); + rl = mirr_ni->runlist.rl; + /* Compare the two runlists. They must be identical. */ + i = 0; + do { + if (rl2[i].vcn != rl[i].vcn || rl2[i].lcn != rl[i].lcn || + rl2[i].length != rl[i].length) { + ntfs_error(sb, "$MFTMirr location mismatch. " + "Run chkdsk."); + up_read(&mirr_ni->runlist.lock); + return false; + } + } while (rl2[i++].length); + up_read(&mirr_ni->runlist.lock); + ntfs_debug("Done."); + return true; +} + +/** + * load_and_check_logfile - load and check the logfile inode for a volume + * @vol: ntfs super block describing device whose logfile to load + * + * Return 'true' on success or 'false' on error. + */ +static bool load_and_check_logfile(ntfs_volume *vol, + RESTART_PAGE_HEADER **rp) +{ + struct inode *tmp_ino; + + ntfs_debug("Entering."); + tmp_ino = ntfs_iget(vol->sb, FILE_LogFile); + if (IS_ERR(tmp_ino) || is_bad_inode(tmp_ino)) { + if (!IS_ERR(tmp_ino)) + iput(tmp_ino); + /* Caller will display error message. */ + return false; + } + if (!ntfs_check_logfile(tmp_ino, rp)) { + iput(tmp_ino); + /* ntfs_check_logfile() will have displayed error output. */ + return false; + } + NInoSetSparseDisabled(NTFS_I(tmp_ino)); + vol->logfile_ino = tmp_ino; + ntfs_debug("Done."); + return true; +} + +#define NTFS_HIBERFIL_HEADER_SIZE 4096 + +/** + * check_windows_hibernation_status - check if Windows is suspended on a volume + * @vol: ntfs super block of device to check + * + * Check if Windows is hibernated on the ntfs volume @vol. This is done by + * looking for the file hiberfil.sys in the root directory of the volume. If + * the file is not present Windows is definitely not suspended. + * + * If hiberfil.sys exists and is less than 4kiB in size it means Windows is + * definitely suspended (this volume is not the system volume). Caveat: on a + * system with many volumes it is possible that the < 4kiB check is bogus but + * for now this should do fine. + * + * If hiberfil.sys exists and is larger than 4kiB in size, we need to read the + * hiberfil header (which is the first 4kiB). If this begins with "hibr", + * Windows is definitely suspended. If it is completely full of zeroes, + * Windows is definitely not hibernated. Any other case is treated as if + * Windows is suspended. This caters for the above mentioned caveat of a + * system with many volumes where no "hibr" magic would be present and there is + * no zero header. + * + * Return 0 if Windows is not hibernated on the volume, >0 if Windows is + * hibernated on the volume, and -errno on error. + */ +static int check_windows_hibernation_status(ntfs_volume *vol) +{ + MFT_REF mref; + struct inode *vi; + struct page *page; + u32 *kaddr, *kend; + ntfs_name *name = NULL; + int ret = 1; + static const ntfschar hiberfil[13] = { cpu_to_le16('h'), + cpu_to_le16('i'), cpu_to_le16('b'), + cpu_to_le16('e'), cpu_to_le16('r'), + cpu_to_le16('f'), cpu_to_le16('i'), + cpu_to_le16('l'), cpu_to_le16('.'), + cpu_to_le16('s'), cpu_to_le16('y'), + cpu_to_le16('s'), 0 }; + + ntfs_debug("Entering."); + /* + * Find the inode number for the hibernation file by looking up the + * filename hiberfil.sys in the root directory. + */ + inode_lock(vol->root_ino); + mref = ntfs_lookup_inode_by_name(NTFS_I(vol->root_ino), hiberfil, 12, + &name); + inode_unlock(vol->root_ino); + if (IS_ERR_MREF(mref)) { + ret = MREF_ERR(mref); + /* If the file does not exist, Windows is not hibernated. */ + if (ret == -ENOENT) { + ntfs_debug("hiberfil.sys not present. Windows is not " + "hibernated on the volume."); + return 0; + } + /* A real error occurred. */ + ntfs_error(vol->sb, "Failed to find inode number for " + "hiberfil.sys."); + return ret; + } + /* We do not care for the type of match that was found. */ + kfree(name); + /* Get the inode. */ + vi = ntfs_iget(vol->sb, MREF(mref)); + if (IS_ERR(vi) || is_bad_inode(vi)) { + if (!IS_ERR(vi)) + iput(vi); + ntfs_error(vol->sb, "Failed to load hiberfil.sys."); + return IS_ERR(vi) ? PTR_ERR(vi) : -EIO; + } + if (unlikely(i_size_read(vi) < NTFS_HIBERFIL_HEADER_SIZE)) { + ntfs_debug("hiberfil.sys is smaller than 4kiB (0x%llx). " + "Windows is hibernated on the volume. This " + "is not the system volume.", i_size_read(vi)); + goto iput_out; + } + page = ntfs_map_page(vi->i_mapping, 0); + if (IS_ERR(page)) { + ntfs_error(vol->sb, "Failed to read from hiberfil.sys."); + ret = PTR_ERR(page); + goto iput_out; + } + kaddr = (u32*)page_address(page); + if (*(le32*)kaddr == cpu_to_le32(0x72626968)/*'hibr'*/) { + ntfs_debug("Magic \"hibr\" found in hiberfil.sys. Windows is " + "hibernated on the volume. This is the " + "system volume."); + goto unm_iput_out; + } + kend = kaddr + NTFS_HIBERFIL_HEADER_SIZE/sizeof(*kaddr); + do { + if (unlikely(*kaddr)) { + ntfs_debug("hiberfil.sys is larger than 4kiB " + "(0x%llx), does not contain the " + "\"hibr\" magic, and does not have a " + "zero header. Windows is hibernated " + "on the volume. This is not the " + "system volume.", i_size_read(vi)); + goto unm_iput_out; + } + } while (++kaddr < kend); + ntfs_debug("hiberfil.sys contains a zero header. Windows is not " + "hibernated on the volume. This is the system " + "volume."); + ret = 0; +unm_iput_out: + ntfs_unmap_page(page); +iput_out: + iput(vi); + return ret; +} + +/** + * load_and_init_quota - load and setup the quota file for a volume if present + * @vol: ntfs super block describing device whose quota file to load + * + * Return 'true' on success or 'false' on error. If $Quota is not present, we + * leave vol->quota_ino as NULL and return success. + */ +static bool load_and_init_quota(ntfs_volume *vol) +{ + MFT_REF mref; + struct inode *tmp_ino; + ntfs_name *name = NULL; + static const ntfschar Quota[7] = { cpu_to_le16('$'), + cpu_to_le16('Q'), cpu_to_le16('u'), + cpu_to_le16('o'), cpu_to_le16('t'), + cpu_to_le16('a'), 0 }; + static ntfschar Q[3] = { cpu_to_le16('$'), + cpu_to_le16('Q'), 0 }; + + ntfs_debug("Entering."); + /* + * Find the inode number for the quota file by looking up the filename + * $Quota in the extended system files directory $Extend. + */ + inode_lock(vol->extend_ino); + mref = ntfs_lookup_inode_by_name(NTFS_I(vol->extend_ino), Quota, 6, + &name); + inode_unlock(vol->extend_ino); + if (IS_ERR_MREF(mref)) { + /* + * If the file does not exist, quotas are disabled and have + * never been enabled on this volume, just return success. + */ + if (MREF_ERR(mref) == -ENOENT) { + ntfs_debug("$Quota not present. Volume does not have " + "quotas enabled."); + /* + * No need to try to set quotas out of date if they are + * not enabled. + */ + NVolSetQuotaOutOfDate(vol); + return true; + } + /* A real error occurred. */ + ntfs_error(vol->sb, "Failed to find inode number for $Quota."); + return false; + } + /* We do not care for the type of match that was found. */ + kfree(name); + /* Get the inode. */ + tmp_ino = ntfs_iget(vol->sb, MREF(mref)); + if (IS_ERR(tmp_ino) || is_bad_inode(tmp_ino)) { + if (!IS_ERR(tmp_ino)) + iput(tmp_ino); + ntfs_error(vol->sb, "Failed to load $Quota."); + return false; + } + vol->quota_ino = tmp_ino; + /* Get the $Q index allocation attribute. */ + tmp_ino = ntfs_index_iget(vol->quota_ino, Q, 2); + if (IS_ERR(tmp_ino)) { + ntfs_error(vol->sb, "Failed to load $Quota/$Q index."); + return false; + } + vol->quota_q_ino = tmp_ino; + ntfs_debug("Done."); + return true; +} + +/** + * load_and_init_usnjrnl - load and setup the transaction log if present + * @vol: ntfs super block describing device whose usnjrnl file to load + * + * Return 'true' on success or 'false' on error. + * + * If $UsnJrnl is not present or in the process of being disabled, we set + * NVolUsnJrnlStamped() and return success. + * + * If the $UsnJrnl $DATA/$J attribute has a size equal to the lowest valid usn, + * i.e. transaction logging has only just been enabled or the journal has been + * stamped and nothing has been logged since, we also set NVolUsnJrnlStamped() + * and return success. + */ +static bool load_and_init_usnjrnl(ntfs_volume *vol) +{ + MFT_REF mref; + struct inode *tmp_ino; + ntfs_inode *tmp_ni; + struct page *page; + ntfs_name *name = NULL; + USN_HEADER *uh; + static const ntfschar UsnJrnl[9] = { cpu_to_le16('$'), + cpu_to_le16('U'), cpu_to_le16('s'), + cpu_to_le16('n'), cpu_to_le16('J'), + cpu_to_le16('r'), cpu_to_le16('n'), + cpu_to_le16('l'), 0 }; + static ntfschar Max[5] = { cpu_to_le16('$'), + cpu_to_le16('M'), cpu_to_le16('a'), + cpu_to_le16('x'), 0 }; + static ntfschar J[3] = { cpu_to_le16('$'), + cpu_to_le16('J'), 0 }; + + ntfs_debug("Entering."); + /* + * Find the inode number for the transaction log file by looking up the + * filename $UsnJrnl in the extended system files directory $Extend. + */ + inode_lock(vol->extend_ino); + mref = ntfs_lookup_inode_by_name(NTFS_I(vol->extend_ino), UsnJrnl, 8, + &name); + inode_unlock(vol->extend_ino); + if (IS_ERR_MREF(mref)) { + /* + * If the file does not exist, transaction logging is disabled, + * just return success. + */ + if (MREF_ERR(mref) == -ENOENT) { + ntfs_debug("$UsnJrnl not present. Volume does not " + "have transaction logging enabled."); +not_enabled: + /* + * No need to try to stamp the transaction log if + * transaction logging is not enabled. + */ + NVolSetUsnJrnlStamped(vol); + return true; + } + /* A real error occurred. */ + ntfs_error(vol->sb, "Failed to find inode number for " + "$UsnJrnl."); + return false; + } + /* We do not care for the type of match that was found. */ + kfree(name); + /* Get the inode. */ + tmp_ino = ntfs_iget(vol->sb, MREF(mref)); + if (IS_ERR(tmp_ino) || unlikely(is_bad_inode(tmp_ino))) { + if (!IS_ERR(tmp_ino)) + iput(tmp_ino); + ntfs_error(vol->sb, "Failed to load $UsnJrnl."); + return false; + } + vol->usnjrnl_ino = tmp_ino; + /* + * If the transaction log is in the process of being deleted, we can + * ignore it. + */ + if (unlikely(vol->vol_flags & VOLUME_DELETE_USN_UNDERWAY)) { + ntfs_debug("$UsnJrnl in the process of being disabled. " + "Volume does not have transaction logging " + "enabled."); + goto not_enabled; + } + /* Get the $DATA/$Max attribute. */ + tmp_ino = ntfs_attr_iget(vol->usnjrnl_ino, AT_DATA, Max, 4); + if (IS_ERR(tmp_ino)) { + ntfs_error(vol->sb, "Failed to load $UsnJrnl/$DATA/$Max " + "attribute."); + return false; + } + vol->usnjrnl_max_ino = tmp_ino; + if (unlikely(i_size_read(tmp_ino) < sizeof(USN_HEADER))) { + ntfs_error(vol->sb, "Found corrupt $UsnJrnl/$DATA/$Max " + "attribute (size is 0x%llx but should be at " + "least 0x%zx bytes).", i_size_read(tmp_ino), + sizeof(USN_HEADER)); + return false; + } + /* Get the $DATA/$J attribute. */ + tmp_ino = ntfs_attr_iget(vol->usnjrnl_ino, AT_DATA, J, 2); + if (IS_ERR(tmp_ino)) { + ntfs_error(vol->sb, "Failed to load $UsnJrnl/$DATA/$J " + "attribute."); + return false; + } + vol->usnjrnl_j_ino = tmp_ino; + /* Verify $J is non-resident and sparse. */ + tmp_ni = NTFS_I(vol->usnjrnl_j_ino); + if (unlikely(!NInoNonResident(tmp_ni) || !NInoSparse(tmp_ni))) { + ntfs_error(vol->sb, "$UsnJrnl/$DATA/$J attribute is resident " + "and/or not sparse."); + return false; + } + /* Read the USN_HEADER from $DATA/$Max. */ + page = ntfs_map_page(vol->usnjrnl_max_ino->i_mapping, 0); + if (IS_ERR(page)) { + ntfs_error(vol->sb, "Failed to read from $UsnJrnl/$DATA/$Max " + "attribute."); + return false; + } + uh = (USN_HEADER*)page_address(page); + /* Sanity check the $Max. */ + if (unlikely(sle64_to_cpu(uh->allocation_delta) > + sle64_to_cpu(uh->maximum_size))) { + ntfs_error(vol->sb, "Allocation delta (0x%llx) exceeds " + "maximum size (0x%llx). $UsnJrnl is corrupt.", + (long long)sle64_to_cpu(uh->allocation_delta), + (long long)sle64_to_cpu(uh->maximum_size)); + ntfs_unmap_page(page); + return false; + } + /* + * If the transaction log has been stamped and nothing has been written + * to it since, we do not need to stamp it. + */ + if (unlikely(sle64_to_cpu(uh->lowest_valid_usn) >= + i_size_read(vol->usnjrnl_j_ino))) { + if (likely(sle64_to_cpu(uh->lowest_valid_usn) == + i_size_read(vol->usnjrnl_j_ino))) { + ntfs_unmap_page(page); + ntfs_debug("$UsnJrnl is enabled but nothing has been " + "logged since it was last stamped. " + "Treating this as if the volume does " + "not have transaction logging " + "enabled."); + goto not_enabled; + } + ntfs_error(vol->sb, "$UsnJrnl has lowest valid usn (0x%llx) " + "which is out of bounds (0x%llx). $UsnJrnl " + "is corrupt.", + (long long)sle64_to_cpu(uh->lowest_valid_usn), + i_size_read(vol->usnjrnl_j_ino)); + ntfs_unmap_page(page); + return false; + } + ntfs_unmap_page(page); + ntfs_debug("Done."); + return true; +} + +/** + * load_and_init_attrdef - load the attribute definitions table for a volume + * @vol: ntfs super block describing device whose attrdef to load + * + * Return 'true' on success or 'false' on error. + */ +static bool load_and_init_attrdef(ntfs_volume *vol) +{ + loff_t i_size; + struct super_block *sb = vol->sb; + struct inode *ino; + struct page *page; + pgoff_t index, max_index; + unsigned int size; + + ntfs_debug("Entering."); + /* Read attrdef table and setup vol->attrdef and vol->attrdef_size. */ + ino = ntfs_iget(sb, FILE_AttrDef); + if (IS_ERR(ino) || is_bad_inode(ino)) { + if (!IS_ERR(ino)) + iput(ino); + goto failed; + } + NInoSetSparseDisabled(NTFS_I(ino)); + /* The size of FILE_AttrDef must be above 0 and fit inside 31 bits. */ + i_size = i_size_read(ino); + if (i_size <= 0 || i_size > 0x7fffffff) + goto iput_failed; + vol->attrdef = (ATTR_DEF*)ntfs_malloc_nofs(i_size); + if (!vol->attrdef) + goto iput_failed; + index = 0; + max_index = i_size >> PAGE_SHIFT; + size = PAGE_SIZE; + while (index < max_index) { + /* Read the attrdef table and copy it into the linear buffer. */ +read_partial_attrdef_page: + page = ntfs_map_page(ino->i_mapping, index); + if (IS_ERR(page)) + goto free_iput_failed; + memcpy((u8*)vol->attrdef + (index++ << PAGE_SHIFT), + page_address(page), size); + ntfs_unmap_page(page); + }; + if (size == PAGE_SIZE) { + size = i_size & ~PAGE_MASK; + if (size) + goto read_partial_attrdef_page; + } + vol->attrdef_size = i_size; + ntfs_debug("Read %llu bytes from $AttrDef.", i_size); + iput(ino); + return true; +free_iput_failed: + ntfs_free(vol->attrdef); + vol->attrdef = NULL; +iput_failed: + iput(ino); +failed: + ntfs_error(sb, "Failed to initialize attribute definition table."); + return false; +} + +#endif /* NTFS_RW */ + +/** + * load_and_init_upcase - load the upcase table for an ntfs volume + * @vol: ntfs super block describing device whose upcase to load + * + * Return 'true' on success or 'false' on error. + */ +static bool load_and_init_upcase(ntfs_volume *vol) +{ + loff_t i_size; + struct super_block *sb = vol->sb; + struct inode *ino; + struct page *page; + pgoff_t index, max_index; + unsigned int size; + int i, max; + + ntfs_debug("Entering."); + /* Read upcase table and setup vol->upcase and vol->upcase_len. */ + ino = ntfs_iget(sb, FILE_UpCase); + if (IS_ERR(ino) || is_bad_inode(ino)) { + if (!IS_ERR(ino)) + iput(ino); + goto upcase_failed; + } + /* + * The upcase size must not be above 64k Unicode characters, must not + * be zero and must be a multiple of sizeof(ntfschar). + */ + i_size = i_size_read(ino); + if (!i_size || i_size & (sizeof(ntfschar) - 1) || + i_size > 64ULL * 1024 * sizeof(ntfschar)) + goto iput_upcase_failed; + vol->upcase = (ntfschar*)ntfs_malloc_nofs(i_size); + if (!vol->upcase) + goto iput_upcase_failed; + index = 0; + max_index = i_size >> PAGE_SHIFT; + size = PAGE_SIZE; + while (index < max_index) { + /* Read the upcase table and copy it into the linear buffer. */ +read_partial_upcase_page: + page = ntfs_map_page(ino->i_mapping, index); + if (IS_ERR(page)) + goto iput_upcase_failed; + memcpy((char*)vol->upcase + (index++ << PAGE_SHIFT), + page_address(page), size); + ntfs_unmap_page(page); + }; + if (size == PAGE_SIZE) { + size = i_size & ~PAGE_MASK; + if (size) + goto read_partial_upcase_page; + } + vol->upcase_len = i_size >> UCHAR_T_SIZE_BITS; + ntfs_debug("Read %llu bytes from $UpCase (expected %zu bytes).", + i_size, 64 * 1024 * sizeof(ntfschar)); + iput(ino); + mutex_lock(&ntfs_lock); + if (!default_upcase) { + ntfs_debug("Using volume specified $UpCase since default is " + "not present."); + mutex_unlock(&ntfs_lock); + return true; + } + max = default_upcase_len; + if (max > vol->upcase_len) + max = vol->upcase_len; + for (i = 0; i < max; i++) + if (vol->upcase[i] != default_upcase[i]) + break; + if (i == max) { + ntfs_free(vol->upcase); + vol->upcase = default_upcase; + vol->upcase_len = max; + ntfs_nr_upcase_users++; + mutex_unlock(&ntfs_lock); + ntfs_debug("Volume specified $UpCase matches default. Using " + "default."); + return true; + } + mutex_unlock(&ntfs_lock); + ntfs_debug("Using volume specified $UpCase since it does not match " + "the default."); + return true; +iput_upcase_failed: + iput(ino); + ntfs_free(vol->upcase); + vol->upcase = NULL; +upcase_failed: + mutex_lock(&ntfs_lock); + if (default_upcase) { + vol->upcase = default_upcase; + vol->upcase_len = default_upcase_len; + ntfs_nr_upcase_users++; + mutex_unlock(&ntfs_lock); + ntfs_error(sb, "Failed to load $UpCase from the volume. Using " + "default."); + return true; + } + mutex_unlock(&ntfs_lock); + ntfs_error(sb, "Failed to initialize upcase table."); + return false; +} + +/* + * The lcn and mft bitmap inodes are NTFS-internal inodes with + * their own special locking rules: + */ +static struct lock_class_key + lcnbmp_runlist_lock_key, lcnbmp_mrec_lock_key, + mftbmp_runlist_lock_key, mftbmp_mrec_lock_key; + +/** + * load_system_files - open the system files using normal functions + * @vol: ntfs super block describing device whose system files to load + * + * Open the system files with normal access functions and complete setting up + * the ntfs super block @vol. + * + * Return 'true' on success or 'false' on error. + */ +static bool load_system_files(ntfs_volume *vol) +{ + struct super_block *sb = vol->sb; + MFT_RECORD *m; + VOLUME_INFORMATION *vi; + ntfs_attr_search_ctx *ctx; +#ifdef NTFS_RW + RESTART_PAGE_HEADER *rp; + int err; +#endif /* NTFS_RW */ + + ntfs_debug("Entering."); +#ifdef NTFS_RW + /* Get mft mirror inode compare the contents of $MFT and $MFTMirr. */ + if (!load_and_init_mft_mirror(vol) || !check_mft_mirror(vol)) { + static const char *es1 = "Failed to load $MFTMirr"; + static const char *es2 = "$MFTMirr does not match $MFT"; + static const char *es3 = ". Run ntfsfix and/or chkdsk."; + + /* If a read-write mount, convert it to a read-only mount. */ + if (!sb_rdonly(sb)) { + if (!(vol->on_errors & (ON_ERRORS_REMOUNT_RO | + ON_ERRORS_CONTINUE))) { + ntfs_error(sb, "%s and neither on_errors=" + "continue nor on_errors=" + "remount-ro was specified%s", + !vol->mftmirr_ino ? es1 : es2, + es3); + goto iput_mirr_err_out; + } + sb->s_flags |= SB_RDONLY; + ntfs_error(sb, "%s. Mounting read-only%s", + !vol->mftmirr_ino ? es1 : es2, es3); + } else + ntfs_warning(sb, "%s. Will not be able to remount " + "read-write%s", + !vol->mftmirr_ino ? es1 : es2, es3); + /* This will prevent a read-write remount. */ + NVolSetErrors(vol); + } +#endif /* NTFS_RW */ + /* Get mft bitmap attribute inode. */ + vol->mftbmp_ino = ntfs_attr_iget(vol->mft_ino, AT_BITMAP, NULL, 0); + if (IS_ERR(vol->mftbmp_ino)) { + ntfs_error(sb, "Failed to load $MFT/$BITMAP attribute."); + goto iput_mirr_err_out; + } + lockdep_set_class(&NTFS_I(vol->mftbmp_ino)->runlist.lock, + &mftbmp_runlist_lock_key); + lockdep_set_class(&NTFS_I(vol->mftbmp_ino)->mrec_lock, + &mftbmp_mrec_lock_key); + /* Read upcase table and setup @vol->upcase and @vol->upcase_len. */ + if (!load_and_init_upcase(vol)) + goto iput_mftbmp_err_out; +#ifdef NTFS_RW + /* + * Read attribute definitions table and setup @vol->attrdef and + * @vol->attrdef_size. + */ + if (!load_and_init_attrdef(vol)) + goto iput_upcase_err_out; +#endif /* NTFS_RW */ + /* + * Get the cluster allocation bitmap inode and verify the size, no + * need for any locking at this stage as we are already running + * exclusively as we are mount in progress task. + */ + vol->lcnbmp_ino = ntfs_iget(sb, FILE_Bitmap); + if (IS_ERR(vol->lcnbmp_ino) || is_bad_inode(vol->lcnbmp_ino)) { + if (!IS_ERR(vol->lcnbmp_ino)) + iput(vol->lcnbmp_ino); + goto bitmap_failed; + } + lockdep_set_class(&NTFS_I(vol->lcnbmp_ino)->runlist.lock, + &lcnbmp_runlist_lock_key); + lockdep_set_class(&NTFS_I(vol->lcnbmp_ino)->mrec_lock, + &lcnbmp_mrec_lock_key); + + NInoSetSparseDisabled(NTFS_I(vol->lcnbmp_ino)); + if ((vol->nr_clusters + 7) >> 3 > i_size_read(vol->lcnbmp_ino)) { + iput(vol->lcnbmp_ino); +bitmap_failed: + ntfs_error(sb, "Failed to load $Bitmap."); + goto iput_attrdef_err_out; + } + /* + * Get the volume inode and setup our cache of the volume flags and + * version. + */ + vol->vol_ino = ntfs_iget(sb, FILE_Volume); + if (IS_ERR(vol->vol_ino) || is_bad_inode(vol->vol_ino)) { + if (!IS_ERR(vol->vol_ino)) + iput(vol->vol_ino); +volume_failed: + ntfs_error(sb, "Failed to load $Volume."); + goto iput_lcnbmp_err_out; + } + m = map_mft_record(NTFS_I(vol->vol_ino)); + if (IS_ERR(m)) { +iput_volume_failed: + iput(vol->vol_ino); + goto volume_failed; + } + if (!(ctx = ntfs_attr_get_search_ctx(NTFS_I(vol->vol_ino), m))) { + ntfs_error(sb, "Failed to get attribute search context."); + goto get_ctx_vol_failed; + } + if (ntfs_attr_lookup(AT_VOLUME_INFORMATION, NULL, 0, 0, 0, NULL, 0, + ctx) || ctx->attr->non_resident || ctx->attr->flags) { +err_put_vol: + ntfs_attr_put_search_ctx(ctx); +get_ctx_vol_failed: + unmap_mft_record(NTFS_I(vol->vol_ino)); + goto iput_volume_failed; + } + vi = (VOLUME_INFORMATION*)((char*)ctx->attr + + le16_to_cpu(ctx->attr->data.resident.value_offset)); + /* Some bounds checks. */ + if ((u8*)vi < (u8*)ctx->attr || (u8*)vi + + le32_to_cpu(ctx->attr->data.resident.value_length) > + (u8*)ctx->attr + le32_to_cpu(ctx->attr->length)) + goto err_put_vol; + /* Copy the volume flags and version to the ntfs_volume structure. */ + vol->vol_flags = vi->flags; + vol->major_ver = vi->major_ver; + vol->minor_ver = vi->minor_ver; + ntfs_attr_put_search_ctx(ctx); + unmap_mft_record(NTFS_I(vol->vol_ino)); + pr_info("volume version %i.%i.\n", vol->major_ver, + vol->minor_ver); + if (vol->major_ver < 3 && NVolSparseEnabled(vol)) { + ntfs_warning(vol->sb, "Disabling sparse support due to NTFS " + "volume version %i.%i (need at least version " + "3.0).", vol->major_ver, vol->minor_ver); + NVolClearSparseEnabled(vol); + } +#ifdef NTFS_RW + /* Make sure that no unsupported volume flags are set. */ + if (vol->vol_flags & VOLUME_MUST_MOUNT_RO_MASK) { + static const char *es1a = "Volume is dirty"; + static const char *es1b = "Volume has been modified by chkdsk"; + static const char *es1c = "Volume has unsupported flags set"; + static const char *es2a = ". Run chkdsk and mount in Windows."; + static const char *es2b = ". Mount in Windows."; + const char *es1, *es2; + + es2 = es2a; + if (vol->vol_flags & VOLUME_IS_DIRTY) + es1 = es1a; + else if (vol->vol_flags & VOLUME_MODIFIED_BY_CHKDSK) { + es1 = es1b; + es2 = es2b; + } else { + es1 = es1c; + ntfs_warning(sb, "Unsupported volume flags 0x%x " + "encountered.", + (unsigned)le16_to_cpu(vol->vol_flags)); + } + /* If a read-write mount, convert it to a read-only mount. */ + if (!sb_rdonly(sb)) { + if (!(vol->on_errors & (ON_ERRORS_REMOUNT_RO | + ON_ERRORS_CONTINUE))) { + ntfs_error(sb, "%s and neither on_errors=" + "continue nor on_errors=" + "remount-ro was specified%s", + es1, es2); + goto iput_vol_err_out; + } + sb->s_flags |= SB_RDONLY; + ntfs_error(sb, "%s. Mounting read-only%s", es1, es2); + } else + ntfs_warning(sb, "%s. Will not be able to remount " + "read-write%s", es1, es2); + /* + * Do not set NVolErrors() because ntfs_remount() re-checks the + * flags which we need to do in case any flags have changed. + */ + } + /* + * Get the inode for the logfile, check it and determine if the volume + * was shutdown cleanly. + */ + rp = NULL; + if (!load_and_check_logfile(vol, &rp) || + !ntfs_is_logfile_clean(vol->logfile_ino, rp)) { + static const char *es1a = "Failed to load $LogFile"; + static const char *es1b = "$LogFile is not clean"; + static const char *es2 = ". Mount in Windows."; + const char *es1; + + es1 = !vol->logfile_ino ? es1a : es1b; + /* If a read-write mount, convert it to a read-only mount. */ + if (!sb_rdonly(sb)) { + if (!(vol->on_errors & (ON_ERRORS_REMOUNT_RO | + ON_ERRORS_CONTINUE))) { + ntfs_error(sb, "%s and neither on_errors=" + "continue nor on_errors=" + "remount-ro was specified%s", + es1, es2); + if (vol->logfile_ino) { + BUG_ON(!rp); + ntfs_free(rp); + } + goto iput_logfile_err_out; + } + sb->s_flags |= SB_RDONLY; + ntfs_error(sb, "%s. Mounting read-only%s", es1, es2); + } else + ntfs_warning(sb, "%s. Will not be able to remount " + "read-write%s", es1, es2); + /* This will prevent a read-write remount. */ + NVolSetErrors(vol); + } + ntfs_free(rp); +#endif /* NTFS_RW */ + /* Get the root directory inode so we can do path lookups. */ + vol->root_ino = ntfs_iget(sb, FILE_root); + if (IS_ERR(vol->root_ino) || is_bad_inode(vol->root_ino)) { + if (!IS_ERR(vol->root_ino)) + iput(vol->root_ino); + ntfs_error(sb, "Failed to load root directory."); + goto iput_logfile_err_out; + } +#ifdef NTFS_RW + /* + * Check if Windows is suspended to disk on the target volume. If it + * is hibernated, we must not write *anything* to the disk so set + * NVolErrors() without setting the dirty volume flag and mount + * read-only. This will prevent read-write remounting and it will also + * prevent all writes. + */ + err = check_windows_hibernation_status(vol); + if (unlikely(err)) { + static const char *es1a = "Failed to determine if Windows is " + "hibernated"; + static const char *es1b = "Windows is hibernated"; + static const char *es2 = ". Run chkdsk."; + const char *es1; + + es1 = err < 0 ? es1a : es1b; + /* If a read-write mount, convert it to a read-only mount. */ + if (!sb_rdonly(sb)) { + if (!(vol->on_errors & (ON_ERRORS_REMOUNT_RO | + ON_ERRORS_CONTINUE))) { + ntfs_error(sb, "%s and neither on_errors=" + "continue nor on_errors=" + "remount-ro was specified%s", + es1, es2); + goto iput_root_err_out; + } + sb->s_flags |= SB_RDONLY; + ntfs_error(sb, "%s. Mounting read-only%s", es1, es2); + } else + ntfs_warning(sb, "%s. Will not be able to remount " + "read-write%s", es1, es2); + /* This will prevent a read-write remount. */ + NVolSetErrors(vol); + } + /* If (still) a read-write mount, mark the volume dirty. */ + if (!sb_rdonly(sb) && ntfs_set_volume_flags(vol, VOLUME_IS_DIRTY)) { + static const char *es1 = "Failed to set dirty bit in volume " + "information flags"; + static const char *es2 = ". Run chkdsk."; + + /* Convert to a read-only mount. */ + if (!(vol->on_errors & (ON_ERRORS_REMOUNT_RO | + ON_ERRORS_CONTINUE))) { + ntfs_error(sb, "%s and neither on_errors=continue nor " + "on_errors=remount-ro was specified%s", + es1, es2); + goto iput_root_err_out; + } + ntfs_error(sb, "%s. Mounting read-only%s", es1, es2); + sb->s_flags |= SB_RDONLY; + /* + * Do not set NVolErrors() because ntfs_remount() might manage + * to set the dirty flag in which case all would be well. + */ + } +#if 0 + // TODO: Enable this code once we start modifying anything that is + // different between NTFS 1.2 and 3.x... + /* + * If (still) a read-write mount, set the NT4 compatibility flag on + * newer NTFS version volumes. + */ + if (!(sb->s_flags & SB_RDONLY) && (vol->major_ver > 1) && + ntfs_set_volume_flags(vol, VOLUME_MOUNTED_ON_NT4)) { + static const char *es1 = "Failed to set NT4 compatibility flag"; + static const char *es2 = ". Run chkdsk."; + + /* Convert to a read-only mount. */ + if (!(vol->on_errors & (ON_ERRORS_REMOUNT_RO | + ON_ERRORS_CONTINUE))) { + ntfs_error(sb, "%s and neither on_errors=continue nor " + "on_errors=remount-ro was specified%s", + es1, es2); + goto iput_root_err_out; + } + ntfs_error(sb, "%s. Mounting read-only%s", es1, es2); + sb->s_flags |= SB_RDONLY; + NVolSetErrors(vol); + } +#endif + /* If (still) a read-write mount, empty the logfile. */ + if (!sb_rdonly(sb) && !ntfs_empty_logfile(vol->logfile_ino)) { + static const char *es1 = "Failed to empty $LogFile"; + static const char *es2 = ". Mount in Windows."; + + /* Convert to a read-only mount. */ + if (!(vol->on_errors & (ON_ERRORS_REMOUNT_RO | + ON_ERRORS_CONTINUE))) { + ntfs_error(sb, "%s and neither on_errors=continue nor " + "on_errors=remount-ro was specified%s", + es1, es2); + goto iput_root_err_out; + } + ntfs_error(sb, "%s. Mounting read-only%s", es1, es2); + sb->s_flags |= SB_RDONLY; + NVolSetErrors(vol); + } +#endif /* NTFS_RW */ + /* If on NTFS versions before 3.0, we are done. */ + if (unlikely(vol->major_ver < 3)) + return true; + /* NTFS 3.0+ specific initialization. */ + /* Get the security descriptors inode. */ + vol->secure_ino = ntfs_iget(sb, FILE_Secure); + if (IS_ERR(vol->secure_ino) || is_bad_inode(vol->secure_ino)) { + if (!IS_ERR(vol->secure_ino)) + iput(vol->secure_ino); + ntfs_error(sb, "Failed to load $Secure."); + goto iput_root_err_out; + } + // TODO: Initialize security. + /* Get the extended system files' directory inode. */ + vol->extend_ino = ntfs_iget(sb, FILE_Extend); + if (IS_ERR(vol->extend_ino) || is_bad_inode(vol->extend_ino) || + !S_ISDIR(vol->extend_ino->i_mode)) { + if (!IS_ERR(vol->extend_ino)) + iput(vol->extend_ino); + ntfs_error(sb, "Failed to load $Extend."); + goto iput_sec_err_out; + } +#ifdef NTFS_RW + /* Find the quota file, load it if present, and set it up. */ + if (!load_and_init_quota(vol)) { + static const char *es1 = "Failed to load $Quota"; + static const char *es2 = ". Run chkdsk."; + + /* If a read-write mount, convert it to a read-only mount. */ + if (!sb_rdonly(sb)) { + if (!(vol->on_errors & (ON_ERRORS_REMOUNT_RO | + ON_ERRORS_CONTINUE))) { + ntfs_error(sb, "%s and neither on_errors=" + "continue nor on_errors=" + "remount-ro was specified%s", + es1, es2); + goto iput_quota_err_out; + } + sb->s_flags |= SB_RDONLY; + ntfs_error(sb, "%s. Mounting read-only%s", es1, es2); + } else + ntfs_warning(sb, "%s. Will not be able to remount " + "read-write%s", es1, es2); + /* This will prevent a read-write remount. */ + NVolSetErrors(vol); + } + /* If (still) a read-write mount, mark the quotas out of date. */ + if (!sb_rdonly(sb) && !ntfs_mark_quotas_out_of_date(vol)) { + static const char *es1 = "Failed to mark quotas out of date"; + static const char *es2 = ". Run chkdsk."; + + /* Convert to a read-only mount. */ + if (!(vol->on_errors & (ON_ERRORS_REMOUNT_RO | + ON_ERRORS_CONTINUE))) { + ntfs_error(sb, "%s and neither on_errors=continue nor " + "on_errors=remount-ro was specified%s", + es1, es2); + goto iput_quota_err_out; + } + ntfs_error(sb, "%s. Mounting read-only%s", es1, es2); + sb->s_flags |= SB_RDONLY; + NVolSetErrors(vol); + } + /* + * Find the transaction log file ($UsnJrnl), load it if present, check + * it, and set it up. + */ + if (!load_and_init_usnjrnl(vol)) { + static const char *es1 = "Failed to load $UsnJrnl"; + static const char *es2 = ". Run chkdsk."; + + /* If a read-write mount, convert it to a read-only mount. */ + if (!sb_rdonly(sb)) { + if (!(vol->on_errors & (ON_ERRORS_REMOUNT_RO | + ON_ERRORS_CONTINUE))) { + ntfs_error(sb, "%s and neither on_errors=" + "continue nor on_errors=" + "remount-ro was specified%s", + es1, es2); + goto iput_usnjrnl_err_out; + } + sb->s_flags |= SB_RDONLY; + ntfs_error(sb, "%s. Mounting read-only%s", es1, es2); + } else + ntfs_warning(sb, "%s. Will not be able to remount " + "read-write%s", es1, es2); + /* This will prevent a read-write remount. */ + NVolSetErrors(vol); + } + /* If (still) a read-write mount, stamp the transaction log. */ + if (!sb_rdonly(sb) && !ntfs_stamp_usnjrnl(vol)) { + static const char *es1 = "Failed to stamp transaction log " + "($UsnJrnl)"; + static const char *es2 = ". Run chkdsk."; + + /* Convert to a read-only mount. */ + if (!(vol->on_errors & (ON_ERRORS_REMOUNT_RO | + ON_ERRORS_CONTINUE))) { + ntfs_error(sb, "%s and neither on_errors=continue nor " + "on_errors=remount-ro was specified%s", + es1, es2); + goto iput_usnjrnl_err_out; + } + ntfs_error(sb, "%s. Mounting read-only%s", es1, es2); + sb->s_flags |= SB_RDONLY; + NVolSetErrors(vol); + } +#endif /* NTFS_RW */ + return true; +#ifdef NTFS_RW +iput_usnjrnl_err_out: + iput(vol->usnjrnl_j_ino); + iput(vol->usnjrnl_max_ino); + iput(vol->usnjrnl_ino); +iput_quota_err_out: + iput(vol->quota_q_ino); + iput(vol->quota_ino); + iput(vol->extend_ino); +#endif /* NTFS_RW */ +iput_sec_err_out: + iput(vol->secure_ino); +iput_root_err_out: + iput(vol->root_ino); +iput_logfile_err_out: +#ifdef NTFS_RW + iput(vol->logfile_ino); +iput_vol_err_out: +#endif /* NTFS_RW */ + iput(vol->vol_ino); +iput_lcnbmp_err_out: + iput(vol->lcnbmp_ino); +iput_attrdef_err_out: + vol->attrdef_size = 0; + if (vol->attrdef) { + ntfs_free(vol->attrdef); + vol->attrdef = NULL; + } +#ifdef NTFS_RW +iput_upcase_err_out: +#endif /* NTFS_RW */ + vol->upcase_len = 0; + mutex_lock(&ntfs_lock); + if (vol->upcase == default_upcase) { + ntfs_nr_upcase_users--; + vol->upcase = NULL; + } + mutex_unlock(&ntfs_lock); + if (vol->upcase) { + ntfs_free(vol->upcase); + vol->upcase = NULL; + } +iput_mftbmp_err_out: + iput(vol->mftbmp_ino); +iput_mirr_err_out: +#ifdef NTFS_RW + iput(vol->mftmirr_ino); +#endif /* NTFS_RW */ + return false; +} + +/** + * ntfs_put_super - called by the vfs to unmount a volume + * @sb: vfs superblock of volume to unmount + * + * ntfs_put_super() is called by the VFS (from fs/super.c::do_umount()) when + * the volume is being unmounted (umount system call has been invoked) and it + * releases all inodes and memory belonging to the NTFS specific part of the + * super block. + */ +static void ntfs_put_super(struct super_block *sb) +{ + ntfs_volume *vol = NTFS_SB(sb); + + ntfs_debug("Entering."); + +#ifdef NTFS_RW + /* + * Commit all inodes while they are still open in case some of them + * cause others to be dirtied. + */ + ntfs_commit_inode(vol->vol_ino); + + /* NTFS 3.0+ specific. */ + if (vol->major_ver >= 3) { + if (vol->usnjrnl_j_ino) + ntfs_commit_inode(vol->usnjrnl_j_ino); + if (vol->usnjrnl_max_ino) + ntfs_commit_inode(vol->usnjrnl_max_ino); + if (vol->usnjrnl_ino) + ntfs_commit_inode(vol->usnjrnl_ino); + if (vol->quota_q_ino) + ntfs_commit_inode(vol->quota_q_ino); + if (vol->quota_ino) + ntfs_commit_inode(vol->quota_ino); + if (vol->extend_ino) + ntfs_commit_inode(vol->extend_ino); + if (vol->secure_ino) + ntfs_commit_inode(vol->secure_ino); + } + + ntfs_commit_inode(vol->root_ino); + + down_write(&vol->lcnbmp_lock); + ntfs_commit_inode(vol->lcnbmp_ino); + up_write(&vol->lcnbmp_lock); + + down_write(&vol->mftbmp_lock); + ntfs_commit_inode(vol->mftbmp_ino); + up_write(&vol->mftbmp_lock); + + if (vol->logfile_ino) + ntfs_commit_inode(vol->logfile_ino); + + if (vol->mftmirr_ino) + ntfs_commit_inode(vol->mftmirr_ino); + ntfs_commit_inode(vol->mft_ino); + + /* + * If a read-write mount and no volume errors have occurred, mark the + * volume clean. Also, re-commit all affected inodes. + */ + if (!sb_rdonly(sb)) { + if (!NVolErrors(vol)) { + if (ntfs_clear_volume_flags(vol, VOLUME_IS_DIRTY)) + ntfs_warning(sb, "Failed to clear dirty bit " + "in volume information " + "flags. Run chkdsk."); + ntfs_commit_inode(vol->vol_ino); + ntfs_commit_inode(vol->root_ino); + if (vol->mftmirr_ino) + ntfs_commit_inode(vol->mftmirr_ino); + ntfs_commit_inode(vol->mft_ino); + } else { + ntfs_warning(sb, "Volume has errors. Leaving volume " + "marked dirty. Run chkdsk."); + } + } +#endif /* NTFS_RW */ + + iput(vol->vol_ino); + vol->vol_ino = NULL; + + /* NTFS 3.0+ specific clean up. */ + if (vol->major_ver >= 3) { +#ifdef NTFS_RW + if (vol->usnjrnl_j_ino) { + iput(vol->usnjrnl_j_ino); + vol->usnjrnl_j_ino = NULL; + } + if (vol->usnjrnl_max_ino) { + iput(vol->usnjrnl_max_ino); + vol->usnjrnl_max_ino = NULL; + } + if (vol->usnjrnl_ino) { + iput(vol->usnjrnl_ino); + vol->usnjrnl_ino = NULL; + } + if (vol->quota_q_ino) { + iput(vol->quota_q_ino); + vol->quota_q_ino = NULL; + } + if (vol->quota_ino) { + iput(vol->quota_ino); + vol->quota_ino = NULL; + } +#endif /* NTFS_RW */ + if (vol->extend_ino) { + iput(vol->extend_ino); + vol->extend_ino = NULL; + } + if (vol->secure_ino) { + iput(vol->secure_ino); + vol->secure_ino = NULL; + } + } + + iput(vol->root_ino); + vol->root_ino = NULL; + + down_write(&vol->lcnbmp_lock); + iput(vol->lcnbmp_ino); + vol->lcnbmp_ino = NULL; + up_write(&vol->lcnbmp_lock); + + down_write(&vol->mftbmp_lock); + iput(vol->mftbmp_ino); + vol->mftbmp_ino = NULL; + up_write(&vol->mftbmp_lock); + +#ifdef NTFS_RW + if (vol->logfile_ino) { + iput(vol->logfile_ino); + vol->logfile_ino = NULL; + } + if (vol->mftmirr_ino) { + /* Re-commit the mft mirror and mft just in case. */ + ntfs_commit_inode(vol->mftmirr_ino); + ntfs_commit_inode(vol->mft_ino); + iput(vol->mftmirr_ino); + vol->mftmirr_ino = NULL; + } + /* + * We should have no dirty inodes left, due to + * mft.c::ntfs_mft_writepage() cleaning all the dirty pages as + * the underlying mft records are written out and cleaned. + */ + ntfs_commit_inode(vol->mft_ino); + write_inode_now(vol->mft_ino, 1); +#endif /* NTFS_RW */ + + iput(vol->mft_ino); + vol->mft_ino = NULL; + + /* Throw away the table of attribute definitions. */ + vol->attrdef_size = 0; + if (vol->attrdef) { + ntfs_free(vol->attrdef); + vol->attrdef = NULL; + } + vol->upcase_len = 0; + /* + * Destroy the global default upcase table if necessary. Also decrease + * the number of upcase users if we are a user. + */ + mutex_lock(&ntfs_lock); + if (vol->upcase == default_upcase) { + ntfs_nr_upcase_users--; + vol->upcase = NULL; + } + if (!ntfs_nr_upcase_users && default_upcase) { + ntfs_free(default_upcase); + default_upcase = NULL; + } + if (vol->cluster_size <= 4096 && !--ntfs_nr_compression_users) + free_compression_buffers(); + mutex_unlock(&ntfs_lock); + if (vol->upcase) { + ntfs_free(vol->upcase); + vol->upcase = NULL; + } + + unload_nls(vol->nls_map); + + sb->s_fs_info = NULL; + kfree(vol); +} + +/** + * get_nr_free_clusters - return the number of free clusters on a volume + * @vol: ntfs volume for which to obtain free cluster count + * + * Calculate the number of free clusters on the mounted NTFS volume @vol. We + * actually calculate the number of clusters in use instead because this + * allows us to not care about partial pages as these will be just zero filled + * and hence not be counted as allocated clusters. + * + * The only particularity is that clusters beyond the end of the logical ntfs + * volume will be marked as allocated to prevent errors which means we have to + * discount those at the end. This is important as the cluster bitmap always + * has a size in multiples of 8 bytes, i.e. up to 63 clusters could be outside + * the logical volume and marked in use when they are not as they do not exist. + * + * If any pages cannot be read we assume all clusters in the erroring pages are + * in use. This means we return an underestimate on errors which is better than + * an overestimate. + */ +static s64 get_nr_free_clusters(ntfs_volume *vol) +{ + s64 nr_free = vol->nr_clusters; + struct address_space *mapping = vol->lcnbmp_ino->i_mapping; + struct page *page; + pgoff_t index, max_index; + + ntfs_debug("Entering."); + /* Serialize accesses to the cluster bitmap. */ + down_read(&vol->lcnbmp_lock); + /* + * Convert the number of bits into bytes rounded up, then convert into + * multiples of PAGE_SIZE, rounding up so that if we have one + * full and one partial page max_index = 2. + */ + max_index = (((vol->nr_clusters + 7) >> 3) + PAGE_SIZE - 1) >> + PAGE_SHIFT; + /* Use multiples of 4 bytes, thus max_size is PAGE_SIZE / 4. */ + ntfs_debug("Reading $Bitmap, max_index = 0x%lx, max_size = 0x%lx.", + max_index, PAGE_SIZE / 4); + for (index = 0; index < max_index; index++) { + unsigned long *kaddr; + + /* + * Read the page from page cache, getting it from backing store + * if necessary, and increment the use count. + */ + page = read_mapping_page(mapping, index, NULL); + /* Ignore pages which errored synchronously. */ + if (IS_ERR(page)) { + ntfs_debug("read_mapping_page() error. Skipping " + "page (index 0x%lx).", index); + nr_free -= PAGE_SIZE * 8; + continue; + } + kaddr = kmap_atomic(page); + /* + * Subtract the number of set bits. If this + * is the last page and it is partial we don't really care as + * it just means we do a little extra work but it won't affect + * the result as all out of range bytes are set to zero by + * ntfs_readpage(). + */ + nr_free -= bitmap_weight(kaddr, + PAGE_SIZE * BITS_PER_BYTE); + kunmap_atomic(kaddr); + put_page(page); + } + ntfs_debug("Finished reading $Bitmap, last index = 0x%lx.", index - 1); + /* + * Fixup for eventual bits outside logical ntfs volume (see function + * description above). + */ + if (vol->nr_clusters & 63) + nr_free += 64 - (vol->nr_clusters & 63); + up_read(&vol->lcnbmp_lock); + /* If errors occurred we may well have gone below zero, fix this. */ + if (nr_free < 0) + nr_free = 0; + ntfs_debug("Exiting."); + return nr_free; +} + +/** + * __get_nr_free_mft_records - return the number of free inodes on a volume + * @vol: ntfs volume for which to obtain free inode count + * @nr_free: number of mft records in filesystem + * @max_index: maximum number of pages containing set bits + * + * Calculate the number of free mft records (inodes) on the mounted NTFS + * volume @vol. We actually calculate the number of mft records in use instead + * because this allows us to not care about partial pages as these will be just + * zero filled and hence not be counted as allocated mft record. + * + * If any pages cannot be read we assume all mft records in the erroring pages + * are in use. This means we return an underestimate on errors which is better + * than an overestimate. + * + * NOTE: Caller must hold mftbmp_lock rw_semaphore for reading or writing. + */ +static unsigned long __get_nr_free_mft_records(ntfs_volume *vol, + s64 nr_free, const pgoff_t max_index) +{ + struct address_space *mapping = vol->mftbmp_ino->i_mapping; + struct page *page; + pgoff_t index; + + ntfs_debug("Entering."); + /* Use multiples of 4 bytes, thus max_size is PAGE_SIZE / 4. */ + ntfs_debug("Reading $MFT/$BITMAP, max_index = 0x%lx, max_size = " + "0x%lx.", max_index, PAGE_SIZE / 4); + for (index = 0; index < max_index; index++) { + unsigned long *kaddr; + + /* + * Read the page from page cache, getting it from backing store + * if necessary, and increment the use count. + */ + page = read_mapping_page(mapping, index, NULL); + /* Ignore pages which errored synchronously. */ + if (IS_ERR(page)) { + ntfs_debug("read_mapping_page() error. Skipping " + "page (index 0x%lx).", index); + nr_free -= PAGE_SIZE * 8; + continue; + } + kaddr = kmap_atomic(page); + /* + * Subtract the number of set bits. If this + * is the last page and it is partial we don't really care as + * it just means we do a little extra work but it won't affect + * the result as all out of range bytes are set to zero by + * ntfs_readpage(). + */ + nr_free -= bitmap_weight(kaddr, + PAGE_SIZE * BITS_PER_BYTE); + kunmap_atomic(kaddr); + put_page(page); + } + ntfs_debug("Finished reading $MFT/$BITMAP, last index = 0x%lx.", + index - 1); + /* If errors occurred we may well have gone below zero, fix this. */ + if (nr_free < 0) + nr_free = 0; + ntfs_debug("Exiting."); + return nr_free; +} + +/** + * ntfs_statfs - return information about mounted NTFS volume + * @dentry: dentry from mounted volume + * @sfs: statfs structure in which to return the information + * + * Return information about the mounted NTFS volume @dentry in the statfs structure + * pointed to by @sfs (this is initialized with zeros before ntfs_statfs is + * called). We interpret the values to be correct of the moment in time at + * which we are called. Most values are variable otherwise and this isn't just + * the free values but the totals as well. For example we can increase the + * total number of file nodes if we run out and we can keep doing this until + * there is no more space on the volume left at all. + * + * Called from vfs_statfs which is used to handle the statfs, fstatfs, and + * ustat system calls. + * + * Return 0 on success or -errno on error. + */ +static int ntfs_statfs(struct dentry *dentry, struct kstatfs *sfs) +{ + struct super_block *sb = dentry->d_sb; + s64 size; + ntfs_volume *vol = NTFS_SB(sb); + ntfs_inode *mft_ni = NTFS_I(vol->mft_ino); + pgoff_t max_index; + unsigned long flags; + + ntfs_debug("Entering."); + /* Type of filesystem. */ + sfs->f_type = NTFS_SB_MAGIC; + /* Optimal transfer block size. */ + sfs->f_bsize = PAGE_SIZE; + /* + * Total data blocks in filesystem in units of f_bsize and since + * inodes are also stored in data blocs ($MFT is a file) this is just + * the total clusters. + */ + sfs->f_blocks = vol->nr_clusters << vol->cluster_size_bits >> + PAGE_SHIFT; + /* Free data blocks in filesystem in units of f_bsize. */ + size = get_nr_free_clusters(vol) << vol->cluster_size_bits >> + PAGE_SHIFT; + if (size < 0LL) + size = 0LL; + /* Free blocks avail to non-superuser, same as above on NTFS. */ + sfs->f_bavail = sfs->f_bfree = size; + /* Serialize accesses to the inode bitmap. */ + down_read(&vol->mftbmp_lock); + read_lock_irqsave(&mft_ni->size_lock, flags); + size = i_size_read(vol->mft_ino) >> vol->mft_record_size_bits; + /* + * Convert the maximum number of set bits into bytes rounded up, then + * convert into multiples of PAGE_SIZE, rounding up so that if we + * have one full and one partial page max_index = 2. + */ + max_index = ((((mft_ni->initialized_size >> vol->mft_record_size_bits) + + 7) >> 3) + PAGE_SIZE - 1) >> PAGE_SHIFT; + read_unlock_irqrestore(&mft_ni->size_lock, flags); + /* Number of inodes in filesystem (at this point in time). */ + sfs->f_files = size; + /* Free inodes in fs (based on current total count). */ + sfs->f_ffree = __get_nr_free_mft_records(vol, size, max_index); + up_read(&vol->mftbmp_lock); + /* + * File system id. This is extremely *nix flavour dependent and even + * within Linux itself all fs do their own thing. I interpret this to + * mean a unique id associated with the mounted fs and not the id + * associated with the filesystem driver, the latter is already given + * by the filesystem type in sfs->f_type. Thus we use the 64-bit + * volume serial number splitting it into two 32-bit parts. We enter + * the least significant 32-bits in f_fsid[0] and the most significant + * 32-bits in f_fsid[1]. + */ + sfs->f_fsid = u64_to_fsid(vol->serial_no); + /* Maximum length of filenames. */ + sfs->f_namelen = NTFS_MAX_NAME_LEN; + return 0; +} + +#ifdef NTFS_RW +static int ntfs_write_inode(struct inode *vi, struct writeback_control *wbc) +{ + return __ntfs_write_inode(vi, wbc->sync_mode == WB_SYNC_ALL); +} +#endif + +/** + * The complete super operations. + */ +static const struct super_operations ntfs_sops = { + .alloc_inode = ntfs_alloc_big_inode, /* VFS: Allocate new inode. */ + .free_inode = ntfs_free_big_inode, /* VFS: Deallocate inode. */ +#ifdef NTFS_RW + .write_inode = ntfs_write_inode, /* VFS: Write dirty inode to + disk. */ +#endif /* NTFS_RW */ + .put_super = ntfs_put_super, /* Syscall: umount. */ + .statfs = ntfs_statfs, /* Syscall: statfs */ + .remount_fs = ntfs_remount, /* Syscall: mount -o remount. */ + .evict_inode = ntfs_evict_big_inode, /* VFS: Called when an inode is + removed from memory. */ + .show_options = ntfs_show_options, /* Show mount options in + proc. */ +}; + +/** + * ntfs_fill_super - mount an ntfs filesystem + * @sb: super block of ntfs filesystem to mount + * @opt: string containing the mount options + * @silent: silence error output + * + * ntfs_fill_super() is called by the VFS to mount the device described by @sb + * with the mount otions in @data with the NTFS filesystem. + * + * If @silent is true, remain silent even if errors are detected. This is used + * during bootup, when the kernel tries to mount the root filesystem with all + * registered filesystems one after the other until one succeeds. This implies + * that all filesystems except the correct one will quite correctly and + * expectedly return an error, but nobody wants to see error messages when in + * fact this is what is supposed to happen. + * + * NOTE: @sb->s_flags contains the mount options flags. + */ +static int ntfs_fill_super(struct super_block *sb, void *opt, const int silent) +{ + ntfs_volume *vol; + struct buffer_head *bh; + struct inode *tmp_ino; + int blocksize, result; + + /* + * We do a pretty difficult piece of bootstrap by reading the + * MFT (and other metadata) from disk into memory. We'll only + * release this metadata during umount, so the locking patterns + * observed during bootstrap do not count. So turn off the + * observation of locking patterns (strictly for this context + * only) while mounting NTFS. [The validator is still active + * otherwise, even for this context: it will for example record + * lock class registrations.] + */ + lockdep_off(); + ntfs_debug("Entering."); +#ifndef NTFS_RW + sb->s_flags |= SB_RDONLY; +#endif /* ! NTFS_RW */ + /* Allocate a new ntfs_volume and place it in sb->s_fs_info. */ + sb->s_fs_info = kmalloc(sizeof(ntfs_volume), GFP_NOFS); + vol = NTFS_SB(sb); + if (!vol) { + if (!silent) + ntfs_error(sb, "Allocation of NTFS volume structure " + "failed. Aborting mount..."); + lockdep_on(); + return -ENOMEM; + } + /* Initialize ntfs_volume structure. */ + *vol = (ntfs_volume) { + .sb = sb, + /* + * Default is group and other don't have any access to files or + * directories while owner has full access. Further, files by + * default are not executable but directories are of course + * browseable. + */ + .fmask = 0177, + .dmask = 0077, + }; + init_rwsem(&vol->mftbmp_lock); + init_rwsem(&vol->lcnbmp_lock); + + /* By default, enable sparse support. */ + NVolSetSparseEnabled(vol); + + /* Important to get the mount options dealt with now. */ + if (!parse_options(vol, (char*)opt)) + goto err_out_now; + + /* We support sector sizes up to the PAGE_SIZE. */ + if (bdev_logical_block_size(sb->s_bdev) > PAGE_SIZE) { + if (!silent) + ntfs_error(sb, "Device has unsupported sector size " + "(%i). The maximum supported sector " + "size on this architecture is %lu " + "bytes.", + bdev_logical_block_size(sb->s_bdev), + PAGE_SIZE); + goto err_out_now; + } + /* + * Setup the device access block size to NTFS_BLOCK_SIZE or the hard + * sector size, whichever is bigger. + */ + blocksize = sb_min_blocksize(sb, NTFS_BLOCK_SIZE); + if (blocksize < NTFS_BLOCK_SIZE) { + if (!silent) + ntfs_error(sb, "Unable to set device block size."); + goto err_out_now; + } + BUG_ON(blocksize != sb->s_blocksize); + ntfs_debug("Set device block size to %i bytes (block size bits %i).", + blocksize, sb->s_blocksize_bits); + /* Determine the size of the device in units of block_size bytes. */ + vol->nr_blocks = sb_bdev_nr_blocks(sb); + if (!vol->nr_blocks) { + if (!silent) + ntfs_error(sb, "Unable to determine device size."); + goto err_out_now; + } + /* Read the boot sector and return unlocked buffer head to it. */ + if (!(bh = read_ntfs_boot_sector(sb, silent))) { + if (!silent) + ntfs_error(sb, "Not an NTFS volume."); + goto err_out_now; + } + /* + * Extract the data from the boot sector and setup the ntfs volume + * using it. + */ + result = parse_ntfs_boot_sector(vol, (NTFS_BOOT_SECTOR*)bh->b_data); + brelse(bh); + if (!result) { + if (!silent) + ntfs_error(sb, "Unsupported NTFS filesystem."); + goto err_out_now; + } + /* + * If the boot sector indicates a sector size bigger than the current + * device block size, switch the device block size to the sector size. + * TODO: It may be possible to support this case even when the set + * below fails, we would just be breaking up the i/o for each sector + * into multiple blocks for i/o purposes but otherwise it should just + * work. However it is safer to leave disabled until someone hits this + * error message and then we can get them to try it without the setting + * so we know for sure that it works. + */ + if (vol->sector_size > blocksize) { + blocksize = sb_set_blocksize(sb, vol->sector_size); + if (blocksize != vol->sector_size) { + if (!silent) + ntfs_error(sb, "Unable to set device block " + "size to sector size (%i).", + vol->sector_size); + goto err_out_now; + } + BUG_ON(blocksize != sb->s_blocksize); + vol->nr_blocks = sb_bdev_nr_blocks(sb); + ntfs_debug("Changed device block size to %i bytes (block size " + "bits %i) to match volume sector size.", + blocksize, sb->s_blocksize_bits); + } + /* Initialize the cluster and mft allocators. */ + ntfs_setup_allocators(vol); + /* Setup remaining fields in the super block. */ + sb->s_magic = NTFS_SB_MAGIC; + /* + * Ntfs allows 63 bits for the file size, i.e. correct would be: + * sb->s_maxbytes = ~0ULL >> 1; + * But the kernel uses a long as the page cache page index which on + * 32-bit architectures is only 32-bits. MAX_LFS_FILESIZE is kernel + * defined to the maximum the page cache page index can cope with + * without overflowing the index or to 2^63 - 1, whichever is smaller. + */ + sb->s_maxbytes = MAX_LFS_FILESIZE; + /* Ntfs measures time in 100ns intervals. */ + sb->s_time_gran = 100; + /* + * Now load the metadata required for the page cache and our address + * space operations to function. We do this by setting up a specialised + * read_inode method and then just calling the normal iget() to obtain + * the inode for $MFT which is sufficient to allow our normal inode + * operations and associated address space operations to function. + */ + sb->s_op = &ntfs_sops; + tmp_ino = new_inode(sb); + if (!tmp_ino) { + if (!silent) + ntfs_error(sb, "Failed to load essential metadata."); + goto err_out_now; + } + tmp_ino->i_ino = FILE_MFT; + insert_inode_hash(tmp_ino); + if (ntfs_read_inode_mount(tmp_ino) < 0) { + if (!silent) + ntfs_error(sb, "Failed to load essential metadata."); + goto iput_tmp_ino_err_out_now; + } + mutex_lock(&ntfs_lock); + /* + * The current mount is a compression user if the cluster size is + * less than or equal 4kiB. + */ + if (vol->cluster_size <= 4096 && !ntfs_nr_compression_users++) { + result = allocate_compression_buffers(); + if (result) { + ntfs_error(NULL, "Failed to allocate buffers " + "for compression engine."); + ntfs_nr_compression_users--; + mutex_unlock(&ntfs_lock); + goto iput_tmp_ino_err_out_now; + } + } + /* + * Generate the global default upcase table if necessary. Also + * temporarily increment the number of upcase users to avoid race + * conditions with concurrent (u)mounts. + */ + if (!default_upcase) + default_upcase = generate_default_upcase(); + ntfs_nr_upcase_users++; + mutex_unlock(&ntfs_lock); + /* + * From now on, ignore @silent parameter. If we fail below this line, + * it will be due to a corrupt fs or a system error, so we report it. + */ + /* + * Open the system files with normal access functions and complete + * setting up the ntfs super block. + */ + if (!load_system_files(vol)) { + ntfs_error(sb, "Failed to load system files."); + goto unl_upcase_iput_tmp_ino_err_out_now; + } + + /* We grab a reference, simulating an ntfs_iget(). */ + ihold(vol->root_ino); + if ((sb->s_root = d_make_root(vol->root_ino))) { + ntfs_debug("Exiting, status successful."); + /* Release the default upcase if it has no users. */ + mutex_lock(&ntfs_lock); + if (!--ntfs_nr_upcase_users && default_upcase) { + ntfs_free(default_upcase); + default_upcase = NULL; + } + mutex_unlock(&ntfs_lock); + sb->s_export_op = &ntfs_export_ops; + lockdep_on(); + return 0; + } + ntfs_error(sb, "Failed to allocate root directory."); + /* Clean up after the successful load_system_files() call from above. */ + // TODO: Use ntfs_put_super() instead of repeating all this code... + // FIXME: Should mark the volume clean as the error is most likely + // -ENOMEM. + iput(vol->vol_ino); + vol->vol_ino = NULL; + /* NTFS 3.0+ specific clean up. */ + if (vol->major_ver >= 3) { +#ifdef NTFS_RW + if (vol->usnjrnl_j_ino) { + iput(vol->usnjrnl_j_ino); + vol->usnjrnl_j_ino = NULL; + } + if (vol->usnjrnl_max_ino) { + iput(vol->usnjrnl_max_ino); + vol->usnjrnl_max_ino = NULL; + } + if (vol->usnjrnl_ino) { + iput(vol->usnjrnl_ino); + vol->usnjrnl_ino = NULL; + } + if (vol->quota_q_ino) { + iput(vol->quota_q_ino); + vol->quota_q_ino = NULL; + } + if (vol->quota_ino) { + iput(vol->quota_ino); + vol->quota_ino = NULL; + } +#endif /* NTFS_RW */ + if (vol->extend_ino) { + iput(vol->extend_ino); + vol->extend_ino = NULL; + } + if (vol->secure_ino) { + iput(vol->secure_ino); + vol->secure_ino = NULL; + } + } + iput(vol->root_ino); + vol->root_ino = NULL; + iput(vol->lcnbmp_ino); + vol->lcnbmp_ino = NULL; + iput(vol->mftbmp_ino); + vol->mftbmp_ino = NULL; +#ifdef NTFS_RW + if (vol->logfile_ino) { + iput(vol->logfile_ino); + vol->logfile_ino = NULL; + } + if (vol->mftmirr_ino) { + iput(vol->mftmirr_ino); + vol->mftmirr_ino = NULL; + } +#endif /* NTFS_RW */ + /* Throw away the table of attribute definitions. */ + vol->attrdef_size = 0; + if (vol->attrdef) { + ntfs_free(vol->attrdef); + vol->attrdef = NULL; + } + vol->upcase_len = 0; + mutex_lock(&ntfs_lock); + if (vol->upcase == default_upcase) { + ntfs_nr_upcase_users--; + vol->upcase = NULL; + } + mutex_unlock(&ntfs_lock); + if (vol->upcase) { + ntfs_free(vol->upcase); + vol->upcase = NULL; + } + if (vol->nls_map) { + unload_nls(vol->nls_map); + vol->nls_map = NULL; + } + /* Error exit code path. */ +unl_upcase_iput_tmp_ino_err_out_now: + /* + * Decrease the number of upcase users and destroy the global default + * upcase table if necessary. + */ + mutex_lock(&ntfs_lock); + if (!--ntfs_nr_upcase_users && default_upcase) { + ntfs_free(default_upcase); + default_upcase = NULL; + } + if (vol->cluster_size <= 4096 && !--ntfs_nr_compression_users) + free_compression_buffers(); + mutex_unlock(&ntfs_lock); +iput_tmp_ino_err_out_now: + iput(tmp_ino); + if (vol->mft_ino && vol->mft_ino != tmp_ino) + iput(vol->mft_ino); + vol->mft_ino = NULL; + /* Errors at this stage are irrelevant. */ +err_out_now: + sb->s_fs_info = NULL; + kfree(vol); + ntfs_debug("Failed, returning -EINVAL."); + lockdep_on(); + return -EINVAL; +} + +/* + * This is a slab cache to optimize allocations and deallocations of Unicode + * strings of the maximum length allowed by NTFS, which is NTFS_MAX_NAME_LEN + * (255) Unicode characters + a terminating NULL Unicode character. + */ +struct kmem_cache *ntfs_name_cache; + +/* Slab caches for efficient allocation/deallocation of inodes. */ +struct kmem_cache *ntfs_inode_cache; +struct kmem_cache *ntfs_big_inode_cache; + +/* Init once constructor for the inode slab cache. */ +static void ntfs_big_inode_init_once(void *foo) +{ + ntfs_inode *ni = (ntfs_inode *)foo; + + inode_init_once(VFS_I(ni)); +} + +/* + * Slab caches to optimize allocations and deallocations of attribute search + * contexts and index contexts, respectively. + */ +struct kmem_cache *ntfs_attr_ctx_cache; +struct kmem_cache *ntfs_index_ctx_cache; + +/* Driver wide mutex. */ +DEFINE_MUTEX(ntfs_lock); + +static struct dentry *ntfs_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) +{ + return mount_bdev(fs_type, flags, dev_name, data, ntfs_fill_super); +} + +static struct file_system_type ntfs_fs_type = { + .owner = THIS_MODULE, + .name = "ntfs", + .mount = ntfs_mount, + .kill_sb = kill_block_super, + .fs_flags = FS_REQUIRES_DEV, +}; +MODULE_ALIAS_FS("ntfs"); + +/* Stable names for the slab caches. */ +static const char ntfs_index_ctx_cache_name[] = "ntfs_index_ctx_cache"; +static const char ntfs_attr_ctx_cache_name[] = "ntfs_attr_ctx_cache"; +static const char ntfs_name_cache_name[] = "ntfs_name_cache"; +static const char ntfs_inode_cache_name[] = "ntfs_inode_cache"; +static const char ntfs_big_inode_cache_name[] = "ntfs_big_inode_cache"; + +static int __init init_ntfs_fs(void) +{ + int err = 0; + + /* This may be ugly but it results in pretty output so who cares. (-8 */ + pr_info("driver " NTFS_VERSION " [Flags: R/" +#ifdef NTFS_RW + "W" +#else + "O" +#endif +#ifdef DEBUG + " DEBUG" +#endif +#ifdef MODULE + " MODULE" +#endif + "].\n"); + + ntfs_debug("Debug messages are enabled."); + + ntfs_index_ctx_cache = kmem_cache_create(ntfs_index_ctx_cache_name, + sizeof(ntfs_index_context), 0 /* offset */, + SLAB_HWCACHE_ALIGN, NULL /* ctor */); + if (!ntfs_index_ctx_cache) { + pr_crit("Failed to create %s!\n", ntfs_index_ctx_cache_name); + goto ictx_err_out; + } + ntfs_attr_ctx_cache = kmem_cache_create(ntfs_attr_ctx_cache_name, + sizeof(ntfs_attr_search_ctx), 0 /* offset */, + SLAB_HWCACHE_ALIGN, NULL /* ctor */); + if (!ntfs_attr_ctx_cache) { + pr_crit("NTFS: Failed to create %s!\n", + ntfs_attr_ctx_cache_name); + goto actx_err_out; + } + + ntfs_name_cache = kmem_cache_create(ntfs_name_cache_name, + (NTFS_MAX_NAME_LEN+1) * sizeof(ntfschar), 0, + SLAB_HWCACHE_ALIGN, NULL); + if (!ntfs_name_cache) { + pr_crit("Failed to create %s!\n", ntfs_name_cache_name); + goto name_err_out; + } + + ntfs_inode_cache = kmem_cache_create(ntfs_inode_cache_name, + sizeof(ntfs_inode), 0, + SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD, NULL); + if (!ntfs_inode_cache) { + pr_crit("Failed to create %s!\n", ntfs_inode_cache_name); + goto inode_err_out; + } + + ntfs_big_inode_cache = kmem_cache_create(ntfs_big_inode_cache_name, + sizeof(big_ntfs_inode), 0, + SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD| + SLAB_ACCOUNT, ntfs_big_inode_init_once); + if (!ntfs_big_inode_cache) { + pr_crit("Failed to create %s!\n", ntfs_big_inode_cache_name); + goto big_inode_err_out; + } + + /* Register the ntfs sysctls. */ + err = ntfs_sysctl(1); + if (err) { + pr_crit("Failed to register NTFS sysctls!\n"); + goto sysctl_err_out; + } + + err = register_filesystem(&ntfs_fs_type); + if (!err) { + ntfs_debug("NTFS driver registered successfully."); + return 0; /* Success! */ + } + pr_crit("Failed to register NTFS filesystem driver!\n"); + + /* Unregister the ntfs sysctls. */ + ntfs_sysctl(0); +sysctl_err_out: + kmem_cache_destroy(ntfs_big_inode_cache); +big_inode_err_out: + kmem_cache_destroy(ntfs_inode_cache); +inode_err_out: + kmem_cache_destroy(ntfs_name_cache); +name_err_out: + kmem_cache_destroy(ntfs_attr_ctx_cache); +actx_err_out: + kmem_cache_destroy(ntfs_index_ctx_cache); +ictx_err_out: + if (!err) { + pr_crit("Aborting NTFS filesystem driver registration...\n"); + err = -ENOMEM; + } + return err; +} + +static void __exit exit_ntfs_fs(void) +{ + ntfs_debug("Unregistering NTFS driver."); + + unregister_filesystem(&ntfs_fs_type); + + /* + * Make sure all delayed rcu free inodes are flushed before we + * destroy cache. + */ + rcu_barrier(); + kmem_cache_destroy(ntfs_big_inode_cache); + kmem_cache_destroy(ntfs_inode_cache); + kmem_cache_destroy(ntfs_name_cache); + kmem_cache_destroy(ntfs_attr_ctx_cache); + kmem_cache_destroy(ntfs_index_ctx_cache); + /* Unregister the ntfs sysctls. */ + ntfs_sysctl(0); +} + +MODULE_AUTHOR("Anton Altaparmakov <anton@tuxera.com>"); +MODULE_DESCRIPTION("NTFS 1.2/3.x driver - Copyright (c) 2001-2014 Anton Altaparmakov and Tuxera Inc."); +MODULE_VERSION(NTFS_VERSION); +MODULE_LICENSE("GPL"); +#ifdef DEBUG +module_param(debug_msgs, bint, 0); +MODULE_PARM_DESC(debug_msgs, "Enable debug messages."); +#endif + +module_init(init_ntfs_fs) +module_exit(exit_ntfs_fs) diff --git a/fs/ntfs/sysctl.c b/fs/ntfs/sysctl.c new file mode 100644 index 000000000..a030d00af --- /dev/null +++ b/fs/ntfs/sysctl.c @@ -0,0 +1,69 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * sysctl.c - Code for sysctl handling in NTFS Linux kernel driver. Part of + * the Linux-NTFS project. Adapted from the old NTFS driver, + * Copyright (C) 1997 Martin von Löwis, Régis Duchesne + * + * Copyright (c) 2002-2005 Anton Altaparmakov + */ + +#ifdef DEBUG + +#include <linux/module.h> + +#ifdef CONFIG_SYSCTL + +#include <linux/proc_fs.h> +#include <linux/sysctl.h> + +#include "sysctl.h" +#include "debug.h" + +/* Definition of the ntfs sysctl. */ +static struct ctl_table ntfs_sysctls[] = { + { + .procname = "ntfs-debug", + .data = &debug_msgs, /* Data pointer and size. */ + .maxlen = sizeof(debug_msgs), + .mode = 0644, /* Mode, proc handler. */ + .proc_handler = proc_dointvec + }, + {} +}; + +/* Define the parent directory /proc/sys/fs. */ +static struct ctl_table sysctls_root[] = { + { + .procname = "fs", + .mode = 0555, + .child = ntfs_sysctls + }, + {} +}; + +/* Storage for the sysctls header. */ +static struct ctl_table_header *sysctls_root_table; + +/** + * ntfs_sysctl - add or remove the debug sysctl + * @add: add (1) or remove (0) the sysctl + * + * Add or remove the debug sysctl. Return 0 on success or -errno on error. + */ +int ntfs_sysctl(int add) +{ + if (add) { + BUG_ON(sysctls_root_table); + sysctls_root_table = register_sysctl_table(sysctls_root); + if (!sysctls_root_table) + return -ENOMEM; + } else { + BUG_ON(!sysctls_root_table); + unregister_sysctl_table(sysctls_root_table); + sysctls_root_table = NULL; + } + return 0; +} + +#endif /* CONFIG_SYSCTL */ +#endif /* DEBUG */ diff --git a/fs/ntfs/sysctl.h b/fs/ntfs/sysctl.h new file mode 100644 index 000000000..96bb2299d --- /dev/null +++ b/fs/ntfs/sysctl.h @@ -0,0 +1,27 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * sysctl.h - Defines for sysctl handling in NTFS Linux kernel driver. Part of + * the Linux-NTFS project. Adapted from the old NTFS driver, + * Copyright (C) 1997 Martin von Löwis, Régis Duchesne + * + * Copyright (c) 2002-2004 Anton Altaparmakov + */ + +#ifndef _LINUX_NTFS_SYSCTL_H +#define _LINUX_NTFS_SYSCTL_H + + +#if defined(DEBUG) && defined(CONFIG_SYSCTL) + +extern int ntfs_sysctl(int add); + +#else + +/* Just return success. */ +static inline int ntfs_sysctl(int add) +{ + return 0; +} + +#endif /* DEBUG && CONFIG_SYSCTL */ +#endif /* _LINUX_NTFS_SYSCTL_H */ diff --git a/fs/ntfs/time.h b/fs/ntfs/time.h new file mode 100644 index 000000000..6b6326130 --- /dev/null +++ b/fs/ntfs/time.h @@ -0,0 +1,89 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * time.h - NTFS time conversion functions. Part of the Linux-NTFS project. + * + * Copyright (c) 2001-2005 Anton Altaparmakov + */ + +#ifndef _LINUX_NTFS_TIME_H +#define _LINUX_NTFS_TIME_H + +#include <linux/time.h> /* For current_kernel_time(). */ +#include <asm/div64.h> /* For do_div(). */ + +#include "endian.h" + +#define NTFS_TIME_OFFSET ((s64)(369 * 365 + 89) * 24 * 3600 * 10000000) + +/** + * utc2ntfs - convert Linux UTC time to NTFS time + * @ts: Linux UTC time to convert to NTFS time + * + * Convert the Linux UTC time @ts to its corresponding NTFS time and return + * that in little endian format. + * + * Linux stores time in a struct timespec64 consisting of a time64_t tv_sec + * and a long tv_nsec where tv_sec is the number of 1-second intervals since + * 1st January 1970, 00:00:00 UTC and tv_nsec is the number of 1-nano-second + * intervals since the value of tv_sec. + * + * NTFS uses Microsoft's standard time format which is stored in a s64 and is + * measured as the number of 100-nano-second intervals since 1st January 1601, + * 00:00:00 UTC. + */ +static inline sle64 utc2ntfs(const struct timespec64 ts) +{ + /* + * Convert the seconds to 100ns intervals, add the nano-seconds + * converted to 100ns intervals, and then add the NTFS time offset. + */ + return cpu_to_sle64((s64)ts.tv_sec * 10000000 + ts.tv_nsec / 100 + + NTFS_TIME_OFFSET); +} + +/** + * get_current_ntfs_time - get the current time in little endian NTFS format + * + * Get the current time from the Linux kernel, convert it to its corresponding + * NTFS time and return that in little endian format. + */ +static inline sle64 get_current_ntfs_time(void) +{ + struct timespec64 ts; + + ktime_get_coarse_real_ts64(&ts); + return utc2ntfs(ts); +} + +/** + * ntfs2utc - convert NTFS time to Linux time + * @time: NTFS time (little endian) to convert to Linux UTC + * + * Convert the little endian NTFS time @time to its corresponding Linux UTC + * time and return that in cpu format. + * + * Linux stores time in a struct timespec64 consisting of a time64_t tv_sec + * and a long tv_nsec where tv_sec is the number of 1-second intervals since + * 1st January 1970, 00:00:00 UTC and tv_nsec is the number of 1-nano-second + * intervals since the value of tv_sec. + * + * NTFS uses Microsoft's standard time format which is stored in a s64 and is + * measured as the number of 100 nano-second intervals since 1st January 1601, + * 00:00:00 UTC. + */ +static inline struct timespec64 ntfs2utc(const sle64 time) +{ + struct timespec64 ts; + + /* Subtract the NTFS time offset. */ + u64 t = (u64)(sle64_to_cpu(time) - NTFS_TIME_OFFSET); + /* + * Convert the time to 1-second intervals and the remainder to + * 1-nano-second intervals. + */ + ts.tv_nsec = do_div(t, 10000000) * 100; + ts.tv_sec = t; + return ts; +} + +#endif /* _LINUX_NTFS_TIME_H */ diff --git a/fs/ntfs/types.h b/fs/ntfs/types.h new file mode 100644 index 000000000..9a47859e7 --- /dev/null +++ b/fs/ntfs/types.h @@ -0,0 +1,55 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * types.h - Defines for NTFS Linux kernel driver specific types. + * Part of the Linux-NTFS project. + * + * Copyright (c) 2001-2005 Anton Altaparmakov + */ + +#ifndef _LINUX_NTFS_TYPES_H +#define _LINUX_NTFS_TYPES_H + +#include <linux/types.h> + +typedef __le16 le16; +typedef __le32 le32; +typedef __le64 le64; +typedef __u16 __bitwise sle16; +typedef __u32 __bitwise sle32; +typedef __u64 __bitwise sle64; + +/* 2-byte Unicode character type. */ +typedef le16 ntfschar; +#define UCHAR_T_SIZE_BITS 1 + +/* + * Clusters are signed 64-bit values on NTFS volumes. We define two types, LCN + * and VCN, to allow for type checking and better code readability. + */ +typedef s64 VCN; +typedef sle64 leVCN; +typedef s64 LCN; +typedef sle64 leLCN; + +/* + * The NTFS journal $LogFile uses log sequence numbers which are signed 64-bit + * values. We define our own type LSN, to allow for type checking and better + * code readability. + */ +typedef s64 LSN; +typedef sle64 leLSN; + +/* + * The NTFS transaction log $UsnJrnl uses usn which are signed 64-bit values. + * We define our own type USN, to allow for type checking and better code + * readability. + */ +typedef s64 USN; +typedef sle64 leUSN; + +typedef enum { + CASE_SENSITIVE = 0, + IGNORE_CASE = 1, +} IGNORE_CASE_BOOL; + +#endif /* _LINUX_NTFS_TYPES_H */ diff --git a/fs/ntfs/unistr.c b/fs/ntfs/unistr.c new file mode 100644 index 000000000..a6b6c64f1 --- /dev/null +++ b/fs/ntfs/unistr.c @@ -0,0 +1,384 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * unistr.c - NTFS Unicode string handling. Part of the Linux-NTFS project. + * + * Copyright (c) 2001-2006 Anton Altaparmakov + */ + +#include <linux/slab.h> + +#include "types.h" +#include "debug.h" +#include "ntfs.h" + +/* + * IMPORTANT + * ========= + * + * All these routines assume that the Unicode characters are in little endian + * encoding inside the strings!!! + */ + +/* + * This is used by the name collation functions to quickly determine what + * characters are (in)valid. + */ +static const u8 legal_ansi_char_array[0x40] = { + 0x00, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, + 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, + + 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, + 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, + + 0x17, 0x07, 0x18, 0x17, 0x17, 0x17, 0x17, 0x17, + 0x17, 0x17, 0x18, 0x16, 0x16, 0x17, 0x07, 0x00, + + 0x17, 0x17, 0x17, 0x17, 0x17, 0x17, 0x17, 0x17, + 0x17, 0x17, 0x04, 0x16, 0x18, 0x16, 0x18, 0x18, +}; + +/** + * ntfs_are_names_equal - compare two Unicode names for equality + * @s1: name to compare to @s2 + * @s1_len: length in Unicode characters of @s1 + * @s2: name to compare to @s1 + * @s2_len: length in Unicode characters of @s2 + * @ic: ignore case bool + * @upcase: upcase table (only if @ic == IGNORE_CASE) + * @upcase_size: length in Unicode characters of @upcase (if present) + * + * Compare the names @s1 and @s2 and return 'true' (1) if the names are + * identical, or 'false' (0) if they are not identical. If @ic is IGNORE_CASE, + * the @upcase table is used to performa a case insensitive comparison. + */ +bool ntfs_are_names_equal(const ntfschar *s1, size_t s1_len, + const ntfschar *s2, size_t s2_len, const IGNORE_CASE_BOOL ic, + const ntfschar *upcase, const u32 upcase_size) +{ + if (s1_len != s2_len) + return false; + if (ic == CASE_SENSITIVE) + return !ntfs_ucsncmp(s1, s2, s1_len); + return !ntfs_ucsncasecmp(s1, s2, s1_len, upcase, upcase_size); +} + +/** + * ntfs_collate_names - collate two Unicode names + * @name1: first Unicode name to compare + * @name2: second Unicode name to compare + * @err_val: if @name1 contains an invalid character return this value + * @ic: either CASE_SENSITIVE or IGNORE_CASE + * @upcase: upcase table (ignored if @ic is CASE_SENSITIVE) + * @upcase_len: upcase table size (ignored if @ic is CASE_SENSITIVE) + * + * ntfs_collate_names collates two Unicode names and returns: + * + * -1 if the first name collates before the second one, + * 0 if the names match, + * 1 if the second name collates before the first one, or + * @err_val if an invalid character is found in @name1 during the comparison. + * + * The following characters are considered invalid: '"', '*', '<', '>' and '?'. + */ +int ntfs_collate_names(const ntfschar *name1, const u32 name1_len, + const ntfschar *name2, const u32 name2_len, + const int err_val, const IGNORE_CASE_BOOL ic, + const ntfschar *upcase, const u32 upcase_len) +{ + u32 cnt, min_len; + u16 c1, c2; + + min_len = name1_len; + if (name1_len > name2_len) + min_len = name2_len; + for (cnt = 0; cnt < min_len; ++cnt) { + c1 = le16_to_cpu(*name1++); + c2 = le16_to_cpu(*name2++); + if (ic) { + if (c1 < upcase_len) + c1 = le16_to_cpu(upcase[c1]); + if (c2 < upcase_len) + c2 = le16_to_cpu(upcase[c2]); + } + if (c1 < 64 && legal_ansi_char_array[c1] & 8) + return err_val; + if (c1 < c2) + return -1; + if (c1 > c2) + return 1; + } + if (name1_len < name2_len) + return -1; + if (name1_len == name2_len) + return 0; + /* name1_len > name2_len */ + c1 = le16_to_cpu(*name1); + if (c1 < 64 && legal_ansi_char_array[c1] & 8) + return err_val; + return 1; +} + +/** + * ntfs_ucsncmp - compare two little endian Unicode strings + * @s1: first string + * @s2: second string + * @n: maximum unicode characters to compare + * + * Compare the first @n characters of the Unicode strings @s1 and @s2, + * The strings in little endian format and appropriate le16_to_cpu() + * conversion is performed on non-little endian machines. + * + * The function returns an integer less than, equal to, or greater than zero + * if @s1 (or the first @n Unicode characters thereof) is found, respectively, + * to be less than, to match, or be greater than @s2. + */ +int ntfs_ucsncmp(const ntfschar *s1, const ntfschar *s2, size_t n) +{ + u16 c1, c2; + size_t i; + + for (i = 0; i < n; ++i) { + c1 = le16_to_cpu(s1[i]); + c2 = le16_to_cpu(s2[i]); + if (c1 < c2) + return -1; + if (c1 > c2) + return 1; + if (!c1) + break; + } + return 0; +} + +/** + * ntfs_ucsncasecmp - compare two little endian Unicode strings, ignoring case + * @s1: first string + * @s2: second string + * @n: maximum unicode characters to compare + * @upcase: upcase table + * @upcase_size: upcase table size in Unicode characters + * + * Compare the first @n characters of the Unicode strings @s1 and @s2, + * ignoring case. The strings in little endian format and appropriate + * le16_to_cpu() conversion is performed on non-little endian machines. + * + * Each character is uppercased using the @upcase table before the comparison. + * + * The function returns an integer less than, equal to, or greater than zero + * if @s1 (or the first @n Unicode characters thereof) is found, respectively, + * to be less than, to match, or be greater than @s2. + */ +int ntfs_ucsncasecmp(const ntfschar *s1, const ntfschar *s2, size_t n, + const ntfschar *upcase, const u32 upcase_size) +{ + size_t i; + u16 c1, c2; + + for (i = 0; i < n; ++i) { + if ((c1 = le16_to_cpu(s1[i])) < upcase_size) + c1 = le16_to_cpu(upcase[c1]); + if ((c2 = le16_to_cpu(s2[i])) < upcase_size) + c2 = le16_to_cpu(upcase[c2]); + if (c1 < c2) + return -1; + if (c1 > c2) + return 1; + if (!c1) + break; + } + return 0; +} + +void ntfs_upcase_name(ntfschar *name, u32 name_len, const ntfschar *upcase, + const u32 upcase_len) +{ + u32 i; + u16 u; + + for (i = 0; i < name_len; i++) + if ((u = le16_to_cpu(name[i])) < upcase_len) + name[i] = upcase[u]; +} + +void ntfs_file_upcase_value(FILE_NAME_ATTR *file_name_attr, + const ntfschar *upcase, const u32 upcase_len) +{ + ntfs_upcase_name((ntfschar*)&file_name_attr->file_name, + file_name_attr->file_name_length, upcase, upcase_len); +} + +int ntfs_file_compare_values(FILE_NAME_ATTR *file_name_attr1, + FILE_NAME_ATTR *file_name_attr2, + const int err_val, const IGNORE_CASE_BOOL ic, + const ntfschar *upcase, const u32 upcase_len) +{ + return ntfs_collate_names((ntfschar*)&file_name_attr1->file_name, + file_name_attr1->file_name_length, + (ntfschar*)&file_name_attr2->file_name, + file_name_attr2->file_name_length, + err_val, ic, upcase, upcase_len); +} + +/** + * ntfs_nlstoucs - convert NLS string to little endian Unicode string + * @vol: ntfs volume which we are working with + * @ins: input NLS string buffer + * @ins_len: length of input string in bytes + * @outs: on return contains the allocated output Unicode string buffer + * + * Convert the input string @ins, which is in whatever format the loaded NLS + * map dictates, into a little endian, 2-byte Unicode string. + * + * This function allocates the string and the caller is responsible for + * calling kmem_cache_free(ntfs_name_cache, *@outs); when finished with it. + * + * On success the function returns the number of Unicode characters written to + * the output string *@outs (>= 0), not counting the terminating Unicode NULL + * character. *@outs is set to the allocated output string buffer. + * + * On error, a negative number corresponding to the error code is returned. In + * that case the output string is not allocated. Both *@outs and *@outs_len + * are then undefined. + * + * This might look a bit odd due to fast path optimization... + */ +int ntfs_nlstoucs(const ntfs_volume *vol, const char *ins, + const int ins_len, ntfschar **outs) +{ + struct nls_table *nls = vol->nls_map; + ntfschar *ucs; + wchar_t wc; + int i, o, wc_len; + + /* We do not trust outside sources. */ + if (likely(ins)) { + ucs = kmem_cache_alloc(ntfs_name_cache, GFP_NOFS); + if (likely(ucs)) { + for (i = o = 0; i < ins_len; i += wc_len) { + wc_len = nls->char2uni(ins + i, ins_len - i, + &wc); + if (likely(wc_len >= 0 && + o < NTFS_MAX_NAME_LEN)) { + if (likely(wc)) { + ucs[o++] = cpu_to_le16(wc); + continue; + } /* else if (!wc) */ + break; + } /* else if (wc_len < 0 || + o >= NTFS_MAX_NAME_LEN) */ + goto name_err; + } + ucs[o] = 0; + *outs = ucs; + return o; + } /* else if (!ucs) */ + ntfs_error(vol->sb, "Failed to allocate buffer for converted " + "name from ntfs_name_cache."); + return -ENOMEM; + } /* else if (!ins) */ + ntfs_error(vol->sb, "Received NULL pointer."); + return -EINVAL; +name_err: + kmem_cache_free(ntfs_name_cache, ucs); + if (wc_len < 0) { + ntfs_error(vol->sb, "Name using character set %s contains " + "characters that cannot be converted to " + "Unicode.", nls->charset); + i = -EILSEQ; + } else /* if (o >= NTFS_MAX_NAME_LEN) */ { + ntfs_error(vol->sb, "Name is too long (maximum length for a " + "name on NTFS is %d Unicode characters.", + NTFS_MAX_NAME_LEN); + i = -ENAMETOOLONG; + } + return i; +} + +/** + * ntfs_ucstonls - convert little endian Unicode string to NLS string + * @vol: ntfs volume which we are working with + * @ins: input Unicode string buffer + * @ins_len: length of input string in Unicode characters + * @outs: on return contains the (allocated) output NLS string buffer + * @outs_len: length of output string buffer in bytes + * + * Convert the input little endian, 2-byte Unicode string @ins, of length + * @ins_len into the string format dictated by the loaded NLS. + * + * If *@outs is NULL, this function allocates the string and the caller is + * responsible for calling kfree(*@outs); when finished with it. In this case + * @outs_len is ignored and can be 0. + * + * On success the function returns the number of bytes written to the output + * string *@outs (>= 0), not counting the terminating NULL byte. If the output + * string buffer was allocated, *@outs is set to it. + * + * On error, a negative number corresponding to the error code is returned. In + * that case the output string is not allocated. The contents of *@outs are + * then undefined. + * + * This might look a bit odd due to fast path optimization... + */ +int ntfs_ucstonls(const ntfs_volume *vol, const ntfschar *ins, + const int ins_len, unsigned char **outs, int outs_len) +{ + struct nls_table *nls = vol->nls_map; + unsigned char *ns; + int i, o, ns_len, wc; + + /* We don't trust outside sources. */ + if (ins) { + ns = *outs; + ns_len = outs_len; + if (ns && !ns_len) { + wc = -ENAMETOOLONG; + goto conversion_err; + } + if (!ns) { + ns_len = ins_len * NLS_MAX_CHARSET_SIZE; + ns = kmalloc(ns_len + 1, GFP_NOFS); + if (!ns) + goto mem_err_out; + } + for (i = o = 0; i < ins_len; i++) { +retry: wc = nls->uni2char(le16_to_cpu(ins[i]), ns + o, + ns_len - o); + if (wc > 0) { + o += wc; + continue; + } else if (!wc) + break; + else if (wc == -ENAMETOOLONG && ns != *outs) { + unsigned char *tc; + /* Grow in multiples of 64 bytes. */ + tc = kmalloc((ns_len + 64) & + ~63, GFP_NOFS); + if (tc) { + memcpy(tc, ns, ns_len); + ns_len = ((ns_len + 64) & ~63) - 1; + kfree(ns); + ns = tc; + goto retry; + } /* No memory so goto conversion_error; */ + } /* wc < 0, real error. */ + goto conversion_err; + } + ns[o] = 0; + *outs = ns; + return o; + } /* else (!ins) */ + ntfs_error(vol->sb, "Received NULL pointer."); + return -EINVAL; +conversion_err: + ntfs_error(vol->sb, "Unicode name contains characters that cannot be " + "converted to character set %s. You might want to " + "try to use the mount option nls=utf8.", nls->charset); + if (ns != *outs) + kfree(ns); + if (wc != -ENAMETOOLONG) + wc = -EILSEQ; + return wc; +mem_err_out: + ntfs_error(vol->sb, "Failed to allocate name!"); + return -ENOMEM; +} diff --git a/fs/ntfs/upcase.c b/fs/ntfs/upcase.c new file mode 100644 index 000000000..4ebe84a78 --- /dev/null +++ b/fs/ntfs/upcase.c @@ -0,0 +1,73 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * upcase.c - Generate the full NTFS Unicode upcase table in little endian. + * Part of the Linux-NTFS project. + * + * Copyright (c) 2001 Richard Russon <ntfs@flatcap.org> + * Copyright (c) 2001-2006 Anton Altaparmakov + */ + +#include "malloc.h" +#include "ntfs.h" + +ntfschar *generate_default_upcase(void) +{ + static const int uc_run_table[][3] = { /* Start, End, Add */ + {0x0061, 0x007B, -32}, {0x0451, 0x045D, -80}, {0x1F70, 0x1F72, 74}, + {0x00E0, 0x00F7, -32}, {0x045E, 0x0460, -80}, {0x1F72, 0x1F76, 86}, + {0x00F8, 0x00FF, -32}, {0x0561, 0x0587, -48}, {0x1F76, 0x1F78, 100}, + {0x0256, 0x0258, -205}, {0x1F00, 0x1F08, 8}, {0x1F78, 0x1F7A, 128}, + {0x028A, 0x028C, -217}, {0x1F10, 0x1F16, 8}, {0x1F7A, 0x1F7C, 112}, + {0x03AC, 0x03AD, -38}, {0x1F20, 0x1F28, 8}, {0x1F7C, 0x1F7E, 126}, + {0x03AD, 0x03B0, -37}, {0x1F30, 0x1F38, 8}, {0x1FB0, 0x1FB2, 8}, + {0x03B1, 0x03C2, -32}, {0x1F40, 0x1F46, 8}, {0x1FD0, 0x1FD2, 8}, + {0x03C2, 0x03C3, -31}, {0x1F51, 0x1F52, 8}, {0x1FE0, 0x1FE2, 8}, + {0x03C3, 0x03CC, -32}, {0x1F53, 0x1F54, 8}, {0x1FE5, 0x1FE6, 7}, + {0x03CC, 0x03CD, -64}, {0x1F55, 0x1F56, 8}, {0x2170, 0x2180, -16}, + {0x03CD, 0x03CF, -63}, {0x1F57, 0x1F58, 8}, {0x24D0, 0x24EA, -26}, + {0x0430, 0x0450, -32}, {0x1F60, 0x1F68, 8}, {0xFF41, 0xFF5B, -32}, + {0} + }; + + static const int uc_dup_table[][2] = { /* Start, End */ + {0x0100, 0x012F}, {0x01A0, 0x01A6}, {0x03E2, 0x03EF}, {0x04CB, 0x04CC}, + {0x0132, 0x0137}, {0x01B3, 0x01B7}, {0x0460, 0x0481}, {0x04D0, 0x04EB}, + {0x0139, 0x0149}, {0x01CD, 0x01DD}, {0x0490, 0x04BF}, {0x04EE, 0x04F5}, + {0x014A, 0x0178}, {0x01DE, 0x01EF}, {0x04BF, 0x04BF}, {0x04F8, 0x04F9}, + {0x0179, 0x017E}, {0x01F4, 0x01F5}, {0x04C1, 0x04C4}, {0x1E00, 0x1E95}, + {0x018B, 0x018B}, {0x01FA, 0x0218}, {0x04C7, 0x04C8}, {0x1EA0, 0x1EF9}, + {0} + }; + + static const int uc_word_table[][2] = { /* Offset, Value */ + {0x00FF, 0x0178}, {0x01AD, 0x01AC}, {0x01F3, 0x01F1}, {0x0269, 0x0196}, + {0x0183, 0x0182}, {0x01B0, 0x01AF}, {0x0253, 0x0181}, {0x026F, 0x019C}, + {0x0185, 0x0184}, {0x01B9, 0x01B8}, {0x0254, 0x0186}, {0x0272, 0x019D}, + {0x0188, 0x0187}, {0x01BD, 0x01BC}, {0x0259, 0x018F}, {0x0275, 0x019F}, + {0x018C, 0x018B}, {0x01C6, 0x01C4}, {0x025B, 0x0190}, {0x0283, 0x01A9}, + {0x0192, 0x0191}, {0x01C9, 0x01C7}, {0x0260, 0x0193}, {0x0288, 0x01AE}, + {0x0199, 0x0198}, {0x01CC, 0x01CA}, {0x0263, 0x0194}, {0x0292, 0x01B7}, + {0x01A8, 0x01A7}, {0x01DD, 0x018E}, {0x0268, 0x0197}, + {0} + }; + + int i, r; + ntfschar *uc; + + uc = ntfs_malloc_nofs(default_upcase_len * sizeof(ntfschar)); + if (!uc) + return uc; + memset(uc, 0, default_upcase_len * sizeof(ntfschar)); + /* Generate the little endian Unicode upcase table used by ntfs. */ + for (i = 0; i < default_upcase_len; i++) + uc[i] = cpu_to_le16(i); + for (r = 0; uc_run_table[r][0]; r++) + for (i = uc_run_table[r][0]; i < uc_run_table[r][1]; i++) + le16_add_cpu(&uc[i], uc_run_table[r][2]); + for (r = 0; uc_dup_table[r][0]; r++) + for (i = uc_dup_table[r][0]; i < uc_dup_table[r][1]; i += 2) + le16_add_cpu(&uc[i + 1], -1); + for (r = 0; uc_word_table[r][0]; r++) + uc[uc_word_table[r][0]] = cpu_to_le16(uc_word_table[r][1]); + return uc; +} diff --git a/fs/ntfs/usnjrnl.c b/fs/ntfs/usnjrnl.c new file mode 100644 index 000000000..9097a0b4e --- /dev/null +++ b/fs/ntfs/usnjrnl.c @@ -0,0 +1,70 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * usnjrnl.h - NTFS kernel transaction log ($UsnJrnl) handling. Part of the + * Linux-NTFS project. + * + * Copyright (c) 2005 Anton Altaparmakov + */ + +#ifdef NTFS_RW + +#include <linux/fs.h> +#include <linux/highmem.h> +#include <linux/mm.h> + +#include "aops.h" +#include "debug.h" +#include "endian.h" +#include "time.h" +#include "types.h" +#include "usnjrnl.h" +#include "volume.h" + +/** + * ntfs_stamp_usnjrnl - stamp the transaction log ($UsnJrnl) on an ntfs volume + * @vol: ntfs volume on which to stamp the transaction log + * + * Stamp the transaction log ($UsnJrnl) on the ntfs volume @vol and return + * 'true' on success and 'false' on error. + * + * This function assumes that the transaction log has already been loaded and + * consistency checked by a call to fs/ntfs/super.c::load_and_init_usnjrnl(). + */ +bool ntfs_stamp_usnjrnl(ntfs_volume *vol) +{ + ntfs_debug("Entering."); + if (likely(!NVolUsnJrnlStamped(vol))) { + sle64 stamp; + struct page *page; + USN_HEADER *uh; + + page = ntfs_map_page(vol->usnjrnl_max_ino->i_mapping, 0); + if (IS_ERR(page)) { + ntfs_error(vol->sb, "Failed to read from " + "$UsnJrnl/$DATA/$Max attribute."); + return false; + } + uh = (USN_HEADER*)page_address(page); + stamp = get_current_ntfs_time(); + ntfs_debug("Stamping transaction log ($UsnJrnl): old " + "journal_id 0x%llx, old lowest_valid_usn " + "0x%llx, new journal_id 0x%llx, new " + "lowest_valid_usn 0x%llx.", + (long long)sle64_to_cpu(uh->journal_id), + (long long)sle64_to_cpu(uh->lowest_valid_usn), + (long long)sle64_to_cpu(stamp), + i_size_read(vol->usnjrnl_j_ino)); + uh->lowest_valid_usn = + cpu_to_sle64(i_size_read(vol->usnjrnl_j_ino)); + uh->journal_id = stamp; + flush_dcache_page(page); + set_page_dirty(page); + ntfs_unmap_page(page); + /* Set the flag so we do not have to do it again on remount. */ + NVolSetUsnJrnlStamped(vol); + } + ntfs_debug("Done."); + return true; +} + +#endif /* NTFS_RW */ diff --git a/fs/ntfs/usnjrnl.h b/fs/ntfs/usnjrnl.h new file mode 100644 index 000000000..85f531b59 --- /dev/null +++ b/fs/ntfs/usnjrnl.h @@ -0,0 +1,191 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * usnjrnl.h - Defines for NTFS kernel transaction log ($UsnJrnl) handling. + * Part of the Linux-NTFS project. + * + * Copyright (c) 2005 Anton Altaparmakov + */ + +#ifndef _LINUX_NTFS_USNJRNL_H +#define _LINUX_NTFS_USNJRNL_H + +#ifdef NTFS_RW + +#include "types.h" +#include "endian.h" +#include "layout.h" +#include "volume.h" + +/* + * Transaction log ($UsnJrnl) organization: + * + * The transaction log records whenever a file is modified in any way. So for + * example it will record that file "blah" was written to at a particular time + * but not what was written. If will record that a file was deleted or + * created, that a file was truncated, etc. See below for all the reason + * codes used. + * + * The transaction log is in the $Extend directory which is in the root + * directory of each volume. If it is not present it means transaction + * logging is disabled. If it is present it means transaction logging is + * either enabled or in the process of being disabled in which case we can + * ignore it as it will go away as soon as Windows gets its hands on it. + * + * To determine whether the transaction logging is enabled or in the process + * of being disabled, need to check the volume flags in the + * $VOLUME_INFORMATION attribute in the $Volume system file (which is present + * in the root directory and has a fixed mft record number, see layout.h). + * If the flag VOLUME_DELETE_USN_UNDERWAY is set it means the transaction log + * is in the process of being disabled and if this flag is clear it means the + * transaction log is enabled. + * + * The transaction log consists of two parts; the $DATA/$Max attribute as well + * as the $DATA/$J attribute. $Max is a header describing the transaction + * log whilst $J is the transaction log data itself as a sequence of variable + * sized USN_RECORDs (see below for all the structures). + * + * We do not care about transaction logging at this point in time but we still + * need to let windows know that the transaction log is out of date. To do + * this we need to stamp the transaction log. This involves setting the + * lowest_valid_usn field in the $DATA/$Max attribute to the usn to be used + * for the next added USN_RECORD to the $DATA/$J attribute as well as + * generating a new journal_id in $DATA/$Max. + * + * The journal_id is as of the current version (2.0) of the transaction log + * simply the 64-bit timestamp of when the journal was either created or last + * stamped. + * + * To determine the next usn there are two ways. The first is to parse + * $DATA/$J and to find the last USN_RECORD in it and to add its record_length + * to its usn (which is the byte offset in the $DATA/$J attribute). The + * second is simply to take the data size of the attribute. Since the usns + * are simply byte offsets into $DATA/$J, this is exactly the next usn. For + * obvious reasons we use the second method as it is much simpler and faster. + * + * As an aside, note that to actually disable the transaction log, one would + * need to set the VOLUME_DELETE_USN_UNDERWAY flag (see above), then go + * through all the mft records on the volume and set the usn field in their + * $STANDARD_INFORMATION attribute to zero. Once that is done, one would need + * to delete the transaction log file, i.e. \$Extent\$UsnJrnl, and finally, + * one would need to clear the VOLUME_DELETE_USN_UNDERWAY flag. + * + * Note that if a volume is unmounted whilst the transaction log is being + * disabled, the process will continue the next time the volume is mounted. + * This is why we can safely mount read-write when we see a transaction log + * in the process of being deleted. + */ + +/* Some $UsnJrnl related constants. */ +#define UsnJrnlMajorVer 2 +#define UsnJrnlMinorVer 0 + +/* + * $DATA/$Max attribute. This is (always?) resident and has a fixed size of + * 32 bytes. It contains the header describing the transaction log. + */ +typedef struct { +/*Ofs*/ +/* 0*/sle64 maximum_size; /* The maximum on-disk size of the $DATA/$J + attribute. */ +/* 8*/sle64 allocation_delta; /* Number of bytes by which to increase the + size of the $DATA/$J attribute. */ +/*0x10*/sle64 journal_id; /* Current id of the transaction log. */ +/*0x18*/leUSN lowest_valid_usn; /* Lowest valid usn in $DATA/$J for the + current journal_id. */ +/* sizeof() = 32 (0x20) bytes */ +} __attribute__ ((__packed__)) USN_HEADER; + +/* + * Reason flags (32-bit). Cumulative flags describing the change(s) to the + * file since it was last opened. I think the names speak for themselves but + * if you disagree check out the descriptions in the Linux NTFS project NTFS + * documentation: http://www.linux-ntfs.org/ + */ +enum { + USN_REASON_DATA_OVERWRITE = cpu_to_le32(0x00000001), + USN_REASON_DATA_EXTEND = cpu_to_le32(0x00000002), + USN_REASON_DATA_TRUNCATION = cpu_to_le32(0x00000004), + USN_REASON_NAMED_DATA_OVERWRITE = cpu_to_le32(0x00000010), + USN_REASON_NAMED_DATA_EXTEND = cpu_to_le32(0x00000020), + USN_REASON_NAMED_DATA_TRUNCATION= cpu_to_le32(0x00000040), + USN_REASON_FILE_CREATE = cpu_to_le32(0x00000100), + USN_REASON_FILE_DELETE = cpu_to_le32(0x00000200), + USN_REASON_EA_CHANGE = cpu_to_le32(0x00000400), + USN_REASON_SECURITY_CHANGE = cpu_to_le32(0x00000800), + USN_REASON_RENAME_OLD_NAME = cpu_to_le32(0x00001000), + USN_REASON_RENAME_NEW_NAME = cpu_to_le32(0x00002000), + USN_REASON_INDEXABLE_CHANGE = cpu_to_le32(0x00004000), + USN_REASON_BASIC_INFO_CHANGE = cpu_to_le32(0x00008000), + USN_REASON_HARD_LINK_CHANGE = cpu_to_le32(0x00010000), + USN_REASON_COMPRESSION_CHANGE = cpu_to_le32(0x00020000), + USN_REASON_ENCRYPTION_CHANGE = cpu_to_le32(0x00040000), + USN_REASON_OBJECT_ID_CHANGE = cpu_to_le32(0x00080000), + USN_REASON_REPARSE_POINT_CHANGE = cpu_to_le32(0x00100000), + USN_REASON_STREAM_CHANGE = cpu_to_le32(0x00200000), + USN_REASON_CLOSE = cpu_to_le32(0x80000000), +}; + +typedef le32 USN_REASON_FLAGS; + +/* + * Source info flags (32-bit). Information about the source of the change(s) + * to the file. For detailed descriptions of what these mean, see the Linux + * NTFS project NTFS documentation: + * http://www.linux-ntfs.org/ + */ +enum { + USN_SOURCE_DATA_MANAGEMENT = cpu_to_le32(0x00000001), + USN_SOURCE_AUXILIARY_DATA = cpu_to_le32(0x00000002), + USN_SOURCE_REPLICATION_MANAGEMENT = cpu_to_le32(0x00000004), +}; + +typedef le32 USN_SOURCE_INFO_FLAGS; + +/* + * $DATA/$J attribute. This is always non-resident, is marked as sparse, and + * is of variabled size. It consists of a sequence of variable size + * USN_RECORDS. The minimum allocated_size is allocation_delta as + * specified in $DATA/$Max. When the maximum_size specified in $DATA/$Max is + * exceeded by more than allocation_delta bytes, allocation_delta bytes are + * allocated and appended to the $DATA/$J attribute and an equal number of + * bytes at the beginning of the attribute are freed and made sparse. Note the + * making sparse only happens at volume checkpoints and hence the actual + * $DATA/$J size can exceed maximum_size + allocation_delta temporarily. + */ +typedef struct { +/*Ofs*/ +/* 0*/le32 length; /* Byte size of this record (8-byte + aligned). */ +/* 4*/le16 major_ver; /* Major version of the transaction log used + for this record. */ +/* 6*/le16 minor_ver; /* Minor version of the transaction log used + for this record. */ +/* 8*/leMFT_REF mft_reference;/* The mft reference of the file (or + directory) described by this record. */ +/*0x10*/leMFT_REF parent_directory;/* The mft reference of the parent + directory of the file described by this + record. */ +/*0x18*/leUSN usn; /* The usn of this record. Equals the offset + within the $DATA/$J attribute. */ +/*0x20*/sle64 time; /* Time when this record was created. */ +/*0x28*/USN_REASON_FLAGS reason;/* Reason flags (see above). */ +/*0x2c*/USN_SOURCE_INFO_FLAGS source_info;/* Source info flags (see above). */ +/*0x30*/le32 security_id; /* File security_id copied from + $STANDARD_INFORMATION. */ +/*0x34*/FILE_ATTR_FLAGS file_attributes; /* File attributes copied from + $STANDARD_INFORMATION or $FILE_NAME (not + sure which). */ +/*0x38*/le16 file_name_size; /* Size of the file name in bytes. */ +/*0x3a*/le16 file_name_offset; /* Offset to the file name in bytes from the + start of this record. */ +/*0x3c*/ntfschar file_name[0]; /* Use when creating only. When reading use + file_name_offset to determine the location + of the name. */ +/* sizeof() = 60 (0x3c) bytes */ +} __attribute__ ((__packed__)) USN_RECORD; + +extern bool ntfs_stamp_usnjrnl(ntfs_volume *vol); + +#endif /* NTFS_RW */ + +#endif /* _LINUX_NTFS_USNJRNL_H */ diff --git a/fs/ntfs/volume.h b/fs/ntfs/volume.h new file mode 100644 index 000000000..930a9ae8a --- /dev/null +++ b/fs/ntfs/volume.h @@ -0,0 +1,164 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * volume.h - Defines for volume structures in NTFS Linux kernel driver. Part + * of the Linux-NTFS project. + * + * Copyright (c) 2001-2006 Anton Altaparmakov + * Copyright (c) 2002 Richard Russon + */ + +#ifndef _LINUX_NTFS_VOLUME_H +#define _LINUX_NTFS_VOLUME_H + +#include <linux/rwsem.h> +#include <linux/uidgid.h> + +#include "types.h" +#include "layout.h" + +/* + * The NTFS in memory super block structure. + */ +typedef struct { + /* + * FIXME: Reorder to have commonly used together element within the + * same cache line, aiming at a cache line size of 32 bytes. Aim for + * 64 bytes for less commonly used together elements. Put most commonly + * used elements to front of structure. Obviously do this only when the + * structure has stabilized... (AIA) + */ + /* Device specifics. */ + struct super_block *sb; /* Pointer back to the super_block. */ + LCN nr_blocks; /* Number of sb->s_blocksize bytes + sized blocks on the device. */ + /* Configuration provided by user at mount time. */ + unsigned long flags; /* Miscellaneous flags, see below. */ + kuid_t uid; /* uid that files will be mounted as. */ + kgid_t gid; /* gid that files will be mounted as. */ + umode_t fmask; /* The mask for file permissions. */ + umode_t dmask; /* The mask for directory + permissions. */ + u8 mft_zone_multiplier; /* Initial mft zone multiplier. */ + u8 on_errors; /* What to do on filesystem errors. */ + /* NTFS bootsector provided information. */ + u16 sector_size; /* in bytes */ + u8 sector_size_bits; /* log2(sector_size) */ + u32 cluster_size; /* in bytes */ + u32 cluster_size_mask; /* cluster_size - 1 */ + u8 cluster_size_bits; /* log2(cluster_size) */ + u32 mft_record_size; /* in bytes */ + u32 mft_record_size_mask; /* mft_record_size - 1 */ + u8 mft_record_size_bits; /* log2(mft_record_size) */ + u32 index_record_size; /* in bytes */ + u32 index_record_size_mask; /* index_record_size - 1 */ + u8 index_record_size_bits; /* log2(index_record_size) */ + LCN nr_clusters; /* Volume size in clusters == number of + bits in lcn bitmap. */ + LCN mft_lcn; /* Cluster location of mft data. */ + LCN mftmirr_lcn; /* Cluster location of copy of mft. */ + u64 serial_no; /* The volume serial number. */ + /* Mount specific NTFS information. */ + u32 upcase_len; /* Number of entries in upcase[]. */ + ntfschar *upcase; /* The upcase table. */ + + s32 attrdef_size; /* Size of the attribute definition + table in bytes. */ + ATTR_DEF *attrdef; /* Table of attribute definitions. + Obtained from FILE_AttrDef. */ + +#ifdef NTFS_RW + /* Variables used by the cluster and mft allocators. */ + s64 mft_data_pos; /* Mft record number at which to + allocate the next mft record. */ + LCN mft_zone_start; /* First cluster of the mft zone. */ + LCN mft_zone_end; /* First cluster beyond the mft zone. */ + LCN mft_zone_pos; /* Current position in the mft zone. */ + LCN data1_zone_pos; /* Current position in the first data + zone. */ + LCN data2_zone_pos; /* Current position in the second data + zone. */ +#endif /* NTFS_RW */ + + struct inode *mft_ino; /* The VFS inode of $MFT. */ + + struct inode *mftbmp_ino; /* Attribute inode for $MFT/$BITMAP. */ + struct rw_semaphore mftbmp_lock; /* Lock for serializing accesses to the + mft record bitmap ($MFT/$BITMAP). */ +#ifdef NTFS_RW + struct inode *mftmirr_ino; /* The VFS inode of $MFTMirr. */ + int mftmirr_size; /* Size of mft mirror in mft records. */ + + struct inode *logfile_ino; /* The VFS inode of $LogFile. */ +#endif /* NTFS_RW */ + + struct inode *lcnbmp_ino; /* The VFS inode of $Bitmap. */ + struct rw_semaphore lcnbmp_lock; /* Lock for serializing accesses to the + cluster bitmap ($Bitmap/$DATA). */ + + struct inode *vol_ino; /* The VFS inode of $Volume. */ + VOLUME_FLAGS vol_flags; /* Volume flags. */ + u8 major_ver; /* Ntfs major version of volume. */ + u8 minor_ver; /* Ntfs minor version of volume. */ + + struct inode *root_ino; /* The VFS inode of the root + directory. */ + struct inode *secure_ino; /* The VFS inode of $Secure (NTFS3.0+ + only, otherwise NULL). */ + struct inode *extend_ino; /* The VFS inode of $Extend (NTFS3.0+ + only, otherwise NULL). */ +#ifdef NTFS_RW + /* $Quota stuff is NTFS3.0+ specific. Unused/NULL otherwise. */ + struct inode *quota_ino; /* The VFS inode of $Quota. */ + struct inode *quota_q_ino; /* Attribute inode for $Quota/$Q. */ + /* $UsnJrnl stuff is NTFS3.0+ specific. Unused/NULL otherwise. */ + struct inode *usnjrnl_ino; /* The VFS inode of $UsnJrnl. */ + struct inode *usnjrnl_max_ino; /* Attribute inode for $UsnJrnl/$Max. */ + struct inode *usnjrnl_j_ino; /* Attribute inode for $UsnJrnl/$J. */ +#endif /* NTFS_RW */ + struct nls_table *nls_map; +} ntfs_volume; + +/* + * Defined bits for the flags field in the ntfs_volume structure. + */ +typedef enum { + NV_Errors, /* 1: Volume has errors, prevent remount rw. */ + NV_ShowSystemFiles, /* 1: Return system files in ntfs_readdir(). */ + NV_CaseSensitive, /* 1: Treat file names as case sensitive and + create filenames in the POSIX namespace. + Otherwise be case insensitive but still + create file names in POSIX namespace. */ + NV_LogFileEmpty, /* 1: $LogFile journal is empty. */ + NV_QuotaOutOfDate, /* 1: $Quota is out of date. */ + NV_UsnJrnlStamped, /* 1: $UsnJrnl has been stamped. */ + NV_SparseEnabled, /* 1: May create sparse files. */ +} ntfs_volume_flags; + +/* + * Macro tricks to expand the NVolFoo(), NVolSetFoo(), and NVolClearFoo() + * functions. + */ +#define DEFINE_NVOL_BIT_OPS(flag) \ +static inline int NVol##flag(ntfs_volume *vol) \ +{ \ + return test_bit(NV_##flag, &(vol)->flags); \ +} \ +static inline void NVolSet##flag(ntfs_volume *vol) \ +{ \ + set_bit(NV_##flag, &(vol)->flags); \ +} \ +static inline void NVolClear##flag(ntfs_volume *vol) \ +{ \ + clear_bit(NV_##flag, &(vol)->flags); \ +} + +/* Emit the ntfs volume bitops functions. */ +DEFINE_NVOL_BIT_OPS(Errors) +DEFINE_NVOL_BIT_OPS(ShowSystemFiles) +DEFINE_NVOL_BIT_OPS(CaseSensitive) +DEFINE_NVOL_BIT_OPS(LogFileEmpty) +DEFINE_NVOL_BIT_OPS(QuotaOutOfDate) +DEFINE_NVOL_BIT_OPS(UsnJrnlStamped) +DEFINE_NVOL_BIT_OPS(SparseEnabled) + +#endif /* _LINUX_NTFS_VOLUME_H */ diff --git a/fs/ntfs3/Kconfig b/fs/ntfs3/Kconfig new file mode 100644 index 000000000..6e4cbc48a --- /dev/null +++ b/fs/ntfs3/Kconfig @@ -0,0 +1,46 @@ +# SPDX-License-Identifier: GPL-2.0-only +config NTFS3_FS + tristate "NTFS Read-Write file system support" + select NLS + help + Windows OS native file system (NTFS) support up to NTFS version 3.1. + + Y or M enables the NTFS3 driver with full features enabled (read, + write, journal replaying, sparse/compressed files support). + File system type to use on mount is "ntfs3". Module name (M option) + is also "ntfs3". + + Documentation: <file:Documentation/filesystems/ntfs3.rst> + +config NTFS3_64BIT_CLUSTER + bool "64 bits per NTFS clusters" + depends on NTFS3_FS && 64BIT + help + Windows implementation of ntfs.sys uses 32 bits per clusters. + If activated 64 bits per clusters you will be able to use 4k cluster + for 16T+ volumes. Windows will not be able to mount such volumes. + + It is recommended to say N here. + +config NTFS3_LZX_XPRESS + bool "activate support of external compressions lzx/xpress" + depends on NTFS3_FS + help + In Windows 10 one can use command "compact" to compress any files. + 4 possible variants of compression are: xpress4k, xpress8k, xpress16k and lzx. + If activated you will be able to read such files correctly. + + It is recommended to say Y here. + +config NTFS3_FS_POSIX_ACL + bool "NTFS POSIX Access Control Lists" + depends on NTFS3_FS + select FS_POSIX_ACL + help + POSIX Access Control Lists (ACLs) support additional access rights + for users and groups beyond the standard owner/group/world scheme, + and this option selects support for ACLs specifically for ntfs + filesystems. + NOTE: this is linux only feature. Windows will ignore these ACLs. + + If you don't know what Access Control Lists are, say N. diff --git a/fs/ntfs3/Makefile b/fs/ntfs3/Makefile new file mode 100644 index 000000000..279701b62 --- /dev/null +++ b/fs/ntfs3/Makefile @@ -0,0 +1,36 @@ +# SPDX-License-Identifier: GPL-2.0 +# +# Makefile for the ntfs3 filesystem support. +# + +# to check robot warnings +ccflags-y += -Wint-to-pointer-cast \ + $(call cc-option,-Wunused-but-set-variable,-Wunused-const-variable) \ + $(call cc-option,-Wold-style-declaration,-Wout-of-line-declaration) + +obj-$(CONFIG_NTFS3_FS) += ntfs3.o + +ntfs3-y := attrib.o \ + attrlist.o \ + bitfunc.o \ + bitmap.o \ + dir.o \ + fsntfs.o \ + frecord.o \ + file.o \ + fslog.o \ + inode.o \ + index.o \ + lznt.o \ + namei.o \ + record.o \ + run.o \ + super.o \ + upcase.o \ + xattr.o + +ntfs3-$(CONFIG_NTFS3_LZX_XPRESS) += $(addprefix lib/,\ + decompress_common.o \ + lzx_decompress.o \ + xpress_decompress.o \ + )
\ No newline at end of file diff --git a/fs/ntfs3/attrib.c b/fs/ntfs3/attrib.c new file mode 100644 index 000000000..2215179c9 --- /dev/null +++ b/fs/ntfs3/attrib.c @@ -0,0 +1,2470 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * + * Copyright (C) 2019-2021 Paragon Software GmbH, All rights reserved. + * + * TODO: Merge attr_set_size/attr_data_get_block/attr_allocate_frame? + */ + +#include <linux/fs.h> +#include <linux/slab.h> +#include <linux/kernel.h> + +#include "debug.h" +#include "ntfs.h" +#include "ntfs_fs.h" + +/* + * You can set external NTFS_MIN_LOG2_OF_CLUMP/NTFS_MAX_LOG2_OF_CLUMP to manage + * preallocate algorithm. + */ +#ifndef NTFS_MIN_LOG2_OF_CLUMP +#define NTFS_MIN_LOG2_OF_CLUMP 16 +#endif + +#ifndef NTFS_MAX_LOG2_OF_CLUMP +#define NTFS_MAX_LOG2_OF_CLUMP 26 +#endif + +// 16M +#define NTFS_CLUMP_MIN (1 << (NTFS_MIN_LOG2_OF_CLUMP + 8)) +// 16G +#define NTFS_CLUMP_MAX (1ull << (NTFS_MAX_LOG2_OF_CLUMP + 8)) + +static inline u64 get_pre_allocated(u64 size) +{ + u32 clump; + u8 align_shift; + u64 ret; + + if (size <= NTFS_CLUMP_MIN) { + clump = 1 << NTFS_MIN_LOG2_OF_CLUMP; + align_shift = NTFS_MIN_LOG2_OF_CLUMP; + } else if (size >= NTFS_CLUMP_MAX) { + clump = 1 << NTFS_MAX_LOG2_OF_CLUMP; + align_shift = NTFS_MAX_LOG2_OF_CLUMP; + } else { + align_shift = NTFS_MIN_LOG2_OF_CLUMP - 1 + + __ffs(size >> (8 + NTFS_MIN_LOG2_OF_CLUMP)); + clump = 1u << align_shift; + } + + ret = (((size + clump - 1) >> align_shift)) << align_shift; + + return ret; +} + +/* + * attr_must_be_resident + * + * Return: True if attribute must be resident. + */ +static inline bool attr_must_be_resident(struct ntfs_sb_info *sbi, + enum ATTR_TYPE type) +{ + const struct ATTR_DEF_ENTRY *de; + + switch (type) { + case ATTR_STD: + case ATTR_NAME: + case ATTR_ID: + case ATTR_LABEL: + case ATTR_VOL_INFO: + case ATTR_ROOT: + case ATTR_EA_INFO: + return true; + default: + de = ntfs_query_def(sbi, type); + if (de && (de->flags & NTFS_ATTR_MUST_BE_RESIDENT)) + return true; + return false; + } +} + +/* + * attr_load_runs - Load all runs stored in @attr. + */ +static int attr_load_runs(struct ATTRIB *attr, struct ntfs_inode *ni, + struct runs_tree *run, const CLST *vcn) +{ + int err; + CLST svcn = le64_to_cpu(attr->nres.svcn); + CLST evcn = le64_to_cpu(attr->nres.evcn); + u32 asize; + u16 run_off; + + if (svcn >= evcn + 1 || run_is_mapped_full(run, svcn, evcn)) + return 0; + + if (vcn && (evcn < *vcn || *vcn < svcn)) + return -EINVAL; + + asize = le32_to_cpu(attr->size); + run_off = le16_to_cpu(attr->nres.run_off); + + if (run_off > asize) + return -EINVAL; + + err = run_unpack_ex(run, ni->mi.sbi, ni->mi.rno, svcn, evcn, + vcn ? *vcn : svcn, Add2Ptr(attr, run_off), + asize - run_off); + if (err < 0) + return err; + + return 0; +} + +/* + * run_deallocate_ex - Deallocate clusters. + */ +static int run_deallocate_ex(struct ntfs_sb_info *sbi, struct runs_tree *run, + CLST vcn, CLST len, CLST *done, bool trim) +{ + int err = 0; + CLST vcn_next, vcn0 = vcn, lcn, clen, dn = 0; + size_t idx; + + if (!len) + goto out; + + if (!run_lookup_entry(run, vcn, &lcn, &clen, &idx)) { +failed: + run_truncate(run, vcn0); + err = -EINVAL; + goto out; + } + + for (;;) { + if (clen > len) + clen = len; + + if (!clen) { + err = -EINVAL; + goto out; + } + + if (lcn != SPARSE_LCN) { + if (sbi) { + /* mark bitmap range [lcn + clen) as free and trim clusters. */ + mark_as_free_ex(sbi, lcn, clen, trim); + } + dn += clen; + } + + len -= clen; + if (!len) + break; + + vcn_next = vcn + clen; + if (!run_get_entry(run, ++idx, &vcn, &lcn, &clen) || + vcn != vcn_next) { + /* Save memory - don't load entire run. */ + goto failed; + } + } + +out: + if (done) + *done += dn; + + return err; +} + +/* + * attr_allocate_clusters - Find free space, mark it as used and store in @run. + */ +int attr_allocate_clusters(struct ntfs_sb_info *sbi, struct runs_tree *run, + CLST vcn, CLST lcn, CLST len, CLST *pre_alloc, + enum ALLOCATE_OPT opt, CLST *alen, const size_t fr, + CLST *new_lcn) +{ + int err; + CLST flen, vcn0 = vcn, pre = pre_alloc ? *pre_alloc : 0; + size_t cnt = run->count; + + for (;;) { + err = ntfs_look_for_free_space(sbi, lcn, len + pre, &lcn, &flen, + opt); + + if (err == -ENOSPC && pre) { + pre = 0; + if (*pre_alloc) + *pre_alloc = 0; + continue; + } + + if (err) + goto out; + + if (new_lcn && vcn == vcn0) + *new_lcn = lcn; + + /* Add new fragment into run storage. */ + if (!run_add_entry(run, vcn, lcn, flen, opt == ALLOCATE_MFT)) { + /* Undo last 'ntfs_look_for_free_space' */ + mark_as_free_ex(sbi, lcn, len, false); + err = -ENOMEM; + goto out; + } + + vcn += flen; + + if (flen >= len || opt == ALLOCATE_MFT || + (fr && run->count - cnt >= fr)) { + *alen = vcn - vcn0; + return 0; + } + + len -= flen; + } + +out: + /* Undo 'ntfs_look_for_free_space' */ + if (vcn - vcn0) { + run_deallocate_ex(sbi, run, vcn0, vcn - vcn0, NULL, false); + run_truncate(run, vcn0); + } + + return err; +} + +/* + * attr_make_nonresident + * + * If page is not NULL - it is already contains resident data + * and locked (called from ni_write_frame()). + */ +int attr_make_nonresident(struct ntfs_inode *ni, struct ATTRIB *attr, + struct ATTR_LIST_ENTRY *le, struct mft_inode *mi, + u64 new_size, struct runs_tree *run, + struct ATTRIB **ins_attr, struct page *page) +{ + struct ntfs_sb_info *sbi; + struct ATTRIB *attr_s; + struct MFT_REC *rec; + u32 used, asize, rsize, aoff, align; + bool is_data; + CLST len, alen; + char *next; + int err; + + if (attr->non_res) { + *ins_attr = attr; + return 0; + } + + sbi = mi->sbi; + rec = mi->mrec; + attr_s = NULL; + used = le32_to_cpu(rec->used); + asize = le32_to_cpu(attr->size); + next = Add2Ptr(attr, asize); + aoff = PtrOffset(rec, attr); + rsize = le32_to_cpu(attr->res.data_size); + is_data = attr->type == ATTR_DATA && !attr->name_len; + + align = sbi->cluster_size; + if (is_attr_compressed(attr)) + align <<= COMPRESSION_UNIT; + len = (rsize + align - 1) >> sbi->cluster_bits; + + run_init(run); + + /* Make a copy of original attribute. */ + attr_s = kmemdup(attr, asize, GFP_NOFS); + if (!attr_s) { + err = -ENOMEM; + goto out; + } + + if (!len) { + /* Empty resident -> Empty nonresident. */ + alen = 0; + } else { + const char *data = resident_data(attr); + + err = attr_allocate_clusters(sbi, run, 0, 0, len, NULL, + ALLOCATE_DEF, &alen, 0, NULL); + if (err) + goto out1; + + if (!rsize) { + /* Empty resident -> Non empty nonresident. */ + } else if (!is_data) { + err = ntfs_sb_write_run(sbi, run, 0, data, rsize, 0); + if (err) + goto out2; + } else if (!page) { + char *kaddr; + + page = grab_cache_page(ni->vfs_inode.i_mapping, 0); + if (!page) { + err = -ENOMEM; + goto out2; + } + kaddr = kmap_atomic(page); + memcpy(kaddr, data, rsize); + memset(kaddr + rsize, 0, PAGE_SIZE - rsize); + kunmap_atomic(kaddr); + flush_dcache_page(page); + SetPageUptodate(page); + set_page_dirty(page); + unlock_page(page); + put_page(page); + } + } + + /* Remove original attribute. */ + used -= asize; + memmove(attr, Add2Ptr(attr, asize), used - aoff); + rec->used = cpu_to_le32(used); + mi->dirty = true; + if (le) + al_remove_le(ni, le); + + err = ni_insert_nonresident(ni, attr_s->type, attr_name(attr_s), + attr_s->name_len, run, 0, alen, + attr_s->flags, &attr, NULL, NULL); + if (err) + goto out3; + + kfree(attr_s); + attr->nres.data_size = cpu_to_le64(rsize); + attr->nres.valid_size = attr->nres.data_size; + + *ins_attr = attr; + + if (is_data) + ni->ni_flags &= ~NI_FLAG_RESIDENT; + + /* Resident attribute becomes non resident. */ + return 0; + +out3: + attr = Add2Ptr(rec, aoff); + memmove(next, attr, used - aoff); + memcpy(attr, attr_s, asize); + rec->used = cpu_to_le32(used + asize); + mi->dirty = true; +out2: + /* Undo: do not trim new allocated clusters. */ + run_deallocate(sbi, run, false); + run_close(run); +out1: + kfree(attr_s); +out: + return err; +} + +/* + * attr_set_size_res - Helper for attr_set_size(). + */ +static int attr_set_size_res(struct ntfs_inode *ni, struct ATTRIB *attr, + struct ATTR_LIST_ENTRY *le, struct mft_inode *mi, + u64 new_size, struct runs_tree *run, + struct ATTRIB **ins_attr) +{ + struct ntfs_sb_info *sbi = mi->sbi; + struct MFT_REC *rec = mi->mrec; + u32 used = le32_to_cpu(rec->used); + u32 asize = le32_to_cpu(attr->size); + u32 aoff = PtrOffset(rec, attr); + u32 rsize = le32_to_cpu(attr->res.data_size); + u32 tail = used - aoff - asize; + char *next = Add2Ptr(attr, asize); + s64 dsize = ALIGN(new_size, 8) - ALIGN(rsize, 8); + + if (dsize < 0) { + memmove(next + dsize, next, tail); + } else if (dsize > 0) { + if (used + dsize > sbi->max_bytes_per_attr) + return attr_make_nonresident(ni, attr, le, mi, new_size, + run, ins_attr, NULL); + + memmove(next + dsize, next, tail); + memset(next, 0, dsize); + } + + if (new_size > rsize) + memset(Add2Ptr(resident_data(attr), rsize), 0, + new_size - rsize); + + rec->used = cpu_to_le32(used + dsize); + attr->size = cpu_to_le32(asize + dsize); + attr->res.data_size = cpu_to_le32(new_size); + mi->dirty = true; + *ins_attr = attr; + + return 0; +} + +/* + * attr_set_size - Change the size of attribute. + * + * Extend: + * - Sparse/compressed: No allocated clusters. + * - Normal: Append allocated and preallocated new clusters. + * Shrink: + * - No deallocate if @keep_prealloc is set. + */ +int attr_set_size(struct ntfs_inode *ni, enum ATTR_TYPE type, + const __le16 *name, u8 name_len, struct runs_tree *run, + u64 new_size, const u64 *new_valid, bool keep_prealloc, + struct ATTRIB **ret) +{ + int err = 0; + struct ntfs_sb_info *sbi = ni->mi.sbi; + u8 cluster_bits = sbi->cluster_bits; + bool is_mft = + ni->mi.rno == MFT_REC_MFT && type == ATTR_DATA && !name_len; + u64 old_valid, old_size, old_alloc, new_alloc, new_alloc_tmp; + struct ATTRIB *attr = NULL, *attr_b; + struct ATTR_LIST_ENTRY *le, *le_b; + struct mft_inode *mi, *mi_b; + CLST alen, vcn, lcn, new_alen, old_alen, svcn, evcn; + CLST next_svcn, pre_alloc = -1, done = 0; + bool is_ext, is_bad = false; + u32 align; + struct MFT_REC *rec; + +again: + alen = 0; + le_b = NULL; + attr_b = ni_find_attr(ni, NULL, &le_b, type, name, name_len, NULL, + &mi_b); + if (!attr_b) { + err = -ENOENT; + goto bad_inode; + } + + if (!attr_b->non_res) { + err = attr_set_size_res(ni, attr_b, le_b, mi_b, new_size, run, + &attr_b); + if (err) + return err; + + /* Return if file is still resident. */ + if (!attr_b->non_res) + goto ok1; + + /* Layout of records may be changed, so do a full search. */ + goto again; + } + + is_ext = is_attr_ext(attr_b); + align = sbi->cluster_size; + if (is_ext) + align <<= attr_b->nres.c_unit; + + old_valid = le64_to_cpu(attr_b->nres.valid_size); + old_size = le64_to_cpu(attr_b->nres.data_size); + old_alloc = le64_to_cpu(attr_b->nres.alloc_size); + +again_1: + old_alen = old_alloc >> cluster_bits; + + new_alloc = (new_size + align - 1) & ~(u64)(align - 1); + new_alen = new_alloc >> cluster_bits; + + if (keep_prealloc && new_size < old_size) { + attr_b->nres.data_size = cpu_to_le64(new_size); + mi_b->dirty = true; + goto ok; + } + + vcn = old_alen - 1; + + svcn = le64_to_cpu(attr_b->nres.svcn); + evcn = le64_to_cpu(attr_b->nres.evcn); + + if (svcn <= vcn && vcn <= evcn) { + attr = attr_b; + le = le_b; + mi = mi_b; + } else if (!le_b) { + err = -EINVAL; + goto bad_inode; + } else { + le = le_b; + attr = ni_find_attr(ni, attr_b, &le, type, name, name_len, &vcn, + &mi); + if (!attr) { + err = -EINVAL; + goto bad_inode; + } + +next_le_1: + svcn = le64_to_cpu(attr->nres.svcn); + evcn = le64_to_cpu(attr->nres.evcn); + } + /* + * Here we have: + * attr,mi,le - last attribute segment (containing 'vcn'). + * attr_b,mi_b,le_b - base (primary) attribute segment. + */ +next_le: + rec = mi->mrec; + err = attr_load_runs(attr, ni, run, NULL); + if (err) + goto out; + + if (new_size > old_size) { + CLST to_allocate; + size_t free; + + if (new_alloc <= old_alloc) { + attr_b->nres.data_size = cpu_to_le64(new_size); + mi_b->dirty = true; + goto ok; + } + + /* + * Add clusters. In simple case we have to: + * - allocate space (vcn, lcn, len) + * - update packed run in 'mi' + * - update attr->nres.evcn + * - update attr_b->nres.data_size/attr_b->nres.alloc_size + */ + to_allocate = new_alen - old_alen; +add_alloc_in_same_attr_seg: + lcn = 0; + if (is_mft) { + /* MFT allocates clusters from MFT zone. */ + pre_alloc = 0; + } else if (is_ext) { + /* No preallocate for sparse/compress. */ + pre_alloc = 0; + } else if (pre_alloc == -1) { + pre_alloc = 0; + if (type == ATTR_DATA && !name_len && + sbi->options->prealloc) { + pre_alloc = + bytes_to_cluster( + sbi, + get_pre_allocated(new_size)) - + new_alen; + } + + /* Get the last LCN to allocate from. */ + if (old_alen && + !run_lookup_entry(run, vcn, &lcn, NULL, NULL)) { + lcn = SPARSE_LCN; + } + + if (lcn == SPARSE_LCN) + lcn = 0; + else if (lcn) + lcn += 1; + + free = wnd_zeroes(&sbi->used.bitmap); + if (to_allocate > free) { + err = -ENOSPC; + goto out; + } + + if (pre_alloc && to_allocate + pre_alloc > free) + pre_alloc = 0; + } + + vcn = old_alen; + + if (is_ext) { + if (!run_add_entry(run, vcn, SPARSE_LCN, to_allocate, + false)) { + err = -ENOMEM; + goto out; + } + alen = to_allocate; + } else { + /* ~3 bytes per fragment. */ + err = attr_allocate_clusters( + sbi, run, vcn, lcn, to_allocate, &pre_alloc, + is_mft ? ALLOCATE_MFT : 0, &alen, + is_mft ? 0 + : (sbi->record_size - + le32_to_cpu(rec->used) + 8) / + 3 + + 1, + NULL); + if (err) + goto out; + } + + done += alen; + vcn += alen; + if (to_allocate > alen) + to_allocate -= alen; + else + to_allocate = 0; + +pack_runs: + err = mi_pack_runs(mi, attr, run, vcn - svcn); + if (err) + goto undo_1; + + next_svcn = le64_to_cpu(attr->nres.evcn) + 1; + new_alloc_tmp = (u64)next_svcn << cluster_bits; + attr_b->nres.alloc_size = cpu_to_le64(new_alloc_tmp); + mi_b->dirty = true; + + if (next_svcn >= vcn && !to_allocate) { + /* Normal way. Update attribute and exit. */ + attr_b->nres.data_size = cpu_to_le64(new_size); + goto ok; + } + + /* At least two MFT to avoid recursive loop. */ + if (is_mft && next_svcn == vcn && + ((u64)done << sbi->cluster_bits) >= 2 * sbi->record_size) { + new_size = new_alloc_tmp; + attr_b->nres.data_size = attr_b->nres.alloc_size; + goto ok; + } + + if (le32_to_cpu(rec->used) < sbi->record_size) { + old_alen = next_svcn; + evcn = old_alen - 1; + goto add_alloc_in_same_attr_seg; + } + + attr_b->nres.data_size = attr_b->nres.alloc_size; + if (new_alloc_tmp < old_valid) + attr_b->nres.valid_size = attr_b->nres.data_size; + + if (type == ATTR_LIST) { + err = ni_expand_list(ni); + if (err) + goto undo_2; + if (next_svcn < vcn) + goto pack_runs; + + /* Layout of records is changed. */ + goto again; + } + + if (!ni->attr_list.size) { + err = ni_create_attr_list(ni); + /* In case of error layout of records is not changed. */ + if (err) + goto undo_2; + /* Layout of records is changed. */ + } + + if (next_svcn >= vcn) { + /* This is MFT data, repeat. */ + goto again; + } + + /* Insert new attribute segment. */ + err = ni_insert_nonresident(ni, type, name, name_len, run, + next_svcn, vcn - next_svcn, + attr_b->flags, &attr, &mi, NULL); + + /* + * Layout of records maybe changed. + * Find base attribute to update. + */ + le_b = NULL; + attr_b = ni_find_attr(ni, NULL, &le_b, type, name, name_len, + NULL, &mi_b); + if (!attr_b) { + err = -EINVAL; + goto bad_inode; + } + + if (err) { + /* ni_insert_nonresident failed. */ + attr = NULL; + goto undo_2; + } + + if (!is_mft) + run_truncate_head(run, evcn + 1); + + svcn = le64_to_cpu(attr->nres.svcn); + evcn = le64_to_cpu(attr->nres.evcn); + + /* + * Attribute is in consistency state. + * Save this point to restore to if next steps fail. + */ + old_valid = old_size = old_alloc = (u64)vcn << cluster_bits; + attr_b->nres.valid_size = attr_b->nres.data_size = + attr_b->nres.alloc_size = cpu_to_le64(old_size); + mi_b->dirty = true; + goto again_1; + } + + if (new_size != old_size || + (new_alloc != old_alloc && !keep_prealloc)) { + /* + * Truncate clusters. In simple case we have to: + * - update packed run in 'mi' + * - update attr->nres.evcn + * - update attr_b->nres.data_size/attr_b->nres.alloc_size + * - mark and trim clusters as free (vcn, lcn, len) + */ + CLST dlen = 0; + + vcn = max(svcn, new_alen); + new_alloc_tmp = (u64)vcn << cluster_bits; + + if (vcn > svcn) { + err = mi_pack_runs(mi, attr, run, vcn - svcn); + if (err) + goto out; + } else if (le && le->vcn) { + u16 le_sz = le16_to_cpu(le->size); + + /* + * NOTE: List entries for one attribute are always + * the same size. We deal with last entry (vcn==0) + * and it is not first in entries array + * (list entry for std attribute always first). + * So it is safe to step back. + */ + mi_remove_attr(NULL, mi, attr); + + if (!al_remove_le(ni, le)) { + err = -EINVAL; + goto bad_inode; + } + + le = (struct ATTR_LIST_ENTRY *)((u8 *)le - le_sz); + } else { + attr->nres.evcn = cpu_to_le64((u64)vcn - 1); + mi->dirty = true; + } + + attr_b->nres.alloc_size = cpu_to_le64(new_alloc_tmp); + + if (vcn == new_alen) { + attr_b->nres.data_size = cpu_to_le64(new_size); + if (new_size < old_valid) + attr_b->nres.valid_size = + attr_b->nres.data_size; + } else { + if (new_alloc_tmp <= + le64_to_cpu(attr_b->nres.data_size)) + attr_b->nres.data_size = + attr_b->nres.alloc_size; + if (new_alloc_tmp < + le64_to_cpu(attr_b->nres.valid_size)) + attr_b->nres.valid_size = + attr_b->nres.alloc_size; + } + mi_b->dirty = true; + + err = run_deallocate_ex(sbi, run, vcn, evcn - vcn + 1, &dlen, + true); + if (err) + goto out; + + if (is_ext) { + /* dlen - really deallocated clusters. */ + le64_sub_cpu(&attr_b->nres.total_size, + ((u64)dlen << cluster_bits)); + } + + run_truncate(run, vcn); + + if (new_alloc_tmp <= new_alloc) + goto ok; + + old_size = new_alloc_tmp; + vcn = svcn - 1; + + if (le == le_b) { + attr = attr_b; + mi = mi_b; + evcn = svcn - 1; + svcn = 0; + goto next_le; + } + + if (le->type != type || le->name_len != name_len || + memcmp(le_name(le), name, name_len * sizeof(short))) { + err = -EINVAL; + goto bad_inode; + } + + err = ni_load_mi(ni, le, &mi); + if (err) + goto out; + + attr = mi_find_attr(mi, NULL, type, name, name_len, &le->id); + if (!attr) { + err = -EINVAL; + goto bad_inode; + } + goto next_le_1; + } + +ok: + if (new_valid) { + __le64 valid = cpu_to_le64(min(*new_valid, new_size)); + + if (attr_b->nres.valid_size != valid) { + attr_b->nres.valid_size = valid; + mi_b->dirty = true; + } + } + +ok1: + if (ret) + *ret = attr_b; + + /* Update inode_set_bytes. */ + if (((type == ATTR_DATA && !name_len) || + (type == ATTR_ALLOC && name == I30_NAME))) { + bool dirty = false; + + if (ni->vfs_inode.i_size != new_size) { + ni->vfs_inode.i_size = new_size; + dirty = true; + } + + if (attr_b->non_res) { + new_alloc = le64_to_cpu(attr_b->nres.alloc_size); + if (inode_get_bytes(&ni->vfs_inode) != new_alloc) { + inode_set_bytes(&ni->vfs_inode, new_alloc); + dirty = true; + } + } + + if (dirty) { + ni->ni_flags |= NI_FLAG_UPDATE_PARENT; + mark_inode_dirty(&ni->vfs_inode); + } + } + + return 0; + +undo_2: + vcn -= alen; + attr_b->nres.data_size = cpu_to_le64(old_size); + attr_b->nres.valid_size = cpu_to_le64(old_valid); + attr_b->nres.alloc_size = cpu_to_le64(old_alloc); + + /* Restore 'attr' and 'mi'. */ + if (attr) + goto restore_run; + + if (le64_to_cpu(attr_b->nres.svcn) <= svcn && + svcn <= le64_to_cpu(attr_b->nres.evcn)) { + attr = attr_b; + le = le_b; + mi = mi_b; + } else if (!le_b) { + err = -EINVAL; + goto bad_inode; + } else { + le = le_b; + attr = ni_find_attr(ni, attr_b, &le, type, name, name_len, + &svcn, &mi); + if (!attr) + goto bad_inode; + } + +restore_run: + if (mi_pack_runs(mi, attr, run, evcn - svcn + 1)) + is_bad = true; + +undo_1: + run_deallocate_ex(sbi, run, vcn, alen, NULL, false); + + run_truncate(run, vcn); +out: + if (is_bad) { +bad_inode: + _ntfs_bad_inode(&ni->vfs_inode); + } + return err; +} + +int attr_data_get_block(struct ntfs_inode *ni, CLST vcn, CLST clen, CLST *lcn, + CLST *len, bool *new) +{ + int err = 0; + struct runs_tree *run = &ni->file.run; + struct ntfs_sb_info *sbi; + u8 cluster_bits; + struct ATTRIB *attr = NULL, *attr_b; + struct ATTR_LIST_ENTRY *le, *le_b; + struct mft_inode *mi, *mi_b; + CLST hint, svcn, to_alloc, evcn1, next_svcn, asize, end; + u64 total_size; + u32 clst_per_frame; + bool ok; + + if (new) + *new = false; + + down_read(&ni->file.run_lock); + ok = run_lookup_entry(run, vcn, lcn, len, NULL); + up_read(&ni->file.run_lock); + + if (ok && (*lcn != SPARSE_LCN || !new)) { + /* Normal way. */ + return 0; + } + + if (!clen) + clen = 1; + + if (ok && clen > *len) + clen = *len; + + sbi = ni->mi.sbi; + cluster_bits = sbi->cluster_bits; + + ni_lock(ni); + down_write(&ni->file.run_lock); + + le_b = NULL; + attr_b = ni_find_attr(ni, NULL, &le_b, ATTR_DATA, NULL, 0, NULL, &mi_b); + if (!attr_b) { + err = -ENOENT; + goto out; + } + + if (!attr_b->non_res) { + *lcn = RESIDENT_LCN; + *len = 1; + goto out; + } + + asize = le64_to_cpu(attr_b->nres.alloc_size) >> cluster_bits; + if (vcn >= asize) { + err = -EINVAL; + goto out; + } + + clst_per_frame = 1u << attr_b->nres.c_unit; + to_alloc = (clen + clst_per_frame - 1) & ~(clst_per_frame - 1); + + if (vcn + to_alloc > asize) + to_alloc = asize - vcn; + + svcn = le64_to_cpu(attr_b->nres.svcn); + evcn1 = le64_to_cpu(attr_b->nres.evcn) + 1; + + attr = attr_b; + le = le_b; + mi = mi_b; + + if (le_b && (vcn < svcn || evcn1 <= vcn)) { + attr = ni_find_attr(ni, attr_b, &le, ATTR_DATA, NULL, 0, &vcn, + &mi); + if (!attr) { + err = -EINVAL; + goto out; + } + svcn = le64_to_cpu(attr->nres.svcn); + evcn1 = le64_to_cpu(attr->nres.evcn) + 1; + } + + err = attr_load_runs(attr, ni, run, NULL); + if (err) + goto out; + + if (!ok) { + ok = run_lookup_entry(run, vcn, lcn, len, NULL); + if (ok && (*lcn != SPARSE_LCN || !new)) { + /* Normal way. */ + err = 0; + goto ok; + } + + if (!ok && !new) { + *len = 0; + err = 0; + goto ok; + } + + if (ok && clen > *len) { + clen = *len; + to_alloc = (clen + clst_per_frame - 1) & + ~(clst_per_frame - 1); + } + } + + if (!is_attr_ext(attr_b)) { + err = -EINVAL; + goto out; + } + + /* Get the last LCN to allocate from. */ + hint = 0; + + if (vcn > evcn1) { + if (!run_add_entry(run, evcn1, SPARSE_LCN, vcn - evcn1, + false)) { + err = -ENOMEM; + goto out; + } + } else if (vcn && !run_lookup_entry(run, vcn - 1, &hint, NULL, NULL)) { + hint = -1; + } + + err = attr_allocate_clusters( + sbi, run, vcn, hint + 1, to_alloc, NULL, 0, len, + (sbi->record_size - le32_to_cpu(mi->mrec->used) + 8) / 3 + 1, + lcn); + if (err) + goto out; + *new = true; + + end = vcn + *len; + + total_size = le64_to_cpu(attr_b->nres.total_size) + + ((u64)*len << cluster_bits); + +repack: + err = mi_pack_runs(mi, attr, run, max(end, evcn1) - svcn); + if (err) + goto out; + + attr_b->nres.total_size = cpu_to_le64(total_size); + inode_set_bytes(&ni->vfs_inode, total_size); + ni->ni_flags |= NI_FLAG_UPDATE_PARENT; + + mi_b->dirty = true; + mark_inode_dirty(&ni->vfs_inode); + + /* Stored [vcn : next_svcn) from [vcn : end). */ + next_svcn = le64_to_cpu(attr->nres.evcn) + 1; + + if (end <= evcn1) { + if (next_svcn == evcn1) { + /* Normal way. Update attribute and exit. */ + goto ok; + } + /* Add new segment [next_svcn : evcn1 - next_svcn). */ + if (!ni->attr_list.size) { + err = ni_create_attr_list(ni); + if (err) + goto out; + /* Layout of records is changed. */ + le_b = NULL; + attr_b = ni_find_attr(ni, NULL, &le_b, ATTR_DATA, NULL, + 0, NULL, &mi_b); + if (!attr_b) { + err = -ENOENT; + goto out; + } + + attr = attr_b; + le = le_b; + mi = mi_b; + goto repack; + } + } + + svcn = evcn1; + + /* Estimate next attribute. */ + attr = ni_find_attr(ni, attr, &le, ATTR_DATA, NULL, 0, &svcn, &mi); + + if (attr) { + CLST alloc = bytes_to_cluster( + sbi, le64_to_cpu(attr_b->nres.alloc_size)); + CLST evcn = le64_to_cpu(attr->nres.evcn); + + if (end < next_svcn) + end = next_svcn; + while (end > evcn) { + /* Remove segment [svcn : evcn). */ + mi_remove_attr(NULL, mi, attr); + + if (!al_remove_le(ni, le)) { + err = -EINVAL; + goto out; + } + + if (evcn + 1 >= alloc) { + /* Last attribute segment. */ + evcn1 = evcn + 1; + goto ins_ext; + } + + if (ni_load_mi(ni, le, &mi)) { + attr = NULL; + goto out; + } + + attr = mi_find_attr(mi, NULL, ATTR_DATA, NULL, 0, + &le->id); + if (!attr) { + err = -EINVAL; + goto out; + } + svcn = le64_to_cpu(attr->nres.svcn); + evcn = le64_to_cpu(attr->nres.evcn); + } + + if (end < svcn) + end = svcn; + + err = attr_load_runs(attr, ni, run, &end); + if (err) + goto out; + + evcn1 = evcn + 1; + attr->nres.svcn = cpu_to_le64(next_svcn); + err = mi_pack_runs(mi, attr, run, evcn1 - next_svcn); + if (err) + goto out; + + le->vcn = cpu_to_le64(next_svcn); + ni->attr_list.dirty = true; + mi->dirty = true; + + next_svcn = le64_to_cpu(attr->nres.evcn) + 1; + } +ins_ext: + if (evcn1 > next_svcn) { + err = ni_insert_nonresident(ni, ATTR_DATA, NULL, 0, run, + next_svcn, evcn1 - next_svcn, + attr_b->flags, &attr, &mi, NULL); + if (err) + goto out; + } +ok: + run_truncate_around(run, vcn); +out: + up_write(&ni->file.run_lock); + ni_unlock(ni); + + return err; +} + +int attr_data_read_resident(struct ntfs_inode *ni, struct page *page) +{ + u64 vbo; + struct ATTRIB *attr; + u32 data_size; + + attr = ni_find_attr(ni, NULL, NULL, ATTR_DATA, NULL, 0, NULL, NULL); + if (!attr) + return -EINVAL; + + if (attr->non_res) + return E_NTFS_NONRESIDENT; + + vbo = page->index << PAGE_SHIFT; + data_size = le32_to_cpu(attr->res.data_size); + if (vbo < data_size) { + const char *data = resident_data(attr); + char *kaddr = kmap_atomic(page); + u32 use = data_size - vbo; + + if (use > PAGE_SIZE) + use = PAGE_SIZE; + + memcpy(kaddr, data + vbo, use); + memset(kaddr + use, 0, PAGE_SIZE - use); + kunmap_atomic(kaddr); + flush_dcache_page(page); + SetPageUptodate(page); + } else if (!PageUptodate(page)) { + zero_user_segment(page, 0, PAGE_SIZE); + SetPageUptodate(page); + } + + return 0; +} + +int attr_data_write_resident(struct ntfs_inode *ni, struct page *page) +{ + u64 vbo; + struct mft_inode *mi; + struct ATTRIB *attr; + u32 data_size; + + attr = ni_find_attr(ni, NULL, NULL, ATTR_DATA, NULL, 0, NULL, &mi); + if (!attr) + return -EINVAL; + + if (attr->non_res) { + /* Return special error code to check this case. */ + return E_NTFS_NONRESIDENT; + } + + vbo = page->index << PAGE_SHIFT; + data_size = le32_to_cpu(attr->res.data_size); + if (vbo < data_size) { + char *data = resident_data(attr); + char *kaddr = kmap_atomic(page); + u32 use = data_size - vbo; + + if (use > PAGE_SIZE) + use = PAGE_SIZE; + memcpy(data + vbo, kaddr, use); + kunmap_atomic(kaddr); + mi->dirty = true; + } + ni->i_valid = data_size; + + return 0; +} + +/* + * attr_load_runs_vcn - Load runs with VCN. + */ +int attr_load_runs_vcn(struct ntfs_inode *ni, enum ATTR_TYPE type, + const __le16 *name, u8 name_len, struct runs_tree *run, + CLST vcn) +{ + struct ATTRIB *attr; + int err; + CLST svcn, evcn; + u16 ro; + + if (!ni) { + /* Is record corrupted? */ + return -ENOENT; + } + + attr = ni_find_attr(ni, NULL, NULL, type, name, name_len, &vcn, NULL); + if (!attr) { + /* Is record corrupted? */ + return -ENOENT; + } + + svcn = le64_to_cpu(attr->nres.svcn); + evcn = le64_to_cpu(attr->nres.evcn); + + if (evcn < vcn || vcn < svcn) { + /* Is record corrupted? */ + return -EINVAL; + } + + ro = le16_to_cpu(attr->nres.run_off); + + if (ro > le32_to_cpu(attr->size)) + return -EINVAL; + + err = run_unpack_ex(run, ni->mi.sbi, ni->mi.rno, svcn, evcn, svcn, + Add2Ptr(attr, ro), le32_to_cpu(attr->size) - ro); + if (err < 0) + return err; + return 0; +} + +/* + * attr_load_runs_range - Load runs for given range [from to). + */ +int attr_load_runs_range(struct ntfs_inode *ni, enum ATTR_TYPE type, + const __le16 *name, u8 name_len, struct runs_tree *run, + u64 from, u64 to) +{ + struct ntfs_sb_info *sbi = ni->mi.sbi; + u8 cluster_bits = sbi->cluster_bits; + CLST vcn; + CLST vcn_last = (to - 1) >> cluster_bits; + CLST lcn, clen; + int err; + + for (vcn = from >> cluster_bits; vcn <= vcn_last; vcn += clen) { + if (!run_lookup_entry(run, vcn, &lcn, &clen, NULL)) { + err = attr_load_runs_vcn(ni, type, name, name_len, run, + vcn); + if (err) + return err; + clen = 0; /* Next run_lookup_entry(vcn) must be success. */ + } + } + + return 0; +} + +#ifdef CONFIG_NTFS3_LZX_XPRESS +/* + * attr_wof_frame_info + * + * Read header of Xpress/LZX file to get info about frame. + */ +int attr_wof_frame_info(struct ntfs_inode *ni, struct ATTRIB *attr, + struct runs_tree *run, u64 frame, u64 frames, + u8 frame_bits, u32 *ondisk_size, u64 *vbo_data) +{ + struct ntfs_sb_info *sbi = ni->mi.sbi; + u64 vbo[2], off[2], wof_size; + u32 voff; + u8 bytes_per_off; + char *addr; + struct page *page; + int i, err; + __le32 *off32; + __le64 *off64; + + if (ni->vfs_inode.i_size < 0x100000000ull) { + /* File starts with array of 32 bit offsets. */ + bytes_per_off = sizeof(__le32); + vbo[1] = frame << 2; + *vbo_data = frames << 2; + } else { + /* File starts with array of 64 bit offsets. */ + bytes_per_off = sizeof(__le64); + vbo[1] = frame << 3; + *vbo_data = frames << 3; + } + + /* + * Read 4/8 bytes at [vbo - 4(8)] == offset where compressed frame starts. + * Read 4/8 bytes at [vbo] == offset where compressed frame ends. + */ + if (!attr->non_res) { + if (vbo[1] + bytes_per_off > le32_to_cpu(attr->res.data_size)) { + ntfs_inode_err(&ni->vfs_inode, "is corrupted"); + return -EINVAL; + } + addr = resident_data(attr); + + if (bytes_per_off == sizeof(__le32)) { + off32 = Add2Ptr(addr, vbo[1]); + off[0] = vbo[1] ? le32_to_cpu(off32[-1]) : 0; + off[1] = le32_to_cpu(off32[0]); + } else { + off64 = Add2Ptr(addr, vbo[1]); + off[0] = vbo[1] ? le64_to_cpu(off64[-1]) : 0; + off[1] = le64_to_cpu(off64[0]); + } + + *vbo_data += off[0]; + *ondisk_size = off[1] - off[0]; + return 0; + } + + wof_size = le64_to_cpu(attr->nres.data_size); + down_write(&ni->file.run_lock); + page = ni->file.offs_page; + if (!page) { + page = alloc_page(GFP_KERNEL); + if (!page) { + err = -ENOMEM; + goto out; + } + page->index = -1; + ni->file.offs_page = page; + } + lock_page(page); + addr = page_address(page); + + if (vbo[1]) { + voff = vbo[1] & (PAGE_SIZE - 1); + vbo[0] = vbo[1] - bytes_per_off; + i = 0; + } else { + voff = 0; + vbo[0] = 0; + off[0] = 0; + i = 1; + } + + do { + pgoff_t index = vbo[i] >> PAGE_SHIFT; + + if (index != page->index) { + u64 from = vbo[i] & ~(u64)(PAGE_SIZE - 1); + u64 to = min(from + PAGE_SIZE, wof_size); + + err = attr_load_runs_range(ni, ATTR_DATA, WOF_NAME, + ARRAY_SIZE(WOF_NAME), run, + from, to); + if (err) + goto out1; + + err = ntfs_bio_pages(sbi, run, &page, 1, from, + to - from, REQ_OP_READ); + if (err) { + page->index = -1; + goto out1; + } + page->index = index; + } + + if (i) { + if (bytes_per_off == sizeof(__le32)) { + off32 = Add2Ptr(addr, voff); + off[1] = le32_to_cpu(*off32); + } else { + off64 = Add2Ptr(addr, voff); + off[1] = le64_to_cpu(*off64); + } + } else if (!voff) { + if (bytes_per_off == sizeof(__le32)) { + off32 = Add2Ptr(addr, PAGE_SIZE - sizeof(u32)); + off[0] = le32_to_cpu(*off32); + } else { + off64 = Add2Ptr(addr, PAGE_SIZE - sizeof(u64)); + off[0] = le64_to_cpu(*off64); + } + } else { + /* Two values in one page. */ + if (bytes_per_off == sizeof(__le32)) { + off32 = Add2Ptr(addr, voff); + off[0] = le32_to_cpu(off32[-1]); + off[1] = le32_to_cpu(off32[0]); + } else { + off64 = Add2Ptr(addr, voff); + off[0] = le64_to_cpu(off64[-1]); + off[1] = le64_to_cpu(off64[0]); + } + break; + } + } while (++i < 2); + + *vbo_data += off[0]; + *ondisk_size = off[1] - off[0]; + +out1: + unlock_page(page); +out: + up_write(&ni->file.run_lock); + return err; +} +#endif + +/* + * attr_is_frame_compressed - Used to detect compressed frame. + */ +int attr_is_frame_compressed(struct ntfs_inode *ni, struct ATTRIB *attr, + CLST frame, CLST *clst_data) +{ + int err; + u32 clst_frame; + CLST clen, lcn, vcn, alen, slen, vcn_next; + size_t idx; + struct runs_tree *run; + + *clst_data = 0; + + if (!is_attr_compressed(attr)) + return 0; + + if (!attr->non_res) + return 0; + + clst_frame = 1u << attr->nres.c_unit; + vcn = frame * clst_frame; + run = &ni->file.run; + + if (!run_lookup_entry(run, vcn, &lcn, &clen, &idx)) { + err = attr_load_runs_vcn(ni, attr->type, attr_name(attr), + attr->name_len, run, vcn); + if (err) + return err; + + if (!run_lookup_entry(run, vcn, &lcn, &clen, &idx)) + return -EINVAL; + } + + if (lcn == SPARSE_LCN) { + /* Sparsed frame. */ + return 0; + } + + if (clen >= clst_frame) { + /* + * The frame is not compressed 'cause + * it does not contain any sparse clusters. + */ + *clst_data = clst_frame; + return 0; + } + + alen = bytes_to_cluster(ni->mi.sbi, le64_to_cpu(attr->nres.alloc_size)); + slen = 0; + *clst_data = clen; + + /* + * The frame is compressed if *clst_data + slen >= clst_frame. + * Check next fragments. + */ + while ((vcn += clen) < alen) { + vcn_next = vcn; + + if (!run_get_entry(run, ++idx, &vcn, &lcn, &clen) || + vcn_next != vcn) { + err = attr_load_runs_vcn(ni, attr->type, + attr_name(attr), + attr->name_len, run, vcn_next); + if (err) + return err; + vcn = vcn_next; + + if (!run_lookup_entry(run, vcn, &lcn, &clen, &idx)) + return -EINVAL; + } + + if (lcn == SPARSE_LCN) { + slen += clen; + } else { + if (slen) { + /* + * Data_clusters + sparse_clusters = + * not enough for frame. + */ + return -EINVAL; + } + *clst_data += clen; + } + + if (*clst_data + slen >= clst_frame) { + if (!slen) { + /* + * There is no sparsed clusters in this frame + * so it is not compressed. + */ + *clst_data = clst_frame; + } else { + /* Frame is compressed. */ + } + break; + } + } + + return 0; +} + +/* + * attr_allocate_frame - Allocate/free clusters for @frame. + * + * Assumed: down_write(&ni->file.run_lock); + */ +int attr_allocate_frame(struct ntfs_inode *ni, CLST frame, size_t compr_size, + u64 new_valid) +{ + int err = 0; + struct runs_tree *run = &ni->file.run; + struct ntfs_sb_info *sbi = ni->mi.sbi; + struct ATTRIB *attr = NULL, *attr_b; + struct ATTR_LIST_ENTRY *le, *le_b; + struct mft_inode *mi, *mi_b; + CLST svcn, evcn1, next_svcn, lcn, len; + CLST vcn, end, clst_data; + u64 total_size, valid_size, data_size; + + le_b = NULL; + attr_b = ni_find_attr(ni, NULL, &le_b, ATTR_DATA, NULL, 0, NULL, &mi_b); + if (!attr_b) + return -ENOENT; + + if (!is_attr_ext(attr_b)) + return -EINVAL; + + vcn = frame << NTFS_LZNT_CUNIT; + total_size = le64_to_cpu(attr_b->nres.total_size); + + svcn = le64_to_cpu(attr_b->nres.svcn); + evcn1 = le64_to_cpu(attr_b->nres.evcn) + 1; + data_size = le64_to_cpu(attr_b->nres.data_size); + + if (svcn <= vcn && vcn < evcn1) { + attr = attr_b; + le = le_b; + mi = mi_b; + } else if (!le_b) { + err = -EINVAL; + goto out; + } else { + le = le_b; + attr = ni_find_attr(ni, attr_b, &le, ATTR_DATA, NULL, 0, &vcn, + &mi); + if (!attr) { + err = -EINVAL; + goto out; + } + svcn = le64_to_cpu(attr->nres.svcn); + evcn1 = le64_to_cpu(attr->nres.evcn) + 1; + } + + err = attr_load_runs(attr, ni, run, NULL); + if (err) + goto out; + + err = attr_is_frame_compressed(ni, attr_b, frame, &clst_data); + if (err) + goto out; + + total_size -= (u64)clst_data << sbi->cluster_bits; + + len = bytes_to_cluster(sbi, compr_size); + + if (len == clst_data) + goto out; + + if (len < clst_data) { + err = run_deallocate_ex(sbi, run, vcn + len, clst_data - len, + NULL, true); + if (err) + goto out; + + if (!run_add_entry(run, vcn + len, SPARSE_LCN, clst_data - len, + false)) { + err = -ENOMEM; + goto out; + } + end = vcn + clst_data; + /* Run contains updated range [vcn + len : end). */ + } else { + CLST alen, hint = 0; + /* Get the last LCN to allocate from. */ + if (vcn + clst_data && + !run_lookup_entry(run, vcn + clst_data - 1, &hint, NULL, + NULL)) { + hint = -1; + } + + err = attr_allocate_clusters(sbi, run, vcn + clst_data, + hint + 1, len - clst_data, NULL, 0, + &alen, 0, &lcn); + if (err) + goto out; + + end = vcn + len; + /* Run contains updated range [vcn + clst_data : end). */ + } + + total_size += (u64)len << sbi->cluster_bits; + +repack: + err = mi_pack_runs(mi, attr, run, max(end, evcn1) - svcn); + if (err) + goto out; + + attr_b->nres.total_size = cpu_to_le64(total_size); + inode_set_bytes(&ni->vfs_inode, total_size); + + mi_b->dirty = true; + mark_inode_dirty(&ni->vfs_inode); + + /* Stored [vcn : next_svcn) from [vcn : end). */ + next_svcn = le64_to_cpu(attr->nres.evcn) + 1; + + if (end <= evcn1) { + if (next_svcn == evcn1) { + /* Normal way. Update attribute and exit. */ + goto ok; + } + /* Add new segment [next_svcn : evcn1 - next_svcn). */ + if (!ni->attr_list.size) { + err = ni_create_attr_list(ni); + if (err) + goto out; + /* Layout of records is changed. */ + le_b = NULL; + attr_b = ni_find_attr(ni, NULL, &le_b, ATTR_DATA, NULL, + 0, NULL, &mi_b); + if (!attr_b) + return -ENOENT; + + attr = attr_b; + le = le_b; + mi = mi_b; + goto repack; + } + } + + svcn = evcn1; + + /* Estimate next attribute. */ + attr = ni_find_attr(ni, attr, &le, ATTR_DATA, NULL, 0, &svcn, &mi); + + if (attr) { + CLST alloc = bytes_to_cluster( + sbi, le64_to_cpu(attr_b->nres.alloc_size)); + CLST evcn = le64_to_cpu(attr->nres.evcn); + + if (end < next_svcn) + end = next_svcn; + while (end > evcn) { + /* Remove segment [svcn : evcn). */ + mi_remove_attr(NULL, mi, attr); + + if (!al_remove_le(ni, le)) { + err = -EINVAL; + goto out; + } + + if (evcn + 1 >= alloc) { + /* Last attribute segment. */ + evcn1 = evcn + 1; + goto ins_ext; + } + + if (ni_load_mi(ni, le, &mi)) { + attr = NULL; + goto out; + } + + attr = mi_find_attr(mi, NULL, ATTR_DATA, NULL, 0, + &le->id); + if (!attr) { + err = -EINVAL; + goto out; + } + svcn = le64_to_cpu(attr->nres.svcn); + evcn = le64_to_cpu(attr->nres.evcn); + } + + if (end < svcn) + end = svcn; + + err = attr_load_runs(attr, ni, run, &end); + if (err) + goto out; + + evcn1 = evcn + 1; + attr->nres.svcn = cpu_to_le64(next_svcn); + err = mi_pack_runs(mi, attr, run, evcn1 - next_svcn); + if (err) + goto out; + + le->vcn = cpu_to_le64(next_svcn); + ni->attr_list.dirty = true; + mi->dirty = true; + + next_svcn = le64_to_cpu(attr->nres.evcn) + 1; + } +ins_ext: + if (evcn1 > next_svcn) { + err = ni_insert_nonresident(ni, ATTR_DATA, NULL, 0, run, + next_svcn, evcn1 - next_svcn, + attr_b->flags, &attr, &mi, NULL); + if (err) + goto out; + } +ok: + run_truncate_around(run, vcn); +out: + if (new_valid > data_size) + new_valid = data_size; + + valid_size = le64_to_cpu(attr_b->nres.valid_size); + if (new_valid != valid_size) { + attr_b->nres.valid_size = cpu_to_le64(valid_size); + mi_b->dirty = true; + } + + return err; +} + +/* + * attr_collapse_range - Collapse range in file. + */ +int attr_collapse_range(struct ntfs_inode *ni, u64 vbo, u64 bytes) +{ + int err = 0; + struct runs_tree *run = &ni->file.run; + struct ntfs_sb_info *sbi = ni->mi.sbi; + struct ATTRIB *attr = NULL, *attr_b; + struct ATTR_LIST_ENTRY *le, *le_b; + struct mft_inode *mi, *mi_b; + CLST svcn, evcn1, len, dealloc, alen; + CLST vcn, end; + u64 valid_size, data_size, alloc_size, total_size; + u32 mask; + __le16 a_flags; + + if (!bytes) + return 0; + + le_b = NULL; + attr_b = ni_find_attr(ni, NULL, &le_b, ATTR_DATA, NULL, 0, NULL, &mi_b); + if (!attr_b) + return -ENOENT; + + if (!attr_b->non_res) { + /* Attribute is resident. Nothing to do? */ + return 0; + } + + data_size = le64_to_cpu(attr_b->nres.data_size); + alloc_size = le64_to_cpu(attr_b->nres.alloc_size); + a_flags = attr_b->flags; + + if (is_attr_ext(attr_b)) { + total_size = le64_to_cpu(attr_b->nres.total_size); + mask = (sbi->cluster_size << attr_b->nres.c_unit) - 1; + } else { + total_size = alloc_size; + mask = sbi->cluster_mask; + } + + if ((vbo & mask) || (bytes & mask)) { + /* Allow to collapse only cluster aligned ranges. */ + return -EINVAL; + } + + if (vbo > data_size) + return -EINVAL; + + down_write(&ni->file.run_lock); + + if (vbo + bytes >= data_size) { + u64 new_valid = min(ni->i_valid, vbo); + + /* Simple truncate file at 'vbo'. */ + truncate_setsize(&ni->vfs_inode, vbo); + err = attr_set_size(ni, ATTR_DATA, NULL, 0, &ni->file.run, vbo, + &new_valid, true, NULL); + + if (!err && new_valid < ni->i_valid) + ni->i_valid = new_valid; + + goto out; + } + + /* + * Enumerate all attribute segments and collapse. + */ + alen = alloc_size >> sbi->cluster_bits; + vcn = vbo >> sbi->cluster_bits; + len = bytes >> sbi->cluster_bits; + end = vcn + len; + dealloc = 0; + + svcn = le64_to_cpu(attr_b->nres.svcn); + evcn1 = le64_to_cpu(attr_b->nres.evcn) + 1; + + if (svcn <= vcn && vcn < evcn1) { + attr = attr_b; + le = le_b; + mi = mi_b; + } else if (!le_b) { + err = -EINVAL; + goto out; + } else { + le = le_b; + attr = ni_find_attr(ni, attr_b, &le, ATTR_DATA, NULL, 0, &vcn, + &mi); + if (!attr) { + err = -EINVAL; + goto out; + } + + svcn = le64_to_cpu(attr->nres.svcn); + evcn1 = le64_to_cpu(attr->nres.evcn) + 1; + } + + for (;;) { + if (svcn >= end) { + /* Shift VCN- */ + attr->nres.svcn = cpu_to_le64(svcn - len); + attr->nres.evcn = cpu_to_le64(evcn1 - 1 - len); + if (le) { + le->vcn = attr->nres.svcn; + ni->attr_list.dirty = true; + } + mi->dirty = true; + } else if (svcn < vcn || end < evcn1) { + CLST vcn1, eat, next_svcn; + + /* Collapse a part of this attribute segment. */ + err = attr_load_runs(attr, ni, run, &svcn); + if (err) + goto out; + vcn1 = max(vcn, svcn); + eat = min(end, evcn1) - vcn1; + + err = run_deallocate_ex(sbi, run, vcn1, eat, &dealloc, + true); + if (err) + goto out; + + if (!run_collapse_range(run, vcn1, eat)) { + err = -ENOMEM; + goto out; + } + + if (svcn >= vcn) { + /* Shift VCN */ + attr->nres.svcn = cpu_to_le64(vcn); + if (le) { + le->vcn = attr->nres.svcn; + ni->attr_list.dirty = true; + } + } + + err = mi_pack_runs(mi, attr, run, evcn1 - svcn - eat); + if (err) + goto out; + + next_svcn = le64_to_cpu(attr->nres.evcn) + 1; + if (next_svcn + eat < evcn1) { + err = ni_insert_nonresident( + ni, ATTR_DATA, NULL, 0, run, next_svcn, + evcn1 - eat - next_svcn, a_flags, &attr, + &mi, &le); + if (err) + goto out; + + /* Layout of records maybe changed. */ + attr_b = NULL; + } + + /* Free all allocated memory. */ + run_truncate(run, 0); + } else { + u16 le_sz; + u16 roff = le16_to_cpu(attr->nres.run_off); + + if (roff > le32_to_cpu(attr->size)) { + err = -EINVAL; + goto out; + } + + run_unpack_ex(RUN_DEALLOCATE, sbi, ni->mi.rno, svcn, + evcn1 - 1, svcn, Add2Ptr(attr, roff), + le32_to_cpu(attr->size) - roff); + + /* Delete this attribute segment. */ + mi_remove_attr(NULL, mi, attr); + if (!le) + break; + + le_sz = le16_to_cpu(le->size); + if (!al_remove_le(ni, le)) { + err = -EINVAL; + goto out; + } + + if (evcn1 >= alen) + break; + + if (!svcn) { + /* Load next record that contains this attribute. */ + if (ni_load_mi(ni, le, &mi)) { + err = -EINVAL; + goto out; + } + + /* Look for required attribute. */ + attr = mi_find_attr(mi, NULL, ATTR_DATA, NULL, + 0, &le->id); + if (!attr) { + err = -EINVAL; + goto out; + } + goto next_attr; + } + le = (struct ATTR_LIST_ENTRY *)((u8 *)le - le_sz); + } + + if (evcn1 >= alen) + break; + + attr = ni_enum_attr_ex(ni, attr, &le, &mi); + if (!attr) { + err = -EINVAL; + goto out; + } + +next_attr: + svcn = le64_to_cpu(attr->nres.svcn); + evcn1 = le64_to_cpu(attr->nres.evcn) + 1; + } + + if (!attr_b) { + le_b = NULL; + attr_b = ni_find_attr(ni, NULL, &le_b, ATTR_DATA, NULL, 0, NULL, + &mi_b); + if (!attr_b) { + err = -ENOENT; + goto out; + } + } + + data_size -= bytes; + valid_size = ni->i_valid; + if (vbo + bytes <= valid_size) + valid_size -= bytes; + else if (vbo < valid_size) + valid_size = vbo; + + attr_b->nres.alloc_size = cpu_to_le64(alloc_size - bytes); + attr_b->nres.data_size = cpu_to_le64(data_size); + attr_b->nres.valid_size = cpu_to_le64(min(valid_size, data_size)); + total_size -= (u64)dealloc << sbi->cluster_bits; + if (is_attr_ext(attr_b)) + attr_b->nres.total_size = cpu_to_le64(total_size); + mi_b->dirty = true; + + /* Update inode size. */ + ni->i_valid = valid_size; + ni->vfs_inode.i_size = data_size; + inode_set_bytes(&ni->vfs_inode, total_size); + ni->ni_flags |= NI_FLAG_UPDATE_PARENT; + mark_inode_dirty(&ni->vfs_inode); + +out: + up_write(&ni->file.run_lock); + if (err) + _ntfs_bad_inode(&ni->vfs_inode); + + return err; +} + +/* + * attr_punch_hole + * + * Not for normal files. + */ +int attr_punch_hole(struct ntfs_inode *ni, u64 vbo, u64 bytes, u32 *frame_size) +{ + int err = 0; + struct runs_tree *run = &ni->file.run; + struct ntfs_sb_info *sbi = ni->mi.sbi; + struct ATTRIB *attr = NULL, *attr_b; + struct ATTR_LIST_ENTRY *le, *le_b; + struct mft_inode *mi, *mi_b; + CLST svcn, evcn1, vcn, len, end, alen, hole, next_svcn; + u64 total_size, alloc_size; + u32 mask; + __le16 a_flags; + struct runs_tree run2; + + if (!bytes) + return 0; + + le_b = NULL; + attr_b = ni_find_attr(ni, NULL, &le_b, ATTR_DATA, NULL, 0, NULL, &mi_b); + if (!attr_b) + return -ENOENT; + + if (!attr_b->non_res) { + u32 data_size = le32_to_cpu(attr_b->res.data_size); + u32 from, to; + + if (vbo > data_size) + return 0; + + from = vbo; + to = min_t(u64, vbo + bytes, data_size); + memset(Add2Ptr(resident_data(attr_b), from), 0, to - from); + return 0; + } + + if (!is_attr_ext(attr_b)) + return -EOPNOTSUPP; + + alloc_size = le64_to_cpu(attr_b->nres.alloc_size); + total_size = le64_to_cpu(attr_b->nres.total_size); + + if (vbo >= alloc_size) { + /* NOTE: It is allowed. */ + return 0; + } + + mask = (sbi->cluster_size << attr_b->nres.c_unit) - 1; + + bytes += vbo; + if (bytes > alloc_size) + bytes = alloc_size; + bytes -= vbo; + + if ((vbo & mask) || (bytes & mask)) { + /* We have to zero a range(s). */ + if (frame_size == NULL) { + /* Caller insists range is aligned. */ + return -EINVAL; + } + *frame_size = mask + 1; + return E_NTFS_NOTALIGNED; + } + + down_write(&ni->file.run_lock); + run_init(&run2); + run_truncate(run, 0); + + /* + * Enumerate all attribute segments and punch hole where necessary. + */ + alen = alloc_size >> sbi->cluster_bits; + vcn = vbo >> sbi->cluster_bits; + len = bytes >> sbi->cluster_bits; + end = vcn + len; + hole = 0; + + svcn = le64_to_cpu(attr_b->nres.svcn); + evcn1 = le64_to_cpu(attr_b->nres.evcn) + 1; + a_flags = attr_b->flags; + + if (svcn <= vcn && vcn < evcn1) { + attr = attr_b; + le = le_b; + mi = mi_b; + } else if (!le_b) { + err = -EINVAL; + goto bad_inode; + } else { + le = le_b; + attr = ni_find_attr(ni, attr_b, &le, ATTR_DATA, NULL, 0, &vcn, + &mi); + if (!attr) { + err = -EINVAL; + goto bad_inode; + } + + svcn = le64_to_cpu(attr->nres.svcn); + evcn1 = le64_to_cpu(attr->nres.evcn) + 1; + } + + while (svcn < end) { + CLST vcn1, zero, hole2 = hole; + + err = attr_load_runs(attr, ni, run, &svcn); + if (err) + goto done; + vcn1 = max(vcn, svcn); + zero = min(end, evcn1) - vcn1; + + /* + * Check range [vcn1 + zero). + * Calculate how many clusters there are. + * Don't do any destructive actions. + */ + err = run_deallocate_ex(NULL, run, vcn1, zero, &hole2, false); + if (err) + goto done; + + /* Check if required range is already hole. */ + if (hole2 == hole) + goto next_attr; + + /* Make a clone of run to undo. */ + err = run_clone(run, &run2); + if (err) + goto done; + + /* Make a hole range (sparse) [vcn1 + zero). */ + if (!run_add_entry(run, vcn1, SPARSE_LCN, zero, false)) { + err = -ENOMEM; + goto done; + } + + /* Update run in attribute segment. */ + err = mi_pack_runs(mi, attr, run, evcn1 - svcn); + if (err) + goto done; + next_svcn = le64_to_cpu(attr->nres.evcn) + 1; + if (next_svcn < evcn1) { + /* Insert new attribute segment. */ + err = ni_insert_nonresident(ni, ATTR_DATA, NULL, 0, run, + next_svcn, + evcn1 - next_svcn, a_flags, + &attr, &mi, &le); + if (err) + goto undo_punch; + + /* Layout of records maybe changed. */ + attr_b = NULL; + } + + /* Real deallocate. Should not fail. */ + run_deallocate_ex(sbi, &run2, vcn1, zero, &hole, true); + +next_attr: + /* Free all allocated memory. */ + run_truncate(run, 0); + + if (evcn1 >= alen) + break; + + /* Get next attribute segment. */ + attr = ni_enum_attr_ex(ni, attr, &le, &mi); + if (!attr) { + err = -EINVAL; + goto bad_inode; + } + + svcn = le64_to_cpu(attr->nres.svcn); + evcn1 = le64_to_cpu(attr->nres.evcn) + 1; + } + +done: + if (!hole) + goto out; + + if (!attr_b) { + attr_b = ni_find_attr(ni, NULL, NULL, ATTR_DATA, NULL, 0, NULL, + &mi_b); + if (!attr_b) { + err = -EINVAL; + goto bad_inode; + } + } + + total_size -= (u64)hole << sbi->cluster_bits; + attr_b->nres.total_size = cpu_to_le64(total_size); + mi_b->dirty = true; + + /* Update inode size. */ + inode_set_bytes(&ni->vfs_inode, total_size); + ni->ni_flags |= NI_FLAG_UPDATE_PARENT; + mark_inode_dirty(&ni->vfs_inode); + +out: + run_close(&run2); + up_write(&ni->file.run_lock); + return err; + +bad_inode: + _ntfs_bad_inode(&ni->vfs_inode); + goto out; + +undo_punch: + /* + * Restore packed runs. + * 'mi_pack_runs' should not fail, cause we restore original. + */ + if (mi_pack_runs(mi, attr, &run2, evcn1 - svcn)) + goto bad_inode; + + goto done; +} + +/* + * attr_insert_range - Insert range (hole) in file. + * Not for normal files. + */ +int attr_insert_range(struct ntfs_inode *ni, u64 vbo, u64 bytes) +{ + int err = 0; + struct runs_tree *run = &ni->file.run; + struct ntfs_sb_info *sbi = ni->mi.sbi; + struct ATTRIB *attr = NULL, *attr_b; + struct ATTR_LIST_ENTRY *le, *le_b; + struct mft_inode *mi, *mi_b; + CLST vcn, svcn, evcn1, len, next_svcn; + u64 data_size, alloc_size; + u32 mask; + __le16 a_flags; + + if (!bytes) + return 0; + + le_b = NULL; + attr_b = ni_find_attr(ni, NULL, &le_b, ATTR_DATA, NULL, 0, NULL, &mi_b); + if (!attr_b) + return -ENOENT; + + if (!is_attr_ext(attr_b)) { + /* It was checked above. See fallocate. */ + return -EOPNOTSUPP; + } + + if (!attr_b->non_res) { + data_size = le32_to_cpu(attr_b->res.data_size); + alloc_size = data_size; + mask = sbi->cluster_mask; /* cluster_size - 1 */ + } else { + data_size = le64_to_cpu(attr_b->nres.data_size); + alloc_size = le64_to_cpu(attr_b->nres.alloc_size); + mask = (sbi->cluster_size << attr_b->nres.c_unit) - 1; + } + + if (vbo > data_size) { + /* Insert range after the file size is not allowed. */ + return -EINVAL; + } + + if ((vbo & mask) || (bytes & mask)) { + /* Allow to insert only frame aligned ranges. */ + return -EINVAL; + } + + /* + * valid_size <= data_size <= alloc_size + * Check alloc_size for maximum possible. + */ + if (bytes > sbi->maxbytes_sparse - alloc_size) + return -EFBIG; + + vcn = vbo >> sbi->cluster_bits; + len = bytes >> sbi->cluster_bits; + + down_write(&ni->file.run_lock); + + if (!attr_b->non_res) { + err = attr_set_size(ni, ATTR_DATA, NULL, 0, run, + data_size + bytes, NULL, false, NULL); + + le_b = NULL; + attr_b = ni_find_attr(ni, NULL, &le_b, ATTR_DATA, NULL, 0, NULL, + &mi_b); + if (!attr_b) { + err = -EINVAL; + goto bad_inode; + } + + if (err) + goto out; + + if (!attr_b->non_res) { + /* Still resident. */ + char *data = Add2Ptr(attr_b, attr_b->res.data_off); + + memmove(data + bytes, data, bytes); + memset(data, 0, bytes); + goto done; + } + + /* Resident files becomes nonresident. */ + data_size = le64_to_cpu(attr_b->nres.data_size); + alloc_size = le64_to_cpu(attr_b->nres.alloc_size); + } + + /* + * Enumerate all attribute segments and shift start vcn. + */ + a_flags = attr_b->flags; + svcn = le64_to_cpu(attr_b->nres.svcn); + evcn1 = le64_to_cpu(attr_b->nres.evcn) + 1; + + if (svcn <= vcn && vcn < evcn1) { + attr = attr_b; + le = le_b; + mi = mi_b; + } else if (!le_b) { + err = -EINVAL; + goto bad_inode; + } else { + le = le_b; + attr = ni_find_attr(ni, attr_b, &le, ATTR_DATA, NULL, 0, &vcn, + &mi); + if (!attr) { + err = -EINVAL; + goto bad_inode; + } + + svcn = le64_to_cpu(attr->nres.svcn); + evcn1 = le64_to_cpu(attr->nres.evcn) + 1; + } + + run_truncate(run, 0); /* clear cached values. */ + err = attr_load_runs(attr, ni, run, NULL); + if (err) + goto out; + + if (!run_insert_range(run, vcn, len)) { + err = -ENOMEM; + goto out; + } + + /* Try to pack in current record as much as possible. */ + err = mi_pack_runs(mi, attr, run, evcn1 + len - svcn); + if (err) + goto out; + + next_svcn = le64_to_cpu(attr->nres.evcn) + 1; + + while ((attr = ni_enum_attr_ex(ni, attr, &le, &mi)) && + attr->type == ATTR_DATA && !attr->name_len) { + le64_add_cpu(&attr->nres.svcn, len); + le64_add_cpu(&attr->nres.evcn, len); + if (le) { + le->vcn = attr->nres.svcn; + ni->attr_list.dirty = true; + } + mi->dirty = true; + } + + if (next_svcn < evcn1 + len) { + err = ni_insert_nonresident(ni, ATTR_DATA, NULL, 0, run, + next_svcn, evcn1 + len - next_svcn, + a_flags, NULL, NULL, NULL); + + le_b = NULL; + attr_b = ni_find_attr(ni, NULL, &le_b, ATTR_DATA, NULL, 0, NULL, + &mi_b); + if (!attr_b) { + err = -EINVAL; + goto bad_inode; + } + + if (err) { + /* ni_insert_nonresident failed. Try to undo. */ + goto undo_insert_range; + } + } + + /* + * Update primary attribute segment. + */ + if (vbo <= ni->i_valid) + ni->i_valid += bytes; + + attr_b->nres.data_size = le64_to_cpu(data_size + bytes); + attr_b->nres.alloc_size = le64_to_cpu(alloc_size + bytes); + + /* ni->valid may be not equal valid_size (temporary). */ + if (ni->i_valid > data_size + bytes) + attr_b->nres.valid_size = attr_b->nres.data_size; + else + attr_b->nres.valid_size = cpu_to_le64(ni->i_valid); + mi_b->dirty = true; + +done: + ni->vfs_inode.i_size += bytes; + ni->ni_flags |= NI_FLAG_UPDATE_PARENT; + mark_inode_dirty(&ni->vfs_inode); + +out: + run_truncate(run, 0); /* clear cached values. */ + + up_write(&ni->file.run_lock); + + return err; + +bad_inode: + _ntfs_bad_inode(&ni->vfs_inode); + goto out; + +undo_insert_range: + svcn = le64_to_cpu(attr_b->nres.svcn); + evcn1 = le64_to_cpu(attr_b->nres.evcn) + 1; + + if (svcn <= vcn && vcn < evcn1) { + attr = attr_b; + le = le_b; + mi = mi_b; + } else if (!le_b) { + goto bad_inode; + } else { + le = le_b; + attr = ni_find_attr(ni, attr_b, &le, ATTR_DATA, NULL, 0, &vcn, + &mi); + if (!attr) { + goto bad_inode; + } + + svcn = le64_to_cpu(attr->nres.svcn); + evcn1 = le64_to_cpu(attr->nres.evcn) + 1; + } + + if (attr_load_runs(attr, ni, run, NULL)) + goto bad_inode; + + if (!run_collapse_range(run, vcn, len)) + goto bad_inode; + + if (mi_pack_runs(mi, attr, run, evcn1 + len - svcn)) + goto bad_inode; + + while ((attr = ni_enum_attr_ex(ni, attr, &le, &mi)) && + attr->type == ATTR_DATA && !attr->name_len) { + le64_sub_cpu(&attr->nres.svcn, len); + le64_sub_cpu(&attr->nres.evcn, len); + if (le) { + le->vcn = attr->nres.svcn; + ni->attr_list.dirty = true; + } + mi->dirty = true; + } + + goto out; +} diff --git a/fs/ntfs3/attrlist.c b/fs/ntfs3/attrlist.c new file mode 100644 index 000000000..0c6a68e71 --- /dev/null +++ b/fs/ntfs3/attrlist.c @@ -0,0 +1,473 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * + * Copyright (C) 2019-2021 Paragon Software GmbH, All rights reserved. + * + */ + +#include <linux/fs.h> + +#include "debug.h" +#include "ntfs.h" +#include "ntfs_fs.h" + +/* + * al_is_valid_le + * + * Return: True if @le is valid. + */ +static inline bool al_is_valid_le(const struct ntfs_inode *ni, + struct ATTR_LIST_ENTRY *le) +{ + if (!le || !ni->attr_list.le || !ni->attr_list.size) + return false; + + return PtrOffset(ni->attr_list.le, le) + le16_to_cpu(le->size) <= + ni->attr_list.size; +} + +void al_destroy(struct ntfs_inode *ni) +{ + run_close(&ni->attr_list.run); + kfree(ni->attr_list.le); + ni->attr_list.le = NULL; + ni->attr_list.size = 0; + ni->attr_list.dirty = false; +} + +/* + * ntfs_load_attr_list + * + * This method makes sure that the ATTRIB list, if present, + * has been properly set up. + */ +int ntfs_load_attr_list(struct ntfs_inode *ni, struct ATTRIB *attr) +{ + int err; + size_t lsize; + void *le = NULL; + + if (ni->attr_list.size) + return 0; + + if (!attr->non_res) { + lsize = le32_to_cpu(attr->res.data_size); + /* attr is resident: lsize < record_size (1K or 4K) */ + le = kvmalloc(al_aligned(lsize), GFP_KERNEL); + if (!le) { + err = -ENOMEM; + goto out; + } + memcpy(le, resident_data(attr), lsize); + } else if (attr->nres.svcn) { + err = -EINVAL; + goto out; + } else { + u16 run_off = le16_to_cpu(attr->nres.run_off); + + lsize = le64_to_cpu(attr->nres.data_size); + + run_init(&ni->attr_list.run); + + if (run_off > le32_to_cpu(attr->size)) { + err = -EINVAL; + goto out; + } + + err = run_unpack_ex(&ni->attr_list.run, ni->mi.sbi, ni->mi.rno, + 0, le64_to_cpu(attr->nres.evcn), 0, + Add2Ptr(attr, run_off), + le32_to_cpu(attr->size) - run_off); + if (err < 0) + goto out; + + /* attr is nonresident. + * The worst case: + * 1T (2^40) extremely fragmented file. + * cluster = 4K (2^12) => 2^28 fragments + * 2^9 fragments per one record => 2^19 records + * 2^5 bytes of ATTR_LIST_ENTRY per one record => 2^24 bytes. + * + * the result is 16M bytes per attribute list. + * Use kvmalloc to allocate in range [several Kbytes - dozen Mbytes] + */ + le = kvmalloc(al_aligned(lsize), GFP_KERNEL); + if (!le) { + err = -ENOMEM; + goto out; + } + + err = ntfs_read_run_nb(ni->mi.sbi, &ni->attr_list.run, 0, le, + lsize, NULL); + if (err) + goto out; + } + + ni->attr_list.size = lsize; + ni->attr_list.le = le; + + return 0; + +out: + ni->attr_list.le = le; + al_destroy(ni); + + return err; +} + +/* + * al_enumerate + * + * Return: + * * The next list le. + * * If @le is NULL then return the first le. + */ +struct ATTR_LIST_ENTRY *al_enumerate(struct ntfs_inode *ni, + struct ATTR_LIST_ENTRY *le) +{ + size_t off; + u16 sz; + + if (!le) { + le = ni->attr_list.le; + } else { + sz = le16_to_cpu(le->size); + if (sz < sizeof(struct ATTR_LIST_ENTRY)) { + /* Impossible 'cause we should not return such le. */ + return NULL; + } + le = Add2Ptr(le, sz); + } + + /* Check boundary. */ + off = PtrOffset(ni->attr_list.le, le); + if (off + sizeof(struct ATTR_LIST_ENTRY) > ni->attr_list.size) { + /* The regular end of list. */ + return NULL; + } + + sz = le16_to_cpu(le->size); + + /* Check le for errors. */ + if (sz < sizeof(struct ATTR_LIST_ENTRY) || + off + sz > ni->attr_list.size || + sz < le->name_off + le->name_len * sizeof(short)) { + return NULL; + } + + return le; +} + +/* + * al_find_le + * + * Find the first le in the list which matches type, name and VCN. + * + * Return: NULL if not found. + */ +struct ATTR_LIST_ENTRY *al_find_le(struct ntfs_inode *ni, + struct ATTR_LIST_ENTRY *le, + const struct ATTRIB *attr) +{ + CLST svcn = attr_svcn(attr); + + return al_find_ex(ni, le, attr->type, attr_name(attr), attr->name_len, + &svcn); +} + +/* + * al_find_ex + * + * Find the first le in the list which matches type, name and VCN. + * + * Return: NULL if not found. + */ +struct ATTR_LIST_ENTRY *al_find_ex(struct ntfs_inode *ni, + struct ATTR_LIST_ENTRY *le, + enum ATTR_TYPE type, const __le16 *name, + u8 name_len, const CLST *vcn) +{ + struct ATTR_LIST_ENTRY *ret = NULL; + u32 type_in = le32_to_cpu(type); + + while ((le = al_enumerate(ni, le))) { + u64 le_vcn; + int diff = le32_to_cpu(le->type) - type_in; + + /* List entries are sorted by type, name and VCN. */ + if (diff < 0) + continue; + + if (diff > 0) + return ret; + + if (le->name_len != name_len) + continue; + + le_vcn = le64_to_cpu(le->vcn); + if (!le_vcn) { + /* + * Compare entry names only for entry with vcn == 0. + */ + diff = ntfs_cmp_names(le_name(le), name_len, name, + name_len, ni->mi.sbi->upcase, + true); + if (diff < 0) + continue; + + if (diff > 0) + return ret; + } + + if (!vcn) + return le; + + if (*vcn == le_vcn) + return le; + + if (*vcn < le_vcn) + return ret; + + ret = le; + } + + return ret; +} + +/* + * al_find_le_to_insert + * + * Find the first list entry which matches type, name and VCN. + */ +static struct ATTR_LIST_ENTRY *al_find_le_to_insert(struct ntfs_inode *ni, + enum ATTR_TYPE type, + const __le16 *name, + u8 name_len, CLST vcn) +{ + struct ATTR_LIST_ENTRY *le = NULL, *prev; + u32 type_in = le32_to_cpu(type); + + /* List entries are sorted by type, name and VCN. */ + while ((le = al_enumerate(ni, prev = le))) { + int diff = le32_to_cpu(le->type) - type_in; + + if (diff < 0) + continue; + + if (diff > 0) + return le; + + if (!le->vcn) { + /* + * Compare entry names only for entry with vcn == 0. + */ + diff = ntfs_cmp_names(le_name(le), le->name_len, name, + name_len, ni->mi.sbi->upcase, + true); + if (diff < 0) + continue; + + if (diff > 0) + return le; + } + + if (le64_to_cpu(le->vcn) >= vcn) + return le; + } + + return prev ? Add2Ptr(prev, le16_to_cpu(prev->size)) : ni->attr_list.le; +} + +/* + * al_add_le + * + * Add an "attribute list entry" to the list. + */ +int al_add_le(struct ntfs_inode *ni, enum ATTR_TYPE type, const __le16 *name, + u8 name_len, CLST svcn, __le16 id, const struct MFT_REF *ref, + struct ATTR_LIST_ENTRY **new_le) +{ + int err; + struct ATTRIB *attr; + struct ATTR_LIST_ENTRY *le; + size_t off; + u16 sz; + size_t asize, new_asize, old_size; + u64 new_size; + typeof(ni->attr_list) *al = &ni->attr_list; + + /* + * Compute the size of the new 'le' + */ + sz = le_size(name_len); + old_size = al->size; + new_size = old_size + sz; + asize = al_aligned(old_size); + new_asize = al_aligned(new_size); + + /* Scan forward to the point at which the new 'le' should be inserted. */ + le = al_find_le_to_insert(ni, type, name, name_len, svcn); + off = PtrOffset(al->le, le); + + if (new_size > asize) { + void *ptr = kmalloc(new_asize, GFP_NOFS); + + if (!ptr) + return -ENOMEM; + + memcpy(ptr, al->le, off); + memcpy(Add2Ptr(ptr, off + sz), le, old_size - off); + le = Add2Ptr(ptr, off); + kfree(al->le); + al->le = ptr; + } else { + memmove(Add2Ptr(le, sz), le, old_size - off); + } + *new_le = le; + + al->size = new_size; + + le->type = type; + le->size = cpu_to_le16(sz); + le->name_len = name_len; + le->name_off = offsetof(struct ATTR_LIST_ENTRY, name); + le->vcn = cpu_to_le64(svcn); + le->ref = *ref; + le->id = id; + memcpy(le->name, name, sizeof(short) * name_len); + + err = attr_set_size(ni, ATTR_LIST, NULL, 0, &al->run, new_size, + &new_size, true, &attr); + if (err) { + /* Undo memmove above. */ + memmove(le, Add2Ptr(le, sz), old_size - off); + al->size = old_size; + return err; + } + + al->dirty = true; + + if (attr && attr->non_res) { + err = ntfs_sb_write_run(ni->mi.sbi, &al->run, 0, al->le, + al->size, 0); + if (err) + return err; + al->dirty = false; + } + + return 0; +} + +/* + * al_remove_le - Remove @le from attribute list. + */ +bool al_remove_le(struct ntfs_inode *ni, struct ATTR_LIST_ENTRY *le) +{ + u16 size; + size_t off; + typeof(ni->attr_list) *al = &ni->attr_list; + + if (!al_is_valid_le(ni, le)) + return false; + + /* Save on stack the size of 'le' */ + size = le16_to_cpu(le->size); + off = PtrOffset(al->le, le); + + memmove(le, Add2Ptr(le, size), al->size - (off + size)); + + al->size -= size; + al->dirty = true; + + return true; +} + +/* + * al_delete_le - Delete first le from the list which matches its parameters. + */ +bool al_delete_le(struct ntfs_inode *ni, enum ATTR_TYPE type, CLST vcn, + const __le16 *name, size_t name_len, + const struct MFT_REF *ref) +{ + u16 size; + struct ATTR_LIST_ENTRY *le; + size_t off; + typeof(ni->attr_list) *al = &ni->attr_list; + + /* Scan forward to the first le that matches the input. */ + le = al_find_ex(ni, NULL, type, name, name_len, &vcn); + if (!le) + return false; + + off = PtrOffset(al->le, le); + +next: + if (off >= al->size) + return false; + if (le->type != type) + return false; + if (le->name_len != name_len) + return false; + if (name_len && ntfs_cmp_names(le_name(le), name_len, name, name_len, + ni->mi.sbi->upcase, true)) + return false; + if (le64_to_cpu(le->vcn) != vcn) + return false; + + /* + * The caller specified a segment reference, so we have to + * scan through the matching entries until we find that segment + * reference or we run of matching entries. + */ + if (ref && memcmp(ref, &le->ref, sizeof(*ref))) { + off += le16_to_cpu(le->size); + le = Add2Ptr(al->le, off); + goto next; + } + + /* Save on stack the size of 'le'. */ + size = le16_to_cpu(le->size); + /* Delete the le. */ + memmove(le, Add2Ptr(le, size), al->size - (off + size)); + + al->size -= size; + al->dirty = true; + + return true; +} + +int al_update(struct ntfs_inode *ni, int sync) +{ + int err; + struct ATTRIB *attr; + typeof(ni->attr_list) *al = &ni->attr_list; + + if (!al->dirty || !al->size) + return 0; + + /* + * Attribute list increased on demand in al_add_le. + * Attribute list decreased here. + */ + err = attr_set_size(ni, ATTR_LIST, NULL, 0, &al->run, al->size, NULL, + false, &attr); + if (err) + goto out; + + if (!attr->non_res) { + memcpy(resident_data(attr), al->le, al->size); + } else { + err = ntfs_sb_write_run(ni->mi.sbi, &al->run, 0, al->le, + al->size, sync); + if (err) + goto out; + + attr->nres.valid_size = attr->nres.data_size; + } + + ni->mi.dirty = true; + al->dirty = false; + +out: + return err; +} diff --git a/fs/ntfs3/bitfunc.c b/fs/ntfs3/bitfunc.c new file mode 100644 index 000000000..50d838093 --- /dev/null +++ b/fs/ntfs3/bitfunc.c @@ -0,0 +1,128 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * + * Copyright (C) 2019-2021 Paragon Software GmbH, All rights reserved. + * + */ + +#include <linux/types.h> + +#include "ntfs_fs.h" + +#define BITS_IN_SIZE_T (sizeof(size_t) * 8) + +/* + * fill_mask[i] - first i bits are '1' , i = 0,1,2,3,4,5,6,7,8 + * fill_mask[i] = 0xFF >> (8-i) + */ +static const u8 fill_mask[] = { 0x00, 0x01, 0x03, 0x07, 0x0F, + 0x1F, 0x3F, 0x7F, 0xFF }; + +/* + * zero_mask[i] - first i bits are '0' , i = 0,1,2,3,4,5,6,7,8 + * zero_mask[i] = 0xFF << i + */ +static const u8 zero_mask[] = { 0xFF, 0xFE, 0xFC, 0xF8, 0xF0, + 0xE0, 0xC0, 0x80, 0x00 }; + +/* + * are_bits_clear + * + * Return: True if all bits [bit, bit+nbits) are zeros "0". + */ +bool are_bits_clear(const ulong *lmap, size_t bit, size_t nbits) +{ + size_t pos = bit & 7; + const u8 *map = (u8 *)lmap + (bit >> 3); + + if (pos) { + if (8 - pos >= nbits) + return !nbits || !(*map & fill_mask[pos + nbits] & + zero_mask[pos]); + + if (*map++ & zero_mask[pos]) + return false; + nbits -= 8 - pos; + } + + pos = ((size_t)map) & (sizeof(size_t) - 1); + if (pos) { + pos = sizeof(size_t) - pos; + if (nbits >= pos * 8) { + for (nbits -= pos * 8; pos; pos--, map++) { + if (*map) + return false; + } + } + } + + for (pos = nbits / BITS_IN_SIZE_T; pos; pos--, map += sizeof(size_t)) { + if (*((size_t *)map)) + return false; + } + + for (pos = (nbits % BITS_IN_SIZE_T) >> 3; pos; pos--, map++) { + if (*map) + return false; + } + + pos = nbits & 7; + if (pos && (*map & fill_mask[pos])) + return false; + + return true; +} + +/* + * are_bits_set + * + * Return: True if all bits [bit, bit+nbits) are ones "1". + */ +bool are_bits_set(const ulong *lmap, size_t bit, size_t nbits) +{ + u8 mask; + size_t pos = bit & 7; + const u8 *map = (u8 *)lmap + (bit >> 3); + + if (pos) { + if (8 - pos >= nbits) { + mask = fill_mask[pos + nbits] & zero_mask[pos]; + return !nbits || (*map & mask) == mask; + } + + mask = zero_mask[pos]; + if ((*map++ & mask) != mask) + return false; + nbits -= 8 - pos; + } + + pos = ((size_t)map) & (sizeof(size_t) - 1); + if (pos) { + pos = sizeof(size_t) - pos; + if (nbits >= pos * 8) { + for (nbits -= pos * 8; pos; pos--, map++) { + if (*map != 0xFF) + return false; + } + } + } + + for (pos = nbits / BITS_IN_SIZE_T; pos; pos--, map += sizeof(size_t)) { + if (*((size_t *)map) != MINUS_ONE_T) + return false; + } + + for (pos = (nbits % BITS_IN_SIZE_T) >> 3; pos; pos--, map++) { + if (*map != 0xFF) + return false; + } + + pos = nbits & 7; + if (pos) { + mask = fill_mask[pos]; + if ((*map & mask) != mask) + return false; + } + + return true; +} diff --git a/fs/ntfs3/bitmap.c b/fs/ntfs3/bitmap.c new file mode 100644 index 000000000..c055bbdfe --- /dev/null +++ b/fs/ntfs3/bitmap.c @@ -0,0 +1,1485 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * + * Copyright (C) 2019-2021 Paragon Software GmbH, All rights reserved. + * + * This code builds two trees of free clusters extents. + * Trees are sorted by start of extent and by length of extent. + * NTFS_MAX_WND_EXTENTS defines the maximum number of elements in trees. + * In extreme case code reads on-disk bitmap to find free clusters. + * + */ + +#include <linux/buffer_head.h> +#include <linux/fs.h> +#include <linux/kernel.h> + +#include "ntfs.h" +#include "ntfs_fs.h" + +/* + * Maximum number of extents in tree. + */ +#define NTFS_MAX_WND_EXTENTS (32u * 1024u) + +struct rb_node_key { + struct rb_node node; + size_t key; +}; + +struct e_node { + struct rb_node_key start; /* Tree sorted by start. */ + struct rb_node_key count; /* Tree sorted by len. */ +}; + +static int wnd_rescan(struct wnd_bitmap *wnd); +static struct buffer_head *wnd_map(struct wnd_bitmap *wnd, size_t iw); +static bool wnd_is_free_hlp(struct wnd_bitmap *wnd, size_t bit, size_t bits); + +static struct kmem_cache *ntfs_enode_cachep; + +int __init ntfs3_init_bitmap(void) +{ + ntfs_enode_cachep = + kmem_cache_create("ntfs3_enode_cache", sizeof(struct e_node), 0, + SLAB_RECLAIM_ACCOUNT, NULL); + return ntfs_enode_cachep ? 0 : -ENOMEM; +} + +void ntfs3_exit_bitmap(void) +{ + kmem_cache_destroy(ntfs_enode_cachep); +} + +/* + * wnd_scan + * + * b_pos + b_len - biggest fragment. + * Scan range [wpos wbits) window @buf. + * + * Return: -1 if not found. + */ +static size_t wnd_scan(const ulong *buf, size_t wbit, u32 wpos, u32 wend, + size_t to_alloc, size_t *prev_tail, size_t *b_pos, + size_t *b_len) +{ + while (wpos < wend) { + size_t free_len; + u32 free_bits, end; + u32 used = find_next_zero_bit(buf, wend, wpos); + + if (used >= wend) { + if (*b_len < *prev_tail) { + *b_pos = wbit - *prev_tail; + *b_len = *prev_tail; + } + + *prev_tail = 0; + return -1; + } + + if (used > wpos) { + wpos = used; + if (*b_len < *prev_tail) { + *b_pos = wbit - *prev_tail; + *b_len = *prev_tail; + } + + *prev_tail = 0; + } + + /* + * Now we have a fragment [wpos, wend) staring with 0. + */ + end = wpos + to_alloc - *prev_tail; + free_bits = find_next_bit(buf, min(end, wend), wpos); + + free_len = *prev_tail + free_bits - wpos; + + if (*b_len < free_len) { + *b_pos = wbit + wpos - *prev_tail; + *b_len = free_len; + } + + if (free_len >= to_alloc) + return wbit + wpos - *prev_tail; + + if (free_bits >= wend) { + *prev_tail += free_bits - wpos; + return -1; + } + + wpos = free_bits + 1; + + *prev_tail = 0; + } + + return -1; +} + +/* + * wnd_close - Frees all resources. + */ +void wnd_close(struct wnd_bitmap *wnd) +{ + struct rb_node *node, *next; + + kfree(wnd->free_bits); + run_close(&wnd->run); + + node = rb_first(&wnd->start_tree); + + while (node) { + next = rb_next(node); + rb_erase(node, &wnd->start_tree); + kmem_cache_free(ntfs_enode_cachep, + rb_entry(node, struct e_node, start.node)); + node = next; + } +} + +static struct rb_node *rb_lookup(struct rb_root *root, size_t v) +{ + struct rb_node **p = &root->rb_node; + struct rb_node *r = NULL; + + while (*p) { + struct rb_node_key *k; + + k = rb_entry(*p, struct rb_node_key, node); + if (v < k->key) { + p = &(*p)->rb_left; + } else if (v > k->key) { + r = &k->node; + p = &(*p)->rb_right; + } else { + return &k->node; + } + } + + return r; +} + +/* + * rb_insert_count - Helper function to insert special kind of 'count' tree. + */ +static inline bool rb_insert_count(struct rb_root *root, struct e_node *e) +{ + struct rb_node **p = &root->rb_node; + struct rb_node *parent = NULL; + size_t e_ckey = e->count.key; + size_t e_skey = e->start.key; + + while (*p) { + struct e_node *k = + rb_entry(parent = *p, struct e_node, count.node); + + if (e_ckey > k->count.key) { + p = &(*p)->rb_left; + } else if (e_ckey < k->count.key) { + p = &(*p)->rb_right; + } else if (e_skey < k->start.key) { + p = &(*p)->rb_left; + } else if (e_skey > k->start.key) { + p = &(*p)->rb_right; + } else { + WARN_ON(1); + return false; + } + } + + rb_link_node(&e->count.node, parent, p); + rb_insert_color(&e->count.node, root); + return true; +} + +/* + * rb_insert_start - Helper function to insert special kind of 'count' tree. + */ +static inline bool rb_insert_start(struct rb_root *root, struct e_node *e) +{ + struct rb_node **p = &root->rb_node; + struct rb_node *parent = NULL; + size_t e_skey = e->start.key; + + while (*p) { + struct e_node *k; + + parent = *p; + + k = rb_entry(parent, struct e_node, start.node); + if (e_skey < k->start.key) { + p = &(*p)->rb_left; + } else if (e_skey > k->start.key) { + p = &(*p)->rb_right; + } else { + WARN_ON(1); + return false; + } + } + + rb_link_node(&e->start.node, parent, p); + rb_insert_color(&e->start.node, root); + return true; +} + +/* + * wnd_add_free_ext - Adds a new extent of free space. + * @build: 1 when building tree. + */ +static void wnd_add_free_ext(struct wnd_bitmap *wnd, size_t bit, size_t len, + bool build) +{ + struct e_node *e, *e0 = NULL; + size_t ib, end_in = bit + len; + struct rb_node *n; + + if (build) { + /* Use extent_min to filter too short extents. */ + if (wnd->count >= NTFS_MAX_WND_EXTENTS && + len <= wnd->extent_min) { + wnd->uptodated = -1; + return; + } + } else { + /* Try to find extent before 'bit'. */ + n = rb_lookup(&wnd->start_tree, bit); + + if (!n) { + n = rb_first(&wnd->start_tree); + } else { + e = rb_entry(n, struct e_node, start.node); + n = rb_next(n); + if (e->start.key + e->count.key == bit) { + /* Remove left. */ + bit = e->start.key; + len += e->count.key; + rb_erase(&e->start.node, &wnd->start_tree); + rb_erase(&e->count.node, &wnd->count_tree); + wnd->count -= 1; + e0 = e; + } + } + + while (n) { + size_t next_end; + + e = rb_entry(n, struct e_node, start.node); + next_end = e->start.key + e->count.key; + if (e->start.key > end_in) + break; + + /* Remove right. */ + n = rb_next(n); + len += next_end - end_in; + end_in = next_end; + rb_erase(&e->start.node, &wnd->start_tree); + rb_erase(&e->count.node, &wnd->count_tree); + wnd->count -= 1; + + if (!e0) + e0 = e; + else + kmem_cache_free(ntfs_enode_cachep, e); + } + + if (wnd->uptodated != 1) { + /* Check bits before 'bit'. */ + ib = wnd->zone_bit == wnd->zone_end || + bit < wnd->zone_end + ? 0 + : wnd->zone_end; + + while (bit > ib && wnd_is_free_hlp(wnd, bit - 1, 1)) { + bit -= 1; + len += 1; + } + + /* Check bits after 'end_in'. */ + ib = wnd->zone_bit == wnd->zone_end || + end_in > wnd->zone_bit + ? wnd->nbits + : wnd->zone_bit; + + while (end_in < ib && wnd_is_free_hlp(wnd, end_in, 1)) { + end_in += 1; + len += 1; + } + } + } + /* Insert new fragment. */ + if (wnd->count >= NTFS_MAX_WND_EXTENTS) { + if (e0) + kmem_cache_free(ntfs_enode_cachep, e0); + + wnd->uptodated = -1; + + /* Compare with smallest fragment. */ + n = rb_last(&wnd->count_tree); + e = rb_entry(n, struct e_node, count.node); + if (len <= e->count.key) + goto out; /* Do not insert small fragments. */ + + if (build) { + struct e_node *e2; + + n = rb_prev(n); + e2 = rb_entry(n, struct e_node, count.node); + /* Smallest fragment will be 'e2->count.key'. */ + wnd->extent_min = e2->count.key; + } + + /* Replace smallest fragment by new one. */ + rb_erase(&e->start.node, &wnd->start_tree); + rb_erase(&e->count.node, &wnd->count_tree); + wnd->count -= 1; + } else { + e = e0 ? e0 : kmem_cache_alloc(ntfs_enode_cachep, GFP_ATOMIC); + if (!e) { + wnd->uptodated = -1; + goto out; + } + + if (build && len <= wnd->extent_min) + wnd->extent_min = len; + } + e->start.key = bit; + e->count.key = len; + if (len > wnd->extent_max) + wnd->extent_max = len; + + rb_insert_start(&wnd->start_tree, e); + rb_insert_count(&wnd->count_tree, e); + wnd->count += 1; + +out:; +} + +/* + * wnd_remove_free_ext - Remove a run from the cached free space. + */ +static void wnd_remove_free_ext(struct wnd_bitmap *wnd, size_t bit, size_t len) +{ + struct rb_node *n, *n3; + struct e_node *e, *e3; + size_t end_in = bit + len; + size_t end3, end, new_key, new_len, max_new_len; + + /* Try to find extent before 'bit'. */ + n = rb_lookup(&wnd->start_tree, bit); + + if (!n) + return; + + e = rb_entry(n, struct e_node, start.node); + end = e->start.key + e->count.key; + + new_key = new_len = 0; + len = e->count.key; + + /* Range [bit,end_in) must be inside 'e' or outside 'e' and 'n'. */ + if (e->start.key > bit) + ; + else if (end_in <= end) { + /* Range [bit,end_in) inside 'e'. */ + new_key = end_in; + new_len = end - end_in; + len = bit - e->start.key; + } else if (bit > end) { + bool bmax = false; + + n3 = rb_next(n); + + while (n3) { + e3 = rb_entry(n3, struct e_node, start.node); + if (e3->start.key >= end_in) + break; + + if (e3->count.key == wnd->extent_max) + bmax = true; + + end3 = e3->start.key + e3->count.key; + if (end3 > end_in) { + e3->start.key = end_in; + rb_erase(&e3->count.node, &wnd->count_tree); + e3->count.key = end3 - end_in; + rb_insert_count(&wnd->count_tree, e3); + break; + } + + n3 = rb_next(n3); + rb_erase(&e3->start.node, &wnd->start_tree); + rb_erase(&e3->count.node, &wnd->count_tree); + wnd->count -= 1; + kmem_cache_free(ntfs_enode_cachep, e3); + } + if (!bmax) + return; + n3 = rb_first(&wnd->count_tree); + wnd->extent_max = + n3 ? rb_entry(n3, struct e_node, count.node)->count.key + : 0; + return; + } + + if (e->count.key != wnd->extent_max) { + ; + } else if (rb_prev(&e->count.node)) { + ; + } else { + n3 = rb_next(&e->count.node); + max_new_len = max(len, new_len); + if (!n3) { + wnd->extent_max = max_new_len; + } else { + e3 = rb_entry(n3, struct e_node, count.node); + wnd->extent_max = max(e3->count.key, max_new_len); + } + } + + if (!len) { + if (new_len) { + e->start.key = new_key; + rb_erase(&e->count.node, &wnd->count_tree); + e->count.key = new_len; + rb_insert_count(&wnd->count_tree, e); + } else { + rb_erase(&e->start.node, &wnd->start_tree); + rb_erase(&e->count.node, &wnd->count_tree); + wnd->count -= 1; + kmem_cache_free(ntfs_enode_cachep, e); + } + goto out; + } + rb_erase(&e->count.node, &wnd->count_tree); + e->count.key = len; + rb_insert_count(&wnd->count_tree, e); + + if (!new_len) + goto out; + + if (wnd->count >= NTFS_MAX_WND_EXTENTS) { + wnd->uptodated = -1; + + /* Get minimal extent. */ + e = rb_entry(rb_last(&wnd->count_tree), struct e_node, + count.node); + if (e->count.key > new_len) + goto out; + + /* Replace minimum. */ + rb_erase(&e->start.node, &wnd->start_tree); + rb_erase(&e->count.node, &wnd->count_tree); + wnd->count -= 1; + } else { + e = kmem_cache_alloc(ntfs_enode_cachep, GFP_ATOMIC); + if (!e) + wnd->uptodated = -1; + } + + if (e) { + e->start.key = new_key; + e->count.key = new_len; + rb_insert_start(&wnd->start_tree, e); + rb_insert_count(&wnd->count_tree, e); + wnd->count += 1; + } + +out: + if (!wnd->count && 1 != wnd->uptodated) + wnd_rescan(wnd); +} + +/* + * wnd_rescan - Scan all bitmap. Used while initialization. + */ +static int wnd_rescan(struct wnd_bitmap *wnd) +{ + int err = 0; + size_t prev_tail = 0; + struct super_block *sb = wnd->sb; + struct ntfs_sb_info *sbi = sb->s_fs_info; + u64 lbo, len = 0; + u32 blocksize = sb->s_blocksize; + u8 cluster_bits = sbi->cluster_bits; + u32 wbits = 8 * sb->s_blocksize; + u32 used, frb; + const ulong *buf; + size_t wpos, wbit, iw, vbo; + struct buffer_head *bh = NULL; + CLST lcn, clen; + + wnd->uptodated = 0; + wnd->extent_max = 0; + wnd->extent_min = MINUS_ONE_T; + wnd->total_zeroes = 0; + + vbo = 0; + + for (iw = 0; iw < wnd->nwnd; iw++) { + if (iw + 1 == wnd->nwnd) + wbits = wnd->bits_last; + + if (wnd->inited) { + if (!wnd->free_bits[iw]) { + /* All ones. */ + if (prev_tail) { + wnd_add_free_ext(wnd, + vbo * 8 - prev_tail, + prev_tail, true); + prev_tail = 0; + } + goto next_wnd; + } + if (wbits == wnd->free_bits[iw]) { + /* All zeroes. */ + prev_tail += wbits; + wnd->total_zeroes += wbits; + goto next_wnd; + } + } + + if (!len) { + u32 off = vbo & sbi->cluster_mask; + + if (!run_lookup_entry(&wnd->run, vbo >> cluster_bits, + &lcn, &clen, NULL)) { + err = -ENOENT; + goto out; + } + + lbo = ((u64)lcn << cluster_bits) + off; + len = ((u64)clen << cluster_bits) - off; + } + + bh = ntfs_bread(sb, lbo >> sb->s_blocksize_bits); + if (!bh) { + err = -EIO; + goto out; + } + + buf = (ulong *)bh->b_data; + + used = bitmap_weight(buf, wbits); + if (used < wbits) { + frb = wbits - used; + wnd->free_bits[iw] = frb; + wnd->total_zeroes += frb; + } + + wpos = 0; + wbit = vbo * 8; + + if (wbit + wbits > wnd->nbits) + wbits = wnd->nbits - wbit; + + do { + used = find_next_zero_bit(buf, wbits, wpos); + + if (used > wpos && prev_tail) { + wnd_add_free_ext(wnd, wbit + wpos - prev_tail, + prev_tail, true); + prev_tail = 0; + } + + wpos = used; + + if (wpos >= wbits) { + /* No free blocks. */ + prev_tail = 0; + break; + } + + frb = find_next_bit(buf, wbits, wpos); + if (frb >= wbits) { + /* Keep last free block. */ + prev_tail += frb - wpos; + break; + } + + wnd_add_free_ext(wnd, wbit + wpos - prev_tail, + frb + prev_tail - wpos, true); + + /* Skip free block and first '1'. */ + wpos = frb + 1; + /* Reset previous tail. */ + prev_tail = 0; + } while (wpos < wbits); + +next_wnd: + + if (bh) + put_bh(bh); + bh = NULL; + + vbo += blocksize; + if (len) { + len -= blocksize; + lbo += blocksize; + } + } + + /* Add last block. */ + if (prev_tail) + wnd_add_free_ext(wnd, wnd->nbits - prev_tail, prev_tail, true); + + /* + * Before init cycle wnd->uptodated was 0. + * If any errors or limits occurs while initialization then + * wnd->uptodated will be -1. + * If 'uptodated' is still 0 then Tree is really updated. + */ + if (!wnd->uptodated) + wnd->uptodated = 1; + + if (wnd->zone_bit != wnd->zone_end) { + size_t zlen = wnd->zone_end - wnd->zone_bit; + + wnd->zone_end = wnd->zone_bit; + wnd_zone_set(wnd, wnd->zone_bit, zlen); + } + +out: + return err; +} + +int wnd_init(struct wnd_bitmap *wnd, struct super_block *sb, size_t nbits) +{ + int err; + u32 blocksize = sb->s_blocksize; + u32 wbits = blocksize * 8; + + init_rwsem(&wnd->rw_lock); + + wnd->sb = sb; + wnd->nbits = nbits; + wnd->total_zeroes = nbits; + wnd->extent_max = MINUS_ONE_T; + wnd->zone_bit = wnd->zone_end = 0; + wnd->nwnd = bytes_to_block(sb, bitmap_size(nbits)); + wnd->bits_last = nbits & (wbits - 1); + if (!wnd->bits_last) + wnd->bits_last = wbits; + + wnd->free_bits = + kvmalloc_array(wnd->nwnd, sizeof(u16), GFP_KERNEL | __GFP_ZERO); + + if (!wnd->free_bits) + return -ENOMEM; + + err = wnd_rescan(wnd); + if (err) + return err; + + wnd->inited = true; + + return 0; +} + +/* + * wnd_map - Call sb_bread for requested window. + */ +static struct buffer_head *wnd_map(struct wnd_bitmap *wnd, size_t iw) +{ + size_t vbo; + CLST lcn, clen; + struct super_block *sb = wnd->sb; + struct ntfs_sb_info *sbi; + struct buffer_head *bh; + u64 lbo; + + sbi = sb->s_fs_info; + vbo = (u64)iw << sb->s_blocksize_bits; + + if (!run_lookup_entry(&wnd->run, vbo >> sbi->cluster_bits, &lcn, &clen, + NULL)) { + return ERR_PTR(-ENOENT); + } + + lbo = ((u64)lcn << sbi->cluster_bits) + (vbo & sbi->cluster_mask); + + bh = ntfs_bread(wnd->sb, lbo >> sb->s_blocksize_bits); + if (!bh) + return ERR_PTR(-EIO); + + return bh; +} + +/* + * wnd_set_free - Mark the bits range from bit to bit + bits as free. + */ +int wnd_set_free(struct wnd_bitmap *wnd, size_t bit, size_t bits) +{ + int err = 0; + struct super_block *sb = wnd->sb; + size_t bits0 = bits; + u32 wbits = 8 * sb->s_blocksize; + size_t iw = bit >> (sb->s_blocksize_bits + 3); + u32 wbit = bit & (wbits - 1); + struct buffer_head *bh; + + while (iw < wnd->nwnd && bits) { + u32 tail, op; + ulong *buf; + + if (iw + 1 == wnd->nwnd) + wbits = wnd->bits_last; + + tail = wbits - wbit; + op = min_t(u32, tail, bits); + + bh = wnd_map(wnd, iw); + if (IS_ERR(bh)) { + err = PTR_ERR(bh); + break; + } + + buf = (ulong *)bh->b_data; + + lock_buffer(bh); + + __bitmap_clear(buf, wbit, op); + + wnd->free_bits[iw] += op; + + set_buffer_uptodate(bh); + mark_buffer_dirty(bh); + unlock_buffer(bh); + put_bh(bh); + + wnd->total_zeroes += op; + bits -= op; + wbit = 0; + iw += 1; + } + + wnd_add_free_ext(wnd, bit, bits0, false); + + return err; +} + +/* + * wnd_set_used - Mark the bits range from bit to bit + bits as used. + */ +int wnd_set_used(struct wnd_bitmap *wnd, size_t bit, size_t bits) +{ + int err = 0; + struct super_block *sb = wnd->sb; + size_t bits0 = bits; + size_t iw = bit >> (sb->s_blocksize_bits + 3); + u32 wbits = 8 * sb->s_blocksize; + u32 wbit = bit & (wbits - 1); + struct buffer_head *bh; + + while (iw < wnd->nwnd && bits) { + u32 tail, op; + ulong *buf; + + if (unlikely(iw + 1 == wnd->nwnd)) + wbits = wnd->bits_last; + + tail = wbits - wbit; + op = min_t(u32, tail, bits); + + bh = wnd_map(wnd, iw); + if (IS_ERR(bh)) { + err = PTR_ERR(bh); + break; + } + buf = (ulong *)bh->b_data; + + lock_buffer(bh); + + __bitmap_set(buf, wbit, op); + wnd->free_bits[iw] -= op; + + set_buffer_uptodate(bh); + mark_buffer_dirty(bh); + unlock_buffer(bh); + put_bh(bh); + + wnd->total_zeroes -= op; + bits -= op; + wbit = 0; + iw += 1; + } + + if (!RB_EMPTY_ROOT(&wnd->start_tree)) + wnd_remove_free_ext(wnd, bit, bits0); + + return err; +} + +/* + * wnd_is_free_hlp + * + * Return: True if all clusters [bit, bit+bits) are free (bitmap only). + */ +static bool wnd_is_free_hlp(struct wnd_bitmap *wnd, size_t bit, size_t bits) +{ + struct super_block *sb = wnd->sb; + size_t iw = bit >> (sb->s_blocksize_bits + 3); + u32 wbits = 8 * sb->s_blocksize; + u32 wbit = bit & (wbits - 1); + + while (iw < wnd->nwnd && bits) { + u32 tail, op; + + if (unlikely(iw + 1 == wnd->nwnd)) + wbits = wnd->bits_last; + + tail = wbits - wbit; + op = min_t(u32, tail, bits); + + if (wbits != wnd->free_bits[iw]) { + bool ret; + struct buffer_head *bh = wnd_map(wnd, iw); + + if (IS_ERR(bh)) + return false; + + ret = are_bits_clear((ulong *)bh->b_data, wbit, op); + + put_bh(bh); + if (!ret) + return false; + } + + bits -= op; + wbit = 0; + iw += 1; + } + + return true; +} + +/* + * wnd_is_free + * + * Return: True if all clusters [bit, bit+bits) are free. + */ +bool wnd_is_free(struct wnd_bitmap *wnd, size_t bit, size_t bits) +{ + bool ret; + struct rb_node *n; + size_t end; + struct e_node *e; + + if (RB_EMPTY_ROOT(&wnd->start_tree)) + goto use_wnd; + + n = rb_lookup(&wnd->start_tree, bit); + if (!n) + goto use_wnd; + + e = rb_entry(n, struct e_node, start.node); + + end = e->start.key + e->count.key; + + if (bit < end && bit + bits <= end) + return true; + +use_wnd: + ret = wnd_is_free_hlp(wnd, bit, bits); + + return ret; +} + +/* + * wnd_is_used + * + * Return: True if all clusters [bit, bit+bits) are used. + */ +bool wnd_is_used(struct wnd_bitmap *wnd, size_t bit, size_t bits) +{ + bool ret = false; + struct super_block *sb = wnd->sb; + size_t iw = bit >> (sb->s_blocksize_bits + 3); + u32 wbits = 8 * sb->s_blocksize; + u32 wbit = bit & (wbits - 1); + size_t end; + struct rb_node *n; + struct e_node *e; + + if (RB_EMPTY_ROOT(&wnd->start_tree)) + goto use_wnd; + + end = bit + bits; + n = rb_lookup(&wnd->start_tree, end - 1); + if (!n) + goto use_wnd; + + e = rb_entry(n, struct e_node, start.node); + if (e->start.key + e->count.key > bit) + return false; + +use_wnd: + while (iw < wnd->nwnd && bits) { + u32 tail, op; + + if (unlikely(iw + 1 == wnd->nwnd)) + wbits = wnd->bits_last; + + tail = wbits - wbit; + op = min_t(u32, tail, bits); + + if (wnd->free_bits[iw]) { + bool ret; + struct buffer_head *bh = wnd_map(wnd, iw); + + if (IS_ERR(bh)) + goto out; + + ret = are_bits_set((ulong *)bh->b_data, wbit, op); + put_bh(bh); + if (!ret) + goto out; + } + + bits -= op; + wbit = 0; + iw += 1; + } + ret = true; + +out: + return ret; +} + +/* + * wnd_find - Look for free space. + * + * - flags - BITMAP_FIND_XXX flags + * + * Return: 0 if not found. + */ +size_t wnd_find(struct wnd_bitmap *wnd, size_t to_alloc, size_t hint, + size_t flags, size_t *allocated) +{ + struct super_block *sb; + u32 wbits, wpos, wzbit, wzend; + size_t fnd, max_alloc, b_len, b_pos; + size_t iw, prev_tail, nwnd, wbit, ebit, zbit, zend; + size_t to_alloc0 = to_alloc; + const ulong *buf; + const struct e_node *e; + const struct rb_node *pr, *cr; + u8 log2_bits; + bool fbits_valid; + struct buffer_head *bh; + + /* Fast checking for available free space. */ + if (flags & BITMAP_FIND_FULL) { + size_t zeroes = wnd_zeroes(wnd); + + zeroes -= wnd->zone_end - wnd->zone_bit; + if (zeroes < to_alloc0) + goto no_space; + + if (to_alloc0 > wnd->extent_max) + goto no_space; + } else { + if (to_alloc > wnd->extent_max) + to_alloc = wnd->extent_max; + } + + if (wnd->zone_bit <= hint && hint < wnd->zone_end) + hint = wnd->zone_end; + + max_alloc = wnd->nbits; + b_len = b_pos = 0; + + if (hint >= max_alloc) + hint = 0; + + if (RB_EMPTY_ROOT(&wnd->start_tree)) { + if (wnd->uptodated == 1) { + /* Extents tree is updated -> No free space. */ + goto no_space; + } + goto scan_bitmap; + } + + e = NULL; + if (!hint) + goto allocate_biggest; + + /* Use hint: Enumerate extents by start >= hint. */ + pr = NULL; + cr = wnd->start_tree.rb_node; + + for (;;) { + e = rb_entry(cr, struct e_node, start.node); + + if (e->start.key == hint) + break; + + if (e->start.key < hint) { + pr = cr; + cr = cr->rb_right; + if (!cr) + break; + continue; + } + + cr = cr->rb_left; + if (!cr) { + e = pr ? rb_entry(pr, struct e_node, start.node) : NULL; + break; + } + } + + if (!e) + goto allocate_biggest; + + if (e->start.key + e->count.key > hint) { + /* We have found extension with 'hint' inside. */ + size_t len = e->start.key + e->count.key - hint; + + if (len >= to_alloc && hint + to_alloc <= max_alloc) { + fnd = hint; + goto found; + } + + if (!(flags & BITMAP_FIND_FULL)) { + if (len > to_alloc) + len = to_alloc; + + if (hint + len <= max_alloc) { + fnd = hint; + to_alloc = len; + goto found; + } + } + } + +allocate_biggest: + /* Allocate from biggest free extent. */ + e = rb_entry(rb_first(&wnd->count_tree), struct e_node, count.node); + if (e->count.key != wnd->extent_max) + wnd->extent_max = e->count.key; + + if (e->count.key < max_alloc) { + if (e->count.key >= to_alloc) { + ; + } else if (flags & BITMAP_FIND_FULL) { + if (e->count.key < to_alloc0) { + /* Biggest free block is less then requested. */ + goto no_space; + } + to_alloc = e->count.key; + } else if (-1 != wnd->uptodated) { + to_alloc = e->count.key; + } else { + /* Check if we can use more bits. */ + size_t op, max_check; + struct rb_root start_tree; + + memcpy(&start_tree, &wnd->start_tree, + sizeof(struct rb_root)); + memset(&wnd->start_tree, 0, sizeof(struct rb_root)); + + max_check = e->start.key + to_alloc; + if (max_check > max_alloc) + max_check = max_alloc; + for (op = e->start.key + e->count.key; op < max_check; + op++) { + if (!wnd_is_free(wnd, op, 1)) + break; + } + memcpy(&wnd->start_tree, &start_tree, + sizeof(struct rb_root)); + to_alloc = op - e->start.key; + } + + /* Prepare to return. */ + fnd = e->start.key; + if (e->start.key + to_alloc > max_alloc) + to_alloc = max_alloc - e->start.key; + goto found; + } + + if (wnd->uptodated == 1) { + /* Extents tree is updated -> no free space. */ + goto no_space; + } + + b_len = e->count.key; + b_pos = e->start.key; + +scan_bitmap: + sb = wnd->sb; + log2_bits = sb->s_blocksize_bits + 3; + + /* At most two ranges [hint, max_alloc) + [0, hint). */ +Again: + + /* TODO: Optimize request for case nbits > wbits. */ + iw = hint >> log2_bits; + wbits = sb->s_blocksize * 8; + wpos = hint & (wbits - 1); + prev_tail = 0; + fbits_valid = true; + + if (max_alloc == wnd->nbits) { + nwnd = wnd->nwnd; + } else { + size_t t = max_alloc + wbits - 1; + + nwnd = likely(t > max_alloc) ? (t >> log2_bits) : wnd->nwnd; + } + + /* Enumerate all windows. */ + for (; iw < nwnd; iw++) { + wbit = iw << log2_bits; + + if (!wnd->free_bits[iw]) { + if (prev_tail > b_len) { + b_pos = wbit - prev_tail; + b_len = prev_tail; + } + + /* Skip full used window. */ + prev_tail = 0; + wpos = 0; + continue; + } + + if (unlikely(iw + 1 == nwnd)) { + if (max_alloc == wnd->nbits) { + wbits = wnd->bits_last; + } else { + size_t t = max_alloc & (wbits - 1); + + if (t) { + wbits = t; + fbits_valid = false; + } + } + } + + if (wnd->zone_end > wnd->zone_bit) { + ebit = wbit + wbits; + zbit = max(wnd->zone_bit, wbit); + zend = min(wnd->zone_end, ebit); + + /* Here we have a window [wbit, ebit) and zone [zbit, zend). */ + if (zend <= zbit) { + /* Zone does not overlap window. */ + } else { + wzbit = zbit - wbit; + wzend = zend - wbit; + + /* Zone overlaps window. */ + if (wnd->free_bits[iw] == wzend - wzbit) { + prev_tail = 0; + wpos = 0; + continue; + } + + /* Scan two ranges window: [wbit, zbit) and [zend, ebit). */ + bh = wnd_map(wnd, iw); + + if (IS_ERR(bh)) { + /* TODO: Error */ + prev_tail = 0; + wpos = 0; + continue; + } + + buf = (ulong *)bh->b_data; + + /* Scan range [wbit, zbit). */ + if (wpos < wzbit) { + /* Scan range [wpos, zbit). */ + fnd = wnd_scan(buf, wbit, wpos, wzbit, + to_alloc, &prev_tail, + &b_pos, &b_len); + if (fnd != MINUS_ONE_T) { + put_bh(bh); + goto found; + } + } + + prev_tail = 0; + + /* Scan range [zend, ebit). */ + if (wzend < wbits) { + fnd = wnd_scan(buf, wbit, + max(wzend, wpos), wbits, + to_alloc, &prev_tail, + &b_pos, &b_len); + if (fnd != MINUS_ONE_T) { + put_bh(bh); + goto found; + } + } + + wpos = 0; + put_bh(bh); + continue; + } + } + + /* Current window does not overlap zone. */ + if (!wpos && fbits_valid && wnd->free_bits[iw] == wbits) { + /* Window is empty. */ + if (prev_tail + wbits >= to_alloc) { + fnd = wbit + wpos - prev_tail; + goto found; + } + + /* Increase 'prev_tail' and process next window. */ + prev_tail += wbits; + wpos = 0; + continue; + } + + /* Read window. */ + bh = wnd_map(wnd, iw); + if (IS_ERR(bh)) { + // TODO: Error. + prev_tail = 0; + wpos = 0; + continue; + } + + buf = (ulong *)bh->b_data; + + /* Scan range [wpos, eBits). */ + fnd = wnd_scan(buf, wbit, wpos, wbits, to_alloc, &prev_tail, + &b_pos, &b_len); + put_bh(bh); + if (fnd != MINUS_ONE_T) + goto found; + } + + if (b_len < prev_tail) { + /* The last fragment. */ + b_len = prev_tail; + b_pos = max_alloc - prev_tail; + } + + if (hint) { + /* + * We have scanned range [hint max_alloc). + * Prepare to scan range [0 hint + to_alloc). + */ + size_t nextmax = hint + to_alloc; + + if (likely(nextmax >= hint) && nextmax < max_alloc) + max_alloc = nextmax; + hint = 0; + goto Again; + } + + if (!b_len) + goto no_space; + + wnd->extent_max = b_len; + + if (flags & BITMAP_FIND_FULL) + goto no_space; + + fnd = b_pos; + to_alloc = b_len; + +found: + if (flags & BITMAP_FIND_MARK_AS_USED) { + /* TODO: Optimize remove extent (pass 'e'?). */ + if (wnd_set_used(wnd, fnd, to_alloc)) + goto no_space; + } else if (wnd->extent_max != MINUS_ONE_T && + to_alloc > wnd->extent_max) { + wnd->extent_max = to_alloc; + } + + *allocated = fnd; + return to_alloc; + +no_space: + return 0; +} + +/* + * wnd_extend - Extend bitmap ($MFT bitmap). + */ +int wnd_extend(struct wnd_bitmap *wnd, size_t new_bits) +{ + int err; + struct super_block *sb = wnd->sb; + struct ntfs_sb_info *sbi = sb->s_fs_info; + u32 blocksize = sb->s_blocksize; + u32 wbits = blocksize * 8; + u32 b0, new_last; + size_t bits, iw, new_wnd; + size_t old_bits = wnd->nbits; + u16 *new_free; + + if (new_bits <= old_bits) + return -EINVAL; + + /* Align to 8 byte boundary. */ + new_wnd = bytes_to_block(sb, bitmap_size(new_bits)); + new_last = new_bits & (wbits - 1); + if (!new_last) + new_last = wbits; + + if (new_wnd != wnd->nwnd) { + new_free = kmalloc(new_wnd * sizeof(u16), GFP_NOFS); + if (!new_free) + return -ENOMEM; + + memcpy(new_free, wnd->free_bits, wnd->nwnd * sizeof(short)); + memset(new_free + wnd->nwnd, 0, + (new_wnd - wnd->nwnd) * sizeof(short)); + kfree(wnd->free_bits); + wnd->free_bits = new_free; + } + + /* Zero bits [old_bits,new_bits). */ + bits = new_bits - old_bits; + b0 = old_bits & (wbits - 1); + + for (iw = old_bits >> (sb->s_blocksize_bits + 3); bits; iw += 1) { + u32 op; + size_t frb; + u64 vbo, lbo, bytes; + struct buffer_head *bh; + ulong *buf; + + if (iw + 1 == new_wnd) + wbits = new_last; + + op = b0 + bits > wbits ? wbits - b0 : bits; + vbo = (u64)iw * blocksize; + + err = ntfs_vbo_to_lbo(sbi, &wnd->run, vbo, &lbo, &bytes); + if (err) + break; + + bh = ntfs_bread(sb, lbo >> sb->s_blocksize_bits); + if (!bh) + return -EIO; + + lock_buffer(bh); + buf = (ulong *)bh->b_data; + + __bitmap_clear(buf, b0, blocksize * 8 - b0); + frb = wbits - bitmap_weight(buf, wbits); + wnd->total_zeroes += frb - wnd->free_bits[iw]; + wnd->free_bits[iw] = frb; + + set_buffer_uptodate(bh); + mark_buffer_dirty(bh); + unlock_buffer(bh); + /* err = sync_dirty_buffer(bh); */ + + b0 = 0; + bits -= op; + } + + wnd->nbits = new_bits; + wnd->nwnd = new_wnd; + wnd->bits_last = new_last; + + wnd_add_free_ext(wnd, old_bits, new_bits - old_bits, false); + + return 0; +} + +void wnd_zone_set(struct wnd_bitmap *wnd, size_t lcn, size_t len) +{ + size_t zlen = wnd->zone_end - wnd->zone_bit; + + if (zlen) + wnd_add_free_ext(wnd, wnd->zone_bit, zlen, false); + + if (!RB_EMPTY_ROOT(&wnd->start_tree) && len) + wnd_remove_free_ext(wnd, lcn, len); + + wnd->zone_bit = lcn; + wnd->zone_end = lcn + len; +} + +int ntfs_trim_fs(struct ntfs_sb_info *sbi, struct fstrim_range *range) +{ + int err = 0; + struct super_block *sb = sbi->sb; + struct wnd_bitmap *wnd = &sbi->used.bitmap; + u32 wbits = 8 * sb->s_blocksize; + CLST len = 0, lcn = 0, done = 0; + CLST minlen = bytes_to_cluster(sbi, range->minlen); + CLST lcn_from = bytes_to_cluster(sbi, range->start); + size_t iw = lcn_from >> (sb->s_blocksize_bits + 3); + u32 wbit = lcn_from & (wbits - 1); + const ulong *buf; + CLST lcn_to; + + if (!minlen) + minlen = 1; + + if (range->len == (u64)-1) + lcn_to = wnd->nbits; + else + lcn_to = bytes_to_cluster(sbi, range->start + range->len); + + down_read_nested(&wnd->rw_lock, BITMAP_MUTEX_CLUSTERS); + + for (; iw < wnd->nwnd; iw++, wbit = 0) { + CLST lcn_wnd = iw * wbits; + struct buffer_head *bh; + + if (lcn_wnd > lcn_to) + break; + + if (!wnd->free_bits[iw]) + continue; + + if (iw + 1 == wnd->nwnd) + wbits = wnd->bits_last; + + if (lcn_wnd + wbits > lcn_to) + wbits = lcn_to - lcn_wnd; + + bh = wnd_map(wnd, iw); + if (IS_ERR(bh)) { + err = PTR_ERR(bh); + break; + } + + buf = (ulong *)bh->b_data; + + for (; wbit < wbits; wbit++) { + if (!test_bit(wbit, buf)) { + if (!len) + lcn = lcn_wnd + wbit; + len += 1; + continue; + } + if (len >= minlen) { + err = ntfs_discard(sbi, lcn, len); + if (err) + goto out; + done += len; + } + len = 0; + } + put_bh(bh); + } + + /* Process the last fragment. */ + if (len >= minlen) { + err = ntfs_discard(sbi, lcn, len); + if (err) + goto out; + done += len; + } + +out: + range->len = (u64)done << sbi->cluster_bits; + + up_read(&wnd->rw_lock); + + return err; +} diff --git a/fs/ntfs3/debug.h b/fs/ntfs3/debug.h new file mode 100644 index 000000000..53ef7489c --- /dev/null +++ b/fs/ntfs3/debug.h @@ -0,0 +1,55 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * + * Copyright (C) 2019-2021 Paragon Software GmbH, All rights reserved. + * + * Useful functions for debugging. + * + */ + +// clang-format off +#ifndef _LINUX_NTFS3_DEBUG_H +#define _LINUX_NTFS3_DEBUG_H + +struct super_block; +struct inode; + +#ifndef Add2Ptr +#define Add2Ptr(P, I) ((void *)((u8 *)(P) + (I))) +#define PtrOffset(B, O) ((size_t)((size_t)(O) - (size_t)(B))) +#endif + +#ifdef CONFIG_PRINTK +__printf(2, 3) +void ntfs_printk(const struct super_block *sb, const char *fmt, ...); +__printf(2, 3) +void ntfs_inode_printk(struct inode *inode, const char *fmt, ...); +#else +static inline __printf(2, 3) +void ntfs_printk(const struct super_block *sb, const char *fmt, ...) +{ +} + +static inline __printf(2, 3) +void ntfs_inode_printk(struct inode *inode, const char *fmt, ...) +{ +} +#endif + +/* + * Logging macros. Thanks Joe Perches <joe@perches.com> for implementation. + */ + +#define ntfs_err(sb, fmt, ...) ntfs_printk(sb, KERN_ERR fmt, ##__VA_ARGS__) +#define ntfs_warn(sb, fmt, ...) ntfs_printk(sb, KERN_WARNING fmt, ##__VA_ARGS__) +#define ntfs_info(sb, fmt, ...) ntfs_printk(sb, KERN_INFO fmt, ##__VA_ARGS__) +#define ntfs_notice(sb, fmt, ...) \ + ntfs_printk(sb, KERN_NOTICE fmt, ##__VA_ARGS__) + +#define ntfs_inode_err(inode, fmt, ...) \ + ntfs_inode_printk(inode, KERN_ERR fmt, ##__VA_ARGS__) +#define ntfs_inode_warn(inode, fmt, ...) \ + ntfs_inode_printk(inode, KERN_WARNING fmt, ##__VA_ARGS__) + +#endif /* _LINUX_NTFS3_DEBUG_H */ +// clang-format on diff --git a/fs/ntfs3/dir.c b/fs/ntfs3/dir.c new file mode 100644 index 000000000..d4d9f4ffb --- /dev/null +++ b/fs/ntfs3/dir.c @@ -0,0 +1,597 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * + * Copyright (C) 2019-2021 Paragon Software GmbH, All rights reserved. + * + * Directory handling functions for NTFS-based filesystems. + * + */ + +#include <linux/fs.h> +#include <linux/nls.h> + +#include "debug.h" +#include "ntfs.h" +#include "ntfs_fs.h" + +/* Convert little endian UTF-16 to NLS string. */ +int ntfs_utf16_to_nls(struct ntfs_sb_info *sbi, const __le16 *name, u32 len, + u8 *buf, int buf_len) +{ + int ret, warn; + u8 *op; + struct nls_table *nls = sbi->options->nls; + + static_assert(sizeof(wchar_t) == sizeof(__le16)); + + if (!nls) { + /* UTF-16 -> UTF-8 */ + ret = utf16s_to_utf8s(name, len, UTF16_LITTLE_ENDIAN, buf, + buf_len); + buf[ret] = '\0'; + return ret; + } + + op = buf; + warn = 0; + + while (len--) { + u16 ec; + int charlen; + char dump[5]; + + if (buf_len < NLS_MAX_CHARSET_SIZE) { + ntfs_warn(sbi->sb, + "filename was truncated while converting."); + break; + } + + ec = le16_to_cpu(*name++); + charlen = nls->uni2char(ec, op, buf_len); + + if (charlen > 0) { + op += charlen; + buf_len -= charlen; + continue; + } + + *op++ = '_'; + buf_len -= 1; + if (warn) + continue; + + warn = 1; + hex_byte_pack(&dump[0], ec >> 8); + hex_byte_pack(&dump[2], ec); + dump[4] = 0; + + ntfs_err(sbi->sb, "failed to convert \"%s\" to %s", dump, + nls->charset); + } + + *op = '\0'; + return op - buf; +} + +// clang-format off +#define PLANE_SIZE 0x00010000 + +#define SURROGATE_PAIR 0x0000d800 +#define SURROGATE_LOW 0x00000400 +#define SURROGATE_BITS 0x000003ff +// clang-format on + +/* + * put_utf16 - Modified version of put_utf16 from fs/nls/nls_base.c + * + * Function is sparse warnings free. + */ +static inline void put_utf16(wchar_t *s, unsigned int c, + enum utf16_endian endian) +{ + static_assert(sizeof(wchar_t) == sizeof(__le16)); + static_assert(sizeof(wchar_t) == sizeof(__be16)); + + switch (endian) { + default: + *s = (wchar_t)c; + break; + case UTF16_LITTLE_ENDIAN: + *(__le16 *)s = __cpu_to_le16(c); + break; + case UTF16_BIG_ENDIAN: + *(__be16 *)s = __cpu_to_be16(c); + break; + } +} + +/* + * _utf8s_to_utf16s + * + * Modified version of 'utf8s_to_utf16s' allows to + * detect -ENAMETOOLONG without writing out of expected maximum. + */ +static int _utf8s_to_utf16s(const u8 *s, int inlen, enum utf16_endian endian, + wchar_t *pwcs, int maxout) +{ + u16 *op; + int size; + unicode_t u; + + op = pwcs; + while (inlen > 0 && *s) { + if (*s & 0x80) { + size = utf8_to_utf32(s, inlen, &u); + if (size < 0) + return -EINVAL; + s += size; + inlen -= size; + + if (u >= PLANE_SIZE) { + if (maxout < 2) + return -ENAMETOOLONG; + + u -= PLANE_SIZE; + put_utf16(op++, + SURROGATE_PAIR | + ((u >> 10) & SURROGATE_BITS), + endian); + put_utf16(op++, + SURROGATE_PAIR | SURROGATE_LOW | + (u & SURROGATE_BITS), + endian); + maxout -= 2; + } else { + if (maxout < 1) + return -ENAMETOOLONG; + + put_utf16(op++, u, endian); + maxout--; + } + } else { + if (maxout < 1) + return -ENAMETOOLONG; + + put_utf16(op++, *s++, endian); + inlen--; + maxout--; + } + } + return op - pwcs; +} + +/* + * ntfs_nls_to_utf16 - Convert input string to UTF-16. + * @name: Input name. + * @name_len: Input name length. + * @uni: Destination memory. + * @max_ulen: Destination memory. + * @endian: Endian of target UTF-16 string. + * + * This function is called: + * - to create NTFS name + * - to create symlink + * + * Return: UTF-16 string length or error (if negative). + */ +int ntfs_nls_to_utf16(struct ntfs_sb_info *sbi, const u8 *name, u32 name_len, + struct cpu_str *uni, u32 max_ulen, + enum utf16_endian endian) +{ + int ret, slen; + const u8 *end; + struct nls_table *nls = sbi->options->nls; + u16 *uname = uni->name; + + static_assert(sizeof(wchar_t) == sizeof(u16)); + + if (!nls) { + /* utf8 -> utf16 */ + ret = _utf8s_to_utf16s(name, name_len, endian, uname, max_ulen); + uni->len = ret; + return ret; + } + + for (ret = 0, end = name + name_len; name < end; ret++, name += slen) { + if (ret >= max_ulen) + return -ENAMETOOLONG; + + slen = nls->char2uni(name, end - name, uname + ret); + if (!slen) + return -EINVAL; + if (slen < 0) + return slen; + } + +#ifdef __BIG_ENDIAN + if (endian == UTF16_LITTLE_ENDIAN) { + int i = ret; + + while (i--) { + __cpu_to_le16s(uname); + uname++; + } + } +#else + if (endian == UTF16_BIG_ENDIAN) { + int i = ret; + + while (i--) { + __cpu_to_be16s(uname); + uname++; + } + } +#endif + + uni->len = ret; + return ret; +} + +/* + * dir_search_u - Helper function. + */ +struct inode *dir_search_u(struct inode *dir, const struct cpu_str *uni, + struct ntfs_fnd *fnd) +{ + int err = 0; + struct super_block *sb = dir->i_sb; + struct ntfs_sb_info *sbi = sb->s_fs_info; + struct ntfs_inode *ni = ntfs_i(dir); + struct NTFS_DE *e; + int diff; + struct inode *inode = NULL; + struct ntfs_fnd *fnd_a = NULL; + + if (!fnd) { + fnd_a = fnd_get(); + if (!fnd_a) { + err = -ENOMEM; + goto out; + } + fnd = fnd_a; + } + + err = indx_find(&ni->dir, ni, NULL, uni, 0, sbi, &diff, &e, fnd); + + if (err) + goto out; + + if (diff) { + err = -ENOENT; + goto out; + } + + inode = ntfs_iget5(sb, &e->ref, uni); + if (!IS_ERR(inode) && is_bad_inode(inode)) { + iput(inode); + err = -EINVAL; + } +out: + fnd_put(fnd_a); + + return err == -ENOENT ? NULL : err ? ERR_PTR(err) : inode; +} + +static inline int ntfs_filldir(struct ntfs_sb_info *sbi, struct ntfs_inode *ni, + const struct NTFS_DE *e, u8 *name, + struct dir_context *ctx) +{ + const struct ATTR_FILE_NAME *fname; + unsigned long ino; + int name_len; + u32 dt_type; + + fname = Add2Ptr(e, sizeof(struct NTFS_DE)); + + if (fname->type == FILE_NAME_DOS) + return 0; + + if (!mi_is_ref(&ni->mi, &fname->home)) + return 0; + + ino = ino_get(&e->ref); + + if (ino == MFT_REC_ROOT) + return 0; + + /* Skip meta files. Unless option to show metafiles is set. */ + if (!sbi->options->showmeta && ntfs_is_meta_file(sbi, ino)) + return 0; + + if (sbi->options->nohidden && (fname->dup.fa & FILE_ATTRIBUTE_HIDDEN)) + return 0; + + name_len = ntfs_utf16_to_nls(sbi, fname->name, fname->name_len, name, + PATH_MAX); + if (name_len <= 0) { + ntfs_warn(sbi->sb, "failed to convert name for inode %lx.", + ino); + return 0; + } + + /* NTFS: symlinks are "dir + reparse" or "file + reparse" */ + if (fname->dup.fa & FILE_ATTRIBUTE_REPARSE_POINT) + dt_type = DT_LNK; + else + dt_type = (fname->dup.fa & FILE_ATTRIBUTE_DIRECTORY) ? DT_DIR : DT_REG; + + return !dir_emit(ctx, (s8 *)name, name_len, ino, dt_type); +} + +/* + * ntfs_read_hdr - Helper function for ntfs_readdir(). + */ +static int ntfs_read_hdr(struct ntfs_sb_info *sbi, struct ntfs_inode *ni, + const struct INDEX_HDR *hdr, u64 vbo, u64 pos, + u8 *name, struct dir_context *ctx) +{ + int err; + const struct NTFS_DE *e; + u32 e_size; + u32 end = le32_to_cpu(hdr->used); + u32 off = le32_to_cpu(hdr->de_off); + + for (;; off += e_size) { + if (off + sizeof(struct NTFS_DE) > end) + return -1; + + e = Add2Ptr(hdr, off); + e_size = le16_to_cpu(e->size); + if (e_size < sizeof(struct NTFS_DE) || off + e_size > end) + return -1; + + if (de_is_last(e)) + return 0; + + /* Skip already enumerated. */ + if (vbo + off < pos) + continue; + + if (le16_to_cpu(e->key_size) < SIZEOF_ATTRIBUTE_FILENAME) + return -1; + + ctx->pos = vbo + off; + + /* Submit the name to the filldir callback. */ + err = ntfs_filldir(sbi, ni, e, name, ctx); + if (err) + return err; + } +} + +/* + * ntfs_readdir - file_operations::iterate_shared + * + * Use non sorted enumeration. + * We have an example of broken volume where sorted enumeration + * counts each name twice. + */ +static int ntfs_readdir(struct file *file, struct dir_context *ctx) +{ + const struct INDEX_ROOT *root; + u64 vbo; + size_t bit; + loff_t eod; + int err = 0; + struct inode *dir = file_inode(file); + struct ntfs_inode *ni = ntfs_i(dir); + struct super_block *sb = dir->i_sb; + struct ntfs_sb_info *sbi = sb->s_fs_info; + loff_t i_size = i_size_read(dir); + u32 pos = ctx->pos; + u8 *name = NULL; + struct indx_node *node = NULL; + u8 index_bits = ni->dir.index_bits; + + /* Name is a buffer of PATH_MAX length. */ + static_assert(NTFS_NAME_LEN * 4 < PATH_MAX); + + eod = i_size + sbi->record_size; + + if (pos >= eod) + return 0; + + if (!dir_emit_dots(file, ctx)) + return 0; + + /* Allocate PATH_MAX bytes. */ + name = __getname(); + if (!name) + return -ENOMEM; + + if (!ni->mi_loaded && ni->attr_list.size) { + /* + * Directory inode is locked for read. + * Load all subrecords to avoid 'write' access to 'ni' during + * directory reading. + */ + ni_lock(ni); + if (!ni->mi_loaded && ni->attr_list.size) { + err = ni_load_all_mi(ni); + if (!err) + ni->mi_loaded = true; + } + ni_unlock(ni); + if (err) + goto out; + } + + root = indx_get_root(&ni->dir, ni, NULL, NULL); + if (!root) { + err = -EINVAL; + goto out; + } + + if (pos >= sbi->record_size) { + bit = (pos - sbi->record_size) >> index_bits; + } else { + err = ntfs_read_hdr(sbi, ni, &root->ihdr, 0, pos, name, ctx); + if (err) + goto out; + bit = 0; + } + + if (!i_size) { + ctx->pos = eod; + goto out; + } + + for (;;) { + vbo = (u64)bit << index_bits; + if (vbo >= i_size) { + ctx->pos = eod; + goto out; + } + + err = indx_used_bit(&ni->dir, ni, &bit); + if (err) + goto out; + + if (bit == MINUS_ONE_T) { + ctx->pos = eod; + goto out; + } + + vbo = (u64)bit << index_bits; + if (vbo >= i_size) { + ntfs_inode_err(dir, "Looks like your dir is corrupt"); + err = -EINVAL; + goto out; + } + + err = indx_read(&ni->dir, ni, bit << ni->dir.idx2vbn_bits, + &node); + if (err) + goto out; + + err = ntfs_read_hdr(sbi, ni, &node->index->ihdr, + vbo + sbi->record_size, pos, name, ctx); + if (err) + goto out; + + bit += 1; + } + +out: + + __putname(name); + put_indx_node(node); + + if (err == -ENOENT) { + err = 0; + ctx->pos = pos; + } + + return err; +} + +static int ntfs_dir_count(struct inode *dir, bool *is_empty, size_t *dirs, + size_t *files) +{ + int err = 0; + struct ntfs_inode *ni = ntfs_i(dir); + struct NTFS_DE *e = NULL; + struct INDEX_ROOT *root; + struct INDEX_HDR *hdr; + const struct ATTR_FILE_NAME *fname; + u32 e_size, off, end; + u64 vbo = 0; + size_t drs = 0, fles = 0, bit = 0; + loff_t i_size = ni->vfs_inode.i_size; + struct indx_node *node = NULL; + u8 index_bits = ni->dir.index_bits; + + if (is_empty) + *is_empty = true; + + root = indx_get_root(&ni->dir, ni, NULL, NULL); + if (!root) + return -EINVAL; + + hdr = &root->ihdr; + + for (;;) { + end = le32_to_cpu(hdr->used); + off = le32_to_cpu(hdr->de_off); + + for (; off + sizeof(struct NTFS_DE) <= end; off += e_size) { + e = Add2Ptr(hdr, off); + e_size = le16_to_cpu(e->size); + if (e_size < sizeof(struct NTFS_DE) || + off + e_size > end) + break; + + if (de_is_last(e)) + break; + + fname = de_get_fname(e); + if (!fname) + continue; + + if (fname->type == FILE_NAME_DOS) + continue; + + if (is_empty) { + *is_empty = false; + if (!dirs && !files) + goto out; + } + + if (fname->dup.fa & FILE_ATTRIBUTE_DIRECTORY) + drs += 1; + else + fles += 1; + } + + if (vbo >= i_size) + goto out; + + err = indx_used_bit(&ni->dir, ni, &bit); + if (err) + goto out; + + if (bit == MINUS_ONE_T) + goto out; + + vbo = (u64)bit << index_bits; + if (vbo >= i_size) + goto out; + + err = indx_read(&ni->dir, ni, bit << ni->dir.idx2vbn_bits, + &node); + if (err) + goto out; + + hdr = &node->index->ihdr; + bit += 1; + vbo = (u64)bit << ni->dir.idx2vbn_bits; + } + +out: + put_indx_node(node); + if (dirs) + *dirs = drs; + if (files) + *files = fles; + + return err; +} + +bool dir_is_empty(struct inode *dir) +{ + bool is_empty = false; + + ntfs_dir_count(dir, &is_empty, NULL, NULL); + + return is_empty; +} + +// clang-format off +const struct file_operations ntfs_dir_operations = { + .llseek = generic_file_llseek, + .read = generic_read_dir, + .iterate_shared = ntfs_readdir, + .fsync = generic_file_fsync, + .open = ntfs_file_open, +}; +// clang-format on diff --git a/fs/ntfs3/file.c b/fs/ntfs3/file.c new file mode 100644 index 000000000..f31c0389a --- /dev/null +++ b/fs/ntfs3/file.c @@ -0,0 +1,1279 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * + * Copyright (C) 2019-2021 Paragon Software GmbH, All rights reserved. + * + * Regular file handling primitives for NTFS-based filesystems. + * + */ + +#include <linux/backing-dev.h> +#include <linux/blkdev.h> +#include <linux/buffer_head.h> +#include <linux/compat.h> +#include <linux/falloc.h> +#include <linux/fiemap.h> + +#include "debug.h" +#include "ntfs.h" +#include "ntfs_fs.h" + +static int ntfs_ioctl_fitrim(struct ntfs_sb_info *sbi, unsigned long arg) +{ + struct fstrim_range __user *user_range; + struct fstrim_range range; + int err; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (!bdev_max_discard_sectors(sbi->sb->s_bdev)) + return -EOPNOTSUPP; + + user_range = (struct fstrim_range __user *)arg; + if (copy_from_user(&range, user_range, sizeof(range))) + return -EFAULT; + + range.minlen = max_t(u32, range.minlen, + bdev_discard_granularity(sbi->sb->s_bdev)); + + err = ntfs_trim_fs(sbi, &range); + if (err < 0) + return err; + + if (copy_to_user(user_range, &range, sizeof(range))) + return -EFAULT; + + return 0; +} + +static long ntfs_ioctl(struct file *filp, u32 cmd, unsigned long arg) +{ + struct inode *inode = file_inode(filp); + struct ntfs_sb_info *sbi = inode->i_sb->s_fs_info; + + switch (cmd) { + case FITRIM: + return ntfs_ioctl_fitrim(sbi, arg); + } + return -ENOTTY; /* Inappropriate ioctl for device. */ +} + +#ifdef CONFIG_COMPAT +static long ntfs_compat_ioctl(struct file *filp, u32 cmd, unsigned long arg) + +{ + return ntfs_ioctl(filp, cmd, (unsigned long)compat_ptr(arg)); +} +#endif + +/* + * ntfs_getattr - inode_operations::getattr + */ +int ntfs_getattr(struct user_namespace *mnt_userns, const struct path *path, + struct kstat *stat, u32 request_mask, u32 flags) +{ + struct inode *inode = d_inode(path->dentry); + struct ntfs_inode *ni = ntfs_i(inode); + + if (is_compressed(ni)) + stat->attributes |= STATX_ATTR_COMPRESSED; + + if (is_encrypted(ni)) + stat->attributes |= STATX_ATTR_ENCRYPTED; + + stat->attributes_mask |= STATX_ATTR_COMPRESSED | STATX_ATTR_ENCRYPTED; + + generic_fillattr(mnt_userns, inode, stat); + + stat->result_mask |= STATX_BTIME; + stat->btime = ni->i_crtime; + stat->blksize = ni->mi.sbi->cluster_size; /* 512, 1K, ..., 2M */ + + return 0; +} + +static int ntfs_extend_initialized_size(struct file *file, + struct ntfs_inode *ni, + const loff_t valid, + const loff_t new_valid) +{ + struct inode *inode = &ni->vfs_inode; + struct address_space *mapping = inode->i_mapping; + struct ntfs_sb_info *sbi = inode->i_sb->s_fs_info; + loff_t pos = valid; + int err; + + if (is_resident(ni)) { + ni->i_valid = new_valid; + return 0; + } + + WARN_ON(is_compressed(ni)); + WARN_ON(valid >= new_valid); + + for (;;) { + u32 zerofrom, len; + struct page *page; + u8 bits; + CLST vcn, lcn, clen; + + if (is_sparsed(ni)) { + bits = sbi->cluster_bits; + vcn = pos >> bits; + + err = attr_data_get_block(ni, vcn, 0, &lcn, &clen, + NULL); + if (err) + goto out; + + if (lcn == SPARSE_LCN) { + loff_t vbo = (loff_t)vcn << bits; + loff_t to = vbo + ((loff_t)clen << bits); + + if (to <= new_valid) { + ni->i_valid = to; + pos = to; + goto next; + } + + if (vbo < pos) { + pos = vbo; + } else { + to = (new_valid >> bits) << bits; + if (pos < to) { + ni->i_valid = to; + pos = to; + goto next; + } + } + } + } + + zerofrom = pos & (PAGE_SIZE - 1); + len = PAGE_SIZE - zerofrom; + + if (pos + len > new_valid) + len = new_valid - pos; + + err = ntfs_write_begin(file, mapping, pos, len, &page, NULL); + if (err) + goto out; + + zero_user_segment(page, zerofrom, PAGE_SIZE); + + /* This function in any case puts page. */ + err = ntfs_write_end(file, mapping, pos, len, len, page, NULL); + if (err < 0) + goto out; + pos += len; + +next: + if (pos >= new_valid) + break; + + balance_dirty_pages_ratelimited(mapping); + cond_resched(); + } + + return 0; + +out: + ni->i_valid = valid; + ntfs_inode_warn(inode, "failed to extend initialized size to %llx.", + new_valid); + return err; +} + +/* + * ntfs_zero_range - Helper function for punch_hole. + * + * It zeroes a range [vbo, vbo_to). + */ +static int ntfs_zero_range(struct inode *inode, u64 vbo, u64 vbo_to) +{ + int err = 0; + struct address_space *mapping = inode->i_mapping; + u32 blocksize = 1 << inode->i_blkbits; + pgoff_t idx = vbo >> PAGE_SHIFT; + u32 z_start = vbo & (PAGE_SIZE - 1); + pgoff_t idx_end = (vbo_to + PAGE_SIZE - 1) >> PAGE_SHIFT; + loff_t page_off; + struct buffer_head *head, *bh; + u32 bh_next, bh_off, z_end; + sector_t iblock; + struct page *page; + + for (; idx < idx_end; idx += 1, z_start = 0) { + page_off = (loff_t)idx << PAGE_SHIFT; + z_end = (page_off + PAGE_SIZE) > vbo_to ? (vbo_to - page_off) + : PAGE_SIZE; + iblock = page_off >> inode->i_blkbits; + + page = find_or_create_page(mapping, idx, + mapping_gfp_constraint(mapping, + ~__GFP_FS)); + if (!page) + return -ENOMEM; + + if (!page_has_buffers(page)) + create_empty_buffers(page, blocksize, 0); + + bh = head = page_buffers(page); + bh_off = 0; + do { + bh_next = bh_off + blocksize; + + if (bh_next <= z_start || bh_off >= z_end) + continue; + + if (!buffer_mapped(bh)) { + ntfs_get_block(inode, iblock, bh, 0); + /* Unmapped? It's a hole - nothing to do. */ + if (!buffer_mapped(bh)) + continue; + } + + /* Ok, it's mapped. Make sure it's up-to-date. */ + if (PageUptodate(page)) + set_buffer_uptodate(bh); + + if (!buffer_uptodate(bh)) { + lock_buffer(bh); + bh->b_end_io = end_buffer_read_sync; + get_bh(bh); + submit_bh(REQ_OP_READ, bh); + + wait_on_buffer(bh); + if (!buffer_uptodate(bh)) { + unlock_page(page); + put_page(page); + err = -EIO; + goto out; + } + } + + mark_buffer_dirty(bh); + + } while (bh_off = bh_next, iblock += 1, + head != (bh = bh->b_this_page)); + + zero_user_segment(page, z_start, z_end); + + unlock_page(page); + put_page(page); + cond_resched(); + } +out: + mark_inode_dirty(inode); + return err; +} + +/* + * ntfs_sparse_cluster - Helper function to zero a new allocated clusters. + * + * NOTE: 512 <= cluster size <= 2M + */ +void ntfs_sparse_cluster(struct inode *inode, struct page *page0, CLST vcn, + CLST len) +{ + struct address_space *mapping = inode->i_mapping; + struct ntfs_sb_info *sbi = inode->i_sb->s_fs_info; + u64 vbo = (u64)vcn << sbi->cluster_bits; + u64 bytes = (u64)len << sbi->cluster_bits; + u32 blocksize = 1 << inode->i_blkbits; + pgoff_t idx0 = page0 ? page0->index : -1; + loff_t vbo_clst = vbo & sbi->cluster_mask_inv; + loff_t end = ntfs_up_cluster(sbi, vbo + bytes); + pgoff_t idx = vbo_clst >> PAGE_SHIFT; + u32 from = vbo_clst & (PAGE_SIZE - 1); + pgoff_t idx_end = (end + PAGE_SIZE - 1) >> PAGE_SHIFT; + loff_t page_off; + u32 to; + bool partial; + struct page *page; + + for (; idx < idx_end; idx += 1, from = 0) { + page = idx == idx0 ? page0 : grab_cache_page(mapping, idx); + + if (!page) + continue; + + page_off = (loff_t)idx << PAGE_SHIFT; + to = (page_off + PAGE_SIZE) > end ? (end - page_off) + : PAGE_SIZE; + partial = false; + + if ((from || PAGE_SIZE != to) && + likely(!page_has_buffers(page))) { + create_empty_buffers(page, blocksize, 0); + } + + if (page_has_buffers(page)) { + struct buffer_head *head, *bh; + u32 bh_off = 0; + + bh = head = page_buffers(page); + do { + u32 bh_next = bh_off + blocksize; + + if (from <= bh_off && bh_next <= to) { + set_buffer_uptodate(bh); + mark_buffer_dirty(bh); + } else if (!buffer_uptodate(bh)) { + partial = true; + } + bh_off = bh_next; + } while (head != (bh = bh->b_this_page)); + } + + zero_user_segment(page, from, to); + + if (!partial) { + if (!PageUptodate(page)) + SetPageUptodate(page); + set_page_dirty(page); + } + + if (idx != idx0) { + unlock_page(page); + put_page(page); + } + cond_resched(); + } + mark_inode_dirty(inode); +} + +/* + * ntfs_file_mmap - file_operations::mmap + */ +static int ntfs_file_mmap(struct file *file, struct vm_area_struct *vma) +{ + struct address_space *mapping = file->f_mapping; + struct inode *inode = mapping->host; + struct ntfs_inode *ni = ntfs_i(inode); + u64 from = ((u64)vma->vm_pgoff << PAGE_SHIFT); + bool rw = vma->vm_flags & VM_WRITE; + int err; + + if (is_encrypted(ni)) { + ntfs_inode_warn(inode, "mmap encrypted not supported"); + return -EOPNOTSUPP; + } + + if (is_dedup(ni)) { + ntfs_inode_warn(inode, "mmap deduplicated not supported"); + return -EOPNOTSUPP; + } + + if (is_compressed(ni) && rw) { + ntfs_inode_warn(inode, "mmap(write) compressed not supported"); + return -EOPNOTSUPP; + } + + if (rw) { + u64 to = min_t(loff_t, i_size_read(inode), + from + vma->vm_end - vma->vm_start); + + if (is_sparsed(ni)) { + /* Allocate clusters for rw map. */ + struct ntfs_sb_info *sbi = inode->i_sb->s_fs_info; + CLST lcn, len; + CLST vcn = from >> sbi->cluster_bits; + CLST end = bytes_to_cluster(sbi, to); + bool new; + + for (; vcn < end; vcn += len) { + err = attr_data_get_block(ni, vcn, 1, &lcn, + &len, &new); + if (err) + goto out; + + if (!new) + continue; + ntfs_sparse_cluster(inode, NULL, vcn, 1); + } + } + + if (ni->i_valid < to) { + if (!inode_trylock(inode)) { + err = -EAGAIN; + goto out; + } + err = ntfs_extend_initialized_size(file, ni, + ni->i_valid, to); + inode_unlock(inode); + if (err) + goto out; + } + } + + err = generic_file_mmap(file, vma); +out: + return err; +} + +static int ntfs_extend(struct inode *inode, loff_t pos, size_t count, + struct file *file) +{ + struct ntfs_inode *ni = ntfs_i(inode); + struct address_space *mapping = inode->i_mapping; + loff_t end = pos + count; + bool extend_init = file && pos > ni->i_valid; + int err; + + if (end <= inode->i_size && !extend_init) + return 0; + + /* Mark rw ntfs as dirty. It will be cleared at umount. */ + ntfs_set_state(ni->mi.sbi, NTFS_DIRTY_DIRTY); + + if (end > inode->i_size) { + err = ntfs_set_size(inode, end); + if (err) + goto out; + inode->i_size = end; + } + + if (extend_init && !is_compressed(ni)) { + err = ntfs_extend_initialized_size(file, ni, ni->i_valid, pos); + if (err) + goto out; + } else { + err = 0; + } + + inode->i_ctime = inode->i_mtime = current_time(inode); + mark_inode_dirty(inode); + + if (IS_SYNC(inode)) { + int err2; + + err = filemap_fdatawrite_range(mapping, pos, end - 1); + err2 = sync_mapping_buffers(mapping); + if (!err) + err = err2; + err2 = write_inode_now(inode, 1); + if (!err) + err = err2; + if (!err) + err = filemap_fdatawait_range(mapping, pos, end - 1); + } + +out: + return err; +} + +static int ntfs_truncate(struct inode *inode, loff_t new_size) +{ + struct super_block *sb = inode->i_sb; + struct ntfs_inode *ni = ntfs_i(inode); + int err, dirty = 0; + u64 new_valid; + + if (!S_ISREG(inode->i_mode)) + return 0; + + if (is_compressed(ni)) { + if (ni->i_valid > new_size) + ni->i_valid = new_size; + } else { + err = block_truncate_page(inode->i_mapping, new_size, + ntfs_get_block); + if (err) + return err; + } + + new_valid = ntfs_up_block(sb, min_t(u64, ni->i_valid, new_size)); + + truncate_setsize(inode, new_size); + + ni_lock(ni); + + down_write(&ni->file.run_lock); + err = attr_set_size(ni, ATTR_DATA, NULL, 0, &ni->file.run, new_size, + &new_valid, ni->mi.sbi->options->prealloc, NULL); + up_write(&ni->file.run_lock); + + if (new_valid < ni->i_valid) + ni->i_valid = new_valid; + + ni_unlock(ni); + + ni->std_fa |= FILE_ATTRIBUTE_ARCHIVE; + inode->i_ctime = inode->i_mtime = current_time(inode); + if (!IS_DIRSYNC(inode)) { + dirty = 1; + } else { + err = ntfs_sync_inode(inode); + if (err) + return err; + } + + if (dirty) + mark_inode_dirty(inode); + + /*ntfs_flush_inodes(inode->i_sb, inode, NULL);*/ + + return 0; +} + +/* + * ntfs_fallocate + * + * Preallocate space for a file. This implements ntfs's fallocate file + * operation, which gets called from sys_fallocate system call. User + * space requests 'len' bytes at 'vbo'. If FALLOC_FL_KEEP_SIZE is set + * we just allocate clusters without zeroing them out. Otherwise we + * allocate and zero out clusters via an expanding truncate. + */ +static long ntfs_fallocate(struct file *file, int mode, loff_t vbo, loff_t len) +{ + struct inode *inode = file->f_mapping->host; + struct address_space *mapping = inode->i_mapping; + struct super_block *sb = inode->i_sb; + struct ntfs_sb_info *sbi = sb->s_fs_info; + struct ntfs_inode *ni = ntfs_i(inode); + loff_t end = vbo + len; + loff_t vbo_down = round_down(vbo, PAGE_SIZE); + bool is_supported_holes = is_sparsed(ni) || is_compressed(ni); + loff_t i_size, new_size; + bool map_locked; + int err; + + /* No support for dir. */ + if (!S_ISREG(inode->i_mode)) + return -EOPNOTSUPP; + + /* + * vfs_fallocate checks all possible combinations of mode. + * Do additional checks here before ntfs_set_state(dirty). + */ + if (mode & FALLOC_FL_PUNCH_HOLE) { + if (!is_supported_holes) + return -EOPNOTSUPP; + } else if (mode & FALLOC_FL_COLLAPSE_RANGE) { + } else if (mode & FALLOC_FL_INSERT_RANGE) { + if (!is_supported_holes) + return -EOPNOTSUPP; + } else if (mode & + ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE | + FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_INSERT_RANGE)) { + ntfs_inode_warn(inode, "fallocate(0x%x) is not supported", + mode); + return -EOPNOTSUPP; + } + + ntfs_set_state(sbi, NTFS_DIRTY_DIRTY); + + inode_lock(inode); + i_size = inode->i_size; + new_size = max(end, i_size); + map_locked = false; + + if (WARN_ON(ni->ni_flags & NI_FLAG_COMPRESSED_MASK)) { + /* Should never be here, see ntfs_file_open. */ + err = -EOPNOTSUPP; + goto out; + } + + if (mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_COLLAPSE_RANGE | + FALLOC_FL_INSERT_RANGE)) { + inode_dio_wait(inode); + filemap_invalidate_lock(mapping); + map_locked = true; + } + + if (mode & FALLOC_FL_PUNCH_HOLE) { + u32 frame_size; + loff_t mask, vbo_a, end_a, tmp; + + err = filemap_write_and_wait_range(mapping, vbo, end - 1); + if (err) + goto out; + + err = filemap_write_and_wait_range(mapping, end, LLONG_MAX); + if (err) + goto out; + + truncate_pagecache(inode, vbo_down); + + ni_lock(ni); + err = attr_punch_hole(ni, vbo, len, &frame_size); + ni_unlock(ni); + if (err != E_NTFS_NOTALIGNED) + goto out; + + /* Process not aligned punch. */ + mask = frame_size - 1; + vbo_a = (vbo + mask) & ~mask; + end_a = end & ~mask; + + tmp = min(vbo_a, end); + if (tmp > vbo) { + err = ntfs_zero_range(inode, vbo, tmp); + if (err) + goto out; + } + + if (vbo < end_a && end_a < end) { + err = ntfs_zero_range(inode, end_a, end); + if (err) + goto out; + } + + /* Aligned punch_hole */ + if (end_a > vbo_a) { + ni_lock(ni); + err = attr_punch_hole(ni, vbo_a, end_a - vbo_a, NULL); + ni_unlock(ni); + } + } else if (mode & FALLOC_FL_COLLAPSE_RANGE) { + /* + * Write tail of the last page before removed range since + * it will get removed from the page cache below. + */ + err = filemap_write_and_wait_range(mapping, vbo_down, vbo); + if (err) + goto out; + + /* + * Write data that will be shifted to preserve them + * when discarding page cache below. + */ + err = filemap_write_and_wait_range(mapping, end, LLONG_MAX); + if (err) + goto out; + + truncate_pagecache(inode, vbo_down); + + ni_lock(ni); + err = attr_collapse_range(ni, vbo, len); + ni_unlock(ni); + } else if (mode & FALLOC_FL_INSERT_RANGE) { + /* Check new size. */ + err = inode_newsize_ok(inode, new_size); + if (err) + goto out; + + /* Write out all dirty pages. */ + err = filemap_write_and_wait_range(mapping, vbo_down, + LLONG_MAX); + if (err) + goto out; + truncate_pagecache(inode, vbo_down); + + ni_lock(ni); + err = attr_insert_range(ni, vbo, len); + ni_unlock(ni); + } else { + /* Check new size. */ + + /* generic/213: expected -ENOSPC instead of -EFBIG. */ + if (!is_supported_holes) { + loff_t to_alloc = new_size - inode_get_bytes(inode); + + if (to_alloc > 0 && + (to_alloc >> sbi->cluster_bits) > + wnd_zeroes(&sbi->used.bitmap)) { + err = -ENOSPC; + goto out; + } + } + + err = inode_newsize_ok(inode, new_size); + if (err) + goto out; + + /* + * Allocate clusters, do not change 'valid' size. + */ + err = ntfs_set_size(inode, new_size); + if (err) + goto out; + + if (is_supported_holes) { + CLST vcn_v = ni->i_valid >> sbi->cluster_bits; + CLST vcn = vbo >> sbi->cluster_bits; + CLST cend = bytes_to_cluster(sbi, end); + CLST lcn, clen; + bool new; + + /* + * Allocate but do not zero new clusters. (see below comments) + * This breaks security: One can read unused on-disk areas. + * Zeroing these clusters may be too long. + * Maybe we should check here for root rights? + */ + for (; vcn < cend; vcn += clen) { + err = attr_data_get_block(ni, vcn, cend - vcn, + &lcn, &clen, &new); + if (err) + goto out; + if (!new || vcn >= vcn_v) + continue; + + /* + * Unwritten area. + * NTFS is not able to store several unwritten areas. + * Activate 'ntfs_sparse_cluster' to zero new allocated clusters. + * + * Dangerous in case: + * 1G of sparsed clusters + 1 cluster of data => + * valid_size == 1G + 1 cluster + * fallocate(1G) will zero 1G and this can be very long + * xfstest 016/086 will fail without 'ntfs_sparse_cluster'. + */ + ntfs_sparse_cluster(inode, NULL, vcn, + min(vcn_v - vcn, clen)); + } + } + + if (mode & FALLOC_FL_KEEP_SIZE) { + ni_lock(ni); + /* True - Keep preallocated. */ + err = attr_set_size(ni, ATTR_DATA, NULL, 0, + &ni->file.run, i_size, &ni->i_valid, + true, NULL); + ni_unlock(ni); + } + } + +out: + if (map_locked) + filemap_invalidate_unlock(mapping); + + if (!err) { + inode->i_ctime = inode->i_mtime = current_time(inode); + mark_inode_dirty(inode); + } + + inode_unlock(inode); + return err; +} + +/* + * ntfs3_setattr - inode_operations::setattr + */ +int ntfs3_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, + struct iattr *attr) +{ + struct super_block *sb = dentry->d_sb; + struct ntfs_sb_info *sbi = sb->s_fs_info; + struct inode *inode = d_inode(dentry); + struct ntfs_inode *ni = ntfs_i(inode); + u32 ia_valid = attr->ia_valid; + umode_t mode = inode->i_mode; + int err; + + if (sbi->options->noacsrules) { + /* "No access rules" - Force any changes of time etc. */ + attr->ia_valid |= ATTR_FORCE; + /* and disable for editing some attributes. */ + attr->ia_valid &= ~(ATTR_UID | ATTR_GID | ATTR_MODE); + ia_valid = attr->ia_valid; + } + + err = setattr_prepare(mnt_userns, dentry, attr); + if (err) + goto out; + + if (ia_valid & ATTR_SIZE) { + loff_t oldsize = inode->i_size; + + if (WARN_ON(ni->ni_flags & NI_FLAG_COMPRESSED_MASK)) { + /* Should never be here, see ntfs_file_open(). */ + err = -EOPNOTSUPP; + goto out; + } + inode_dio_wait(inode); + + if (attr->ia_size <= oldsize) + err = ntfs_truncate(inode, attr->ia_size); + else if (attr->ia_size > oldsize) + err = ntfs_extend(inode, attr->ia_size, 0, NULL); + + if (err) + goto out; + + ni->ni_flags |= NI_FLAG_UPDATE_PARENT; + } + + setattr_copy(mnt_userns, inode, attr); + + if (mode != inode->i_mode) { + err = ntfs_acl_chmod(mnt_userns, inode); + if (err) + goto out; + + /* Linux 'w' -> Windows 'ro'. */ + if (0222 & inode->i_mode) + ni->std_fa &= ~FILE_ATTRIBUTE_READONLY; + else + ni->std_fa |= FILE_ATTRIBUTE_READONLY; + } + + if (ia_valid & (ATTR_UID | ATTR_GID | ATTR_MODE)) + ntfs_save_wsl_perm(inode); + mark_inode_dirty(inode); +out: + return err; +} + +static ssize_t ntfs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) +{ + struct file *file = iocb->ki_filp; + struct inode *inode = file->f_mapping->host; + struct ntfs_inode *ni = ntfs_i(inode); + + if (is_encrypted(ni)) { + ntfs_inode_warn(inode, "encrypted i/o not supported"); + return -EOPNOTSUPP; + } + + if (is_compressed(ni) && (iocb->ki_flags & IOCB_DIRECT)) { + ntfs_inode_warn(inode, "direct i/o + compressed not supported"); + return -EOPNOTSUPP; + } + +#ifndef CONFIG_NTFS3_LZX_XPRESS + if (ni->ni_flags & NI_FLAG_COMPRESSED_MASK) { + ntfs_inode_warn( + inode, + "activate CONFIG_NTFS3_LZX_XPRESS to read external compressed files"); + return -EOPNOTSUPP; + } +#endif + + if (is_dedup(ni)) { + ntfs_inode_warn(inode, "read deduplicated not supported"); + return -EOPNOTSUPP; + } + + return generic_file_read_iter(iocb, iter); +} + +/* + * ntfs_get_frame_pages + * + * Return: Array of locked pages. + */ +static int ntfs_get_frame_pages(struct address_space *mapping, pgoff_t index, + struct page **pages, u32 pages_per_frame, + bool *frame_uptodate) +{ + gfp_t gfp_mask = mapping_gfp_mask(mapping); + u32 npages; + + *frame_uptodate = true; + + for (npages = 0; npages < pages_per_frame; npages++, index++) { + struct page *page; + + page = find_or_create_page(mapping, index, gfp_mask); + if (!page) { + while (npages--) { + page = pages[npages]; + unlock_page(page); + put_page(page); + } + + return -ENOMEM; + } + + if (!PageUptodate(page)) + *frame_uptodate = false; + + pages[npages] = page; + } + + return 0; +} + +/* + * ntfs_compress_write - Helper for ntfs_file_write_iter() (compressed files). + */ +static ssize_t ntfs_compress_write(struct kiocb *iocb, struct iov_iter *from) +{ + int err; + struct file *file = iocb->ki_filp; + size_t count = iov_iter_count(from); + loff_t pos = iocb->ki_pos; + struct inode *inode = file_inode(file); + loff_t i_size = inode->i_size; + struct address_space *mapping = inode->i_mapping; + struct ntfs_inode *ni = ntfs_i(inode); + u64 valid = ni->i_valid; + struct ntfs_sb_info *sbi = ni->mi.sbi; + struct page *page, **pages = NULL; + size_t written = 0; + u8 frame_bits = NTFS_LZNT_CUNIT + sbi->cluster_bits; + u32 frame_size = 1u << frame_bits; + u32 pages_per_frame = frame_size >> PAGE_SHIFT; + u32 ip, off; + CLST frame; + u64 frame_vbo; + pgoff_t index; + bool frame_uptodate; + + if (frame_size < PAGE_SIZE) { + /* + * frame_size == 8K if cluster 512 + * frame_size == 64K if cluster 4096 + */ + ntfs_inode_warn(inode, "page size is bigger than frame size"); + return -EOPNOTSUPP; + } + + pages = kmalloc_array(pages_per_frame, sizeof(struct page *), GFP_NOFS); + if (!pages) + return -ENOMEM; + + current->backing_dev_info = inode_to_bdi(inode); + err = file_remove_privs(file); + if (err) + goto out; + + err = file_update_time(file); + if (err) + goto out; + + /* Zero range [valid : pos). */ + while (valid < pos) { + CLST lcn, clen; + + frame = valid >> frame_bits; + frame_vbo = valid & ~(frame_size - 1); + off = valid & (frame_size - 1); + + err = attr_data_get_block(ni, frame << NTFS_LZNT_CUNIT, 0, &lcn, + &clen, NULL); + if (err) + goto out; + + if (lcn == SPARSE_LCN) { + ni->i_valid = valid = + frame_vbo + ((u64)clen << sbi->cluster_bits); + continue; + } + + /* Load full frame. */ + err = ntfs_get_frame_pages(mapping, frame_vbo >> PAGE_SHIFT, + pages, pages_per_frame, + &frame_uptodate); + if (err) + goto out; + + if (!frame_uptodate && off) { + err = ni_read_frame(ni, frame_vbo, pages, + pages_per_frame); + if (err) { + for (ip = 0; ip < pages_per_frame; ip++) { + page = pages[ip]; + unlock_page(page); + put_page(page); + } + goto out; + } + } + + ip = off >> PAGE_SHIFT; + off = offset_in_page(valid); + for (; ip < pages_per_frame; ip++, off = 0) { + page = pages[ip]; + zero_user_segment(page, off, PAGE_SIZE); + flush_dcache_page(page); + SetPageUptodate(page); + } + + ni_lock(ni); + err = ni_write_frame(ni, pages, pages_per_frame); + ni_unlock(ni); + + for (ip = 0; ip < pages_per_frame; ip++) { + page = pages[ip]; + SetPageUptodate(page); + unlock_page(page); + put_page(page); + } + + if (err) + goto out; + + ni->i_valid = valid = frame_vbo + frame_size; + } + + /* Copy user data [pos : pos + count). */ + while (count) { + size_t copied, bytes; + + off = pos & (frame_size - 1); + bytes = frame_size - off; + if (bytes > count) + bytes = count; + + frame_vbo = pos & ~(frame_size - 1); + index = frame_vbo >> PAGE_SHIFT; + + if (unlikely(fault_in_iov_iter_readable(from, bytes))) { + err = -EFAULT; + goto out; + } + + /* Load full frame. */ + err = ntfs_get_frame_pages(mapping, index, pages, + pages_per_frame, &frame_uptodate); + if (err) + goto out; + + if (!frame_uptodate) { + loff_t to = pos + bytes; + + if (off || (to < i_size && (to & (frame_size - 1)))) { + err = ni_read_frame(ni, frame_vbo, pages, + pages_per_frame); + if (err) { + for (ip = 0; ip < pages_per_frame; + ip++) { + page = pages[ip]; + unlock_page(page); + put_page(page); + } + goto out; + } + } + } + + WARN_ON(!bytes); + copied = 0; + ip = off >> PAGE_SHIFT; + off = offset_in_page(pos); + + /* Copy user data to pages. */ + for (;;) { + size_t cp, tail = PAGE_SIZE - off; + + page = pages[ip]; + cp = copy_page_from_iter_atomic(page, off, + min(tail, bytes), from); + flush_dcache_page(page); + + copied += cp; + bytes -= cp; + if (!bytes || !cp) + break; + + if (cp < tail) { + off += cp; + } else { + ip++; + off = 0; + } + } + + ni_lock(ni); + err = ni_write_frame(ni, pages, pages_per_frame); + ni_unlock(ni); + + for (ip = 0; ip < pages_per_frame; ip++) { + page = pages[ip]; + ClearPageDirty(page); + SetPageUptodate(page); + unlock_page(page); + put_page(page); + } + + if (err) + goto out; + + /* + * We can loop for a long time in here. Be nice and allow + * us to schedule out to avoid softlocking if preempt + * is disabled. + */ + cond_resched(); + + pos += copied; + written += copied; + + count = iov_iter_count(from); + } + +out: + kfree(pages); + + current->backing_dev_info = NULL; + + if (err < 0) + return err; + + iocb->ki_pos += written; + if (iocb->ki_pos > ni->i_valid) + ni->i_valid = iocb->ki_pos; + + return written; +} + +/* + * ntfs_file_write_iter - file_operations::write_iter + */ +static ssize_t ntfs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) +{ + struct file *file = iocb->ki_filp; + struct address_space *mapping = file->f_mapping; + struct inode *inode = mapping->host; + ssize_t ret; + struct ntfs_inode *ni = ntfs_i(inode); + + if (is_encrypted(ni)) { + ntfs_inode_warn(inode, "encrypted i/o not supported"); + return -EOPNOTSUPP; + } + + if (is_compressed(ni) && (iocb->ki_flags & IOCB_DIRECT)) { + ntfs_inode_warn(inode, "direct i/o + compressed not supported"); + return -EOPNOTSUPP; + } + + if (is_dedup(ni)) { + ntfs_inode_warn(inode, "write into deduplicated not supported"); + return -EOPNOTSUPP; + } + + if (!inode_trylock(inode)) { + if (iocb->ki_flags & IOCB_NOWAIT) + return -EAGAIN; + inode_lock(inode); + } + + ret = generic_write_checks(iocb, from); + if (ret <= 0) + goto out; + + if (WARN_ON(ni->ni_flags & NI_FLAG_COMPRESSED_MASK)) { + /* Should never be here, see ntfs_file_open(). */ + ret = -EOPNOTSUPP; + goto out; + } + + ret = ntfs_extend(inode, iocb->ki_pos, ret, file); + if (ret) + goto out; + + ret = is_compressed(ni) ? ntfs_compress_write(iocb, from) + : __generic_file_write_iter(iocb, from); + +out: + inode_unlock(inode); + + if (ret > 0) + ret = generic_write_sync(iocb, ret); + + return ret; +} + +/* + * ntfs_file_open - file_operations::open + */ +int ntfs_file_open(struct inode *inode, struct file *file) +{ + struct ntfs_inode *ni = ntfs_i(inode); + + if (unlikely((is_compressed(ni) || is_encrypted(ni)) && + (file->f_flags & O_DIRECT))) { + return -EOPNOTSUPP; + } + + /* Decompress "external compressed" file if opened for rw. */ + if ((ni->ni_flags & NI_FLAG_COMPRESSED_MASK) && + (file->f_flags & (O_WRONLY | O_RDWR | O_TRUNC))) { +#ifdef CONFIG_NTFS3_LZX_XPRESS + int err = ni_decompress_file(ni); + + if (err) + return err; +#else + ntfs_inode_warn( + inode, + "activate CONFIG_NTFS3_LZX_XPRESS to write external compressed files"); + return -EOPNOTSUPP; +#endif + } + + return generic_file_open(inode, file); +} + +/* + * ntfs_file_release - file_operations::release + */ +static int ntfs_file_release(struct inode *inode, struct file *file) +{ + struct ntfs_inode *ni = ntfs_i(inode); + struct ntfs_sb_info *sbi = ni->mi.sbi; + int err = 0; + + /* If we are last writer on the inode, drop the block reservation. */ + if (sbi->options->prealloc && ((file->f_mode & FMODE_WRITE) && + atomic_read(&inode->i_writecount) == 1)) { + ni_lock(ni); + down_write(&ni->file.run_lock); + + err = attr_set_size(ni, ATTR_DATA, NULL, 0, &ni->file.run, + inode->i_size, &ni->i_valid, false, NULL); + + up_write(&ni->file.run_lock); + ni_unlock(ni); + } + return err; +} + +/* + * ntfs_fiemap - file_operations::fiemap + */ +int ntfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, + __u64 start, __u64 len) +{ + int err; + struct ntfs_inode *ni = ntfs_i(inode); + + err = fiemap_prep(inode, fieinfo, start, &len, ~FIEMAP_FLAG_XATTR); + if (err) + return err; + + ni_lock(ni); + + err = ni_fiemap(ni, fieinfo, start, len); + + ni_unlock(ni); + + return err; +} + +// clang-format off +const struct inode_operations ntfs_file_inode_operations = { + .getattr = ntfs_getattr, + .setattr = ntfs3_setattr, + .listxattr = ntfs_listxattr, + .permission = ntfs_permission, + .get_acl = ntfs_get_acl, + .set_acl = ntfs_set_acl, + .fiemap = ntfs_fiemap, +}; + +const struct file_operations ntfs_file_operations = { + .llseek = generic_file_llseek, + .read_iter = ntfs_file_read_iter, + .write_iter = ntfs_file_write_iter, + .unlocked_ioctl = ntfs_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = ntfs_compat_ioctl, +#endif + .splice_read = generic_file_splice_read, + .mmap = ntfs_file_mmap, + .open = ntfs_file_open, + .fsync = generic_file_fsync, + .splice_write = iter_file_splice_write, + .fallocate = ntfs_fallocate, + .release = ntfs_file_release, +}; +// clang-format on diff --git a/fs/ntfs3/frecord.c b/fs/ntfs3/frecord.c new file mode 100644 index 000000000..bb7e33c24 --- /dev/null +++ b/fs/ntfs3/frecord.c @@ -0,0 +1,3368 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * + * Copyright (C) 2019-2021 Paragon Software GmbH, All rights reserved. + * + */ + +#include <linux/fiemap.h> +#include <linux/fs.h> +#include <linux/minmax.h> +#include <linux/vmalloc.h> + +#include "debug.h" +#include "ntfs.h" +#include "ntfs_fs.h" +#ifdef CONFIG_NTFS3_LZX_XPRESS +#include "lib/lib.h" +#endif + +static struct mft_inode *ni_ins_mi(struct ntfs_inode *ni, struct rb_root *tree, + CLST ino, struct rb_node *ins) +{ + struct rb_node **p = &tree->rb_node; + struct rb_node *pr = NULL; + + while (*p) { + struct mft_inode *mi; + + pr = *p; + mi = rb_entry(pr, struct mft_inode, node); + if (mi->rno > ino) + p = &pr->rb_left; + else if (mi->rno < ino) + p = &pr->rb_right; + else + return mi; + } + + if (!ins) + return NULL; + + rb_link_node(ins, pr, p); + rb_insert_color(ins, tree); + return rb_entry(ins, struct mft_inode, node); +} + +/* + * ni_find_mi - Find mft_inode by record number. + */ +static struct mft_inode *ni_find_mi(struct ntfs_inode *ni, CLST rno) +{ + return ni_ins_mi(ni, &ni->mi_tree, rno, NULL); +} + +/* + * ni_add_mi - Add new mft_inode into ntfs_inode. + */ +static void ni_add_mi(struct ntfs_inode *ni, struct mft_inode *mi) +{ + ni_ins_mi(ni, &ni->mi_tree, mi->rno, &mi->node); +} + +/* + * ni_remove_mi - Remove mft_inode from ntfs_inode. + */ +void ni_remove_mi(struct ntfs_inode *ni, struct mft_inode *mi) +{ + rb_erase(&mi->node, &ni->mi_tree); +} + +/* + * ni_std - Return: Pointer into std_info from primary record. + */ +struct ATTR_STD_INFO *ni_std(struct ntfs_inode *ni) +{ + const struct ATTRIB *attr; + + attr = mi_find_attr(&ni->mi, NULL, ATTR_STD, NULL, 0, NULL); + return attr ? resident_data_ex(attr, sizeof(struct ATTR_STD_INFO)) + : NULL; +} + +/* + * ni_std5 + * + * Return: Pointer into std_info from primary record. + */ +struct ATTR_STD_INFO5 *ni_std5(struct ntfs_inode *ni) +{ + const struct ATTRIB *attr; + + attr = mi_find_attr(&ni->mi, NULL, ATTR_STD, NULL, 0, NULL); + + return attr ? resident_data_ex(attr, sizeof(struct ATTR_STD_INFO5)) + : NULL; +} + +/* + * ni_clear - Clear resources allocated by ntfs_inode. + */ +void ni_clear(struct ntfs_inode *ni) +{ + struct rb_node *node; + + if (!ni->vfs_inode.i_nlink && is_rec_inuse(ni->mi.mrec)) + ni_delete_all(ni); + + al_destroy(ni); + + for (node = rb_first(&ni->mi_tree); node;) { + struct rb_node *next = rb_next(node); + struct mft_inode *mi = rb_entry(node, struct mft_inode, node); + + rb_erase(node, &ni->mi_tree); + mi_put(mi); + node = next; + } + + /* Bad inode always has mode == S_IFREG. */ + if (ni->ni_flags & NI_FLAG_DIR) + indx_clear(&ni->dir); + else { + run_close(&ni->file.run); +#ifdef CONFIG_NTFS3_LZX_XPRESS + if (ni->file.offs_page) { + /* On-demand allocated page for offsets. */ + put_page(ni->file.offs_page); + ni->file.offs_page = NULL; + } +#endif + } + + mi_clear(&ni->mi); +} + +/* + * ni_load_mi_ex - Find mft_inode by record number. + */ +int ni_load_mi_ex(struct ntfs_inode *ni, CLST rno, struct mft_inode **mi) +{ + int err; + struct mft_inode *r; + + r = ni_find_mi(ni, rno); + if (r) + goto out; + + err = mi_get(ni->mi.sbi, rno, &r); + if (err) + return err; + + ni_add_mi(ni, r); + +out: + if (mi) + *mi = r; + return 0; +} + +/* + * ni_load_mi - Load mft_inode corresponded list_entry. + */ +int ni_load_mi(struct ntfs_inode *ni, const struct ATTR_LIST_ENTRY *le, + struct mft_inode **mi) +{ + CLST rno; + + if (!le) { + *mi = &ni->mi; + return 0; + } + + rno = ino_get(&le->ref); + if (rno == ni->mi.rno) { + *mi = &ni->mi; + return 0; + } + return ni_load_mi_ex(ni, rno, mi); +} + +/* + * ni_find_attr + * + * Return: Attribute and record this attribute belongs to. + */ +struct ATTRIB *ni_find_attr(struct ntfs_inode *ni, struct ATTRIB *attr, + struct ATTR_LIST_ENTRY **le_o, enum ATTR_TYPE type, + const __le16 *name, u8 name_len, const CLST *vcn, + struct mft_inode **mi) +{ + struct ATTR_LIST_ENTRY *le; + struct mft_inode *m; + + if (!ni->attr_list.size || + (!name_len && (type == ATTR_LIST || type == ATTR_STD))) { + if (le_o) + *le_o = NULL; + if (mi) + *mi = &ni->mi; + + /* Look for required attribute in primary record. */ + return mi_find_attr(&ni->mi, attr, type, name, name_len, NULL); + } + + /* First look for list entry of required type. */ + le = al_find_ex(ni, le_o ? *le_o : NULL, type, name, name_len, vcn); + if (!le) + return NULL; + + if (le_o) + *le_o = le; + + /* Load record that contains this attribute. */ + if (ni_load_mi(ni, le, &m)) + return NULL; + + /* Look for required attribute. */ + attr = mi_find_attr(m, NULL, type, name, name_len, &le->id); + + if (!attr) + goto out; + + if (!attr->non_res) { + if (vcn && *vcn) + goto out; + } else if (!vcn) { + if (attr->nres.svcn) + goto out; + } else if (le64_to_cpu(attr->nres.svcn) > *vcn || + *vcn > le64_to_cpu(attr->nres.evcn)) { + goto out; + } + + if (mi) + *mi = m; + return attr; + +out: + ntfs_set_state(ni->mi.sbi, NTFS_DIRTY_ERROR); + return NULL; +} + +/* + * ni_enum_attr_ex - Enumerates attributes in ntfs_inode. + */ +struct ATTRIB *ni_enum_attr_ex(struct ntfs_inode *ni, struct ATTRIB *attr, + struct ATTR_LIST_ENTRY **le, + struct mft_inode **mi) +{ + struct mft_inode *mi2; + struct ATTR_LIST_ENTRY *le2; + + /* Do we have an attribute list? */ + if (!ni->attr_list.size) { + *le = NULL; + if (mi) + *mi = &ni->mi; + /* Enum attributes in primary record. */ + return mi_enum_attr(&ni->mi, attr); + } + + /* Get next list entry. */ + le2 = *le = al_enumerate(ni, attr ? *le : NULL); + if (!le2) + return NULL; + + /* Load record that contains the required attribute. */ + if (ni_load_mi(ni, le2, &mi2)) + return NULL; + + if (mi) + *mi = mi2; + + /* Find attribute in loaded record. */ + return rec_find_attr_le(mi2, le2); +} + +/* + * ni_load_attr - Load attribute that contains given VCN. + */ +struct ATTRIB *ni_load_attr(struct ntfs_inode *ni, enum ATTR_TYPE type, + const __le16 *name, u8 name_len, CLST vcn, + struct mft_inode **pmi) +{ + struct ATTR_LIST_ENTRY *le; + struct ATTRIB *attr; + struct mft_inode *mi; + struct ATTR_LIST_ENTRY *next; + + if (!ni->attr_list.size) { + if (pmi) + *pmi = &ni->mi; + return mi_find_attr(&ni->mi, NULL, type, name, name_len, NULL); + } + + le = al_find_ex(ni, NULL, type, name, name_len, NULL); + if (!le) + return NULL; + + /* + * Unfortunately ATTR_LIST_ENTRY contains only start VCN. + * So to find the ATTRIB segment that contains 'vcn' we should + * enumerate some entries. + */ + if (vcn) { + for (;; le = next) { + next = al_find_ex(ni, le, type, name, name_len, NULL); + if (!next || le64_to_cpu(next->vcn) > vcn) + break; + } + } + + if (ni_load_mi(ni, le, &mi)) + return NULL; + + if (pmi) + *pmi = mi; + + attr = mi_find_attr(mi, NULL, type, name, name_len, &le->id); + if (!attr) + return NULL; + + if (!attr->non_res) + return attr; + + if (le64_to_cpu(attr->nres.svcn) <= vcn && + vcn <= le64_to_cpu(attr->nres.evcn)) + return attr; + + return NULL; +} + +/* + * ni_load_all_mi - Load all subrecords. + */ +int ni_load_all_mi(struct ntfs_inode *ni) +{ + int err; + struct ATTR_LIST_ENTRY *le; + + if (!ni->attr_list.size) + return 0; + + le = NULL; + + while ((le = al_enumerate(ni, le))) { + CLST rno = ino_get(&le->ref); + + if (rno == ni->mi.rno) + continue; + + err = ni_load_mi_ex(ni, rno, NULL); + if (err) + return err; + } + + return 0; +} + +/* + * ni_add_subrecord - Allocate + format + attach a new subrecord. + */ +bool ni_add_subrecord(struct ntfs_inode *ni, CLST rno, struct mft_inode **mi) +{ + struct mft_inode *m; + + m = kzalloc(sizeof(struct mft_inode), GFP_NOFS); + if (!m) + return false; + + if (mi_format_new(m, ni->mi.sbi, rno, 0, ni->mi.rno == MFT_REC_MFT)) { + mi_put(m); + return false; + } + + mi_get_ref(&ni->mi, &m->mrec->parent_ref); + + ni_add_mi(ni, m); + *mi = m; + return true; +} + +/* + * ni_remove_attr - Remove all attributes for the given type/name/id. + */ +int ni_remove_attr(struct ntfs_inode *ni, enum ATTR_TYPE type, + const __le16 *name, size_t name_len, bool base_only, + const __le16 *id) +{ + int err; + struct ATTRIB *attr; + struct ATTR_LIST_ENTRY *le; + struct mft_inode *mi; + u32 type_in; + int diff; + + if (base_only || type == ATTR_LIST || !ni->attr_list.size) { + attr = mi_find_attr(&ni->mi, NULL, type, name, name_len, id); + if (!attr) + return -ENOENT; + + mi_remove_attr(ni, &ni->mi, attr); + return 0; + } + + type_in = le32_to_cpu(type); + le = NULL; + + for (;;) { + le = al_enumerate(ni, le); + if (!le) + return 0; + +next_le2: + diff = le32_to_cpu(le->type) - type_in; + if (diff < 0) + continue; + + if (diff > 0) + return 0; + + if (le->name_len != name_len) + continue; + + if (name_len && + memcmp(le_name(le), name, name_len * sizeof(short))) + continue; + + if (id && le->id != *id) + continue; + err = ni_load_mi(ni, le, &mi); + if (err) + return err; + + al_remove_le(ni, le); + + attr = mi_find_attr(mi, NULL, type, name, name_len, id); + if (!attr) + return -ENOENT; + + mi_remove_attr(ni, mi, attr); + + if (PtrOffset(ni->attr_list.le, le) >= ni->attr_list.size) + return 0; + goto next_le2; + } +} + +/* + * ni_ins_new_attr - Insert the attribute into record. + * + * Return: Not full constructed attribute or NULL if not possible to create. + */ +static struct ATTRIB * +ni_ins_new_attr(struct ntfs_inode *ni, struct mft_inode *mi, + struct ATTR_LIST_ENTRY *le, enum ATTR_TYPE type, + const __le16 *name, u8 name_len, u32 asize, u16 name_off, + CLST svcn, struct ATTR_LIST_ENTRY **ins_le) +{ + int err; + struct ATTRIB *attr; + bool le_added = false; + struct MFT_REF ref; + + mi_get_ref(mi, &ref); + + if (type != ATTR_LIST && !le && ni->attr_list.size) { + err = al_add_le(ni, type, name, name_len, svcn, cpu_to_le16(-1), + &ref, &le); + if (err) { + /* No memory or no space. */ + return ERR_PTR(err); + } + le_added = true; + + /* + * al_add_le -> attr_set_size (list) -> ni_expand_list + * which moves some attributes out of primary record + * this means that name may point into moved memory + * reinit 'name' from le. + */ + name = le->name; + } + + attr = mi_insert_attr(mi, type, name, name_len, asize, name_off); + if (!attr) { + if (le_added) + al_remove_le(ni, le); + return NULL; + } + + if (type == ATTR_LIST) { + /* Attr list is not in list entry array. */ + goto out; + } + + if (!le) + goto out; + + /* Update ATTRIB Id and record reference. */ + le->id = attr->id; + ni->attr_list.dirty = true; + le->ref = ref; + +out: + if (ins_le) + *ins_le = le; + return attr; +} + +/* + * ni_repack + * + * Random write access to sparsed or compressed file may result to + * not optimized packed runs. + * Here is the place to optimize it. + */ +static int ni_repack(struct ntfs_inode *ni) +{ + int err = 0; + struct ntfs_sb_info *sbi = ni->mi.sbi; + struct mft_inode *mi, *mi_p = NULL; + struct ATTRIB *attr = NULL, *attr_p; + struct ATTR_LIST_ENTRY *le = NULL, *le_p; + CLST alloc = 0; + u8 cluster_bits = sbi->cluster_bits; + CLST svcn, evcn = 0, svcn_p, evcn_p, next_svcn; + u32 roff, rs = sbi->record_size; + struct runs_tree run; + + run_init(&run); + + while ((attr = ni_enum_attr_ex(ni, attr, &le, &mi))) { + if (!attr->non_res) + continue; + + svcn = le64_to_cpu(attr->nres.svcn); + if (svcn != le64_to_cpu(le->vcn)) { + err = -EINVAL; + break; + } + + if (!svcn) { + alloc = le64_to_cpu(attr->nres.alloc_size) >> + cluster_bits; + mi_p = NULL; + } else if (svcn != evcn + 1) { + err = -EINVAL; + break; + } + + evcn = le64_to_cpu(attr->nres.evcn); + + if (svcn > evcn + 1) { + err = -EINVAL; + break; + } + + if (!mi_p) { + /* Do not try if not enogh free space. */ + if (le32_to_cpu(mi->mrec->used) + 8 >= rs) + continue; + + /* Do not try if last attribute segment. */ + if (evcn + 1 == alloc) + continue; + run_close(&run); + } + + roff = le16_to_cpu(attr->nres.run_off); + + if (roff > le32_to_cpu(attr->size)) { + err = -EINVAL; + break; + } + + err = run_unpack(&run, sbi, ni->mi.rno, svcn, evcn, svcn, + Add2Ptr(attr, roff), + le32_to_cpu(attr->size) - roff); + if (err < 0) + break; + + if (!mi_p) { + mi_p = mi; + attr_p = attr; + svcn_p = svcn; + evcn_p = evcn; + le_p = le; + err = 0; + continue; + } + + /* + * Run contains data from two records: mi_p and mi + * Try to pack in one. + */ + err = mi_pack_runs(mi_p, attr_p, &run, evcn + 1 - svcn_p); + if (err) + break; + + next_svcn = le64_to_cpu(attr_p->nres.evcn) + 1; + + if (next_svcn >= evcn + 1) { + /* We can remove this attribute segment. */ + al_remove_le(ni, le); + mi_remove_attr(NULL, mi, attr); + le = le_p; + continue; + } + + attr->nres.svcn = le->vcn = cpu_to_le64(next_svcn); + mi->dirty = true; + ni->attr_list.dirty = true; + + if (evcn + 1 == alloc) { + err = mi_pack_runs(mi, attr, &run, + evcn + 1 - next_svcn); + if (err) + break; + mi_p = NULL; + } else { + mi_p = mi; + attr_p = attr; + svcn_p = next_svcn; + evcn_p = evcn; + le_p = le; + run_truncate_head(&run, next_svcn); + } + } + + if (err) { + ntfs_inode_warn(&ni->vfs_inode, "repack problem"); + ntfs_set_state(sbi, NTFS_DIRTY_ERROR); + + /* Pack loaded but not packed runs. */ + if (mi_p) + mi_pack_runs(mi_p, attr_p, &run, evcn_p + 1 - svcn_p); + } + + run_close(&run); + return err; +} + +/* + * ni_try_remove_attr_list + * + * Can we remove attribute list? + * Check the case when primary record contains enough space for all attributes. + */ +static int ni_try_remove_attr_list(struct ntfs_inode *ni) +{ + int err = 0; + struct ntfs_sb_info *sbi = ni->mi.sbi; + struct ATTRIB *attr, *attr_list, *attr_ins; + struct ATTR_LIST_ENTRY *le; + struct mft_inode *mi; + u32 asize, free; + struct MFT_REF ref; + struct MFT_REC *mrec; + __le16 id; + + if (!ni->attr_list.dirty) + return 0; + + err = ni_repack(ni); + if (err) + return err; + + attr_list = mi_find_attr(&ni->mi, NULL, ATTR_LIST, NULL, 0, NULL); + if (!attr_list) + return 0; + + asize = le32_to_cpu(attr_list->size); + + /* Free space in primary record without attribute list. */ + free = sbi->record_size - le32_to_cpu(ni->mi.mrec->used) + asize; + mi_get_ref(&ni->mi, &ref); + + le = NULL; + while ((le = al_enumerate(ni, le))) { + if (!memcmp(&le->ref, &ref, sizeof(ref))) + continue; + + if (le->vcn) + return 0; + + mi = ni_find_mi(ni, ino_get(&le->ref)); + if (!mi) + return 0; + + attr = mi_find_attr(mi, NULL, le->type, le_name(le), + le->name_len, &le->id); + if (!attr) + return 0; + + asize = le32_to_cpu(attr->size); + if (asize > free) + return 0; + + free -= asize; + } + + /* Make a copy of primary record to restore if error. */ + mrec = kmemdup(ni->mi.mrec, sbi->record_size, GFP_NOFS); + if (!mrec) + return 0; /* Not critical. */ + + /* It seems that attribute list can be removed from primary record. */ + mi_remove_attr(NULL, &ni->mi, attr_list); + + /* + * Repeat the cycle above and copy all attributes to primary record. + * Do not remove original attributes from subrecords! + * It should be success! + */ + le = NULL; + while ((le = al_enumerate(ni, le))) { + if (!memcmp(&le->ref, &ref, sizeof(ref))) + continue; + + mi = ni_find_mi(ni, ino_get(&le->ref)); + if (!mi) { + /* Should never happened, 'cause already checked. */ + goto out; + } + + attr = mi_find_attr(mi, NULL, le->type, le_name(le), + le->name_len, &le->id); + if (!attr) { + /* Should never happened, 'cause already checked. */ + goto out; + } + asize = le32_to_cpu(attr->size); + + /* Insert into primary record. */ + attr_ins = mi_insert_attr(&ni->mi, le->type, le_name(le), + le->name_len, asize, + le16_to_cpu(attr->name_off)); + if (!attr_ins) { + /* + * No space in primary record (already checked). + */ + goto out; + } + + /* Copy all except id. */ + id = attr_ins->id; + memcpy(attr_ins, attr, asize); + attr_ins->id = id; + } + + /* + * Repeat the cycle above and remove all attributes from subrecords. + */ + le = NULL; + while ((le = al_enumerate(ni, le))) { + if (!memcmp(&le->ref, &ref, sizeof(ref))) + continue; + + mi = ni_find_mi(ni, ino_get(&le->ref)); + if (!mi) + continue; + + attr = mi_find_attr(mi, NULL, le->type, le_name(le), + le->name_len, &le->id); + if (!attr) + continue; + + /* Remove from original record. */ + mi_remove_attr(NULL, mi, attr); + } + + run_deallocate(sbi, &ni->attr_list.run, true); + run_close(&ni->attr_list.run); + ni->attr_list.size = 0; + kfree(ni->attr_list.le); + ni->attr_list.le = NULL; + ni->attr_list.dirty = false; + + kfree(mrec); + return 0; +out: + /* Restore primary record. */ + swap(mrec, ni->mi.mrec); + kfree(mrec); + return 0; +} + +/* + * ni_create_attr_list - Generates an attribute list for this primary record. + */ +int ni_create_attr_list(struct ntfs_inode *ni) +{ + struct ntfs_sb_info *sbi = ni->mi.sbi; + int err; + u32 lsize; + struct ATTRIB *attr; + struct ATTRIB *arr_move[7]; + struct ATTR_LIST_ENTRY *le, *le_b[7]; + struct MFT_REC *rec; + bool is_mft; + CLST rno = 0; + struct mft_inode *mi; + u32 free_b, nb, to_free, rs; + u16 sz; + + is_mft = ni->mi.rno == MFT_REC_MFT; + rec = ni->mi.mrec; + rs = sbi->record_size; + + /* + * Skip estimating exact memory requirement. + * Looks like one record_size is always enough. + */ + le = kmalloc(al_aligned(rs), GFP_NOFS); + if (!le) { + err = -ENOMEM; + goto out; + } + + mi_get_ref(&ni->mi, &le->ref); + ni->attr_list.le = le; + + attr = NULL; + nb = 0; + free_b = 0; + attr = NULL; + + for (; (attr = mi_enum_attr(&ni->mi, attr)); le = Add2Ptr(le, sz)) { + sz = le_size(attr->name_len); + le->type = attr->type; + le->size = cpu_to_le16(sz); + le->name_len = attr->name_len; + le->name_off = offsetof(struct ATTR_LIST_ENTRY, name); + le->vcn = 0; + if (le != ni->attr_list.le) + le->ref = ni->attr_list.le->ref; + le->id = attr->id; + + if (attr->name_len) + memcpy(le->name, attr_name(attr), + sizeof(short) * attr->name_len); + else if (attr->type == ATTR_STD) + continue; + else if (attr->type == ATTR_LIST) + continue; + else if (is_mft && attr->type == ATTR_DATA) + continue; + + if (!nb || nb < ARRAY_SIZE(arr_move)) { + le_b[nb] = le; + arr_move[nb++] = attr; + free_b += le32_to_cpu(attr->size); + } + } + + lsize = PtrOffset(ni->attr_list.le, le); + ni->attr_list.size = lsize; + + to_free = le32_to_cpu(rec->used) + lsize + SIZEOF_RESIDENT; + if (to_free <= rs) { + to_free = 0; + } else { + to_free -= rs; + + if (to_free > free_b) { + err = -EINVAL; + goto out1; + } + } + + /* Allocate child MFT. */ + err = ntfs_look_free_mft(sbi, &rno, is_mft, ni, &mi); + if (err) + goto out1; + + err = -EINVAL; + /* Call mi_remove_attr() in reverse order to keep pointers 'arr_move' valid. */ + while (to_free > 0) { + struct ATTRIB *b = arr_move[--nb]; + u32 asize = le32_to_cpu(b->size); + u16 name_off = le16_to_cpu(b->name_off); + + attr = mi_insert_attr(mi, b->type, Add2Ptr(b, name_off), + b->name_len, asize, name_off); + if (!attr) + goto out1; + + mi_get_ref(mi, &le_b[nb]->ref); + le_b[nb]->id = attr->id; + + /* Copy all except id. */ + memcpy(attr, b, asize); + attr->id = le_b[nb]->id; + + /* Remove from primary record. */ + if (!mi_remove_attr(NULL, &ni->mi, b)) + goto out1; + + if (to_free <= asize) + break; + to_free -= asize; + if (!nb) + goto out1; + } + + attr = mi_insert_attr(&ni->mi, ATTR_LIST, NULL, 0, + lsize + SIZEOF_RESIDENT, SIZEOF_RESIDENT); + if (!attr) + goto out1; + + attr->non_res = 0; + attr->flags = 0; + attr->res.data_size = cpu_to_le32(lsize); + attr->res.data_off = SIZEOF_RESIDENT_LE; + attr->res.flags = 0; + attr->res.res = 0; + + memcpy(resident_data_ex(attr, lsize), ni->attr_list.le, lsize); + + ni->attr_list.dirty = false; + + mark_inode_dirty(&ni->vfs_inode); + goto out; + +out1: + kfree(ni->attr_list.le); + ni->attr_list.le = NULL; + ni->attr_list.size = 0; + return err; + +out: + return 0; +} + +/* + * ni_ins_attr_ext - Add an external attribute to the ntfs_inode. + */ +static int ni_ins_attr_ext(struct ntfs_inode *ni, struct ATTR_LIST_ENTRY *le, + enum ATTR_TYPE type, const __le16 *name, u8 name_len, + u32 asize, CLST svcn, u16 name_off, bool force_ext, + struct ATTRIB **ins_attr, struct mft_inode **ins_mi, + struct ATTR_LIST_ENTRY **ins_le) +{ + struct ATTRIB *attr; + struct mft_inode *mi; + CLST rno; + u64 vbo; + struct rb_node *node; + int err; + bool is_mft, is_mft_data; + struct ntfs_sb_info *sbi = ni->mi.sbi; + + is_mft = ni->mi.rno == MFT_REC_MFT; + is_mft_data = is_mft && type == ATTR_DATA && !name_len; + + if (asize > sbi->max_bytes_per_attr) { + err = -EINVAL; + goto out; + } + + /* + * Standard information and attr_list cannot be made external. + * The Log File cannot have any external attributes. + */ + if (type == ATTR_STD || type == ATTR_LIST || + ni->mi.rno == MFT_REC_LOG) { + err = -EINVAL; + goto out; + } + + /* Create attribute list if it is not already existed. */ + if (!ni->attr_list.size) { + err = ni_create_attr_list(ni); + if (err) + goto out; + } + + vbo = is_mft_data ? ((u64)svcn << sbi->cluster_bits) : 0; + + if (force_ext) + goto insert_ext; + + /* Load all subrecords into memory. */ + err = ni_load_all_mi(ni); + if (err) + goto out; + + /* Check each of loaded subrecord. */ + for (node = rb_first(&ni->mi_tree); node; node = rb_next(node)) { + mi = rb_entry(node, struct mft_inode, node); + + if (is_mft_data && + (mi_enum_attr(mi, NULL) || + vbo <= ((u64)mi->rno << sbi->record_bits))) { + /* We can't accept this record 'cause MFT's bootstrapping. */ + continue; + } + if (is_mft && + mi_find_attr(mi, NULL, ATTR_DATA, NULL, 0, NULL)) { + /* + * This child record already has a ATTR_DATA. + * So it can't accept any other records. + */ + continue; + } + + if ((type != ATTR_NAME || name_len) && + mi_find_attr(mi, NULL, type, name, name_len, NULL)) { + /* Only indexed attributes can share same record. */ + continue; + } + + /* + * Do not try to insert this attribute + * if there is no room in record. + */ + if (le32_to_cpu(mi->mrec->used) + asize > sbi->record_size) + continue; + + /* Try to insert attribute into this subrecord. */ + attr = ni_ins_new_attr(ni, mi, le, type, name, name_len, asize, + name_off, svcn, ins_le); + if (!attr) + continue; + if (IS_ERR(attr)) + return PTR_ERR(attr); + + if (ins_attr) + *ins_attr = attr; + if (ins_mi) + *ins_mi = mi; + return 0; + } + +insert_ext: + /* We have to allocate a new child subrecord. */ + err = ntfs_look_free_mft(sbi, &rno, is_mft_data, ni, &mi); + if (err) + goto out; + + if (is_mft_data && vbo <= ((u64)rno << sbi->record_bits)) { + err = -EINVAL; + goto out1; + } + + attr = ni_ins_new_attr(ni, mi, le, type, name, name_len, asize, + name_off, svcn, ins_le); + if (!attr) { + err = -EINVAL; + goto out2; + } + + if (IS_ERR(attr)) { + err = PTR_ERR(attr); + goto out2; + } + + if (ins_attr) + *ins_attr = attr; + if (ins_mi) + *ins_mi = mi; + + return 0; + +out2: + ni_remove_mi(ni, mi); + mi_put(mi); + +out1: + ntfs_mark_rec_free(sbi, rno, is_mft); + +out: + return err; +} + +/* + * ni_insert_attr - Insert an attribute into the file. + * + * If the primary record has room, it will just insert the attribute. + * If not, it may make the attribute external. + * For $MFT::Data it may make room for the attribute by + * making other attributes external. + * + * NOTE: + * The ATTR_LIST and ATTR_STD cannot be made external. + * This function does not fill new attribute full. + * It only fills 'size'/'type'/'id'/'name_len' fields. + */ +static int ni_insert_attr(struct ntfs_inode *ni, enum ATTR_TYPE type, + const __le16 *name, u8 name_len, u32 asize, + u16 name_off, CLST svcn, struct ATTRIB **ins_attr, + struct mft_inode **ins_mi, + struct ATTR_LIST_ENTRY **ins_le) +{ + struct ntfs_sb_info *sbi = ni->mi.sbi; + int err; + struct ATTRIB *attr, *eattr; + struct MFT_REC *rec; + bool is_mft; + struct ATTR_LIST_ENTRY *le; + u32 list_reserve, max_free, free, used, t32; + __le16 id; + u16 t16; + + is_mft = ni->mi.rno == MFT_REC_MFT; + rec = ni->mi.mrec; + + list_reserve = SIZEOF_NONRESIDENT + 3 * (1 + 2 * sizeof(u32)); + used = le32_to_cpu(rec->used); + free = sbi->record_size - used; + + if (is_mft && type != ATTR_LIST) { + /* Reserve space for the ATTRIB list. */ + if (free < list_reserve) + free = 0; + else + free -= list_reserve; + } + + if (asize <= free) { + attr = ni_ins_new_attr(ni, &ni->mi, NULL, type, name, name_len, + asize, name_off, svcn, ins_le); + if (IS_ERR(attr)) { + err = PTR_ERR(attr); + goto out; + } + + if (attr) { + if (ins_attr) + *ins_attr = attr; + if (ins_mi) + *ins_mi = &ni->mi; + err = 0; + goto out; + } + } + + if (!is_mft || type != ATTR_DATA || svcn) { + /* This ATTRIB will be external. */ + err = ni_ins_attr_ext(ni, NULL, type, name, name_len, asize, + svcn, name_off, false, ins_attr, ins_mi, + ins_le); + goto out; + } + + /* + * Here we have: "is_mft && type == ATTR_DATA && !svcn" + * + * The first chunk of the $MFT::Data ATTRIB must be the base record. + * Evict as many other attributes as possible. + */ + max_free = free; + + /* Estimate the result of moving all possible attributes away. */ + attr = NULL; + + while ((attr = mi_enum_attr(&ni->mi, attr))) { + if (attr->type == ATTR_STD) + continue; + if (attr->type == ATTR_LIST) + continue; + max_free += le32_to_cpu(attr->size); + } + + if (max_free < asize + list_reserve) { + /* Impossible to insert this attribute into primary record. */ + err = -EINVAL; + goto out; + } + + /* Start real attribute moving. */ + attr = NULL; + + for (;;) { + attr = mi_enum_attr(&ni->mi, attr); + if (!attr) { + /* We should never be here 'cause we have already check this case. */ + err = -EINVAL; + goto out; + } + + /* Skip attributes that MUST be primary record. */ + if (attr->type == ATTR_STD || attr->type == ATTR_LIST) + continue; + + le = NULL; + if (ni->attr_list.size) { + le = al_find_le(ni, NULL, attr); + if (!le) { + /* Really this is a serious bug. */ + err = -EINVAL; + goto out; + } + } + + t32 = le32_to_cpu(attr->size); + t16 = le16_to_cpu(attr->name_off); + err = ni_ins_attr_ext(ni, le, attr->type, Add2Ptr(attr, t16), + attr->name_len, t32, attr_svcn(attr), t16, + false, &eattr, NULL, NULL); + if (err) + return err; + + id = eattr->id; + memcpy(eattr, attr, t32); + eattr->id = id; + + /* Remove from primary record. */ + mi_remove_attr(NULL, &ni->mi, attr); + + /* attr now points to next attribute. */ + if (attr->type == ATTR_END) + goto out; + } + while (asize + list_reserve > sbi->record_size - le32_to_cpu(rec->used)) + ; + + attr = ni_ins_new_attr(ni, &ni->mi, NULL, type, name, name_len, asize, + name_off, svcn, ins_le); + if (!attr) { + err = -EINVAL; + goto out; + } + + if (IS_ERR(attr)) { + err = PTR_ERR(attr); + goto out; + } + + if (ins_attr) + *ins_attr = attr; + if (ins_mi) + *ins_mi = &ni->mi; + +out: + return err; +} + +/* ni_expand_mft_list - Split ATTR_DATA of $MFT. */ +static int ni_expand_mft_list(struct ntfs_inode *ni) +{ + int err = 0; + struct runs_tree *run = &ni->file.run; + u32 asize, run_size, done = 0; + struct ATTRIB *attr; + struct rb_node *node; + CLST mft_min, mft_new, svcn, evcn, plen; + struct mft_inode *mi, *mi_min, *mi_new; + struct ntfs_sb_info *sbi = ni->mi.sbi; + + /* Find the nearest MFT. */ + mft_min = 0; + mft_new = 0; + mi_min = NULL; + + for (node = rb_first(&ni->mi_tree); node; node = rb_next(node)) { + mi = rb_entry(node, struct mft_inode, node); + + attr = mi_enum_attr(mi, NULL); + + if (!attr) { + mft_min = mi->rno; + mi_min = mi; + break; + } + } + + if (ntfs_look_free_mft(sbi, &mft_new, true, ni, &mi_new)) { + mft_new = 0; + /* Really this is not critical. */ + } else if (mft_min > mft_new) { + mft_min = mft_new; + mi_min = mi_new; + } else { + ntfs_mark_rec_free(sbi, mft_new, true); + mft_new = 0; + ni_remove_mi(ni, mi_new); + } + + attr = mi_find_attr(&ni->mi, NULL, ATTR_DATA, NULL, 0, NULL); + if (!attr) { + err = -EINVAL; + goto out; + } + + asize = le32_to_cpu(attr->size); + + evcn = le64_to_cpu(attr->nres.evcn); + svcn = bytes_to_cluster(sbi, (u64)(mft_min + 1) << sbi->record_bits); + if (evcn + 1 >= svcn) { + err = -EINVAL; + goto out; + } + + /* + * Split primary attribute [0 evcn] in two parts [0 svcn) + [svcn evcn]. + * + * Update first part of ATTR_DATA in 'primary MFT. + */ + err = run_pack(run, 0, svcn, Add2Ptr(attr, SIZEOF_NONRESIDENT), + asize - SIZEOF_NONRESIDENT, &plen); + if (err < 0) + goto out; + + run_size = ALIGN(err, 8); + err = 0; + + if (plen < svcn) { + err = -EINVAL; + goto out; + } + + attr->nres.evcn = cpu_to_le64(svcn - 1); + attr->size = cpu_to_le32(run_size + SIZEOF_NONRESIDENT); + /* 'done' - How many bytes of primary MFT becomes free. */ + done = asize - run_size - SIZEOF_NONRESIDENT; + le32_sub_cpu(&ni->mi.mrec->used, done); + + /* Estimate packed size (run_buf=NULL). */ + err = run_pack(run, svcn, evcn + 1 - svcn, NULL, sbi->record_size, + &plen); + if (err < 0) + goto out; + + run_size = ALIGN(err, 8); + err = 0; + + if (plen < evcn + 1 - svcn) { + err = -EINVAL; + goto out; + } + + /* + * This function may implicitly call expand attr_list. + * Insert second part of ATTR_DATA in 'mi_min'. + */ + attr = ni_ins_new_attr(ni, mi_min, NULL, ATTR_DATA, NULL, 0, + SIZEOF_NONRESIDENT + run_size, + SIZEOF_NONRESIDENT, svcn, NULL); + if (!attr) { + err = -EINVAL; + goto out; + } + + if (IS_ERR(attr)) { + err = PTR_ERR(attr); + goto out; + } + + attr->non_res = 1; + attr->name_off = SIZEOF_NONRESIDENT_LE; + attr->flags = 0; + + /* This function can't fail - cause already checked above. */ + run_pack(run, svcn, evcn + 1 - svcn, Add2Ptr(attr, SIZEOF_NONRESIDENT), + run_size, &plen); + + attr->nres.svcn = cpu_to_le64(svcn); + attr->nres.evcn = cpu_to_le64(evcn); + attr->nres.run_off = cpu_to_le16(SIZEOF_NONRESIDENT); + +out: + if (mft_new) { + ntfs_mark_rec_free(sbi, mft_new, true); + ni_remove_mi(ni, mi_new); + } + + return !err && !done ? -EOPNOTSUPP : err; +} + +/* + * ni_expand_list - Move all possible attributes out of primary record. + */ +int ni_expand_list(struct ntfs_inode *ni) +{ + int err = 0; + u32 asize, done = 0; + struct ATTRIB *attr, *ins_attr; + struct ATTR_LIST_ENTRY *le; + bool is_mft = ni->mi.rno == MFT_REC_MFT; + struct MFT_REF ref; + + mi_get_ref(&ni->mi, &ref); + le = NULL; + + while ((le = al_enumerate(ni, le))) { + if (le->type == ATTR_STD) + continue; + + if (memcmp(&ref, &le->ref, sizeof(struct MFT_REF))) + continue; + + if (is_mft && le->type == ATTR_DATA) + continue; + + /* Find attribute in primary record. */ + attr = rec_find_attr_le(&ni->mi, le); + if (!attr) { + err = -EINVAL; + goto out; + } + + asize = le32_to_cpu(attr->size); + + /* Always insert into new record to avoid collisions (deep recursive). */ + err = ni_ins_attr_ext(ni, le, attr->type, attr_name(attr), + attr->name_len, asize, attr_svcn(attr), + le16_to_cpu(attr->name_off), true, + &ins_attr, NULL, NULL); + + if (err) + goto out; + + memcpy(ins_attr, attr, asize); + ins_attr->id = le->id; + /* Remove from primary record. */ + mi_remove_attr(NULL, &ni->mi, attr); + + done += asize; + goto out; + } + + if (!is_mft) { + err = -EFBIG; /* Attr list is too big(?) */ + goto out; + } + + /* Split MFT data as much as possible. */ + err = ni_expand_mft_list(ni); + +out: + return !err && !done ? -EOPNOTSUPP : err; +} + +/* + * ni_insert_nonresident - Insert new nonresident attribute. + */ +int ni_insert_nonresident(struct ntfs_inode *ni, enum ATTR_TYPE type, + const __le16 *name, u8 name_len, + const struct runs_tree *run, CLST svcn, CLST len, + __le16 flags, struct ATTRIB **new_attr, + struct mft_inode **mi, struct ATTR_LIST_ENTRY **le) +{ + int err; + CLST plen; + struct ATTRIB *attr; + bool is_ext = + (flags & (ATTR_FLAG_SPARSED | ATTR_FLAG_COMPRESSED)) && !svcn; + u32 name_size = ALIGN(name_len * sizeof(short), 8); + u32 name_off = is_ext ? SIZEOF_NONRESIDENT_EX : SIZEOF_NONRESIDENT; + u32 run_off = name_off + name_size; + u32 run_size, asize; + struct ntfs_sb_info *sbi = ni->mi.sbi; + + /* Estimate packed size (run_buf=NULL). */ + err = run_pack(run, svcn, len, NULL, sbi->max_bytes_per_attr - run_off, + &plen); + if (err < 0) + goto out; + + run_size = ALIGN(err, 8); + + if (plen < len) { + err = -EINVAL; + goto out; + } + + asize = run_off + run_size; + + if (asize > sbi->max_bytes_per_attr) { + err = -EINVAL; + goto out; + } + + err = ni_insert_attr(ni, type, name, name_len, asize, name_off, svcn, + &attr, mi, le); + + if (err) + goto out; + + attr->non_res = 1; + attr->name_off = cpu_to_le16(name_off); + attr->flags = flags; + + /* This function can't fail - cause already checked above. */ + run_pack(run, svcn, len, Add2Ptr(attr, run_off), run_size, &plen); + + attr->nres.svcn = cpu_to_le64(svcn); + attr->nres.evcn = cpu_to_le64((u64)svcn + len - 1); + + if (new_attr) + *new_attr = attr; + + *(__le64 *)&attr->nres.run_off = cpu_to_le64(run_off); + + attr->nres.alloc_size = + svcn ? 0 : cpu_to_le64((u64)len << ni->mi.sbi->cluster_bits); + attr->nres.data_size = attr->nres.alloc_size; + attr->nres.valid_size = attr->nres.alloc_size; + + if (is_ext) { + if (flags & ATTR_FLAG_COMPRESSED) + attr->nres.c_unit = COMPRESSION_UNIT; + attr->nres.total_size = attr->nres.alloc_size; + } + +out: + return err; +} + +/* + * ni_insert_resident - Inserts new resident attribute. + */ +int ni_insert_resident(struct ntfs_inode *ni, u32 data_size, + enum ATTR_TYPE type, const __le16 *name, u8 name_len, + struct ATTRIB **new_attr, struct mft_inode **mi, + struct ATTR_LIST_ENTRY **le) +{ + int err; + u32 name_size = ALIGN(name_len * sizeof(short), 8); + u32 asize = SIZEOF_RESIDENT + name_size + ALIGN(data_size, 8); + struct ATTRIB *attr; + + err = ni_insert_attr(ni, type, name, name_len, asize, SIZEOF_RESIDENT, + 0, &attr, mi, le); + if (err) + return err; + + attr->non_res = 0; + attr->flags = 0; + + attr->res.data_size = cpu_to_le32(data_size); + attr->res.data_off = cpu_to_le16(SIZEOF_RESIDENT + name_size); + if (type == ATTR_NAME) { + attr->res.flags = RESIDENT_FLAG_INDEXED; + + /* is_attr_indexed(attr)) == true */ + le16_add_cpu(&ni->mi.mrec->hard_links, 1); + ni->mi.dirty = true; + } + attr->res.res = 0; + + if (new_attr) + *new_attr = attr; + + return 0; +} + +/* + * ni_remove_attr_le - Remove attribute from record. + */ +void ni_remove_attr_le(struct ntfs_inode *ni, struct ATTRIB *attr, + struct mft_inode *mi, struct ATTR_LIST_ENTRY *le) +{ + mi_remove_attr(ni, mi, attr); + + if (le) + al_remove_le(ni, le); +} + +/* + * ni_delete_all - Remove all attributes and frees allocates space. + * + * ntfs_evict_inode->ntfs_clear_inode->ni_delete_all (if no links). + */ +int ni_delete_all(struct ntfs_inode *ni) +{ + int err; + struct ATTR_LIST_ENTRY *le = NULL; + struct ATTRIB *attr = NULL; + struct rb_node *node; + u16 roff; + u32 asize; + CLST svcn, evcn; + struct ntfs_sb_info *sbi = ni->mi.sbi; + bool nt3 = is_ntfs3(sbi); + struct MFT_REF ref; + + while ((attr = ni_enum_attr_ex(ni, attr, &le, NULL))) { + if (!nt3 || attr->name_len) { + ; + } else if (attr->type == ATTR_REPARSE) { + mi_get_ref(&ni->mi, &ref); + ntfs_remove_reparse(sbi, 0, &ref); + } else if (attr->type == ATTR_ID && !attr->non_res && + le32_to_cpu(attr->res.data_size) >= + sizeof(struct GUID)) { + ntfs_objid_remove(sbi, resident_data(attr)); + } + + if (!attr->non_res) + continue; + + svcn = le64_to_cpu(attr->nres.svcn); + evcn = le64_to_cpu(attr->nres.evcn); + + if (evcn + 1 <= svcn) + continue; + + asize = le32_to_cpu(attr->size); + roff = le16_to_cpu(attr->nres.run_off); + + if (roff > asize) + return -EINVAL; + + /* run==1 means unpack and deallocate. */ + run_unpack_ex(RUN_DEALLOCATE, sbi, ni->mi.rno, svcn, evcn, svcn, + Add2Ptr(attr, roff), asize - roff); + } + + if (ni->attr_list.size) { + run_deallocate(ni->mi.sbi, &ni->attr_list.run, true); + al_destroy(ni); + } + + /* Free all subrecords. */ + for (node = rb_first(&ni->mi_tree); node;) { + struct rb_node *next = rb_next(node); + struct mft_inode *mi = rb_entry(node, struct mft_inode, node); + + clear_rec_inuse(mi->mrec); + mi->dirty = true; + mi_write(mi, 0); + + ntfs_mark_rec_free(sbi, mi->rno, false); + ni_remove_mi(ni, mi); + mi_put(mi); + node = next; + } + + /* Free base record. */ + clear_rec_inuse(ni->mi.mrec); + ni->mi.dirty = true; + err = mi_write(&ni->mi, 0); + + ntfs_mark_rec_free(sbi, ni->mi.rno, false); + + return err; +} + +/* ni_fname_name + * + * Return: File name attribute by its value. + */ +struct ATTR_FILE_NAME *ni_fname_name(struct ntfs_inode *ni, + const struct cpu_str *uni, + const struct MFT_REF *home_dir, + struct mft_inode **mi, + struct ATTR_LIST_ENTRY **le) +{ + struct ATTRIB *attr = NULL; + struct ATTR_FILE_NAME *fname; + + if (le) + *le = NULL; + + /* Enumerate all names. */ +next: + attr = ni_find_attr(ni, attr, le, ATTR_NAME, NULL, 0, NULL, mi); + if (!attr) + return NULL; + + fname = resident_data_ex(attr, SIZEOF_ATTRIBUTE_FILENAME); + if (!fname) + goto next; + + if (home_dir && memcmp(home_dir, &fname->home, sizeof(*home_dir))) + goto next; + + if (!uni) + return fname; + + if (uni->len != fname->name_len) + goto next; + + if (ntfs_cmp_names_cpu(uni, (struct le_str *)&fname->name_len, NULL, + false)) + goto next; + + return fname; +} + +/* + * ni_fname_type + * + * Return: File name attribute with given type. + */ +struct ATTR_FILE_NAME *ni_fname_type(struct ntfs_inode *ni, u8 name_type, + struct mft_inode **mi, + struct ATTR_LIST_ENTRY **le) +{ + struct ATTRIB *attr = NULL; + struct ATTR_FILE_NAME *fname; + + *le = NULL; + + if (name_type == FILE_NAME_POSIX) + return NULL; + + /* Enumerate all names. */ + for (;;) { + attr = ni_find_attr(ni, attr, le, ATTR_NAME, NULL, 0, NULL, mi); + if (!attr) + return NULL; + + fname = resident_data_ex(attr, SIZEOF_ATTRIBUTE_FILENAME); + if (fname && name_type == fname->type) + return fname; + } +} + +/* + * ni_new_attr_flags + * + * Process compressed/sparsed in special way. + * NOTE: You need to set ni->std_fa = new_fa + * after this function to keep internal structures in consistency. + */ +int ni_new_attr_flags(struct ntfs_inode *ni, enum FILE_ATTRIBUTE new_fa) +{ + struct ATTRIB *attr; + struct mft_inode *mi; + __le16 new_aflags; + u32 new_asize; + + attr = ni_find_attr(ni, NULL, NULL, ATTR_DATA, NULL, 0, NULL, &mi); + if (!attr) + return -EINVAL; + + new_aflags = attr->flags; + + if (new_fa & FILE_ATTRIBUTE_SPARSE_FILE) + new_aflags |= ATTR_FLAG_SPARSED; + else + new_aflags &= ~ATTR_FLAG_SPARSED; + + if (new_fa & FILE_ATTRIBUTE_COMPRESSED) + new_aflags |= ATTR_FLAG_COMPRESSED; + else + new_aflags &= ~ATTR_FLAG_COMPRESSED; + + if (new_aflags == attr->flags) + return 0; + + if ((new_aflags & (ATTR_FLAG_COMPRESSED | ATTR_FLAG_SPARSED)) == + (ATTR_FLAG_COMPRESSED | ATTR_FLAG_SPARSED)) { + ntfs_inode_warn(&ni->vfs_inode, + "file can't be sparsed and compressed"); + return -EOPNOTSUPP; + } + + if (!attr->non_res) + goto out; + + if (attr->nres.data_size) { + ntfs_inode_warn( + &ni->vfs_inode, + "one can change sparsed/compressed only for empty files"); + return -EOPNOTSUPP; + } + + /* Resize nonresident empty attribute in-place only. */ + new_asize = (new_aflags & (ATTR_FLAG_COMPRESSED | ATTR_FLAG_SPARSED)) + ? (SIZEOF_NONRESIDENT_EX + 8) + : (SIZEOF_NONRESIDENT + 8); + + if (!mi_resize_attr(mi, attr, new_asize - le32_to_cpu(attr->size))) + return -EOPNOTSUPP; + + if (new_aflags & ATTR_FLAG_SPARSED) { + attr->name_off = SIZEOF_NONRESIDENT_EX_LE; + /* Windows uses 16 clusters per frame but supports one cluster per frame too. */ + attr->nres.c_unit = 0; + ni->vfs_inode.i_mapping->a_ops = &ntfs_aops; + } else if (new_aflags & ATTR_FLAG_COMPRESSED) { + attr->name_off = SIZEOF_NONRESIDENT_EX_LE; + /* The only allowed: 16 clusters per frame. */ + attr->nres.c_unit = NTFS_LZNT_CUNIT; + ni->vfs_inode.i_mapping->a_ops = &ntfs_aops_cmpr; + } else { + attr->name_off = SIZEOF_NONRESIDENT_LE; + /* Normal files. */ + attr->nres.c_unit = 0; + ni->vfs_inode.i_mapping->a_ops = &ntfs_aops; + } + attr->nres.run_off = attr->name_off; +out: + attr->flags = new_aflags; + mi->dirty = true; + + return 0; +} + +/* + * ni_parse_reparse + * + * buffer - memory for reparse buffer header + */ +enum REPARSE_SIGN ni_parse_reparse(struct ntfs_inode *ni, struct ATTRIB *attr, + struct REPARSE_DATA_BUFFER *buffer) +{ + const struct REPARSE_DATA_BUFFER *rp = NULL; + u8 bits; + u16 len; + typeof(rp->CompressReparseBuffer) *cmpr; + + /* Try to estimate reparse point. */ + if (!attr->non_res) { + rp = resident_data_ex(attr, sizeof(struct REPARSE_DATA_BUFFER)); + } else if (le64_to_cpu(attr->nres.data_size) >= + sizeof(struct REPARSE_DATA_BUFFER)) { + struct runs_tree run; + + run_init(&run); + + if (!attr_load_runs_vcn(ni, ATTR_REPARSE, NULL, 0, &run, 0) && + !ntfs_read_run_nb(ni->mi.sbi, &run, 0, buffer, + sizeof(struct REPARSE_DATA_BUFFER), + NULL)) { + rp = buffer; + } + + run_close(&run); + } + + if (!rp) + return REPARSE_NONE; + + len = le16_to_cpu(rp->ReparseDataLength); + switch (rp->ReparseTag) { + case (IO_REPARSE_TAG_MICROSOFT | IO_REPARSE_TAG_SYMBOLIC_LINK): + break; /* Symbolic link. */ + case IO_REPARSE_TAG_MOUNT_POINT: + break; /* Mount points and junctions. */ + case IO_REPARSE_TAG_SYMLINK: + break; + case IO_REPARSE_TAG_COMPRESS: + /* + * WOF - Windows Overlay Filter - Used to compress files with + * LZX/Xpress. + * + * Unlike native NTFS file compression, the Windows + * Overlay Filter supports only read operations. This means + * that it doesn't need to sector-align each compressed chunk, + * so the compressed data can be packed more tightly together. + * If you open the file for writing, the WOF just decompresses + * the entire file, turning it back into a plain file. + * + * Ntfs3 driver decompresses the entire file only on write or + * change size requests. + */ + + cmpr = &rp->CompressReparseBuffer; + if (len < sizeof(*cmpr) || + cmpr->WofVersion != WOF_CURRENT_VERSION || + cmpr->WofProvider != WOF_PROVIDER_SYSTEM || + cmpr->ProviderVer != WOF_PROVIDER_CURRENT_VERSION) { + return REPARSE_NONE; + } + + switch (cmpr->CompressionFormat) { + case WOF_COMPRESSION_XPRESS4K: + bits = 0xc; // 4k + break; + case WOF_COMPRESSION_XPRESS8K: + bits = 0xd; // 8k + break; + case WOF_COMPRESSION_XPRESS16K: + bits = 0xe; // 16k + break; + case WOF_COMPRESSION_LZX32K: + bits = 0xf; // 32k + break; + default: + bits = 0x10; // 64k + break; + } + ni_set_ext_compress_bits(ni, bits); + return REPARSE_COMPRESSED; + + case IO_REPARSE_TAG_DEDUP: + ni->ni_flags |= NI_FLAG_DEDUPLICATED; + return REPARSE_DEDUPLICATED; + + default: + if (rp->ReparseTag & IO_REPARSE_TAG_NAME_SURROGATE) + break; + + return REPARSE_NONE; + } + + if (buffer != rp) + memcpy(buffer, rp, sizeof(struct REPARSE_DATA_BUFFER)); + + /* Looks like normal symlink. */ + return REPARSE_LINK; +} + +/* + * ni_fiemap - Helper for file_fiemap(). + * + * Assumed ni_lock. + * TODO: Less aggressive locks. + */ +int ni_fiemap(struct ntfs_inode *ni, struct fiemap_extent_info *fieinfo, + __u64 vbo, __u64 len) +{ + int err = 0; + struct ntfs_sb_info *sbi = ni->mi.sbi; + u8 cluster_bits = sbi->cluster_bits; + struct runs_tree *run; + struct rw_semaphore *run_lock; + struct ATTRIB *attr; + CLST vcn = vbo >> cluster_bits; + CLST lcn, clen; + u64 valid = ni->i_valid; + u64 lbo, bytes; + u64 end, alloc_size; + size_t idx = -1; + u32 flags; + bool ok; + + if (S_ISDIR(ni->vfs_inode.i_mode)) { + run = &ni->dir.alloc_run; + attr = ni_find_attr(ni, NULL, NULL, ATTR_ALLOC, I30_NAME, + ARRAY_SIZE(I30_NAME), NULL, NULL); + run_lock = &ni->dir.run_lock; + } else { + run = &ni->file.run; + attr = ni_find_attr(ni, NULL, NULL, ATTR_DATA, NULL, 0, NULL, + NULL); + if (!attr) { + err = -EINVAL; + goto out; + } + if (is_attr_compressed(attr)) { + /* Unfortunately cp -r incorrectly treats compressed clusters. */ + err = -EOPNOTSUPP; + ntfs_inode_warn( + &ni->vfs_inode, + "fiemap is not supported for compressed file (cp -r)"); + goto out; + } + run_lock = &ni->file.run_lock; + } + + if (!attr || !attr->non_res) { + err = fiemap_fill_next_extent( + fieinfo, 0, 0, + attr ? le32_to_cpu(attr->res.data_size) : 0, + FIEMAP_EXTENT_DATA_INLINE | FIEMAP_EXTENT_LAST | + FIEMAP_EXTENT_MERGED); + goto out; + } + + end = vbo + len; + alloc_size = le64_to_cpu(attr->nres.alloc_size); + if (end > alloc_size) + end = alloc_size; + + down_read(run_lock); + + while (vbo < end) { + if (idx == -1) { + ok = run_lookup_entry(run, vcn, &lcn, &clen, &idx); + } else { + CLST vcn_next = vcn; + + ok = run_get_entry(run, ++idx, &vcn, &lcn, &clen) && + vcn == vcn_next; + if (!ok) + vcn = vcn_next; + } + + if (!ok) { + up_read(run_lock); + down_write(run_lock); + + err = attr_load_runs_vcn(ni, attr->type, + attr_name(attr), + attr->name_len, run, vcn); + + up_write(run_lock); + down_read(run_lock); + + if (err) + break; + + ok = run_lookup_entry(run, vcn, &lcn, &clen, &idx); + + if (!ok) { + err = -EINVAL; + break; + } + } + + if (!clen) { + err = -EINVAL; // ? + break; + } + + if (lcn == SPARSE_LCN) { + vcn += clen; + vbo = (u64)vcn << cluster_bits; + continue; + } + + flags = FIEMAP_EXTENT_MERGED; + if (S_ISDIR(ni->vfs_inode.i_mode)) { + ; + } else if (is_attr_compressed(attr)) { + CLST clst_data; + + err = attr_is_frame_compressed( + ni, attr, vcn >> attr->nres.c_unit, &clst_data); + if (err) + break; + if (clst_data < NTFS_LZNT_CLUSTERS) + flags |= FIEMAP_EXTENT_ENCODED; + } else if (is_attr_encrypted(attr)) { + flags |= FIEMAP_EXTENT_DATA_ENCRYPTED; + } + + vbo = (u64)vcn << cluster_bits; + bytes = (u64)clen << cluster_bits; + lbo = (u64)lcn << cluster_bits; + + vcn += clen; + + if (vbo + bytes >= end) + bytes = end - vbo; + + if (vbo + bytes <= valid) { + ; + } else if (vbo >= valid) { + flags |= FIEMAP_EXTENT_UNWRITTEN; + } else { + /* vbo < valid && valid < vbo + bytes */ + u64 dlen = valid - vbo; + + if (vbo + dlen >= end) + flags |= FIEMAP_EXTENT_LAST; + + err = fiemap_fill_next_extent(fieinfo, vbo, lbo, dlen, + flags); + if (err < 0) + break; + if (err == 1) { + err = 0; + break; + } + + vbo = valid; + bytes -= dlen; + if (!bytes) + continue; + + lbo += dlen; + flags |= FIEMAP_EXTENT_UNWRITTEN; + } + + if (vbo + bytes >= end) + flags |= FIEMAP_EXTENT_LAST; + + err = fiemap_fill_next_extent(fieinfo, vbo, lbo, bytes, flags); + if (err < 0) + break; + if (err == 1) { + err = 0; + break; + } + + vbo += bytes; + } + + up_read(run_lock); + +out: + return err; +} + +/* + * ni_readpage_cmpr + * + * When decompressing, we typically obtain more than one page per reference. + * We inject the additional pages into the page cache. + */ +int ni_readpage_cmpr(struct ntfs_inode *ni, struct page *page) +{ + int err; + struct ntfs_sb_info *sbi = ni->mi.sbi; + struct address_space *mapping = page->mapping; + pgoff_t index = page->index; + u64 frame_vbo, vbo = (u64)index << PAGE_SHIFT; + struct page **pages = NULL; /* Array of at most 16 pages. stack? */ + u8 frame_bits; + CLST frame; + u32 i, idx, frame_size, pages_per_frame; + gfp_t gfp_mask; + struct page *pg; + + if (vbo >= ni->vfs_inode.i_size) { + SetPageUptodate(page); + err = 0; + goto out; + } + + if (ni->ni_flags & NI_FLAG_COMPRESSED_MASK) { + /* Xpress or LZX. */ + frame_bits = ni_ext_compress_bits(ni); + } else { + /* LZNT compression. */ + frame_bits = NTFS_LZNT_CUNIT + sbi->cluster_bits; + } + frame_size = 1u << frame_bits; + frame = vbo >> frame_bits; + frame_vbo = (u64)frame << frame_bits; + idx = (vbo - frame_vbo) >> PAGE_SHIFT; + + pages_per_frame = frame_size >> PAGE_SHIFT; + pages = kcalloc(pages_per_frame, sizeof(struct page *), GFP_NOFS); + if (!pages) { + err = -ENOMEM; + goto out; + } + + pages[idx] = page; + index = frame_vbo >> PAGE_SHIFT; + gfp_mask = mapping_gfp_mask(mapping); + + for (i = 0; i < pages_per_frame; i++, index++) { + if (i == idx) + continue; + + pg = find_or_create_page(mapping, index, gfp_mask); + if (!pg) { + err = -ENOMEM; + goto out1; + } + pages[i] = pg; + } + + err = ni_read_frame(ni, frame_vbo, pages, pages_per_frame); + +out1: + if (err) + SetPageError(page); + + for (i = 0; i < pages_per_frame; i++) { + pg = pages[i]; + if (i == idx || !pg) + continue; + unlock_page(pg); + put_page(pg); + } + +out: + /* At this point, err contains 0 or -EIO depending on the "critical" page. */ + kfree(pages); + unlock_page(page); + + return err; +} + +#ifdef CONFIG_NTFS3_LZX_XPRESS +/* + * ni_decompress_file - Decompress LZX/Xpress compressed file. + * + * Remove ATTR_DATA::WofCompressedData. + * Remove ATTR_REPARSE. + */ +int ni_decompress_file(struct ntfs_inode *ni) +{ + struct ntfs_sb_info *sbi = ni->mi.sbi; + struct inode *inode = &ni->vfs_inode; + loff_t i_size = inode->i_size; + struct address_space *mapping = inode->i_mapping; + gfp_t gfp_mask = mapping_gfp_mask(mapping); + struct page **pages = NULL; + struct ATTR_LIST_ENTRY *le; + struct ATTRIB *attr; + CLST vcn, cend, lcn, clen, end; + pgoff_t index; + u64 vbo; + u8 frame_bits; + u32 i, frame_size, pages_per_frame, bytes; + struct mft_inode *mi; + int err; + + /* Clusters for decompressed data. */ + cend = bytes_to_cluster(sbi, i_size); + + if (!i_size) + goto remove_wof; + + /* Check in advance. */ + if (cend > wnd_zeroes(&sbi->used.bitmap)) { + err = -ENOSPC; + goto out; + } + + frame_bits = ni_ext_compress_bits(ni); + frame_size = 1u << frame_bits; + pages_per_frame = frame_size >> PAGE_SHIFT; + pages = kcalloc(pages_per_frame, sizeof(struct page *), GFP_NOFS); + if (!pages) { + err = -ENOMEM; + goto out; + } + + /* + * Step 1: Decompress data and copy to new allocated clusters. + */ + index = 0; + for (vbo = 0; vbo < i_size; vbo += bytes) { + u32 nr_pages; + bool new; + + if (vbo + frame_size > i_size) { + bytes = i_size - vbo; + nr_pages = (bytes + PAGE_SIZE - 1) >> PAGE_SHIFT; + } else { + nr_pages = pages_per_frame; + bytes = frame_size; + } + + end = bytes_to_cluster(sbi, vbo + bytes); + + for (vcn = vbo >> sbi->cluster_bits; vcn < end; vcn += clen) { + err = attr_data_get_block(ni, vcn, cend - vcn, &lcn, + &clen, &new); + if (err) + goto out; + } + + for (i = 0; i < pages_per_frame; i++, index++) { + struct page *pg; + + pg = find_or_create_page(mapping, index, gfp_mask); + if (!pg) { + while (i--) { + unlock_page(pages[i]); + put_page(pages[i]); + } + err = -ENOMEM; + goto out; + } + pages[i] = pg; + } + + err = ni_read_frame(ni, vbo, pages, pages_per_frame); + + if (!err) { + down_read(&ni->file.run_lock); + err = ntfs_bio_pages(sbi, &ni->file.run, pages, + nr_pages, vbo, bytes, + REQ_OP_WRITE); + up_read(&ni->file.run_lock); + } + + for (i = 0; i < pages_per_frame; i++) { + unlock_page(pages[i]); + put_page(pages[i]); + } + + if (err) + goto out; + + cond_resched(); + } + +remove_wof: + /* + * Step 2: Deallocate attributes ATTR_DATA::WofCompressedData + * and ATTR_REPARSE. + */ + attr = NULL; + le = NULL; + while ((attr = ni_enum_attr_ex(ni, attr, &le, NULL))) { + CLST svcn, evcn; + u32 asize, roff; + + if (attr->type == ATTR_REPARSE) { + struct MFT_REF ref; + + mi_get_ref(&ni->mi, &ref); + ntfs_remove_reparse(sbi, 0, &ref); + } + + if (!attr->non_res) + continue; + + if (attr->type != ATTR_REPARSE && + (attr->type != ATTR_DATA || + attr->name_len != ARRAY_SIZE(WOF_NAME) || + memcmp(attr_name(attr), WOF_NAME, sizeof(WOF_NAME)))) + continue; + + svcn = le64_to_cpu(attr->nres.svcn); + evcn = le64_to_cpu(attr->nres.evcn); + + if (evcn + 1 <= svcn) + continue; + + asize = le32_to_cpu(attr->size); + roff = le16_to_cpu(attr->nres.run_off); + + if (roff > asize) { + err = -EINVAL; + goto out; + } + + /*run==1 Means unpack and deallocate. */ + run_unpack_ex(RUN_DEALLOCATE, sbi, ni->mi.rno, svcn, evcn, svcn, + Add2Ptr(attr, roff), asize - roff); + } + + /* + * Step 3: Remove attribute ATTR_DATA::WofCompressedData. + */ + err = ni_remove_attr(ni, ATTR_DATA, WOF_NAME, ARRAY_SIZE(WOF_NAME), + false, NULL); + if (err) + goto out; + + /* + * Step 4: Remove ATTR_REPARSE. + */ + err = ni_remove_attr(ni, ATTR_REPARSE, NULL, 0, false, NULL); + if (err) + goto out; + + /* + * Step 5: Remove sparse flag from data attribute. + */ + attr = ni_find_attr(ni, NULL, NULL, ATTR_DATA, NULL, 0, NULL, &mi); + if (!attr) { + err = -EINVAL; + goto out; + } + + if (attr->non_res && is_attr_sparsed(attr)) { + /* Sparsed attribute header is 8 bytes bigger than normal. */ + struct MFT_REC *rec = mi->mrec; + u32 used = le32_to_cpu(rec->used); + u32 asize = le32_to_cpu(attr->size); + u16 roff = le16_to_cpu(attr->nres.run_off); + char *rbuf = Add2Ptr(attr, roff); + + memmove(rbuf - 8, rbuf, used - PtrOffset(rec, rbuf)); + attr->size = cpu_to_le32(asize - 8); + attr->flags &= ~ATTR_FLAG_SPARSED; + attr->nres.run_off = cpu_to_le16(roff - 8); + attr->nres.c_unit = 0; + rec->used = cpu_to_le32(used - 8); + mi->dirty = true; + ni->std_fa &= ~(FILE_ATTRIBUTE_SPARSE_FILE | + FILE_ATTRIBUTE_REPARSE_POINT); + + mark_inode_dirty(inode); + } + + /* Clear cached flag. */ + ni->ni_flags &= ~NI_FLAG_COMPRESSED_MASK; + if (ni->file.offs_page) { + put_page(ni->file.offs_page); + ni->file.offs_page = NULL; + } + mapping->a_ops = &ntfs_aops; + +out: + kfree(pages); + if (err) + _ntfs_bad_inode(inode); + + return err; +} + +/* + * decompress_lzx_xpress - External compression LZX/Xpress. + */ +static int decompress_lzx_xpress(struct ntfs_sb_info *sbi, const char *cmpr, + size_t cmpr_size, void *unc, size_t unc_size, + u32 frame_size) +{ + int err; + void *ctx; + + if (cmpr_size == unc_size) { + /* Frame not compressed. */ + memcpy(unc, cmpr, unc_size); + return 0; + } + + err = 0; + if (frame_size == 0x8000) { + mutex_lock(&sbi->compress.mtx_lzx); + /* LZX: Frame compressed. */ + ctx = sbi->compress.lzx; + if (!ctx) { + /* Lazy initialize LZX decompress context. */ + ctx = lzx_allocate_decompressor(); + if (!ctx) { + err = -ENOMEM; + goto out1; + } + + sbi->compress.lzx = ctx; + } + + if (lzx_decompress(ctx, cmpr, cmpr_size, unc, unc_size)) { + /* Treat all errors as "invalid argument". */ + err = -EINVAL; + } +out1: + mutex_unlock(&sbi->compress.mtx_lzx); + } else { + /* XPRESS: Frame compressed. */ + mutex_lock(&sbi->compress.mtx_xpress); + ctx = sbi->compress.xpress; + if (!ctx) { + /* Lazy initialize Xpress decompress context. */ + ctx = xpress_allocate_decompressor(); + if (!ctx) { + err = -ENOMEM; + goto out2; + } + + sbi->compress.xpress = ctx; + } + + if (xpress_decompress(ctx, cmpr, cmpr_size, unc, unc_size)) { + /* Treat all errors as "invalid argument". */ + err = -EINVAL; + } +out2: + mutex_unlock(&sbi->compress.mtx_xpress); + } + return err; +} +#endif + +/* + * ni_read_frame + * + * Pages - Array of locked pages. + */ +int ni_read_frame(struct ntfs_inode *ni, u64 frame_vbo, struct page **pages, + u32 pages_per_frame) +{ + int err; + struct ntfs_sb_info *sbi = ni->mi.sbi; + u8 cluster_bits = sbi->cluster_bits; + char *frame_ondisk = NULL; + char *frame_mem = NULL; + struct page **pages_disk = NULL; + struct ATTR_LIST_ENTRY *le = NULL; + struct runs_tree *run = &ni->file.run; + u64 valid_size = ni->i_valid; + u64 vbo_disk; + size_t unc_size; + u32 frame_size, i, npages_disk, ondisk_size; + struct page *pg; + struct ATTRIB *attr; + CLST frame, clst_data; + + /* + * To simplify decompress algorithm do vmap for source + * and target pages. + */ + for (i = 0; i < pages_per_frame; i++) + kmap(pages[i]); + + frame_size = pages_per_frame << PAGE_SHIFT; + frame_mem = vmap(pages, pages_per_frame, VM_MAP, PAGE_KERNEL); + if (!frame_mem) { + err = -ENOMEM; + goto out; + } + + attr = ni_find_attr(ni, NULL, &le, ATTR_DATA, NULL, 0, NULL, NULL); + if (!attr) { + err = -ENOENT; + goto out1; + } + + if (!attr->non_res) { + u32 data_size = le32_to_cpu(attr->res.data_size); + + memset(frame_mem, 0, frame_size); + if (frame_vbo < data_size) { + ondisk_size = data_size - frame_vbo; + memcpy(frame_mem, resident_data(attr) + frame_vbo, + min(ondisk_size, frame_size)); + } + err = 0; + goto out1; + } + + if (frame_vbo >= valid_size) { + memset(frame_mem, 0, frame_size); + err = 0; + goto out1; + } + + if (ni->ni_flags & NI_FLAG_COMPRESSED_MASK) { +#ifndef CONFIG_NTFS3_LZX_XPRESS + err = -EOPNOTSUPP; + goto out1; +#else + u32 frame_bits = ni_ext_compress_bits(ni); + u64 frame64 = frame_vbo >> frame_bits; + u64 frames, vbo_data; + + if (frame_size != (1u << frame_bits)) { + err = -EINVAL; + goto out1; + } + switch (frame_size) { + case 0x1000: + case 0x2000: + case 0x4000: + case 0x8000: + break; + default: + /* Unknown compression. */ + err = -EOPNOTSUPP; + goto out1; + } + + attr = ni_find_attr(ni, attr, &le, ATTR_DATA, WOF_NAME, + ARRAY_SIZE(WOF_NAME), NULL, NULL); + if (!attr) { + ntfs_inode_err( + &ni->vfs_inode, + "external compressed file should contains data attribute \"WofCompressedData\""); + err = -EINVAL; + goto out1; + } + + if (!attr->non_res) { + run = NULL; + } else { + run = run_alloc(); + if (!run) { + err = -ENOMEM; + goto out1; + } + } + + frames = (ni->vfs_inode.i_size - 1) >> frame_bits; + + err = attr_wof_frame_info(ni, attr, run, frame64, frames, + frame_bits, &ondisk_size, &vbo_data); + if (err) + goto out2; + + if (frame64 == frames) { + unc_size = 1 + ((ni->vfs_inode.i_size - 1) & + (frame_size - 1)); + ondisk_size = attr_size(attr) - vbo_data; + } else { + unc_size = frame_size; + } + + if (ondisk_size > frame_size) { + err = -EINVAL; + goto out2; + } + + if (!attr->non_res) { + if (vbo_data + ondisk_size > + le32_to_cpu(attr->res.data_size)) { + err = -EINVAL; + goto out1; + } + + err = decompress_lzx_xpress( + sbi, Add2Ptr(resident_data(attr), vbo_data), + ondisk_size, frame_mem, unc_size, frame_size); + goto out1; + } + vbo_disk = vbo_data; + /* Load all runs to read [vbo_disk-vbo_to). */ + err = attr_load_runs_range(ni, ATTR_DATA, WOF_NAME, + ARRAY_SIZE(WOF_NAME), run, vbo_disk, + vbo_data + ondisk_size); + if (err) + goto out2; + npages_disk = (ondisk_size + (vbo_disk & (PAGE_SIZE - 1)) + + PAGE_SIZE - 1) >> + PAGE_SHIFT; +#endif + } else if (is_attr_compressed(attr)) { + /* LZNT compression. */ + if (sbi->cluster_size > NTFS_LZNT_MAX_CLUSTER) { + err = -EOPNOTSUPP; + goto out1; + } + + if (attr->nres.c_unit != NTFS_LZNT_CUNIT) { + err = -EOPNOTSUPP; + goto out1; + } + + down_write(&ni->file.run_lock); + run_truncate_around(run, le64_to_cpu(attr->nres.svcn)); + frame = frame_vbo >> (cluster_bits + NTFS_LZNT_CUNIT); + err = attr_is_frame_compressed(ni, attr, frame, &clst_data); + up_write(&ni->file.run_lock); + if (err) + goto out1; + + if (!clst_data) { + memset(frame_mem, 0, frame_size); + goto out1; + } + + frame_size = sbi->cluster_size << NTFS_LZNT_CUNIT; + ondisk_size = clst_data << cluster_bits; + + if (clst_data >= NTFS_LZNT_CLUSTERS) { + /* Frame is not compressed. */ + down_read(&ni->file.run_lock); + err = ntfs_bio_pages(sbi, run, pages, pages_per_frame, + frame_vbo, ondisk_size, + REQ_OP_READ); + up_read(&ni->file.run_lock); + goto out1; + } + vbo_disk = frame_vbo; + npages_disk = (ondisk_size + PAGE_SIZE - 1) >> PAGE_SHIFT; + } else { + __builtin_unreachable(); + err = -EINVAL; + goto out1; + } + + pages_disk = kzalloc(npages_disk * sizeof(struct page *), GFP_NOFS); + if (!pages_disk) { + err = -ENOMEM; + goto out2; + } + + for (i = 0; i < npages_disk; i++) { + pg = alloc_page(GFP_KERNEL); + if (!pg) { + err = -ENOMEM; + goto out3; + } + pages_disk[i] = pg; + lock_page(pg); + kmap(pg); + } + + /* Read 'ondisk_size' bytes from disk. */ + down_read(&ni->file.run_lock); + err = ntfs_bio_pages(sbi, run, pages_disk, npages_disk, vbo_disk, + ondisk_size, REQ_OP_READ); + up_read(&ni->file.run_lock); + if (err) + goto out3; + + /* + * To simplify decompress algorithm do vmap for source and target pages. + */ + frame_ondisk = vmap(pages_disk, npages_disk, VM_MAP, PAGE_KERNEL_RO); + if (!frame_ondisk) { + err = -ENOMEM; + goto out3; + } + + /* Decompress: Frame_ondisk -> frame_mem. */ +#ifdef CONFIG_NTFS3_LZX_XPRESS + if (run != &ni->file.run) { + /* LZX or XPRESS */ + err = decompress_lzx_xpress( + sbi, frame_ondisk + (vbo_disk & (PAGE_SIZE - 1)), + ondisk_size, frame_mem, unc_size, frame_size); + } else +#endif + { + /* LZNT - Native NTFS compression. */ + unc_size = decompress_lznt(frame_ondisk, ondisk_size, frame_mem, + frame_size); + if ((ssize_t)unc_size < 0) + err = unc_size; + else if (!unc_size || unc_size > frame_size) + err = -EINVAL; + } + if (!err && valid_size < frame_vbo + frame_size) { + size_t ok = valid_size - frame_vbo; + + memset(frame_mem + ok, 0, frame_size - ok); + } + + vunmap(frame_ondisk); + +out3: + for (i = 0; i < npages_disk; i++) { + pg = pages_disk[i]; + if (pg) { + kunmap(pg); + unlock_page(pg); + put_page(pg); + } + } + kfree(pages_disk); + +out2: +#ifdef CONFIG_NTFS3_LZX_XPRESS + if (run != &ni->file.run) + run_free(run); +#endif +out1: + vunmap(frame_mem); +out: + for (i = 0; i < pages_per_frame; i++) { + pg = pages[i]; + kunmap(pg); + ClearPageError(pg); + SetPageUptodate(pg); + } + + return err; +} + +/* + * ni_write_frame + * + * Pages - Array of locked pages. + */ +int ni_write_frame(struct ntfs_inode *ni, struct page **pages, + u32 pages_per_frame) +{ + int err; + struct ntfs_sb_info *sbi = ni->mi.sbi; + u8 frame_bits = NTFS_LZNT_CUNIT + sbi->cluster_bits; + u32 frame_size = sbi->cluster_size << NTFS_LZNT_CUNIT; + u64 frame_vbo = (u64)pages[0]->index << PAGE_SHIFT; + CLST frame = frame_vbo >> frame_bits; + char *frame_ondisk = NULL; + struct page **pages_disk = NULL; + struct ATTR_LIST_ENTRY *le = NULL; + char *frame_mem; + struct ATTRIB *attr; + struct mft_inode *mi; + u32 i; + struct page *pg; + size_t compr_size, ondisk_size; + struct lznt *lznt; + + attr = ni_find_attr(ni, NULL, &le, ATTR_DATA, NULL, 0, NULL, &mi); + if (!attr) { + err = -ENOENT; + goto out; + } + + if (WARN_ON(!is_attr_compressed(attr))) { + err = -EINVAL; + goto out; + } + + if (sbi->cluster_size > NTFS_LZNT_MAX_CLUSTER) { + err = -EOPNOTSUPP; + goto out; + } + + if (!attr->non_res) { + down_write(&ni->file.run_lock); + err = attr_make_nonresident(ni, attr, le, mi, + le32_to_cpu(attr->res.data_size), + &ni->file.run, &attr, pages[0]); + up_write(&ni->file.run_lock); + if (err) + goto out; + } + + if (attr->nres.c_unit != NTFS_LZNT_CUNIT) { + err = -EOPNOTSUPP; + goto out; + } + + pages_disk = kcalloc(pages_per_frame, sizeof(struct page *), GFP_NOFS); + if (!pages_disk) { + err = -ENOMEM; + goto out; + } + + for (i = 0; i < pages_per_frame; i++) { + pg = alloc_page(GFP_KERNEL); + if (!pg) { + err = -ENOMEM; + goto out1; + } + pages_disk[i] = pg; + lock_page(pg); + kmap(pg); + } + + /* To simplify compress algorithm do vmap for source and target pages. */ + frame_ondisk = vmap(pages_disk, pages_per_frame, VM_MAP, PAGE_KERNEL); + if (!frame_ondisk) { + err = -ENOMEM; + goto out1; + } + + for (i = 0; i < pages_per_frame; i++) + kmap(pages[i]); + + /* Map in-memory frame for read-only. */ + frame_mem = vmap(pages, pages_per_frame, VM_MAP, PAGE_KERNEL_RO); + if (!frame_mem) { + err = -ENOMEM; + goto out2; + } + + mutex_lock(&sbi->compress.mtx_lznt); + lznt = NULL; + if (!sbi->compress.lznt) { + /* + * LZNT implements two levels of compression: + * 0 - Standard compression + * 1 - Best compression, requires a lot of cpu + * use mount option? + */ + lznt = get_lznt_ctx(0); + if (!lznt) { + mutex_unlock(&sbi->compress.mtx_lznt); + err = -ENOMEM; + goto out3; + } + + sbi->compress.lznt = lznt; + lznt = NULL; + } + + /* Compress: frame_mem -> frame_ondisk */ + compr_size = compress_lznt(frame_mem, frame_size, frame_ondisk, + frame_size, sbi->compress.lznt); + mutex_unlock(&sbi->compress.mtx_lznt); + kfree(lznt); + + if (compr_size + sbi->cluster_size > frame_size) { + /* Frame is not compressed. */ + compr_size = frame_size; + ondisk_size = frame_size; + } else if (compr_size) { + /* Frame is compressed. */ + ondisk_size = ntfs_up_cluster(sbi, compr_size); + memset(frame_ondisk + compr_size, 0, ondisk_size - compr_size); + } else { + /* Frame is sparsed. */ + ondisk_size = 0; + } + + down_write(&ni->file.run_lock); + run_truncate_around(&ni->file.run, le64_to_cpu(attr->nres.svcn)); + err = attr_allocate_frame(ni, frame, compr_size, ni->i_valid); + up_write(&ni->file.run_lock); + if (err) + goto out2; + + if (!ondisk_size) + goto out2; + + down_read(&ni->file.run_lock); + err = ntfs_bio_pages(sbi, &ni->file.run, + ondisk_size < frame_size ? pages_disk : pages, + pages_per_frame, frame_vbo, ondisk_size, + REQ_OP_WRITE); + up_read(&ni->file.run_lock); + +out3: + vunmap(frame_mem); + +out2: + for (i = 0; i < pages_per_frame; i++) + kunmap(pages[i]); + + vunmap(frame_ondisk); +out1: + for (i = 0; i < pages_per_frame; i++) { + pg = pages_disk[i]; + if (pg) { + kunmap(pg); + unlock_page(pg); + put_page(pg); + } + } + kfree(pages_disk); +out: + return err; +} + +/* + * ni_remove_name - Removes name 'de' from MFT and from directory. + * 'de2' and 'undo_step' are used to restore MFT/dir, if error occurs. + */ +int ni_remove_name(struct ntfs_inode *dir_ni, struct ntfs_inode *ni, + struct NTFS_DE *de, struct NTFS_DE **de2, int *undo_step) +{ + int err; + struct ntfs_sb_info *sbi = ni->mi.sbi; + struct ATTR_FILE_NAME *de_name = (struct ATTR_FILE_NAME *)(de + 1); + struct ATTR_FILE_NAME *fname; + struct ATTR_LIST_ENTRY *le; + struct mft_inode *mi; + u16 de_key_size = le16_to_cpu(de->key_size); + u8 name_type; + + *undo_step = 0; + + /* Find name in record. */ + mi_get_ref(&dir_ni->mi, &de_name->home); + + fname = ni_fname_name(ni, (struct cpu_str *)&de_name->name_len, + &de_name->home, &mi, &le); + if (!fname) + return -ENOENT; + + memcpy(&de_name->dup, &fname->dup, sizeof(struct NTFS_DUP_INFO)); + name_type = paired_name(fname->type); + + /* Mark ntfs as dirty. It will be cleared at umount. */ + ntfs_set_state(sbi, NTFS_DIRTY_DIRTY); + + /* Step 1: Remove name from directory. */ + err = indx_delete_entry(&dir_ni->dir, dir_ni, fname, de_key_size, sbi); + if (err) + return err; + + /* Step 2: Remove name from MFT. */ + ni_remove_attr_le(ni, attr_from_name(fname), mi, le); + + *undo_step = 2; + + /* Get paired name. */ + fname = ni_fname_type(ni, name_type, &mi, &le); + if (fname) { + u16 de2_key_size = fname_full_size(fname); + + *de2 = Add2Ptr(de, 1024); + (*de2)->key_size = cpu_to_le16(de2_key_size); + + memcpy(*de2 + 1, fname, de2_key_size); + + /* Step 3: Remove paired name from directory. */ + err = indx_delete_entry(&dir_ni->dir, dir_ni, fname, + de2_key_size, sbi); + if (err) + return err; + + /* Step 4: Remove paired name from MFT. */ + ni_remove_attr_le(ni, attr_from_name(fname), mi, le); + + *undo_step = 4; + } + return 0; +} + +/* + * ni_remove_name_undo - Paired function for ni_remove_name. + * + * Return: True if ok + */ +bool ni_remove_name_undo(struct ntfs_inode *dir_ni, struct ntfs_inode *ni, + struct NTFS_DE *de, struct NTFS_DE *de2, int undo_step) +{ + struct ntfs_sb_info *sbi = ni->mi.sbi; + struct ATTRIB *attr; + u16 de_key_size = de2 ? le16_to_cpu(de2->key_size) : 0; + + switch (undo_step) { + case 4: + if (ni_insert_resident(ni, de_key_size, ATTR_NAME, NULL, 0, + &attr, NULL, NULL)) { + return false; + } + memcpy(Add2Ptr(attr, SIZEOF_RESIDENT), de2 + 1, de_key_size); + + mi_get_ref(&ni->mi, &de2->ref); + de2->size = cpu_to_le16(ALIGN(de_key_size, 8) + + sizeof(struct NTFS_DE)); + de2->flags = 0; + de2->res = 0; + + if (indx_insert_entry(&dir_ni->dir, dir_ni, de2, sbi, NULL, + 1)) { + return false; + } + fallthrough; + + case 2: + de_key_size = le16_to_cpu(de->key_size); + + if (ni_insert_resident(ni, de_key_size, ATTR_NAME, NULL, 0, + &attr, NULL, NULL)) { + return false; + } + + memcpy(Add2Ptr(attr, SIZEOF_RESIDENT), de + 1, de_key_size); + mi_get_ref(&ni->mi, &de->ref); + + if (indx_insert_entry(&dir_ni->dir, dir_ni, de, sbi, NULL, 1)) + return false; + } + + return true; +} + +/* + * ni_add_name - Add new name into MFT and into directory. + */ +int ni_add_name(struct ntfs_inode *dir_ni, struct ntfs_inode *ni, + struct NTFS_DE *de) +{ + int err; + struct ATTRIB *attr; + struct ATTR_LIST_ENTRY *le; + struct mft_inode *mi; + struct ATTR_FILE_NAME *fname; + struct ATTR_FILE_NAME *de_name = (struct ATTR_FILE_NAME *)(de + 1); + u16 de_key_size = le16_to_cpu(de->key_size); + + mi_get_ref(&ni->mi, &de->ref); + mi_get_ref(&dir_ni->mi, &de_name->home); + + /* Fill duplicate from any ATTR_NAME. */ + fname = ni_fname_name(ni, NULL, NULL, NULL, NULL); + if (fname) + memcpy(&de_name->dup, &fname->dup, sizeof(fname->dup)); + de_name->dup.fa = ni->std_fa; + + /* Insert new name into MFT. */ + err = ni_insert_resident(ni, de_key_size, ATTR_NAME, NULL, 0, &attr, + &mi, &le); + if (err) + return err; + + memcpy(Add2Ptr(attr, SIZEOF_RESIDENT), de_name, de_key_size); + + /* Insert new name into directory. */ + err = indx_insert_entry(&dir_ni->dir, dir_ni, de, ni->mi.sbi, NULL, 0); + if (err) + ni_remove_attr_le(ni, attr, mi, le); + + return err; +} + +/* + * ni_rename - Remove one name and insert new name. + */ +int ni_rename(struct ntfs_inode *dir_ni, struct ntfs_inode *new_dir_ni, + struct ntfs_inode *ni, struct NTFS_DE *de, struct NTFS_DE *new_de, + bool *is_bad) +{ + int err; + struct NTFS_DE *de2 = NULL; + int undo = 0; + + /* + * There are two possible ways to rename: + * 1) Add new name and remove old name. + * 2) Remove old name and add new name. + * + * In most cases (not all!) adding new name into MFT and into directory can + * allocate additional cluster(s). + * Second way may result to bad inode if we can't add new name + * and then can't restore (add) old name. + */ + + /* + * Way 1 - Add new + remove old. + */ + err = ni_add_name(new_dir_ni, ni, new_de); + if (!err) { + err = ni_remove_name(dir_ni, ni, de, &de2, &undo); + if (err && ni_remove_name(new_dir_ni, ni, new_de, &de2, &undo)) + *is_bad = true; + } + + /* + * Way 2 - Remove old + add new. + */ + /* + * err = ni_remove_name(dir_ni, ni, de, &de2, &undo); + * if (!err) { + * err = ni_add_name(new_dir_ni, ni, new_de); + * if (err && !ni_remove_name_undo(dir_ni, ni, de, de2, undo)) + * *is_bad = true; + * } + */ + + return err; +} + +/* + * ni_is_dirty - Return: True if 'ni' requires ni_write_inode. + */ +bool ni_is_dirty(struct inode *inode) +{ + struct ntfs_inode *ni = ntfs_i(inode); + struct rb_node *node; + + if (ni->mi.dirty || ni->attr_list.dirty || + (ni->ni_flags & NI_FLAG_UPDATE_PARENT)) + return true; + + for (node = rb_first(&ni->mi_tree); node; node = rb_next(node)) { + if (rb_entry(node, struct mft_inode, node)->dirty) + return true; + } + + return false; +} + +/* + * ni_update_parent + * + * Update duplicate info of ATTR_FILE_NAME in MFT and in parent directories. + */ +static bool ni_update_parent(struct ntfs_inode *ni, struct NTFS_DUP_INFO *dup, + int sync) +{ + struct ATTRIB *attr; + struct mft_inode *mi; + struct ATTR_LIST_ENTRY *le = NULL; + struct ntfs_sb_info *sbi = ni->mi.sbi; + struct super_block *sb = sbi->sb; + bool re_dirty = false; + + if (ni->mi.mrec->flags & RECORD_FLAG_DIR) { + dup->fa |= FILE_ATTRIBUTE_DIRECTORY; + attr = NULL; + dup->alloc_size = 0; + dup->data_size = 0; + } else { + dup->fa &= ~FILE_ATTRIBUTE_DIRECTORY; + + attr = ni_find_attr(ni, NULL, &le, ATTR_DATA, NULL, 0, NULL, + &mi); + if (!attr) { + dup->alloc_size = dup->data_size = 0; + } else if (!attr->non_res) { + u32 data_size = le32_to_cpu(attr->res.data_size); + + dup->alloc_size = cpu_to_le64(ALIGN(data_size, 8)); + dup->data_size = cpu_to_le64(data_size); + } else { + u64 new_valid = ni->i_valid; + u64 data_size = le64_to_cpu(attr->nres.data_size); + __le64 valid_le; + + dup->alloc_size = is_attr_ext(attr) + ? attr->nres.total_size + : attr->nres.alloc_size; + dup->data_size = attr->nres.data_size; + + if (new_valid > data_size) + new_valid = data_size; + + valid_le = cpu_to_le64(new_valid); + if (valid_le != attr->nres.valid_size) { + attr->nres.valid_size = valid_le; + mi->dirty = true; + } + } + } + + /* TODO: Fill reparse info. */ + dup->reparse = 0; + dup->ea_size = 0; + + if (ni->ni_flags & NI_FLAG_EA) { + attr = ni_find_attr(ni, attr, &le, ATTR_EA_INFO, NULL, 0, NULL, + NULL); + if (attr) { + const struct EA_INFO *info; + + info = resident_data_ex(attr, sizeof(struct EA_INFO)); + /* If ATTR_EA_INFO exists 'info' can't be NULL. */ + if (info) + dup->ea_size = info->size_pack; + } + } + + attr = NULL; + le = NULL; + + while ((attr = ni_find_attr(ni, attr, &le, ATTR_NAME, NULL, 0, NULL, + &mi))) { + struct inode *dir; + struct ATTR_FILE_NAME *fname; + + fname = resident_data_ex(attr, SIZEOF_ATTRIBUTE_FILENAME); + if (!fname || !memcmp(&fname->dup, dup, sizeof(fname->dup))) + continue; + + /* Check simple case when parent inode equals current inode. */ + if (ino_get(&fname->home) == ni->vfs_inode.i_ino) { + ntfs_set_state(sbi, NTFS_DIRTY_ERROR); + continue; + } + + /* ntfs_iget5 may sleep. */ + dir = ntfs_iget5(sb, &fname->home, NULL); + if (IS_ERR(dir)) { + ntfs_inode_warn( + &ni->vfs_inode, + "failed to open parent directory r=%lx to update", + (long)ino_get(&fname->home)); + continue; + } + + if (!is_bad_inode(dir)) { + struct ntfs_inode *dir_ni = ntfs_i(dir); + + if (!ni_trylock(dir_ni)) { + re_dirty = true; + } else { + indx_update_dup(dir_ni, sbi, fname, dup, sync); + ni_unlock(dir_ni); + memcpy(&fname->dup, dup, sizeof(fname->dup)); + mi->dirty = true; + } + } + iput(dir); + } + + return re_dirty; +} + +/* + * ni_write_inode - Write MFT base record and all subrecords to disk. + */ +int ni_write_inode(struct inode *inode, int sync, const char *hint) +{ + int err = 0, err2; + struct ntfs_inode *ni = ntfs_i(inode); + struct super_block *sb = inode->i_sb; + struct ntfs_sb_info *sbi = sb->s_fs_info; + bool re_dirty = false; + struct ATTR_STD_INFO *std; + struct rb_node *node, *next; + struct NTFS_DUP_INFO dup; + + if (is_bad_inode(inode) || sb_rdonly(sb)) + return 0; + + if (!ni_trylock(ni)) { + /* 'ni' is under modification, skip for now. */ + mark_inode_dirty_sync(inode); + return 0; + } + + if (is_rec_inuse(ni->mi.mrec) && + !(sbi->flags & NTFS_FLAGS_LOG_REPLAYING) && inode->i_nlink) { + bool modified = false; + + /* Update times in standard attribute. */ + std = ni_std(ni); + if (!std) { + err = -EINVAL; + goto out; + } + + /* Update the access times if they have changed. */ + dup.m_time = kernel2nt(&inode->i_mtime); + if (std->m_time != dup.m_time) { + std->m_time = dup.m_time; + modified = true; + } + + dup.c_time = kernel2nt(&inode->i_ctime); + if (std->c_time != dup.c_time) { + std->c_time = dup.c_time; + modified = true; + } + + dup.a_time = kernel2nt(&inode->i_atime); + if (std->a_time != dup.a_time) { + std->a_time = dup.a_time; + modified = true; + } + + dup.fa = ni->std_fa; + if (std->fa != dup.fa) { + std->fa = dup.fa; + modified = true; + } + + if (modified) + ni->mi.dirty = true; + + if (!ntfs_is_meta_file(sbi, inode->i_ino) && + (modified || (ni->ni_flags & NI_FLAG_UPDATE_PARENT)) + /* Avoid __wait_on_freeing_inode(inode). */ + && (sb->s_flags & SB_ACTIVE)) { + dup.cr_time = std->cr_time; + /* Not critical if this function fail. */ + re_dirty = ni_update_parent(ni, &dup, sync); + + if (re_dirty) + ni->ni_flags |= NI_FLAG_UPDATE_PARENT; + else + ni->ni_flags &= ~NI_FLAG_UPDATE_PARENT; + } + + /* Update attribute list. */ + if (ni->attr_list.size && ni->attr_list.dirty) { + if (inode->i_ino != MFT_REC_MFT || sync) { + err = ni_try_remove_attr_list(ni); + if (err) + goto out; + } + + err = al_update(ni, sync); + if (err) + goto out; + } + } + + for (node = rb_first(&ni->mi_tree); node; node = next) { + struct mft_inode *mi = rb_entry(node, struct mft_inode, node); + bool is_empty; + + next = rb_next(node); + + if (!mi->dirty) + continue; + + is_empty = !mi_enum_attr(mi, NULL); + + if (is_empty) + clear_rec_inuse(mi->mrec); + + err2 = mi_write(mi, sync); + if (!err && err2) + err = err2; + + if (is_empty) { + ntfs_mark_rec_free(sbi, mi->rno, false); + rb_erase(node, &ni->mi_tree); + mi_put(mi); + } + } + + if (ni->mi.dirty) { + err2 = mi_write(&ni->mi, sync); + if (!err && err2) + err = err2; + } +out: + ni_unlock(ni); + + if (err) { + ntfs_err(sb, "%s r=%lx failed, %d.", hint, inode->i_ino, err); + ntfs_set_state(sbi, NTFS_DIRTY_ERROR); + return err; + } + + if (re_dirty) + mark_inode_dirty_sync(inode); + + return 0; +} diff --git a/fs/ntfs3/fslog.c b/fs/ntfs3/fslog.c new file mode 100644 index 000000000..710cb5aa5 --- /dev/null +++ b/fs/ntfs3/fslog.c @@ -0,0 +1,5210 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * + * Copyright (C) 2019-2021 Paragon Software GmbH, All rights reserved. + * + */ + +#include <linux/blkdev.h> +#include <linux/fs.h> +#include <linux/random.h> +#include <linux/slab.h> + +#include "debug.h" +#include "ntfs.h" +#include "ntfs_fs.h" + +/* + * LOG FILE structs + */ + +// clang-format off + +#define MaxLogFileSize 0x100000000ull +#define DefaultLogPageSize 4096 +#define MinLogRecordPages 0x30 + +struct RESTART_HDR { + struct NTFS_RECORD_HEADER rhdr; // 'RSTR' + __le32 sys_page_size; // 0x10: Page size of the system which initialized the log. + __le32 page_size; // 0x14: Log page size used for this log file. + __le16 ra_off; // 0x18: + __le16 minor_ver; // 0x1A: + __le16 major_ver; // 0x1C: + __le16 fixups[]; +}; + +#define LFS_NO_CLIENT 0xffff +#define LFS_NO_CLIENT_LE cpu_to_le16(0xffff) + +struct CLIENT_REC { + __le64 oldest_lsn; + __le64 restart_lsn; // 0x08: + __le16 prev_client; // 0x10: + __le16 next_client; // 0x12: + __le16 seq_num; // 0x14: + u8 align[6]; // 0x16: + __le32 name_bytes; // 0x1C: In bytes. + __le16 name[32]; // 0x20: Name of client. +}; + +static_assert(sizeof(struct CLIENT_REC) == 0x60); + +/* Two copies of these will exist at the beginning of the log file */ +struct RESTART_AREA { + __le64 current_lsn; // 0x00: Current logical end of log file. + __le16 log_clients; // 0x08: Maximum number of clients. + __le16 client_idx[2]; // 0x0A: Free/use index into the client record arrays. + __le16 flags; // 0x0E: See RESTART_SINGLE_PAGE_IO. + __le32 seq_num_bits; // 0x10: The number of bits in sequence number. + __le16 ra_len; // 0x14: + __le16 client_off; // 0x16: + __le64 l_size; // 0x18: Usable log file size. + __le32 last_lsn_data_len; // 0x20: + __le16 rec_hdr_len; // 0x24: Log page data offset. + __le16 data_off; // 0x26: Log page data length. + __le32 open_log_count; // 0x28: + __le32 align[5]; // 0x2C: + struct CLIENT_REC clients[]; // 0x40: +}; + +struct LOG_REC_HDR { + __le16 redo_op; // 0x00: NTFS_LOG_OPERATION + __le16 undo_op; // 0x02: NTFS_LOG_OPERATION + __le16 redo_off; // 0x04: Offset to Redo record. + __le16 redo_len; // 0x06: Redo length. + __le16 undo_off; // 0x08: Offset to Undo record. + __le16 undo_len; // 0x0A: Undo length. + __le16 target_attr; // 0x0C: + __le16 lcns_follow; // 0x0E: + __le16 record_off; // 0x10: + __le16 attr_off; // 0x12: + __le16 cluster_off; // 0x14: + __le16 reserved; // 0x16: + __le64 target_vcn; // 0x18: + __le64 page_lcns[]; // 0x20: +}; + +static_assert(sizeof(struct LOG_REC_HDR) == 0x20); + +#define RESTART_ENTRY_ALLOCATED 0xFFFFFFFF +#define RESTART_ENTRY_ALLOCATED_LE cpu_to_le32(0xFFFFFFFF) + +struct RESTART_TABLE { + __le16 size; // 0x00: In bytes + __le16 used; // 0x02: Entries + __le16 total; // 0x04: Entries + __le16 res[3]; // 0x06: + __le32 free_goal; // 0x0C: + __le32 first_free; // 0x10: + __le32 last_free; // 0x14: + +}; + +static_assert(sizeof(struct RESTART_TABLE) == 0x18); + +struct ATTR_NAME_ENTRY { + __le16 off; // Offset in the Open attribute Table. + __le16 name_bytes; + __le16 name[]; +}; + +struct OPEN_ATTR_ENRTY { + __le32 next; // 0x00: RESTART_ENTRY_ALLOCATED if allocated + __le32 bytes_per_index; // 0x04: + enum ATTR_TYPE type; // 0x08: + u8 is_dirty_pages; // 0x0C: + u8 is_attr_name; // 0x0B: Faked field to manage 'ptr' + u8 name_len; // 0x0C: Faked field to manage 'ptr' + u8 res; + struct MFT_REF ref; // 0x10: File Reference of file containing attribute + __le64 open_record_lsn; // 0x18: + void *ptr; // 0x20: +}; + +/* 32 bit version of 'struct OPEN_ATTR_ENRTY' */ +struct OPEN_ATTR_ENRTY_32 { + __le32 next; // 0x00: RESTART_ENTRY_ALLOCATED if allocated + __le32 ptr; // 0x04: + struct MFT_REF ref; // 0x08: + __le64 open_record_lsn; // 0x10: + u8 is_dirty_pages; // 0x18: + u8 is_attr_name; // 0x19: + u8 res1[2]; + enum ATTR_TYPE type; // 0x1C: + u8 name_len; // 0x20: In wchar + u8 res2[3]; + __le32 AttributeName; // 0x24: + __le32 bytes_per_index; // 0x28: +}; + +#define SIZEOF_OPENATTRIBUTEENTRY0 0x2c +// static_assert( 0x2C == sizeof(struct OPEN_ATTR_ENRTY_32) ); +static_assert(sizeof(struct OPEN_ATTR_ENRTY) < SIZEOF_OPENATTRIBUTEENTRY0); + +/* + * One entry exists in the Dirty Pages Table for each page which is dirty at + * the time the Restart Area is written. + */ +struct DIR_PAGE_ENTRY { + __le32 next; // 0x00: RESTART_ENTRY_ALLOCATED if allocated + __le32 target_attr; // 0x04: Index into the Open attribute Table + __le32 transfer_len; // 0x08: + __le32 lcns_follow; // 0x0C: + __le64 vcn; // 0x10: Vcn of dirty page + __le64 oldest_lsn; // 0x18: + __le64 page_lcns[]; // 0x20: +}; + +static_assert(sizeof(struct DIR_PAGE_ENTRY) == 0x20); + +/* 32 bit version of 'struct DIR_PAGE_ENTRY' */ +struct DIR_PAGE_ENTRY_32 { + __le32 next; // 0x00: RESTART_ENTRY_ALLOCATED if allocated + __le32 target_attr; // 0x04: Index into the Open attribute Table + __le32 transfer_len; // 0x08: + __le32 lcns_follow; // 0x0C: + __le32 reserved; // 0x10: + __le32 vcn_low; // 0x14: Vcn of dirty page + __le32 vcn_hi; // 0x18: Vcn of dirty page + __le32 oldest_lsn_low; // 0x1C: + __le32 oldest_lsn_hi; // 0x1C: + __le32 page_lcns_low; // 0x24: + __le32 page_lcns_hi; // 0x24: +}; + +static_assert(offsetof(struct DIR_PAGE_ENTRY_32, vcn_low) == 0x14); +static_assert(sizeof(struct DIR_PAGE_ENTRY_32) == 0x2c); + +enum transact_state { + TransactionUninitialized = 0, + TransactionActive, + TransactionPrepared, + TransactionCommitted +}; + +struct TRANSACTION_ENTRY { + __le32 next; // 0x00: RESTART_ENTRY_ALLOCATED if allocated + u8 transact_state; // 0x04: + u8 reserved[3]; // 0x05: + __le64 first_lsn; // 0x08: + __le64 prev_lsn; // 0x10: + __le64 undo_next_lsn; // 0x18: + __le32 undo_records; // 0x20: Number of undo log records pending abort + __le32 undo_len; // 0x24: Total undo size +}; + +static_assert(sizeof(struct TRANSACTION_ENTRY) == 0x28); + +struct NTFS_RESTART { + __le32 major_ver; // 0x00: + __le32 minor_ver; // 0x04: + __le64 check_point_start; // 0x08: + __le64 open_attr_table_lsn; // 0x10: + __le64 attr_names_lsn; // 0x18: + __le64 dirty_pages_table_lsn; // 0x20: + __le64 transact_table_lsn; // 0x28: + __le32 open_attr_len; // 0x30: In bytes + __le32 attr_names_len; // 0x34: In bytes + __le32 dirty_pages_len; // 0x38: In bytes + __le32 transact_table_len; // 0x3C: In bytes +}; + +static_assert(sizeof(struct NTFS_RESTART) == 0x40); + +struct NEW_ATTRIBUTE_SIZES { + __le64 alloc_size; + __le64 valid_size; + __le64 data_size; + __le64 total_size; +}; + +struct BITMAP_RANGE { + __le32 bitmap_off; + __le32 bits; +}; + +struct LCN_RANGE { + __le64 lcn; + __le64 len; +}; + +/* The following type defines the different log record types. */ +#define LfsClientRecord cpu_to_le32(1) +#define LfsClientRestart cpu_to_le32(2) + +/* This is used to uniquely identify a client for a particular log file. */ +struct CLIENT_ID { + __le16 seq_num; + __le16 client_idx; +}; + +/* This is the header that begins every Log Record in the log file. */ +struct LFS_RECORD_HDR { + __le64 this_lsn; // 0x00: + __le64 client_prev_lsn; // 0x08: + __le64 client_undo_next_lsn; // 0x10: + __le32 client_data_len; // 0x18: + struct CLIENT_ID client; // 0x1C: Owner of this log record. + __le32 record_type; // 0x20: LfsClientRecord or LfsClientRestart. + __le32 transact_id; // 0x24: + __le16 flags; // 0x28: LOG_RECORD_MULTI_PAGE + u8 align[6]; // 0x2A: +}; + +#define LOG_RECORD_MULTI_PAGE cpu_to_le16(1) + +static_assert(sizeof(struct LFS_RECORD_HDR) == 0x30); + +struct LFS_RECORD { + __le16 next_record_off; // 0x00: Offset of the free space in the page, + u8 align[6]; // 0x02: + __le64 last_end_lsn; // 0x08: lsn for the last log record which ends on the page, +}; + +static_assert(sizeof(struct LFS_RECORD) == 0x10); + +struct RECORD_PAGE_HDR { + struct NTFS_RECORD_HEADER rhdr; // 'RCRD' + __le32 rflags; // 0x10: See LOG_PAGE_LOG_RECORD_END + __le16 page_count; // 0x14: + __le16 page_pos; // 0x16: + struct LFS_RECORD record_hdr; // 0x18: + __le16 fixups[10]; // 0x28: + __le32 file_off; // 0x3c: Used when major version >= 2 +}; + +// clang-format on + +// Page contains the end of a log record. +#define LOG_PAGE_LOG_RECORD_END cpu_to_le32(0x00000001) + +static inline bool is_log_record_end(const struct RECORD_PAGE_HDR *hdr) +{ + return hdr->rflags & LOG_PAGE_LOG_RECORD_END; +} + +static_assert(offsetof(struct RECORD_PAGE_HDR, file_off) == 0x3c); + +/* + * END of NTFS LOG structures + */ + +/* Define some tuning parameters to keep the restart tables a reasonable size. */ +#define INITIAL_NUMBER_TRANSACTIONS 5 + +enum NTFS_LOG_OPERATION { + + Noop = 0x00, + CompensationLogRecord = 0x01, + InitializeFileRecordSegment = 0x02, + DeallocateFileRecordSegment = 0x03, + WriteEndOfFileRecordSegment = 0x04, + CreateAttribute = 0x05, + DeleteAttribute = 0x06, + UpdateResidentValue = 0x07, + UpdateNonresidentValue = 0x08, + UpdateMappingPairs = 0x09, + DeleteDirtyClusters = 0x0A, + SetNewAttributeSizes = 0x0B, + AddIndexEntryRoot = 0x0C, + DeleteIndexEntryRoot = 0x0D, + AddIndexEntryAllocation = 0x0E, + DeleteIndexEntryAllocation = 0x0F, + WriteEndOfIndexBuffer = 0x10, + SetIndexEntryVcnRoot = 0x11, + SetIndexEntryVcnAllocation = 0x12, + UpdateFileNameRoot = 0x13, + UpdateFileNameAllocation = 0x14, + SetBitsInNonresidentBitMap = 0x15, + ClearBitsInNonresidentBitMap = 0x16, + HotFix = 0x17, + EndTopLevelAction = 0x18, + PrepareTransaction = 0x19, + CommitTransaction = 0x1A, + ForgetTransaction = 0x1B, + OpenNonresidentAttribute = 0x1C, + OpenAttributeTableDump = 0x1D, + AttributeNamesDump = 0x1E, + DirtyPageTableDump = 0x1F, + TransactionTableDump = 0x20, + UpdateRecordDataRoot = 0x21, + UpdateRecordDataAllocation = 0x22, + + UpdateRelativeDataInIndex = + 0x23, // NtOfsRestartUpdateRelativeDataInIndex + UpdateRelativeDataInIndex2 = 0x24, + ZeroEndOfFileRecord = 0x25, +}; + +/* + * Array for log records which require a target attribute. + * A true indicates that the corresponding restart operation + * requires a target attribute. + */ +static const u8 AttributeRequired[] = { + 0xFC, 0xFB, 0xFF, 0x10, 0x06, +}; + +static inline bool is_target_required(u16 op) +{ + bool ret = op <= UpdateRecordDataAllocation && + (AttributeRequired[op >> 3] >> (op & 7) & 1); + return ret; +} + +static inline bool can_skip_action(enum NTFS_LOG_OPERATION op) +{ + switch (op) { + case Noop: + case DeleteDirtyClusters: + case HotFix: + case EndTopLevelAction: + case PrepareTransaction: + case CommitTransaction: + case ForgetTransaction: + case CompensationLogRecord: + case OpenNonresidentAttribute: + case OpenAttributeTableDump: + case AttributeNamesDump: + case DirtyPageTableDump: + case TransactionTableDump: + return true; + default: + return false; + } +} + +enum { lcb_ctx_undo_next, lcb_ctx_prev, lcb_ctx_next }; + +/* Bytes per restart table. */ +static inline u32 bytes_per_rt(const struct RESTART_TABLE *rt) +{ + return le16_to_cpu(rt->used) * le16_to_cpu(rt->size) + + sizeof(struct RESTART_TABLE); +} + +/* Log record length. */ +static inline u32 lrh_length(const struct LOG_REC_HDR *lr) +{ + u16 t16 = le16_to_cpu(lr->lcns_follow); + + return struct_size(lr, page_lcns, max_t(u16, 1, t16)); +} + +struct lcb { + struct LFS_RECORD_HDR *lrh; // Log record header of the current lsn. + struct LOG_REC_HDR *log_rec; + u32 ctx_mode; // lcb_ctx_undo_next/lcb_ctx_prev/lcb_ctx_next + struct CLIENT_ID client; + bool alloc; // If true the we should deallocate 'log_rec'. +}; + +static void lcb_put(struct lcb *lcb) +{ + if (lcb->alloc) + kfree(lcb->log_rec); + kfree(lcb->lrh); + kfree(lcb); +} + +/* Find the oldest lsn from active clients. */ +static inline void oldest_client_lsn(const struct CLIENT_REC *ca, + __le16 next_client, u64 *oldest_lsn) +{ + while (next_client != LFS_NO_CLIENT_LE) { + const struct CLIENT_REC *cr = ca + le16_to_cpu(next_client); + u64 lsn = le64_to_cpu(cr->oldest_lsn); + + /* Ignore this block if it's oldest lsn is 0. */ + if (lsn && lsn < *oldest_lsn) + *oldest_lsn = lsn; + + next_client = cr->next_client; + } +} + +static inline bool is_rst_page_hdr_valid(u32 file_off, + const struct RESTART_HDR *rhdr) +{ + u32 sys_page = le32_to_cpu(rhdr->sys_page_size); + u32 page_size = le32_to_cpu(rhdr->page_size); + u32 end_usa; + u16 ro; + + if (sys_page < SECTOR_SIZE || page_size < SECTOR_SIZE || + sys_page & (sys_page - 1) || page_size & (page_size - 1)) { + return false; + } + + /* Check that if the file offset isn't 0, it is the system page size. */ + if (file_off && file_off != sys_page) + return false; + + /* Check support version 1.1+. */ + if (le16_to_cpu(rhdr->major_ver) <= 1 && !rhdr->minor_ver) + return false; + + if (le16_to_cpu(rhdr->major_ver) > 2) + return false; + + ro = le16_to_cpu(rhdr->ra_off); + if (!IS_ALIGNED(ro, 8) || ro > sys_page) + return false; + + end_usa = ((sys_page >> SECTOR_SHIFT) + 1) * sizeof(short); + end_usa += le16_to_cpu(rhdr->rhdr.fix_off); + + if (ro < end_usa) + return false; + + return true; +} + +static inline bool is_rst_area_valid(const struct RESTART_HDR *rhdr) +{ + const struct RESTART_AREA *ra; + u16 cl, fl, ul; + u32 off, l_size, file_dat_bits, file_size_round; + u16 ro = le16_to_cpu(rhdr->ra_off); + u32 sys_page = le32_to_cpu(rhdr->sys_page_size); + + if (ro + offsetof(struct RESTART_AREA, l_size) > + SECTOR_SIZE - sizeof(short)) + return false; + + ra = Add2Ptr(rhdr, ro); + cl = le16_to_cpu(ra->log_clients); + + if (cl > 1) + return false; + + off = le16_to_cpu(ra->client_off); + + if (!IS_ALIGNED(off, 8) || ro + off > SECTOR_SIZE - sizeof(short)) + return false; + + off += cl * sizeof(struct CLIENT_REC); + + if (off > sys_page) + return false; + + /* + * Check the restart length field and whether the entire + * restart area is contained that length. + */ + if (le16_to_cpu(rhdr->ra_off) + le16_to_cpu(ra->ra_len) > sys_page || + off > le16_to_cpu(ra->ra_len)) { + return false; + } + + /* + * As a final check make sure that the use list and the free list + * are either empty or point to a valid client. + */ + fl = le16_to_cpu(ra->client_idx[0]); + ul = le16_to_cpu(ra->client_idx[1]); + if ((fl != LFS_NO_CLIENT && fl >= cl) || + (ul != LFS_NO_CLIENT && ul >= cl)) + return false; + + /* Make sure the sequence number bits match the log file size. */ + l_size = le64_to_cpu(ra->l_size); + + file_dat_bits = sizeof(u64) * 8 - le32_to_cpu(ra->seq_num_bits); + file_size_round = 1u << (file_dat_bits + 3); + if (file_size_round != l_size && + (file_size_round < l_size || (file_size_round / 2) > l_size)) { + return false; + } + + /* The log page data offset and record header length must be quad-aligned. */ + if (!IS_ALIGNED(le16_to_cpu(ra->data_off), 8) || + !IS_ALIGNED(le16_to_cpu(ra->rec_hdr_len), 8)) + return false; + + return true; +} + +static inline bool is_client_area_valid(const struct RESTART_HDR *rhdr, + bool usa_error) +{ + u16 ro = le16_to_cpu(rhdr->ra_off); + const struct RESTART_AREA *ra = Add2Ptr(rhdr, ro); + u16 ra_len = le16_to_cpu(ra->ra_len); + const struct CLIENT_REC *ca; + u32 i; + + if (usa_error && ra_len + ro > SECTOR_SIZE - sizeof(short)) + return false; + + /* Find the start of the client array. */ + ca = Add2Ptr(ra, le16_to_cpu(ra->client_off)); + + /* + * Start with the free list. + * Check that all the clients are valid and that there isn't a cycle. + * Do the in-use list on the second pass. + */ + for (i = 0; i < 2; i++) { + u16 client_idx = le16_to_cpu(ra->client_idx[i]); + bool first_client = true; + u16 clients = le16_to_cpu(ra->log_clients); + + while (client_idx != LFS_NO_CLIENT) { + const struct CLIENT_REC *cr; + + if (!clients || + client_idx >= le16_to_cpu(ra->log_clients)) + return false; + + clients -= 1; + cr = ca + client_idx; + + client_idx = le16_to_cpu(cr->next_client); + + if (first_client) { + first_client = false; + if (cr->prev_client != LFS_NO_CLIENT_LE) + return false; + } + } + } + + return true; +} + +/* + * remove_client + * + * Remove a client record from a client record list an restart area. + */ +static inline void remove_client(struct CLIENT_REC *ca, + const struct CLIENT_REC *cr, __le16 *head) +{ + if (cr->prev_client == LFS_NO_CLIENT_LE) + *head = cr->next_client; + else + ca[le16_to_cpu(cr->prev_client)].next_client = cr->next_client; + + if (cr->next_client != LFS_NO_CLIENT_LE) + ca[le16_to_cpu(cr->next_client)].prev_client = cr->prev_client; +} + +/* + * add_client - Add a client record to the start of a list. + */ +static inline void add_client(struct CLIENT_REC *ca, u16 index, __le16 *head) +{ + struct CLIENT_REC *cr = ca + index; + + cr->prev_client = LFS_NO_CLIENT_LE; + cr->next_client = *head; + + if (*head != LFS_NO_CLIENT_LE) + ca[le16_to_cpu(*head)].prev_client = cpu_to_le16(index); + + *head = cpu_to_le16(index); +} + +static inline void *enum_rstbl(struct RESTART_TABLE *t, void *c) +{ + __le32 *e; + u32 bprt; + u16 rsize = t ? le16_to_cpu(t->size) : 0; + + if (!c) { + if (!t || !t->total) + return NULL; + e = Add2Ptr(t, sizeof(struct RESTART_TABLE)); + } else { + e = Add2Ptr(c, rsize); + } + + /* Loop until we hit the first one allocated, or the end of the list. */ + for (bprt = bytes_per_rt(t); PtrOffset(t, e) < bprt; + e = Add2Ptr(e, rsize)) { + if (*e == RESTART_ENTRY_ALLOCATED_LE) + return e; + } + return NULL; +} + +/* + * find_dp - Search for a @vcn in Dirty Page Table. + */ +static inline struct DIR_PAGE_ENTRY *find_dp(struct RESTART_TABLE *dptbl, + u32 target_attr, u64 vcn) +{ + __le32 ta = cpu_to_le32(target_attr); + struct DIR_PAGE_ENTRY *dp = NULL; + + while ((dp = enum_rstbl(dptbl, dp))) { + u64 dp_vcn = le64_to_cpu(dp->vcn); + + if (dp->target_attr == ta && vcn >= dp_vcn && + vcn < dp_vcn + le32_to_cpu(dp->lcns_follow)) { + return dp; + } + } + return NULL; +} + +static inline u32 norm_file_page(u32 page_size, u32 *l_size, bool use_default) +{ + if (use_default) + page_size = DefaultLogPageSize; + + /* Round the file size down to a system page boundary. */ + *l_size &= ~(page_size - 1); + + /* File should contain at least 2 restart pages and MinLogRecordPages pages. */ + if (*l_size < (MinLogRecordPages + 2) * page_size) + return 0; + + return page_size; +} + +static bool check_log_rec(const struct LOG_REC_HDR *lr, u32 bytes, u32 tr, + u32 bytes_per_attr_entry) +{ + u16 t16; + + if (bytes < sizeof(struct LOG_REC_HDR)) + return false; + if (!tr) + return false; + + if ((tr - sizeof(struct RESTART_TABLE)) % + sizeof(struct TRANSACTION_ENTRY)) + return false; + + if (le16_to_cpu(lr->redo_off) & 7) + return false; + + if (le16_to_cpu(lr->undo_off) & 7) + return false; + + if (lr->target_attr) + goto check_lcns; + + if (is_target_required(le16_to_cpu(lr->redo_op))) + return false; + + if (is_target_required(le16_to_cpu(lr->undo_op))) + return false; + +check_lcns: + if (!lr->lcns_follow) + goto check_length; + + t16 = le16_to_cpu(lr->target_attr); + if ((t16 - sizeof(struct RESTART_TABLE)) % bytes_per_attr_entry) + return false; + +check_length: + if (bytes < lrh_length(lr)) + return false; + + return true; +} + +static bool check_rstbl(const struct RESTART_TABLE *rt, size_t bytes) +{ + u32 ts; + u32 i, off; + u16 rsize = le16_to_cpu(rt->size); + u16 ne = le16_to_cpu(rt->used); + u32 ff = le32_to_cpu(rt->first_free); + u32 lf = le32_to_cpu(rt->last_free); + + ts = rsize * ne + sizeof(struct RESTART_TABLE); + + if (!rsize || rsize > bytes || + rsize + sizeof(struct RESTART_TABLE) > bytes || bytes < ts || + le16_to_cpu(rt->total) > ne || ff > ts || lf > ts || + (ff && ff < sizeof(struct RESTART_TABLE)) || + (lf && lf < sizeof(struct RESTART_TABLE))) { + return false; + } + + /* + * Verify each entry is either allocated or points + * to a valid offset the table. + */ + for (i = 0; i < ne; i++) { + off = le32_to_cpu(*(__le32 *)Add2Ptr( + rt, i * rsize + sizeof(struct RESTART_TABLE))); + + if (off != RESTART_ENTRY_ALLOCATED && off && + (off < sizeof(struct RESTART_TABLE) || + ((off - sizeof(struct RESTART_TABLE)) % rsize))) { + return false; + } + } + + /* + * Walk through the list headed by the first entry to make + * sure none of the entries are currently being used. + */ + for (off = ff; off;) { + if (off == RESTART_ENTRY_ALLOCATED) + return false; + + off = le32_to_cpu(*(__le32 *)Add2Ptr(rt, off)); + } + + return true; +} + +/* + * free_rsttbl_idx - Free a previously allocated index a Restart Table. + */ +static inline void free_rsttbl_idx(struct RESTART_TABLE *rt, u32 off) +{ + __le32 *e; + u32 lf = le32_to_cpu(rt->last_free); + __le32 off_le = cpu_to_le32(off); + + e = Add2Ptr(rt, off); + + if (off < le32_to_cpu(rt->free_goal)) { + *e = rt->first_free; + rt->first_free = off_le; + if (!lf) + rt->last_free = off_le; + } else { + if (lf) + *(__le32 *)Add2Ptr(rt, lf) = off_le; + else + rt->first_free = off_le; + + rt->last_free = off_le; + *e = 0; + } + + le16_sub_cpu(&rt->total, 1); +} + +static inline struct RESTART_TABLE *init_rsttbl(u16 esize, u16 used) +{ + __le32 *e, *last_free; + u32 off; + u32 bytes = esize * used + sizeof(struct RESTART_TABLE); + u32 lf = sizeof(struct RESTART_TABLE) + (used - 1) * esize; + struct RESTART_TABLE *t = kzalloc(bytes, GFP_NOFS); + + if (!t) + return NULL; + + t->size = cpu_to_le16(esize); + t->used = cpu_to_le16(used); + t->free_goal = cpu_to_le32(~0u); + t->first_free = cpu_to_le32(sizeof(struct RESTART_TABLE)); + t->last_free = cpu_to_le32(lf); + + e = (__le32 *)(t + 1); + last_free = Add2Ptr(t, lf); + + for (off = sizeof(struct RESTART_TABLE) + esize; e < last_free; + e = Add2Ptr(e, esize), off += esize) { + *e = cpu_to_le32(off); + } + return t; +} + +static inline struct RESTART_TABLE *extend_rsttbl(struct RESTART_TABLE *tbl, + u32 add, u32 free_goal) +{ + u16 esize = le16_to_cpu(tbl->size); + __le32 osize = cpu_to_le32(bytes_per_rt(tbl)); + u32 used = le16_to_cpu(tbl->used); + struct RESTART_TABLE *rt; + + rt = init_rsttbl(esize, used + add); + if (!rt) + return NULL; + + memcpy(rt + 1, tbl + 1, esize * used); + + rt->free_goal = free_goal == ~0u + ? cpu_to_le32(~0u) + : cpu_to_le32(sizeof(struct RESTART_TABLE) + + free_goal * esize); + + if (tbl->first_free) { + rt->first_free = tbl->first_free; + *(__le32 *)Add2Ptr(rt, le32_to_cpu(tbl->last_free)) = osize; + } else { + rt->first_free = osize; + } + + rt->total = tbl->total; + + kfree(tbl); + return rt; +} + +/* + * alloc_rsttbl_idx + * + * Allocate an index from within a previously initialized Restart Table. + */ +static inline void *alloc_rsttbl_idx(struct RESTART_TABLE **tbl) +{ + u32 off; + __le32 *e; + struct RESTART_TABLE *t = *tbl; + + if (!t->first_free) { + *tbl = t = extend_rsttbl(t, 16, ~0u); + if (!t) + return NULL; + } + + off = le32_to_cpu(t->first_free); + + /* Dequeue this entry and zero it. */ + e = Add2Ptr(t, off); + + t->first_free = *e; + + memset(e, 0, le16_to_cpu(t->size)); + + *e = RESTART_ENTRY_ALLOCATED_LE; + + /* If list is going empty, then we fix the last_free as well. */ + if (!t->first_free) + t->last_free = 0; + + le16_add_cpu(&t->total, 1); + + return Add2Ptr(t, off); +} + +/* + * alloc_rsttbl_from_idx + * + * Allocate a specific index from within a previously initialized Restart Table. + */ +static inline void *alloc_rsttbl_from_idx(struct RESTART_TABLE **tbl, u32 vbo) +{ + u32 off; + __le32 *e; + struct RESTART_TABLE *rt = *tbl; + u32 bytes = bytes_per_rt(rt); + u16 esize = le16_to_cpu(rt->size); + + /* If the entry is not the table, we will have to extend the table. */ + if (vbo >= bytes) { + /* + * Extend the size by computing the number of entries between + * the existing size and the desired index and adding 1 to that. + */ + u32 bytes2idx = vbo - bytes; + + /* + * There should always be an integral number of entries + * being added. Now extend the table. + */ + *tbl = rt = extend_rsttbl(rt, bytes2idx / esize + 1, bytes); + if (!rt) + return NULL; + } + + /* See if the entry is already allocated, and just return if it is. */ + e = Add2Ptr(rt, vbo); + + if (*e == RESTART_ENTRY_ALLOCATED_LE) + return e; + + /* + * Walk through the table, looking for the entry we're + * interested and the previous entry. + */ + off = le32_to_cpu(rt->first_free); + e = Add2Ptr(rt, off); + + if (off == vbo) { + /* this is a match */ + rt->first_free = *e; + goto skip_looking; + } + + /* + * Need to walk through the list looking for the predecessor + * of our entry. + */ + for (;;) { + /* Remember the entry just found */ + u32 last_off = off; + __le32 *last_e = e; + + /* Should never run of entries. */ + + /* Lookup up the next entry the list. */ + off = le32_to_cpu(*last_e); + e = Add2Ptr(rt, off); + + /* If this is our match we are done. */ + if (off == vbo) { + *last_e = *e; + + /* + * If this was the last entry, we update that + * table as well. + */ + if (le32_to_cpu(rt->last_free) == off) + rt->last_free = cpu_to_le32(last_off); + break; + } + } + +skip_looking: + /* If the list is now empty, we fix the last_free as well. */ + if (!rt->first_free) + rt->last_free = 0; + + /* Zero this entry. */ + memset(e, 0, esize); + *e = RESTART_ENTRY_ALLOCATED_LE; + + le16_add_cpu(&rt->total, 1); + + return e; +} + +#define RESTART_SINGLE_PAGE_IO cpu_to_le16(0x0001) + +#define NTFSLOG_WRAPPED 0x00000001 +#define NTFSLOG_MULTIPLE_PAGE_IO 0x00000002 +#define NTFSLOG_NO_LAST_LSN 0x00000004 +#define NTFSLOG_REUSE_TAIL 0x00000010 +#define NTFSLOG_NO_OLDEST_LSN 0x00000020 + +/* Helper struct to work with NTFS $LogFile. */ +struct ntfs_log { + struct ntfs_inode *ni; + + u32 l_size; + u32 sys_page_size; + u32 sys_page_mask; + u32 page_size; + u32 page_mask; // page_size - 1 + u8 page_bits; + struct RECORD_PAGE_HDR *one_page_buf; + + struct RESTART_TABLE *open_attr_tbl; + u32 transaction_id; + u32 clst_per_page; + + u32 first_page; + u32 next_page; + u32 ra_off; + u32 data_off; + u32 restart_size; + u32 data_size; + u16 record_header_len; + u64 seq_num; + u32 seq_num_bits; + u32 file_data_bits; + u32 seq_num_mask; /* (1 << file_data_bits) - 1 */ + + struct RESTART_AREA *ra; /* In-memory image of the next restart area. */ + u32 ra_size; /* The usable size of the restart area. */ + + /* + * If true, then the in-memory restart area is to be written + * to the first position on the disk. + */ + bool init_ra; + bool set_dirty; /* True if we need to set dirty flag. */ + + u64 oldest_lsn; + + u32 oldest_lsn_off; + u64 last_lsn; + + u32 total_avail; + u32 total_avail_pages; + u32 total_undo_commit; + u32 max_current_avail; + u32 current_avail; + u32 reserved; + + short major_ver; + short minor_ver; + + u32 l_flags; /* See NTFSLOG_XXX */ + u32 current_openlog_count; /* On-disk value for open_log_count. */ + + struct CLIENT_ID client_id; + u32 client_undo_commit; +}; + +static inline u32 lsn_to_vbo(struct ntfs_log *log, const u64 lsn) +{ + u32 vbo = (lsn << log->seq_num_bits) >> (log->seq_num_bits - 3); + + return vbo; +} + +/* Compute the offset in the log file of the next log page. */ +static inline u32 next_page_off(struct ntfs_log *log, u32 off) +{ + off = (off & ~log->sys_page_mask) + log->page_size; + return off >= log->l_size ? log->first_page : off; +} + +static inline u32 lsn_to_page_off(struct ntfs_log *log, u64 lsn) +{ + return (((u32)lsn) << 3) & log->page_mask; +} + +static inline u64 vbo_to_lsn(struct ntfs_log *log, u32 off, u64 Seq) +{ + return (off >> 3) + (Seq << log->file_data_bits); +} + +static inline bool is_lsn_in_file(struct ntfs_log *log, u64 lsn) +{ + return lsn >= log->oldest_lsn && + lsn <= le64_to_cpu(log->ra->current_lsn); +} + +static inline u32 hdr_file_off(struct ntfs_log *log, + struct RECORD_PAGE_HDR *hdr) +{ + if (log->major_ver < 2) + return le64_to_cpu(hdr->rhdr.lsn); + + return le32_to_cpu(hdr->file_off); +} + +static inline u64 base_lsn(struct ntfs_log *log, + const struct RECORD_PAGE_HDR *hdr, u64 lsn) +{ + u64 h_lsn = le64_to_cpu(hdr->rhdr.lsn); + u64 ret = (((h_lsn >> log->file_data_bits) + + (lsn < (lsn_to_vbo(log, h_lsn) & ~log->page_mask) ? 1 : 0)) + << log->file_data_bits) + + ((((is_log_record_end(hdr) && + h_lsn <= le64_to_cpu(hdr->record_hdr.last_end_lsn)) + ? le16_to_cpu(hdr->record_hdr.next_record_off) + : log->page_size) + + lsn) >> + 3); + + return ret; +} + +static inline bool verify_client_lsn(struct ntfs_log *log, + const struct CLIENT_REC *client, u64 lsn) +{ + return lsn >= le64_to_cpu(client->oldest_lsn) && + lsn <= le64_to_cpu(log->ra->current_lsn) && lsn; +} + +struct restart_info { + u64 last_lsn; + struct RESTART_HDR *r_page; + u32 vbo; + bool chkdsk_was_run; + bool valid_page; + bool initialized; + bool restart; +}; + +static int read_log_page(struct ntfs_log *log, u32 vbo, + struct RECORD_PAGE_HDR **buffer, bool *usa_error) +{ + int err = 0; + u32 page_idx = vbo >> log->page_bits; + u32 page_off = vbo & log->page_mask; + u32 bytes = log->page_size - page_off; + void *to_free = NULL; + u32 page_vbo = page_idx << log->page_bits; + struct RECORD_PAGE_HDR *page_buf; + struct ntfs_inode *ni = log->ni; + bool bBAAD; + + if (vbo >= log->l_size) + return -EINVAL; + + if (!*buffer) { + to_free = kmalloc(log->page_size, GFP_NOFS); + if (!to_free) + return -ENOMEM; + *buffer = to_free; + } + + page_buf = page_off ? log->one_page_buf : *buffer; + + err = ntfs_read_run_nb(ni->mi.sbi, &ni->file.run, page_vbo, page_buf, + log->page_size, NULL); + if (err) + goto out; + + if (page_buf->rhdr.sign != NTFS_FFFF_SIGNATURE) + ntfs_fix_post_read(&page_buf->rhdr, PAGE_SIZE, false); + + if (page_buf != *buffer) + memcpy(*buffer, Add2Ptr(page_buf, page_off), bytes); + + bBAAD = page_buf->rhdr.sign == NTFS_BAAD_SIGNATURE; + + if (usa_error) + *usa_error = bBAAD; + /* Check that the update sequence array for this page is valid */ + /* If we don't allow errors, raise an error status */ + else if (bBAAD) + err = -EINVAL; + +out: + if (err && to_free) { + kfree(to_free); + *buffer = NULL; + } + + return err; +} + +/* + * log_read_rst + * + * It walks through 512 blocks of the file looking for a valid + * restart page header. It will stop the first time we find a + * valid page header. + */ +static int log_read_rst(struct ntfs_log *log, u32 l_size, bool first, + struct restart_info *info) +{ + u32 skip, vbo; + struct RESTART_HDR *r_page = NULL; + + /* Determine which restart area we are looking for. */ + if (first) { + vbo = 0; + skip = 512; + } else { + vbo = 512; + skip = 0; + } + + /* Loop continuously until we succeed. */ + for (; vbo < l_size; vbo = 2 * vbo + skip, skip = 0) { + bool usa_error; + bool brst, bchk; + struct RESTART_AREA *ra; + + /* Read a page header at the current offset. */ + if (read_log_page(log, vbo, (struct RECORD_PAGE_HDR **)&r_page, + &usa_error)) { + /* Ignore any errors. */ + continue; + } + + /* Exit if the signature is a log record page. */ + if (r_page->rhdr.sign == NTFS_RCRD_SIGNATURE) { + info->initialized = true; + break; + } + + brst = r_page->rhdr.sign == NTFS_RSTR_SIGNATURE; + bchk = r_page->rhdr.sign == NTFS_CHKD_SIGNATURE; + + if (!bchk && !brst) { + if (r_page->rhdr.sign != NTFS_FFFF_SIGNATURE) { + /* + * Remember if the signature does not + * indicate uninitialized file. + */ + info->initialized = true; + } + continue; + } + + ra = NULL; + info->valid_page = false; + info->initialized = true; + info->vbo = vbo; + + /* Let's check the restart area if this is a valid page. */ + if (!is_rst_page_hdr_valid(vbo, r_page)) + goto check_result; + ra = Add2Ptr(r_page, le16_to_cpu(r_page->ra_off)); + + if (!is_rst_area_valid(r_page)) + goto check_result; + + /* + * We have a valid restart page header and restart area. + * If chkdsk was run or we have no clients then we have + * no more checking to do. + */ + if (bchk || ra->client_idx[1] == LFS_NO_CLIENT_LE) { + info->valid_page = true; + goto check_result; + } + + if (is_client_area_valid(r_page, usa_error)) { + info->valid_page = true; + ra = Add2Ptr(r_page, le16_to_cpu(r_page->ra_off)); + } + +check_result: + /* + * If chkdsk was run then update the caller's + * values and return. + */ + if (r_page->rhdr.sign == NTFS_CHKD_SIGNATURE) { + info->chkdsk_was_run = true; + info->last_lsn = le64_to_cpu(r_page->rhdr.lsn); + info->restart = true; + info->r_page = r_page; + return 0; + } + + /* + * If we have a valid page then copy the values + * we need from it. + */ + if (info->valid_page) { + info->last_lsn = le64_to_cpu(ra->current_lsn); + info->restart = true; + info->r_page = r_page; + return 0; + } + } + + kfree(r_page); + + return 0; +} + +/* + * Ilog_init_pg_hdr - Init @log from restart page header. + */ +static void log_init_pg_hdr(struct ntfs_log *log, u32 sys_page_size, + u32 page_size, u16 major_ver, u16 minor_ver) +{ + log->sys_page_size = sys_page_size; + log->sys_page_mask = sys_page_size - 1; + log->page_size = page_size; + log->page_mask = page_size - 1; + log->page_bits = blksize_bits(page_size); + + log->clst_per_page = log->page_size >> log->ni->mi.sbi->cluster_bits; + if (!log->clst_per_page) + log->clst_per_page = 1; + + log->first_page = major_ver >= 2 + ? 0x22 * page_size + : ((sys_page_size << 1) + (page_size << 1)); + log->major_ver = major_ver; + log->minor_ver = minor_ver; +} + +/* + * log_create - Init @log in cases when we don't have a restart area to use. + */ +static void log_create(struct ntfs_log *log, u32 l_size, const u64 last_lsn, + u32 open_log_count, bool wrapped, bool use_multi_page) +{ + log->l_size = l_size; + /* All file offsets must be quadword aligned. */ + log->file_data_bits = blksize_bits(l_size) - 3; + log->seq_num_mask = (8 << log->file_data_bits) - 1; + log->seq_num_bits = sizeof(u64) * 8 - log->file_data_bits; + log->seq_num = (last_lsn >> log->file_data_bits) + 2; + log->next_page = log->first_page; + log->oldest_lsn = log->seq_num << log->file_data_bits; + log->oldest_lsn_off = 0; + log->last_lsn = log->oldest_lsn; + + log->l_flags |= NTFSLOG_NO_LAST_LSN | NTFSLOG_NO_OLDEST_LSN; + + /* Set the correct flags for the I/O and indicate if we have wrapped. */ + if (wrapped) + log->l_flags |= NTFSLOG_WRAPPED; + + if (use_multi_page) + log->l_flags |= NTFSLOG_MULTIPLE_PAGE_IO; + + /* Compute the log page values. */ + log->data_off = ALIGN( + offsetof(struct RECORD_PAGE_HDR, fixups) + + sizeof(short) * ((log->page_size >> SECTOR_SHIFT) + 1), + 8); + log->data_size = log->page_size - log->data_off; + log->record_header_len = sizeof(struct LFS_RECORD_HDR); + + /* Remember the different page sizes for reservation. */ + log->reserved = log->data_size - log->record_header_len; + + /* Compute the restart page values. */ + log->ra_off = ALIGN( + offsetof(struct RESTART_HDR, fixups) + + sizeof(short) * + ((log->sys_page_size >> SECTOR_SHIFT) + 1), + 8); + log->restart_size = log->sys_page_size - log->ra_off; + log->ra_size = struct_size(log->ra, clients, 1); + log->current_openlog_count = open_log_count; + + /* + * The total available log file space is the number of + * log file pages times the space available on each page. + */ + log->total_avail_pages = log->l_size - log->first_page; + log->total_avail = log->total_avail_pages >> log->page_bits; + + /* + * We assume that we can't use the end of the page less than + * the file record size. + * Then we won't need to reserve more than the caller asks for. + */ + log->max_current_avail = log->total_avail * log->reserved; + log->total_avail = log->total_avail * log->data_size; + log->current_avail = log->max_current_avail; +} + +/* + * log_create_ra - Fill a restart area from the values stored in @log. + */ +static struct RESTART_AREA *log_create_ra(struct ntfs_log *log) +{ + struct CLIENT_REC *cr; + struct RESTART_AREA *ra = kzalloc(log->restart_size, GFP_NOFS); + + if (!ra) + return NULL; + + ra->current_lsn = cpu_to_le64(log->last_lsn); + ra->log_clients = cpu_to_le16(1); + ra->client_idx[1] = LFS_NO_CLIENT_LE; + if (log->l_flags & NTFSLOG_MULTIPLE_PAGE_IO) + ra->flags = RESTART_SINGLE_PAGE_IO; + ra->seq_num_bits = cpu_to_le32(log->seq_num_bits); + ra->ra_len = cpu_to_le16(log->ra_size); + ra->client_off = cpu_to_le16(offsetof(struct RESTART_AREA, clients)); + ra->l_size = cpu_to_le64(log->l_size); + ra->rec_hdr_len = cpu_to_le16(log->record_header_len); + ra->data_off = cpu_to_le16(log->data_off); + ra->open_log_count = cpu_to_le32(log->current_openlog_count + 1); + + cr = ra->clients; + + cr->prev_client = LFS_NO_CLIENT_LE; + cr->next_client = LFS_NO_CLIENT_LE; + + return ra; +} + +static u32 final_log_off(struct ntfs_log *log, u64 lsn, u32 data_len) +{ + u32 base_vbo = lsn << 3; + u32 final_log_off = (base_vbo & log->seq_num_mask) & ~log->page_mask; + u32 page_off = base_vbo & log->page_mask; + u32 tail = log->page_size - page_off; + + page_off -= 1; + + /* Add the length of the header. */ + data_len += log->record_header_len; + + /* + * If this lsn is contained this log page we are done. + * Otherwise we need to walk through several log pages. + */ + if (data_len > tail) { + data_len -= tail; + tail = log->data_size; + page_off = log->data_off - 1; + + for (;;) { + final_log_off = next_page_off(log, final_log_off); + + /* + * We are done if the remaining bytes + * fit on this page. + */ + if (data_len <= tail) + break; + data_len -= tail; + } + } + + /* + * We add the remaining bytes to our starting position on this page + * and then add that value to the file offset of this log page. + */ + return final_log_off + data_len + page_off; +} + +static int next_log_lsn(struct ntfs_log *log, const struct LFS_RECORD_HDR *rh, + u64 *lsn) +{ + int err; + u64 this_lsn = le64_to_cpu(rh->this_lsn); + u32 vbo = lsn_to_vbo(log, this_lsn); + u32 end = + final_log_off(log, this_lsn, le32_to_cpu(rh->client_data_len)); + u32 hdr_off = end & ~log->sys_page_mask; + u64 seq = this_lsn >> log->file_data_bits; + struct RECORD_PAGE_HDR *page = NULL; + + /* Remember if we wrapped. */ + if (end <= vbo) + seq += 1; + + /* Log page header for this page. */ + err = read_log_page(log, hdr_off, &page, NULL); + if (err) + return err; + + /* + * If the lsn we were given was not the last lsn on this page, + * then the starting offset for the next lsn is on a quad word + * boundary following the last file offset for the current lsn. + * Otherwise the file offset is the start of the data on the next page. + */ + if (this_lsn == le64_to_cpu(page->rhdr.lsn)) { + /* If we wrapped, we need to increment the sequence number. */ + hdr_off = next_page_off(log, hdr_off); + if (hdr_off == log->first_page) + seq += 1; + + vbo = hdr_off + log->data_off; + } else { + vbo = ALIGN(end, 8); + } + + /* Compute the lsn based on the file offset and the sequence count. */ + *lsn = vbo_to_lsn(log, vbo, seq); + + /* + * If this lsn is within the legal range for the file, we return true. + * Otherwise false indicates that there are no more lsn's. + */ + if (!is_lsn_in_file(log, *lsn)) + *lsn = 0; + + kfree(page); + + return 0; +} + +/* + * current_log_avail - Calculate the number of bytes available for log records. + */ +static u32 current_log_avail(struct ntfs_log *log) +{ + u32 oldest_off, next_free_off, free_bytes; + + if (log->l_flags & NTFSLOG_NO_LAST_LSN) { + /* The entire file is available. */ + return log->max_current_avail; + } + + /* + * If there is a last lsn the restart area then we know that we will + * have to compute the free range. + * If there is no oldest lsn then start at the first page of the file. + */ + oldest_off = (log->l_flags & NTFSLOG_NO_OLDEST_LSN) + ? log->first_page + : (log->oldest_lsn_off & ~log->sys_page_mask); + + /* + * We will use the next log page offset to compute the next free page. + * If we are going to reuse this page go to the next page. + * If we are at the first page then use the end of the file. + */ + next_free_off = (log->l_flags & NTFSLOG_REUSE_TAIL) + ? log->next_page + log->page_size + : log->next_page == log->first_page + ? log->l_size + : log->next_page; + + /* If the two offsets are the same then there is no available space. */ + if (oldest_off == next_free_off) + return 0; + /* + * If the free offset follows the oldest offset then subtract + * this range from the total available pages. + */ + free_bytes = + oldest_off < next_free_off + ? log->total_avail_pages - (next_free_off - oldest_off) + : oldest_off - next_free_off; + + free_bytes >>= log->page_bits; + return free_bytes * log->reserved; +} + +static bool check_subseq_log_page(struct ntfs_log *log, + const struct RECORD_PAGE_HDR *rp, u32 vbo, + u64 seq) +{ + u64 lsn_seq; + const struct NTFS_RECORD_HEADER *rhdr = &rp->rhdr; + u64 lsn = le64_to_cpu(rhdr->lsn); + + if (rhdr->sign == NTFS_FFFF_SIGNATURE || !rhdr->sign) + return false; + + /* + * If the last lsn on the page occurs was written after the page + * that caused the original error then we have a fatal error. + */ + lsn_seq = lsn >> log->file_data_bits; + + /* + * If the sequence number for the lsn the page is equal or greater + * than lsn we expect, then this is a subsequent write. + */ + return lsn_seq >= seq || + (lsn_seq == seq - 1 && log->first_page == vbo && + vbo != (lsn_to_vbo(log, lsn) & ~log->page_mask)); +} + +/* + * last_log_lsn + * + * Walks through the log pages for a file, searching for the + * last log page written to the file. + */ +static int last_log_lsn(struct ntfs_log *log) +{ + int err; + bool usa_error = false; + bool replace_page = false; + bool reuse_page = log->l_flags & NTFSLOG_REUSE_TAIL; + bool wrapped_file, wrapped; + + u32 page_cnt = 1, page_pos = 1; + u32 page_off = 0, page_off1 = 0, saved_off = 0; + u32 final_off, second_off, final_off_prev = 0, second_off_prev = 0; + u32 first_file_off = 0, second_file_off = 0; + u32 part_io_count = 0; + u32 tails = 0; + u32 this_off, curpage_off, nextpage_off, remain_pages; + + u64 expected_seq, seq_base = 0, lsn_base = 0; + u64 best_lsn, best_lsn1, best_lsn2; + u64 lsn_cur, lsn1, lsn2; + u64 last_ok_lsn = reuse_page ? log->last_lsn : 0; + + u16 cur_pos, best_page_pos; + + struct RECORD_PAGE_HDR *page = NULL; + struct RECORD_PAGE_HDR *tst_page = NULL; + struct RECORD_PAGE_HDR *first_tail = NULL; + struct RECORD_PAGE_HDR *second_tail = NULL; + struct RECORD_PAGE_HDR *tail_page = NULL; + struct RECORD_PAGE_HDR *second_tail_prev = NULL; + struct RECORD_PAGE_HDR *first_tail_prev = NULL; + struct RECORD_PAGE_HDR *page_bufs = NULL; + struct RECORD_PAGE_HDR *best_page; + + if (log->major_ver >= 2) { + final_off = 0x02 * log->page_size; + second_off = 0x12 * log->page_size; + + // 0x10 == 0x12 - 0x2 + page_bufs = kmalloc(log->page_size * 0x10, GFP_NOFS); + if (!page_bufs) + return -ENOMEM; + } else { + second_off = log->first_page - log->page_size; + final_off = second_off - log->page_size; + } + +next_tail: + /* Read second tail page (at pos 3/0x12000). */ + if (read_log_page(log, second_off, &second_tail, &usa_error) || + usa_error || second_tail->rhdr.sign != NTFS_RCRD_SIGNATURE) { + kfree(second_tail); + second_tail = NULL; + second_file_off = 0; + lsn2 = 0; + } else { + second_file_off = hdr_file_off(log, second_tail); + lsn2 = le64_to_cpu(second_tail->record_hdr.last_end_lsn); + } + + /* Read first tail page (at pos 2/0x2000). */ + if (read_log_page(log, final_off, &first_tail, &usa_error) || + usa_error || first_tail->rhdr.sign != NTFS_RCRD_SIGNATURE) { + kfree(first_tail); + first_tail = NULL; + first_file_off = 0; + lsn1 = 0; + } else { + first_file_off = hdr_file_off(log, first_tail); + lsn1 = le64_to_cpu(first_tail->record_hdr.last_end_lsn); + } + + if (log->major_ver < 2) { + int best_page; + + first_tail_prev = first_tail; + final_off_prev = first_file_off; + second_tail_prev = second_tail; + second_off_prev = second_file_off; + tails = 1; + + if (!first_tail && !second_tail) + goto tail_read; + + if (first_tail && second_tail) + best_page = lsn1 < lsn2 ? 1 : 0; + else if (first_tail) + best_page = 0; + else + best_page = 1; + + page_off = best_page ? second_file_off : first_file_off; + seq_base = (best_page ? lsn2 : lsn1) >> log->file_data_bits; + goto tail_read; + } + + best_lsn1 = first_tail ? base_lsn(log, first_tail, first_file_off) : 0; + best_lsn2 = + second_tail ? base_lsn(log, second_tail, second_file_off) : 0; + + if (first_tail && second_tail) { + if (best_lsn1 > best_lsn2) { + best_lsn = best_lsn1; + best_page = first_tail; + this_off = first_file_off; + } else { + best_lsn = best_lsn2; + best_page = second_tail; + this_off = second_file_off; + } + } else if (first_tail) { + best_lsn = best_lsn1; + best_page = first_tail; + this_off = first_file_off; + } else if (second_tail) { + best_lsn = best_lsn2; + best_page = second_tail; + this_off = second_file_off; + } else { + goto tail_read; + } + + best_page_pos = le16_to_cpu(best_page->page_pos); + + if (!tails) { + if (best_page_pos == page_pos) { + seq_base = best_lsn >> log->file_data_bits; + saved_off = page_off = le32_to_cpu(best_page->file_off); + lsn_base = best_lsn; + + memmove(page_bufs, best_page, log->page_size); + + page_cnt = le16_to_cpu(best_page->page_count); + if (page_cnt > 1) + page_pos += 1; + + tails = 1; + } + } else if (seq_base == (best_lsn >> log->file_data_bits) && + saved_off + log->page_size == this_off && + lsn_base < best_lsn && + (page_pos != page_cnt || best_page_pos == page_pos || + best_page_pos == 1) && + (page_pos >= page_cnt || best_page_pos == page_pos)) { + u16 bppc = le16_to_cpu(best_page->page_count); + + saved_off += log->page_size; + lsn_base = best_lsn; + + memmove(Add2Ptr(page_bufs, tails * log->page_size), best_page, + log->page_size); + + tails += 1; + + if (best_page_pos != bppc) { + page_cnt = bppc; + page_pos = best_page_pos; + + if (page_cnt > 1) + page_pos += 1; + } else { + page_pos = page_cnt = 1; + } + } else { + kfree(first_tail); + kfree(second_tail); + goto tail_read; + } + + kfree(first_tail_prev); + first_tail_prev = first_tail; + final_off_prev = first_file_off; + first_tail = NULL; + + kfree(second_tail_prev); + second_tail_prev = second_tail; + second_off_prev = second_file_off; + second_tail = NULL; + + final_off += log->page_size; + second_off += log->page_size; + + if (tails < 0x10) + goto next_tail; +tail_read: + first_tail = first_tail_prev; + final_off = final_off_prev; + + second_tail = second_tail_prev; + second_off = second_off_prev; + + page_cnt = page_pos = 1; + + curpage_off = seq_base == log->seq_num ? min(log->next_page, page_off) + : log->next_page; + + wrapped_file = + curpage_off == log->first_page && + !(log->l_flags & (NTFSLOG_NO_LAST_LSN | NTFSLOG_REUSE_TAIL)); + + expected_seq = wrapped_file ? (log->seq_num + 1) : log->seq_num; + + nextpage_off = curpage_off; + +next_page: + tail_page = NULL; + /* Read the next log page. */ + err = read_log_page(log, curpage_off, &page, &usa_error); + + /* Compute the next log page offset the file. */ + nextpage_off = next_page_off(log, curpage_off); + wrapped = nextpage_off == log->first_page; + + if (tails > 1) { + struct RECORD_PAGE_HDR *cur_page = + Add2Ptr(page_bufs, curpage_off - page_off); + + if (curpage_off == saved_off) { + tail_page = cur_page; + goto use_tail_page; + } + + if (page_off > curpage_off || curpage_off >= saved_off) + goto use_tail_page; + + if (page_off1) + goto use_cur_page; + + if (!err && !usa_error && + page->rhdr.sign == NTFS_RCRD_SIGNATURE && + cur_page->rhdr.lsn == page->rhdr.lsn && + cur_page->record_hdr.next_record_off == + page->record_hdr.next_record_off && + ((page_pos == page_cnt && + le16_to_cpu(page->page_pos) == 1) || + (page_pos != page_cnt && + le16_to_cpu(page->page_pos) == page_pos + 1 && + le16_to_cpu(page->page_count) == page_cnt))) { + cur_page = NULL; + goto use_tail_page; + } + + page_off1 = page_off; + +use_cur_page: + + lsn_cur = le64_to_cpu(cur_page->rhdr.lsn); + + if (last_ok_lsn != + le64_to_cpu(cur_page->record_hdr.last_end_lsn) && + ((lsn_cur >> log->file_data_bits) + + ((curpage_off < + (lsn_to_vbo(log, lsn_cur) & ~log->page_mask)) + ? 1 + : 0)) != expected_seq) { + goto check_tail; + } + + if (!is_log_record_end(cur_page)) { + tail_page = NULL; + last_ok_lsn = lsn_cur; + goto next_page_1; + } + + log->seq_num = expected_seq; + log->l_flags &= ~NTFSLOG_NO_LAST_LSN; + log->last_lsn = le64_to_cpu(cur_page->record_hdr.last_end_lsn); + log->ra->current_lsn = cur_page->record_hdr.last_end_lsn; + + if (log->record_header_len <= + log->page_size - + le16_to_cpu(cur_page->record_hdr.next_record_off)) { + log->l_flags |= NTFSLOG_REUSE_TAIL; + log->next_page = curpage_off; + } else { + log->l_flags &= ~NTFSLOG_REUSE_TAIL; + log->next_page = nextpage_off; + } + + if (wrapped_file) + log->l_flags |= NTFSLOG_WRAPPED; + + last_ok_lsn = le64_to_cpu(cur_page->record_hdr.last_end_lsn); + goto next_page_1; + } + + /* + * If we are at the expected first page of a transfer check to see + * if either tail copy is at this offset. + * If this page is the last page of a transfer, check if we wrote + * a subsequent tail copy. + */ + if (page_cnt == page_pos || page_cnt == page_pos + 1) { + /* + * Check if the offset matches either the first or second + * tail copy. It is possible it will match both. + */ + if (curpage_off == final_off) + tail_page = first_tail; + + /* + * If we already matched on the first page then + * check the ending lsn's. + */ + if (curpage_off == second_off) { + if (!tail_page || + (second_tail && + le64_to_cpu(second_tail->record_hdr.last_end_lsn) > + le64_to_cpu(first_tail->record_hdr + .last_end_lsn))) { + tail_page = second_tail; + } + } + } + +use_tail_page: + if (tail_page) { + /* We have a candidate for a tail copy. */ + lsn_cur = le64_to_cpu(tail_page->record_hdr.last_end_lsn); + + if (last_ok_lsn < lsn_cur) { + /* + * If the sequence number is not expected, + * then don't use the tail copy. + */ + if (expected_seq != (lsn_cur >> log->file_data_bits)) + tail_page = NULL; + } else if (last_ok_lsn > lsn_cur) { + /* + * If the last lsn is greater than the one on + * this page then forget this tail. + */ + tail_page = NULL; + } + } + + /* + *If we have an error on the current page, + * we will break of this loop. + */ + if (err || usa_error) + goto check_tail; + + /* + * Done if the last lsn on this page doesn't match the previous known + * last lsn or the sequence number is not expected. + */ + lsn_cur = le64_to_cpu(page->rhdr.lsn); + if (last_ok_lsn != lsn_cur && + expected_seq != (lsn_cur >> log->file_data_bits)) { + goto check_tail; + } + + /* + * Check that the page position and page count values are correct. + * If this is the first page of a transfer the position must be 1 + * and the count will be unknown. + */ + if (page_cnt == page_pos) { + if (page->page_pos != cpu_to_le16(1) && + (!reuse_page || page->page_pos != page->page_count)) { + /* + * If the current page is the first page we are + * looking at and we are reusing this page then + * it can be either the first or last page of a + * transfer. Otherwise it can only be the first. + */ + goto check_tail; + } + } else if (le16_to_cpu(page->page_count) != page_cnt || + le16_to_cpu(page->page_pos) != page_pos + 1) { + /* + * The page position better be 1 more than the last page + * position and the page count better match. + */ + goto check_tail; + } + + /* + * We have a valid page the file and may have a valid page + * the tail copy area. + * If the tail page was written after the page the file then + * break of the loop. + */ + if (tail_page && + le64_to_cpu(tail_page->record_hdr.last_end_lsn) > lsn_cur) { + /* Remember if we will replace the page. */ + replace_page = true; + goto check_tail; + } + + tail_page = NULL; + + if (is_log_record_end(page)) { + /* + * Since we have read this page we know the sequence number + * is the same as our expected value. + */ + log->seq_num = expected_seq; + log->last_lsn = le64_to_cpu(page->record_hdr.last_end_lsn); + log->ra->current_lsn = page->record_hdr.last_end_lsn; + log->l_flags &= ~NTFSLOG_NO_LAST_LSN; + + /* + * If there is room on this page for another header then + * remember we want to reuse the page. + */ + if (log->record_header_len <= + log->page_size - + le16_to_cpu(page->record_hdr.next_record_off)) { + log->l_flags |= NTFSLOG_REUSE_TAIL; + log->next_page = curpage_off; + } else { + log->l_flags &= ~NTFSLOG_REUSE_TAIL; + log->next_page = nextpage_off; + } + + /* Remember if we wrapped the log file. */ + if (wrapped_file) + log->l_flags |= NTFSLOG_WRAPPED; + } + + /* + * Remember the last page count and position. + * Also remember the last known lsn. + */ + page_cnt = le16_to_cpu(page->page_count); + page_pos = le16_to_cpu(page->page_pos); + last_ok_lsn = le64_to_cpu(page->rhdr.lsn); + +next_page_1: + + if (wrapped) { + expected_seq += 1; + wrapped_file = 1; + } + + curpage_off = nextpage_off; + kfree(page); + page = NULL; + reuse_page = 0; + goto next_page; + +check_tail: + if (tail_page) { + log->seq_num = expected_seq; + log->last_lsn = le64_to_cpu(tail_page->record_hdr.last_end_lsn); + log->ra->current_lsn = tail_page->record_hdr.last_end_lsn; + log->l_flags &= ~NTFSLOG_NO_LAST_LSN; + + if (log->page_size - + le16_to_cpu( + tail_page->record_hdr.next_record_off) >= + log->record_header_len) { + log->l_flags |= NTFSLOG_REUSE_TAIL; + log->next_page = curpage_off; + } else { + log->l_flags &= ~NTFSLOG_REUSE_TAIL; + log->next_page = nextpage_off; + } + + if (wrapped) + log->l_flags |= NTFSLOG_WRAPPED; + } + + /* Remember that the partial IO will start at the next page. */ + second_off = nextpage_off; + + /* + * If the next page is the first page of the file then update + * the sequence number for log records which begon the next page. + */ + if (wrapped) + expected_seq += 1; + + /* + * If we have a tail copy or are performing single page I/O we can + * immediately look at the next page. + */ + if (replace_page || (log->ra->flags & RESTART_SINGLE_PAGE_IO)) { + page_cnt = 2; + page_pos = 1; + goto check_valid; + } + + if (page_pos != page_cnt) + goto check_valid; + /* + * If the next page causes us to wrap to the beginning of the log + * file then we know which page to check next. + */ + if (wrapped) { + page_cnt = 2; + page_pos = 1; + goto check_valid; + } + + cur_pos = 2; + +next_test_page: + kfree(tst_page); + tst_page = NULL; + + /* Walk through the file, reading log pages. */ + err = read_log_page(log, nextpage_off, &tst_page, &usa_error); + + /* + * If we get a USA error then assume that we correctly found + * the end of the original transfer. + */ + if (usa_error) + goto file_is_valid; + + /* + * If we were able to read the page, we examine it to see if it + * is the same or different Io block. + */ + if (err) + goto next_test_page_1; + + if (le16_to_cpu(tst_page->page_pos) == cur_pos && + check_subseq_log_page(log, tst_page, nextpage_off, expected_seq)) { + page_cnt = le16_to_cpu(tst_page->page_count) + 1; + page_pos = le16_to_cpu(tst_page->page_pos); + goto check_valid; + } else { + goto file_is_valid; + } + +next_test_page_1: + + nextpage_off = next_page_off(log, curpage_off); + wrapped = nextpage_off == log->first_page; + + if (wrapped) { + expected_seq += 1; + page_cnt = 2; + page_pos = 1; + } + + cur_pos += 1; + part_io_count += 1; + if (!wrapped) + goto next_test_page; + +check_valid: + /* Skip over the remaining pages this transfer. */ + remain_pages = page_cnt - page_pos - 1; + part_io_count += remain_pages; + + while (remain_pages--) { + nextpage_off = next_page_off(log, curpage_off); + wrapped = nextpage_off == log->first_page; + + if (wrapped) + expected_seq += 1; + } + + /* Call our routine to check this log page. */ + kfree(tst_page); + tst_page = NULL; + + err = read_log_page(log, nextpage_off, &tst_page, &usa_error); + if (!err && !usa_error && + check_subseq_log_page(log, tst_page, nextpage_off, expected_seq)) { + err = -EINVAL; + goto out; + } + +file_is_valid: + + /* We have a valid file. */ + if (page_off1 || tail_page) { + struct RECORD_PAGE_HDR *tmp_page; + + if (sb_rdonly(log->ni->mi.sbi->sb)) { + err = -EROFS; + goto out; + } + + if (page_off1) { + tmp_page = Add2Ptr(page_bufs, page_off1 - page_off); + tails -= (page_off1 - page_off) / log->page_size; + if (!tail_page) + tails -= 1; + } else { + tmp_page = tail_page; + tails = 1; + } + + while (tails--) { + u64 off = hdr_file_off(log, tmp_page); + + if (!page) { + page = kmalloc(log->page_size, GFP_NOFS); + if (!page) { + err = -ENOMEM; + goto out; + } + } + + /* + * Correct page and copy the data from this page + * into it and flush it to disk. + */ + memcpy(page, tmp_page, log->page_size); + + /* Fill last flushed lsn value flush the page. */ + if (log->major_ver < 2) + page->rhdr.lsn = page->record_hdr.last_end_lsn; + else + page->file_off = 0; + + page->page_pos = page->page_count = cpu_to_le16(1); + + ntfs_fix_pre_write(&page->rhdr, log->page_size); + + err = ntfs_sb_write_run(log->ni->mi.sbi, + &log->ni->file.run, off, page, + log->page_size, 0); + + if (err) + goto out; + + if (part_io_count && second_off == off) { + second_off += log->page_size; + part_io_count -= 1; + } + + tmp_page = Add2Ptr(tmp_page, log->page_size); + } + } + + if (part_io_count) { + if (sb_rdonly(log->ni->mi.sbi->sb)) { + err = -EROFS; + goto out; + } + } + +out: + kfree(second_tail); + kfree(first_tail); + kfree(page); + kfree(tst_page); + kfree(page_bufs); + + return err; +} + +/* + * read_log_rec_buf - Copy a log record from the file to a buffer. + * + * The log record may span several log pages and may even wrap the file. + */ +static int read_log_rec_buf(struct ntfs_log *log, + const struct LFS_RECORD_HDR *rh, void *buffer) +{ + int err; + struct RECORD_PAGE_HDR *ph = NULL; + u64 lsn = le64_to_cpu(rh->this_lsn); + u32 vbo = lsn_to_vbo(log, lsn) & ~log->page_mask; + u32 off = lsn_to_page_off(log, lsn) + log->record_header_len; + u32 data_len = le32_to_cpu(rh->client_data_len); + + /* + * While there are more bytes to transfer, + * we continue to attempt to perform the read. + */ + for (;;) { + bool usa_error; + u32 tail = log->page_size - off; + + if (tail >= data_len) + tail = data_len; + + data_len -= tail; + + err = read_log_page(log, vbo, &ph, &usa_error); + if (err) + goto out; + + /* + * The last lsn on this page better be greater or equal + * to the lsn we are copying. + */ + if (lsn > le64_to_cpu(ph->rhdr.lsn)) { + err = -EINVAL; + goto out; + } + + memcpy(buffer, Add2Ptr(ph, off), tail); + + /* If there are no more bytes to transfer, we exit the loop. */ + if (!data_len) { + if (!is_log_record_end(ph) || + lsn > le64_to_cpu(ph->record_hdr.last_end_lsn)) { + err = -EINVAL; + goto out; + } + break; + } + + if (ph->rhdr.lsn == ph->record_hdr.last_end_lsn || + lsn > le64_to_cpu(ph->rhdr.lsn)) { + err = -EINVAL; + goto out; + } + + vbo = next_page_off(log, vbo); + off = log->data_off; + + /* + * Adjust our pointer the user's buffer to transfer + * the next block to. + */ + buffer = Add2Ptr(buffer, tail); + } + +out: + kfree(ph); + return err; +} + +static int read_rst_area(struct ntfs_log *log, struct NTFS_RESTART **rst_, + u64 *lsn) +{ + int err; + struct LFS_RECORD_HDR *rh = NULL; + const struct CLIENT_REC *cr = + Add2Ptr(log->ra, le16_to_cpu(log->ra->client_off)); + u64 lsnr, lsnc = le64_to_cpu(cr->restart_lsn); + u32 len; + struct NTFS_RESTART *rst; + + *lsn = 0; + *rst_ = NULL; + + /* If the client doesn't have a restart area, go ahead and exit now. */ + if (!lsnc) + return 0; + + err = read_log_page(log, lsn_to_vbo(log, lsnc), + (struct RECORD_PAGE_HDR **)&rh, NULL); + if (err) + return err; + + rst = NULL; + lsnr = le64_to_cpu(rh->this_lsn); + + if (lsnc != lsnr) { + /* If the lsn values don't match, then the disk is corrupt. */ + err = -EINVAL; + goto out; + } + + *lsn = lsnr; + len = le32_to_cpu(rh->client_data_len); + + if (!len) { + err = 0; + goto out; + } + + if (len < sizeof(struct NTFS_RESTART)) { + err = -EINVAL; + goto out; + } + + rst = kmalloc(len, GFP_NOFS); + if (!rst) { + err = -ENOMEM; + goto out; + } + + /* Copy the data into the 'rst' buffer. */ + err = read_log_rec_buf(log, rh, rst); + if (err) + goto out; + + *rst_ = rst; + rst = NULL; + +out: + kfree(rh); + kfree(rst); + + return err; +} + +static int find_log_rec(struct ntfs_log *log, u64 lsn, struct lcb *lcb) +{ + int err; + struct LFS_RECORD_HDR *rh = lcb->lrh; + u32 rec_len, len; + + /* Read the record header for this lsn. */ + if (!rh) { + err = read_log_page(log, lsn_to_vbo(log, lsn), + (struct RECORD_PAGE_HDR **)&rh, NULL); + + lcb->lrh = rh; + if (err) + return err; + } + + /* + * If the lsn the log record doesn't match the desired + * lsn then the disk is corrupt. + */ + if (lsn != le64_to_cpu(rh->this_lsn)) + return -EINVAL; + + len = le32_to_cpu(rh->client_data_len); + + /* + * Check that the length field isn't greater than the total + * available space the log file. + */ + rec_len = len + log->record_header_len; + if (rec_len >= log->total_avail) + return -EINVAL; + + /* + * If the entire log record is on this log page, + * put a pointer to the log record the context block. + */ + if (rh->flags & LOG_RECORD_MULTI_PAGE) { + void *lr = kmalloc(len, GFP_NOFS); + + if (!lr) + return -ENOMEM; + + lcb->log_rec = lr; + lcb->alloc = true; + + /* Copy the data into the buffer returned. */ + err = read_log_rec_buf(log, rh, lr); + if (err) + return err; + } else { + /* If beyond the end of the current page -> an error. */ + u32 page_off = lsn_to_page_off(log, lsn); + + if (page_off + len + log->record_header_len > log->page_size) + return -EINVAL; + + lcb->log_rec = Add2Ptr(rh, sizeof(struct LFS_RECORD_HDR)); + lcb->alloc = false; + } + + return 0; +} + +/* + * read_log_rec_lcb - Init the query operation. + */ +static int read_log_rec_lcb(struct ntfs_log *log, u64 lsn, u32 ctx_mode, + struct lcb **lcb_) +{ + int err; + const struct CLIENT_REC *cr; + struct lcb *lcb; + + switch (ctx_mode) { + case lcb_ctx_undo_next: + case lcb_ctx_prev: + case lcb_ctx_next: + break; + default: + return -EINVAL; + } + + /* Check that the given lsn is the legal range for this client. */ + cr = Add2Ptr(log->ra, le16_to_cpu(log->ra->client_off)); + + if (!verify_client_lsn(log, cr, lsn)) + return -EINVAL; + + lcb = kzalloc(sizeof(struct lcb), GFP_NOFS); + if (!lcb) + return -ENOMEM; + lcb->client = log->client_id; + lcb->ctx_mode = ctx_mode; + + /* Find the log record indicated by the given lsn. */ + err = find_log_rec(log, lsn, lcb); + if (err) + goto out; + + *lcb_ = lcb; + return 0; + +out: + lcb_put(lcb); + *lcb_ = NULL; + return err; +} + +/* + * find_client_next_lsn + * + * Attempt to find the next lsn to return to a client based on the context mode. + */ +static int find_client_next_lsn(struct ntfs_log *log, struct lcb *lcb, u64 *lsn) +{ + int err; + u64 next_lsn; + struct LFS_RECORD_HDR *hdr; + + hdr = lcb->lrh; + *lsn = 0; + + if (lcb_ctx_next != lcb->ctx_mode) + goto check_undo_next; + + /* Loop as long as another lsn can be found. */ + for (;;) { + u64 current_lsn; + + err = next_log_lsn(log, hdr, ¤t_lsn); + if (err) + goto out; + + if (!current_lsn) + break; + + if (hdr != lcb->lrh) + kfree(hdr); + + hdr = NULL; + err = read_log_page(log, lsn_to_vbo(log, current_lsn), + (struct RECORD_PAGE_HDR **)&hdr, NULL); + if (err) + goto out; + + if (memcmp(&hdr->client, &lcb->client, + sizeof(struct CLIENT_ID))) { + /*err = -EINVAL; */ + } else if (LfsClientRecord == hdr->record_type) { + kfree(lcb->lrh); + lcb->lrh = hdr; + *lsn = current_lsn; + return 0; + } + } + +out: + if (hdr != lcb->lrh) + kfree(hdr); + return err; + +check_undo_next: + if (lcb_ctx_undo_next == lcb->ctx_mode) + next_lsn = le64_to_cpu(hdr->client_undo_next_lsn); + else if (lcb_ctx_prev == lcb->ctx_mode) + next_lsn = le64_to_cpu(hdr->client_prev_lsn); + else + return 0; + + if (!next_lsn) + return 0; + + if (!verify_client_lsn( + log, Add2Ptr(log->ra, le16_to_cpu(log->ra->client_off)), + next_lsn)) + return 0; + + hdr = NULL; + err = read_log_page(log, lsn_to_vbo(log, next_lsn), + (struct RECORD_PAGE_HDR **)&hdr, NULL); + if (err) + return err; + kfree(lcb->lrh); + lcb->lrh = hdr; + + *lsn = next_lsn; + + return 0; +} + +static int read_next_log_rec(struct ntfs_log *log, struct lcb *lcb, u64 *lsn) +{ + int err; + + err = find_client_next_lsn(log, lcb, lsn); + if (err) + return err; + + if (!*lsn) + return 0; + + if (lcb->alloc) + kfree(lcb->log_rec); + + lcb->log_rec = NULL; + lcb->alloc = false; + kfree(lcb->lrh); + lcb->lrh = NULL; + + return find_log_rec(log, *lsn, lcb); +} + +bool check_index_header(const struct INDEX_HDR *hdr, size_t bytes) +{ + __le16 mask; + u32 min_de, de_off, used, total; + const struct NTFS_DE *e; + + if (hdr_has_subnode(hdr)) { + min_de = sizeof(struct NTFS_DE) + sizeof(u64); + mask = NTFS_IE_HAS_SUBNODES; + } else { + min_de = sizeof(struct NTFS_DE); + mask = 0; + } + + de_off = le32_to_cpu(hdr->de_off); + used = le32_to_cpu(hdr->used); + total = le32_to_cpu(hdr->total); + + if (de_off > bytes - min_de || used > bytes || total > bytes || + de_off + min_de > used || used > total) { + return false; + } + + e = Add2Ptr(hdr, de_off); + for (;;) { + u16 esize = le16_to_cpu(e->size); + struct NTFS_DE *next = Add2Ptr(e, esize); + + if (esize < min_de || PtrOffset(hdr, next) > used || + (e->flags & NTFS_IE_HAS_SUBNODES) != mask) { + return false; + } + + if (de_is_last(e)) + break; + + e = next; + } + + return true; +} + +static inline bool check_index_buffer(const struct INDEX_BUFFER *ib, u32 bytes) +{ + u16 fo; + const struct NTFS_RECORD_HEADER *r = &ib->rhdr; + + if (r->sign != NTFS_INDX_SIGNATURE) + return false; + + fo = (SECTOR_SIZE - ((bytes >> SECTOR_SHIFT) + 1) * sizeof(short)); + + if (le16_to_cpu(r->fix_off) > fo) + return false; + + if ((le16_to_cpu(r->fix_num) - 1) * SECTOR_SIZE != bytes) + return false; + + return check_index_header(&ib->ihdr, + bytes - offsetof(struct INDEX_BUFFER, ihdr)); +} + +static inline bool check_index_root(const struct ATTRIB *attr, + struct ntfs_sb_info *sbi) +{ + bool ret; + const struct INDEX_ROOT *root = resident_data(attr); + u8 index_bits = le32_to_cpu(root->index_block_size) >= sbi->cluster_size + ? sbi->cluster_bits + : SECTOR_SHIFT; + u8 block_clst = root->index_block_clst; + + if (le32_to_cpu(attr->res.data_size) < sizeof(struct INDEX_ROOT) || + (root->type != ATTR_NAME && root->type != ATTR_ZERO) || + (root->type == ATTR_NAME && + root->rule != NTFS_COLLATION_TYPE_FILENAME) || + (le32_to_cpu(root->index_block_size) != + (block_clst << index_bits)) || + (block_clst != 1 && block_clst != 2 && block_clst != 4 && + block_clst != 8 && block_clst != 0x10 && block_clst != 0x20 && + block_clst != 0x40 && block_clst != 0x80)) { + return false; + } + + ret = check_index_header(&root->ihdr, + le32_to_cpu(attr->res.data_size) - + offsetof(struct INDEX_ROOT, ihdr)); + return ret; +} + +static inline bool check_attr(const struct MFT_REC *rec, + const struct ATTRIB *attr, + struct ntfs_sb_info *sbi) +{ + u32 asize = le32_to_cpu(attr->size); + u32 rsize = 0; + u64 dsize, svcn, evcn; + u16 run_off; + + /* Check the fixed part of the attribute record header. */ + if (asize >= sbi->record_size || + asize + PtrOffset(rec, attr) >= sbi->record_size || + (attr->name_len && + le16_to_cpu(attr->name_off) + attr->name_len * sizeof(short) > + asize)) { + return false; + } + + /* Check the attribute fields. */ + switch (attr->non_res) { + case 0: + rsize = le32_to_cpu(attr->res.data_size); + if (rsize >= asize || + le16_to_cpu(attr->res.data_off) + rsize > asize) { + return false; + } + break; + + case 1: + dsize = le64_to_cpu(attr->nres.data_size); + svcn = le64_to_cpu(attr->nres.svcn); + evcn = le64_to_cpu(attr->nres.evcn); + run_off = le16_to_cpu(attr->nres.run_off); + + if (svcn > evcn + 1 || run_off >= asize || + le64_to_cpu(attr->nres.valid_size) > dsize || + dsize > le64_to_cpu(attr->nres.alloc_size)) { + return false; + } + + if (run_off > asize) + return false; + + if (run_unpack(NULL, sbi, 0, svcn, evcn, svcn, + Add2Ptr(attr, run_off), asize - run_off) < 0) { + return false; + } + + return true; + + default: + return false; + } + + switch (attr->type) { + case ATTR_NAME: + if (fname_full_size(Add2Ptr( + attr, le16_to_cpu(attr->res.data_off))) > asize) { + return false; + } + break; + + case ATTR_ROOT: + return check_index_root(attr, sbi); + + case ATTR_STD: + if (rsize < sizeof(struct ATTR_STD_INFO5) && + rsize != sizeof(struct ATTR_STD_INFO)) { + return false; + } + break; + + case ATTR_LIST: + case ATTR_ID: + case ATTR_SECURE: + case ATTR_LABEL: + case ATTR_VOL_INFO: + case ATTR_DATA: + case ATTR_ALLOC: + case ATTR_BITMAP: + case ATTR_REPARSE: + case ATTR_EA_INFO: + case ATTR_EA: + case ATTR_PROPERTYSET: + case ATTR_LOGGED_UTILITY_STREAM: + break; + + default: + return false; + } + + return true; +} + +static inline bool check_file_record(const struct MFT_REC *rec, + const struct MFT_REC *rec2, + struct ntfs_sb_info *sbi) +{ + const struct ATTRIB *attr; + u16 fo = le16_to_cpu(rec->rhdr.fix_off); + u16 fn = le16_to_cpu(rec->rhdr.fix_num); + u16 ao = le16_to_cpu(rec->attr_off); + u32 rs = sbi->record_size; + + /* Check the file record header for consistency. */ + if (rec->rhdr.sign != NTFS_FILE_SIGNATURE || + fo > (SECTOR_SIZE - ((rs >> SECTOR_SHIFT) + 1) * sizeof(short)) || + (fn - 1) * SECTOR_SIZE != rs || ao < MFTRECORD_FIXUP_OFFSET_1 || + ao > sbi->record_size - SIZEOF_RESIDENT || !is_rec_inuse(rec) || + le32_to_cpu(rec->total) != rs) { + return false; + } + + /* Loop to check all of the attributes. */ + for (attr = Add2Ptr(rec, ao); attr->type != ATTR_END; + attr = Add2Ptr(attr, le32_to_cpu(attr->size))) { + if (check_attr(rec, attr, sbi)) + continue; + return false; + } + + return true; +} + +static inline int check_lsn(const struct NTFS_RECORD_HEADER *hdr, + const u64 *rlsn) +{ + u64 lsn; + + if (!rlsn) + return true; + + lsn = le64_to_cpu(hdr->lsn); + + if (hdr->sign == NTFS_HOLE_SIGNATURE) + return false; + + if (*rlsn > lsn) + return true; + + return false; +} + +static inline bool check_if_attr(const struct MFT_REC *rec, + const struct LOG_REC_HDR *lrh) +{ + u16 ro = le16_to_cpu(lrh->record_off); + u16 o = le16_to_cpu(rec->attr_off); + const struct ATTRIB *attr = Add2Ptr(rec, o); + + while (o < ro) { + u32 asize; + + if (attr->type == ATTR_END) + break; + + asize = le32_to_cpu(attr->size); + if (!asize) + break; + + o += asize; + attr = Add2Ptr(attr, asize); + } + + return o == ro; +} + +static inline bool check_if_index_root(const struct MFT_REC *rec, + const struct LOG_REC_HDR *lrh) +{ + u16 ro = le16_to_cpu(lrh->record_off); + u16 o = le16_to_cpu(rec->attr_off); + const struct ATTRIB *attr = Add2Ptr(rec, o); + + while (o < ro) { + u32 asize; + + if (attr->type == ATTR_END) + break; + + asize = le32_to_cpu(attr->size); + if (!asize) + break; + + o += asize; + attr = Add2Ptr(attr, asize); + } + + return o == ro && attr->type == ATTR_ROOT; +} + +static inline bool check_if_root_index(const struct ATTRIB *attr, + const struct INDEX_HDR *hdr, + const struct LOG_REC_HDR *lrh) +{ + u16 ao = le16_to_cpu(lrh->attr_off); + u32 de_off = le32_to_cpu(hdr->de_off); + u32 o = PtrOffset(attr, hdr) + de_off; + const struct NTFS_DE *e = Add2Ptr(hdr, de_off); + u32 asize = le32_to_cpu(attr->size); + + while (o < ao) { + u16 esize; + + if (o >= asize) + break; + + esize = le16_to_cpu(e->size); + if (!esize) + break; + + o += esize; + e = Add2Ptr(e, esize); + } + + return o == ao; +} + +static inline bool check_if_alloc_index(const struct INDEX_HDR *hdr, + u32 attr_off) +{ + u32 de_off = le32_to_cpu(hdr->de_off); + u32 o = offsetof(struct INDEX_BUFFER, ihdr) + de_off; + const struct NTFS_DE *e = Add2Ptr(hdr, de_off); + u32 used = le32_to_cpu(hdr->used); + + while (o < attr_off) { + u16 esize; + + if (de_off >= used) + break; + + esize = le16_to_cpu(e->size); + if (!esize) + break; + + o += esize; + de_off += esize; + e = Add2Ptr(e, esize); + } + + return o == attr_off; +} + +static inline void change_attr_size(struct MFT_REC *rec, struct ATTRIB *attr, + u32 nsize) +{ + u32 asize = le32_to_cpu(attr->size); + int dsize = nsize - asize; + u8 *next = Add2Ptr(attr, asize); + u32 used = le32_to_cpu(rec->used); + + memmove(Add2Ptr(attr, nsize), next, used - PtrOffset(rec, next)); + + rec->used = cpu_to_le32(used + dsize); + attr->size = cpu_to_le32(nsize); +} + +struct OpenAttr { + struct ATTRIB *attr; + struct runs_tree *run1; + struct runs_tree run0; + struct ntfs_inode *ni; + // CLST rno; +}; + +/* + * cmp_type_and_name + * + * Return: 0 if 'attr' has the same type and name. + */ +static inline int cmp_type_and_name(const struct ATTRIB *a1, + const struct ATTRIB *a2) +{ + return a1->type != a2->type || a1->name_len != a2->name_len || + (a1->name_len && memcmp(attr_name(a1), attr_name(a2), + a1->name_len * sizeof(short))); +} + +static struct OpenAttr *find_loaded_attr(struct ntfs_log *log, + const struct ATTRIB *attr, CLST rno) +{ + struct OPEN_ATTR_ENRTY *oe = NULL; + + while ((oe = enum_rstbl(log->open_attr_tbl, oe))) { + struct OpenAttr *op_attr; + + if (ino_get(&oe->ref) != rno) + continue; + + op_attr = (struct OpenAttr *)oe->ptr; + if (!cmp_type_and_name(op_attr->attr, attr)) + return op_attr; + } + return NULL; +} + +static struct ATTRIB *attr_create_nonres_log(struct ntfs_sb_info *sbi, + enum ATTR_TYPE type, u64 size, + const u16 *name, size_t name_len, + __le16 flags) +{ + struct ATTRIB *attr; + u32 name_size = ALIGN(name_len * sizeof(short), 8); + bool is_ext = flags & (ATTR_FLAG_COMPRESSED | ATTR_FLAG_SPARSED); + u32 asize = name_size + + (is_ext ? SIZEOF_NONRESIDENT_EX : SIZEOF_NONRESIDENT); + + attr = kzalloc(asize, GFP_NOFS); + if (!attr) + return NULL; + + attr->type = type; + attr->size = cpu_to_le32(asize); + attr->flags = flags; + attr->non_res = 1; + attr->name_len = name_len; + + attr->nres.evcn = cpu_to_le64((u64)bytes_to_cluster(sbi, size) - 1); + attr->nres.alloc_size = cpu_to_le64(ntfs_up_cluster(sbi, size)); + attr->nres.data_size = cpu_to_le64(size); + attr->nres.valid_size = attr->nres.data_size; + if (is_ext) { + attr->name_off = SIZEOF_NONRESIDENT_EX_LE; + if (is_attr_compressed(attr)) + attr->nres.c_unit = COMPRESSION_UNIT; + + attr->nres.run_off = + cpu_to_le16(SIZEOF_NONRESIDENT_EX + name_size); + memcpy(Add2Ptr(attr, SIZEOF_NONRESIDENT_EX), name, + name_len * sizeof(short)); + } else { + attr->name_off = SIZEOF_NONRESIDENT_LE; + attr->nres.run_off = + cpu_to_le16(SIZEOF_NONRESIDENT + name_size); + memcpy(Add2Ptr(attr, SIZEOF_NONRESIDENT), name, + name_len * sizeof(short)); + } + + return attr; +} + +/* + * do_action - Common routine for the Redo and Undo Passes. + * @rlsn: If it is NULL then undo. + */ +static int do_action(struct ntfs_log *log, struct OPEN_ATTR_ENRTY *oe, + const struct LOG_REC_HDR *lrh, u32 op, void *data, + u32 dlen, u32 rec_len, const u64 *rlsn) +{ + int err = 0; + struct ntfs_sb_info *sbi = log->ni->mi.sbi; + struct inode *inode = NULL, *inode_parent; + struct mft_inode *mi = NULL, *mi2_child = NULL; + CLST rno = 0, rno_base = 0; + struct INDEX_BUFFER *ib = NULL; + struct MFT_REC *rec = NULL; + struct ATTRIB *attr = NULL, *attr2; + struct INDEX_HDR *hdr; + struct INDEX_ROOT *root; + struct NTFS_DE *e, *e1, *e2; + struct NEW_ATTRIBUTE_SIZES *new_sz; + struct ATTR_FILE_NAME *fname; + struct OpenAttr *oa, *oa2; + u32 nsize, t32, asize, used, esize, bmp_off, bmp_bits; + u16 id, id2; + u32 record_size = sbi->record_size; + u64 t64; + u16 roff = le16_to_cpu(lrh->record_off); + u16 aoff = le16_to_cpu(lrh->attr_off); + u64 lco = 0; + u64 cbo = (u64)le16_to_cpu(lrh->cluster_off) << SECTOR_SHIFT; + u64 tvo = le64_to_cpu(lrh->target_vcn) << sbi->cluster_bits; + u64 vbo = cbo + tvo; + void *buffer_le = NULL; + u32 bytes = 0; + bool a_dirty = false; + u16 data_off; + + oa = oe->ptr; + + /* Big switch to prepare. */ + switch (op) { + /* ============================================================ + * Process MFT records, as described by the current log record. + * ============================================================ + */ + case InitializeFileRecordSegment: + case DeallocateFileRecordSegment: + case WriteEndOfFileRecordSegment: + case CreateAttribute: + case DeleteAttribute: + case UpdateResidentValue: + case UpdateMappingPairs: + case SetNewAttributeSizes: + case AddIndexEntryRoot: + case DeleteIndexEntryRoot: + case SetIndexEntryVcnRoot: + case UpdateFileNameRoot: + case UpdateRecordDataRoot: + case ZeroEndOfFileRecord: + rno = vbo >> sbi->record_bits; + inode = ilookup(sbi->sb, rno); + if (inode) { + mi = &ntfs_i(inode)->mi; + } else if (op == InitializeFileRecordSegment) { + mi = kzalloc(sizeof(struct mft_inode), GFP_NOFS); + if (!mi) + return -ENOMEM; + err = mi_format_new(mi, sbi, rno, 0, false); + if (err) + goto out; + } else { + /* Read from disk. */ + err = mi_get(sbi, rno, &mi); + if (err) + return err; + } + rec = mi->mrec; + + if (op == DeallocateFileRecordSegment) + goto skip_load_parent; + + if (InitializeFileRecordSegment != op) { + if (rec->rhdr.sign == NTFS_BAAD_SIGNATURE) + goto dirty_vol; + if (!check_lsn(&rec->rhdr, rlsn)) + goto out; + if (!check_file_record(rec, NULL, sbi)) + goto dirty_vol; + attr = Add2Ptr(rec, roff); + } + + if (is_rec_base(rec) || InitializeFileRecordSegment == op) { + rno_base = rno; + goto skip_load_parent; + } + + rno_base = ino_get(&rec->parent_ref); + inode_parent = ntfs_iget5(sbi->sb, &rec->parent_ref, NULL); + if (IS_ERR(inode_parent)) + goto skip_load_parent; + + if (is_bad_inode(inode_parent)) { + iput(inode_parent); + goto skip_load_parent; + } + + if (ni_load_mi_ex(ntfs_i(inode_parent), rno, &mi2_child)) { + iput(inode_parent); + } else { + if (mi2_child->mrec != mi->mrec) + memcpy(mi2_child->mrec, mi->mrec, + sbi->record_size); + + if (inode) + iput(inode); + else if (mi) + mi_put(mi); + + inode = inode_parent; + mi = mi2_child; + rec = mi2_child->mrec; + attr = Add2Ptr(rec, roff); + } + +skip_load_parent: + inode_parent = NULL; + break; + + /* + * Process attributes, as described by the current log record. + */ + case UpdateNonresidentValue: + case AddIndexEntryAllocation: + case DeleteIndexEntryAllocation: + case WriteEndOfIndexBuffer: + case SetIndexEntryVcnAllocation: + case UpdateFileNameAllocation: + case SetBitsInNonresidentBitMap: + case ClearBitsInNonresidentBitMap: + case UpdateRecordDataAllocation: + attr = oa->attr; + bytes = UpdateNonresidentValue == op ? dlen : 0; + lco = (u64)le16_to_cpu(lrh->lcns_follow) << sbi->cluster_bits; + + if (attr->type == ATTR_ALLOC) { + t32 = le32_to_cpu(oe->bytes_per_index); + if (bytes < t32) + bytes = t32; + } + + if (!bytes) + bytes = lco - cbo; + + bytes += roff; + if (attr->type == ATTR_ALLOC) + bytes = (bytes + 511) & ~511; // align + + buffer_le = kmalloc(bytes, GFP_NOFS); + if (!buffer_le) + return -ENOMEM; + + err = ntfs_read_run_nb(sbi, oa->run1, vbo, buffer_le, bytes, + NULL); + if (err) + goto out; + + if (attr->type == ATTR_ALLOC && *(int *)buffer_le) + ntfs_fix_post_read(buffer_le, bytes, false); + break; + + default: + WARN_ON(1); + } + + /* Big switch to do operation. */ + switch (op) { + case InitializeFileRecordSegment: + if (roff + dlen > record_size) + goto dirty_vol; + + memcpy(Add2Ptr(rec, roff), data, dlen); + mi->dirty = true; + break; + + case DeallocateFileRecordSegment: + clear_rec_inuse(rec); + le16_add_cpu(&rec->seq, 1); + mi->dirty = true; + break; + + case WriteEndOfFileRecordSegment: + attr2 = (struct ATTRIB *)data; + if (!check_if_attr(rec, lrh) || roff + dlen > record_size) + goto dirty_vol; + + memmove(attr, attr2, dlen); + rec->used = cpu_to_le32(ALIGN(roff + dlen, 8)); + + mi->dirty = true; + break; + + case CreateAttribute: + attr2 = (struct ATTRIB *)data; + asize = le32_to_cpu(attr2->size); + used = le32_to_cpu(rec->used); + + if (!check_if_attr(rec, lrh) || dlen < SIZEOF_RESIDENT || + !IS_ALIGNED(asize, 8) || + Add2Ptr(attr2, asize) > Add2Ptr(lrh, rec_len) || + dlen > record_size - used) { + goto dirty_vol; + } + + memmove(Add2Ptr(attr, asize), attr, used - roff); + memcpy(attr, attr2, asize); + + rec->used = cpu_to_le32(used + asize); + id = le16_to_cpu(rec->next_attr_id); + id2 = le16_to_cpu(attr2->id); + if (id <= id2) + rec->next_attr_id = cpu_to_le16(id2 + 1); + if (is_attr_indexed(attr)) + le16_add_cpu(&rec->hard_links, 1); + + oa2 = find_loaded_attr(log, attr, rno_base); + if (oa2) { + void *p2 = kmemdup(attr, le32_to_cpu(attr->size), + GFP_NOFS); + if (p2) { + // run_close(oa2->run1); + kfree(oa2->attr); + oa2->attr = p2; + } + } + + mi->dirty = true; + break; + + case DeleteAttribute: + asize = le32_to_cpu(attr->size); + used = le32_to_cpu(rec->used); + + if (!check_if_attr(rec, lrh)) + goto dirty_vol; + + rec->used = cpu_to_le32(used - asize); + if (is_attr_indexed(attr)) + le16_add_cpu(&rec->hard_links, -1); + + memmove(attr, Add2Ptr(attr, asize), used - asize - roff); + + mi->dirty = true; + break; + + case UpdateResidentValue: + nsize = aoff + dlen; + + if (!check_if_attr(rec, lrh)) + goto dirty_vol; + + asize = le32_to_cpu(attr->size); + used = le32_to_cpu(rec->used); + + if (lrh->redo_len == lrh->undo_len) { + if (nsize > asize) + goto dirty_vol; + goto move_data; + } + + if (nsize > asize && nsize - asize > record_size - used) + goto dirty_vol; + + nsize = ALIGN(nsize, 8); + data_off = le16_to_cpu(attr->res.data_off); + + if (nsize < asize) { + memmove(Add2Ptr(attr, aoff), data, dlen); + data = NULL; // To skip below memmove(). + } + + memmove(Add2Ptr(attr, nsize), Add2Ptr(attr, asize), + used - le16_to_cpu(lrh->record_off) - asize); + + rec->used = cpu_to_le32(used + nsize - asize); + attr->size = cpu_to_le32(nsize); + attr->res.data_size = cpu_to_le32(aoff + dlen - data_off); + +move_data: + if (data) + memmove(Add2Ptr(attr, aoff), data, dlen); + + oa2 = find_loaded_attr(log, attr, rno_base); + if (oa2) { + void *p2 = kmemdup(attr, le32_to_cpu(attr->size), + GFP_NOFS); + if (p2) { + // run_close(&oa2->run0); + oa2->run1 = &oa2->run0; + kfree(oa2->attr); + oa2->attr = p2; + } + } + + mi->dirty = true; + break; + + case UpdateMappingPairs: + nsize = aoff + dlen; + asize = le32_to_cpu(attr->size); + used = le32_to_cpu(rec->used); + + if (!check_if_attr(rec, lrh) || !attr->non_res || + aoff < le16_to_cpu(attr->nres.run_off) || aoff > asize || + (nsize > asize && nsize - asize > record_size - used)) { + goto dirty_vol; + } + + nsize = ALIGN(nsize, 8); + + memmove(Add2Ptr(attr, nsize), Add2Ptr(attr, asize), + used - le16_to_cpu(lrh->record_off) - asize); + rec->used = cpu_to_le32(used + nsize - asize); + attr->size = cpu_to_le32(nsize); + memmove(Add2Ptr(attr, aoff), data, dlen); + + if (run_get_highest_vcn(le64_to_cpu(attr->nres.svcn), + attr_run(attr), &t64)) { + goto dirty_vol; + } + + attr->nres.evcn = cpu_to_le64(t64); + oa2 = find_loaded_attr(log, attr, rno_base); + if (oa2 && oa2->attr->non_res) + oa2->attr->nres.evcn = attr->nres.evcn; + + mi->dirty = true; + break; + + case SetNewAttributeSizes: + new_sz = data; + if (!check_if_attr(rec, lrh) || !attr->non_res) + goto dirty_vol; + + attr->nres.alloc_size = new_sz->alloc_size; + attr->nres.data_size = new_sz->data_size; + attr->nres.valid_size = new_sz->valid_size; + + if (dlen >= sizeof(struct NEW_ATTRIBUTE_SIZES)) + attr->nres.total_size = new_sz->total_size; + + oa2 = find_loaded_attr(log, attr, rno_base); + if (oa2) { + void *p2 = kmemdup(attr, le32_to_cpu(attr->size), + GFP_NOFS); + if (p2) { + kfree(oa2->attr); + oa2->attr = p2; + } + } + mi->dirty = true; + break; + + case AddIndexEntryRoot: + e = (struct NTFS_DE *)data; + esize = le16_to_cpu(e->size); + root = resident_data(attr); + hdr = &root->ihdr; + used = le32_to_cpu(hdr->used); + + if (!check_if_index_root(rec, lrh) || + !check_if_root_index(attr, hdr, lrh) || + Add2Ptr(data, esize) > Add2Ptr(lrh, rec_len) || + esize > le32_to_cpu(rec->total) - le32_to_cpu(rec->used)) { + goto dirty_vol; + } + + e1 = Add2Ptr(attr, le16_to_cpu(lrh->attr_off)); + + change_attr_size(rec, attr, le32_to_cpu(attr->size) + esize); + + memmove(Add2Ptr(e1, esize), e1, + PtrOffset(e1, Add2Ptr(hdr, used))); + memmove(e1, e, esize); + + le32_add_cpu(&attr->res.data_size, esize); + hdr->used = cpu_to_le32(used + esize); + le32_add_cpu(&hdr->total, esize); + + mi->dirty = true; + break; + + case DeleteIndexEntryRoot: + root = resident_data(attr); + hdr = &root->ihdr; + used = le32_to_cpu(hdr->used); + + if (!check_if_index_root(rec, lrh) || + !check_if_root_index(attr, hdr, lrh)) { + goto dirty_vol; + } + + e1 = Add2Ptr(attr, le16_to_cpu(lrh->attr_off)); + esize = le16_to_cpu(e1->size); + e2 = Add2Ptr(e1, esize); + + memmove(e1, e2, PtrOffset(e2, Add2Ptr(hdr, used))); + + le32_sub_cpu(&attr->res.data_size, esize); + hdr->used = cpu_to_le32(used - esize); + le32_sub_cpu(&hdr->total, esize); + + change_attr_size(rec, attr, le32_to_cpu(attr->size) - esize); + + mi->dirty = true; + break; + + case SetIndexEntryVcnRoot: + root = resident_data(attr); + hdr = &root->ihdr; + + if (!check_if_index_root(rec, lrh) || + !check_if_root_index(attr, hdr, lrh)) { + goto dirty_vol; + } + + e = Add2Ptr(attr, le16_to_cpu(lrh->attr_off)); + + de_set_vbn_le(e, *(__le64 *)data); + mi->dirty = true; + break; + + case UpdateFileNameRoot: + root = resident_data(attr); + hdr = &root->ihdr; + + if (!check_if_index_root(rec, lrh) || + !check_if_root_index(attr, hdr, lrh)) { + goto dirty_vol; + } + + e = Add2Ptr(attr, le16_to_cpu(lrh->attr_off)); + fname = (struct ATTR_FILE_NAME *)(e + 1); + memmove(&fname->dup, data, sizeof(fname->dup)); // + mi->dirty = true; + break; + + case UpdateRecordDataRoot: + root = resident_data(attr); + hdr = &root->ihdr; + + if (!check_if_index_root(rec, lrh) || + !check_if_root_index(attr, hdr, lrh)) { + goto dirty_vol; + } + + e = Add2Ptr(attr, le16_to_cpu(lrh->attr_off)); + + memmove(Add2Ptr(e, le16_to_cpu(e->view.data_off)), data, dlen); + + mi->dirty = true; + break; + + case ZeroEndOfFileRecord: + if (roff + dlen > record_size) + goto dirty_vol; + + memset(attr, 0, dlen); + mi->dirty = true; + break; + + case UpdateNonresidentValue: + if (lco < cbo + roff + dlen) + goto dirty_vol; + + memcpy(Add2Ptr(buffer_le, roff), data, dlen); + + a_dirty = true; + if (attr->type == ATTR_ALLOC) + ntfs_fix_pre_write(buffer_le, bytes); + break; + + case AddIndexEntryAllocation: + ib = Add2Ptr(buffer_le, roff); + hdr = &ib->ihdr; + e = data; + esize = le16_to_cpu(e->size); + e1 = Add2Ptr(ib, aoff); + + if (is_baad(&ib->rhdr)) + goto dirty_vol; + if (!check_lsn(&ib->rhdr, rlsn)) + goto out; + + used = le32_to_cpu(hdr->used); + + if (!check_index_buffer(ib, bytes) || + !check_if_alloc_index(hdr, aoff) || + Add2Ptr(e, esize) > Add2Ptr(lrh, rec_len) || + used + esize > le32_to_cpu(hdr->total)) { + goto dirty_vol; + } + + memmove(Add2Ptr(e1, esize), e1, + PtrOffset(e1, Add2Ptr(hdr, used))); + memcpy(e1, e, esize); + + hdr->used = cpu_to_le32(used + esize); + + a_dirty = true; + + ntfs_fix_pre_write(&ib->rhdr, bytes); + break; + + case DeleteIndexEntryAllocation: + ib = Add2Ptr(buffer_le, roff); + hdr = &ib->ihdr; + e = Add2Ptr(ib, aoff); + esize = le16_to_cpu(e->size); + + if (is_baad(&ib->rhdr)) + goto dirty_vol; + if (!check_lsn(&ib->rhdr, rlsn)) + goto out; + + if (!check_index_buffer(ib, bytes) || + !check_if_alloc_index(hdr, aoff)) { + goto dirty_vol; + } + + e1 = Add2Ptr(e, esize); + nsize = esize; + used = le32_to_cpu(hdr->used); + + memmove(e, e1, PtrOffset(e1, Add2Ptr(hdr, used))); + + hdr->used = cpu_to_le32(used - nsize); + + a_dirty = true; + + ntfs_fix_pre_write(&ib->rhdr, bytes); + break; + + case WriteEndOfIndexBuffer: + ib = Add2Ptr(buffer_le, roff); + hdr = &ib->ihdr; + e = Add2Ptr(ib, aoff); + + if (is_baad(&ib->rhdr)) + goto dirty_vol; + if (!check_lsn(&ib->rhdr, rlsn)) + goto out; + if (!check_index_buffer(ib, bytes) || + !check_if_alloc_index(hdr, aoff) || + aoff + dlen > offsetof(struct INDEX_BUFFER, ihdr) + + le32_to_cpu(hdr->total)) { + goto dirty_vol; + } + + hdr->used = cpu_to_le32(dlen + PtrOffset(hdr, e)); + memmove(e, data, dlen); + + a_dirty = true; + ntfs_fix_pre_write(&ib->rhdr, bytes); + break; + + case SetIndexEntryVcnAllocation: + ib = Add2Ptr(buffer_le, roff); + hdr = &ib->ihdr; + e = Add2Ptr(ib, aoff); + + if (is_baad(&ib->rhdr)) + goto dirty_vol; + + if (!check_lsn(&ib->rhdr, rlsn)) + goto out; + if (!check_index_buffer(ib, bytes) || + !check_if_alloc_index(hdr, aoff)) { + goto dirty_vol; + } + + de_set_vbn_le(e, *(__le64 *)data); + + a_dirty = true; + ntfs_fix_pre_write(&ib->rhdr, bytes); + break; + + case UpdateFileNameAllocation: + ib = Add2Ptr(buffer_le, roff); + hdr = &ib->ihdr; + e = Add2Ptr(ib, aoff); + + if (is_baad(&ib->rhdr)) + goto dirty_vol; + + if (!check_lsn(&ib->rhdr, rlsn)) + goto out; + if (!check_index_buffer(ib, bytes) || + !check_if_alloc_index(hdr, aoff)) { + goto dirty_vol; + } + + fname = (struct ATTR_FILE_NAME *)(e + 1); + memmove(&fname->dup, data, sizeof(fname->dup)); + + a_dirty = true; + ntfs_fix_pre_write(&ib->rhdr, bytes); + break; + + case SetBitsInNonresidentBitMap: + bmp_off = + le32_to_cpu(((struct BITMAP_RANGE *)data)->bitmap_off); + bmp_bits = le32_to_cpu(((struct BITMAP_RANGE *)data)->bits); + + if (cbo + (bmp_off + 7) / 8 > lco || + cbo + ((bmp_off + bmp_bits + 7) / 8) > lco) { + goto dirty_vol; + } + + __bitmap_set(Add2Ptr(buffer_le, roff), bmp_off, bmp_bits); + a_dirty = true; + break; + + case ClearBitsInNonresidentBitMap: + bmp_off = + le32_to_cpu(((struct BITMAP_RANGE *)data)->bitmap_off); + bmp_bits = le32_to_cpu(((struct BITMAP_RANGE *)data)->bits); + + if (cbo + (bmp_off + 7) / 8 > lco || + cbo + ((bmp_off + bmp_bits + 7) / 8) > lco) { + goto dirty_vol; + } + + __bitmap_clear(Add2Ptr(buffer_le, roff), bmp_off, bmp_bits); + a_dirty = true; + break; + + case UpdateRecordDataAllocation: + ib = Add2Ptr(buffer_le, roff); + hdr = &ib->ihdr; + e = Add2Ptr(ib, aoff); + + if (is_baad(&ib->rhdr)) + goto dirty_vol; + + if (!check_lsn(&ib->rhdr, rlsn)) + goto out; + if (!check_index_buffer(ib, bytes) || + !check_if_alloc_index(hdr, aoff)) { + goto dirty_vol; + } + + memmove(Add2Ptr(e, le16_to_cpu(e->view.data_off)), data, dlen); + + a_dirty = true; + ntfs_fix_pre_write(&ib->rhdr, bytes); + break; + + default: + WARN_ON(1); + } + + if (rlsn) { + __le64 t64 = cpu_to_le64(*rlsn); + + if (rec) + rec->rhdr.lsn = t64; + if (ib) + ib->rhdr.lsn = t64; + } + + if (mi && mi->dirty) { + err = mi_write(mi, 0); + if (err) + goto out; + } + + if (a_dirty) { + attr = oa->attr; + err = ntfs_sb_write_run(sbi, oa->run1, vbo, buffer_le, bytes, 0); + if (err) + goto out; + } + +out: + + if (inode) + iput(inode); + else if (mi != mi2_child) + mi_put(mi); + + kfree(buffer_le); + + return err; + +dirty_vol: + log->set_dirty = true; + goto out; +} + +/* + * log_replay - Replays log and empties it. + * + * This function is called during mount operation. + * It replays log and empties it. + * Initialized is set false if logfile contains '-1'. + */ +int log_replay(struct ntfs_inode *ni, bool *initialized) +{ + int err; + struct ntfs_sb_info *sbi = ni->mi.sbi; + struct ntfs_log *log; + + struct restart_info rst_info, rst_info2; + u64 rec_lsn, ra_lsn, checkpt_lsn = 0, rlsn = 0; + struct ATTR_NAME_ENTRY *attr_names = NULL; + struct ATTR_NAME_ENTRY *ane; + struct RESTART_TABLE *dptbl = NULL; + struct RESTART_TABLE *trtbl = NULL; + const struct RESTART_TABLE *rt; + struct RESTART_TABLE *oatbl = NULL; + struct inode *inode; + struct OpenAttr *oa; + struct ntfs_inode *ni_oe; + struct ATTRIB *attr = NULL; + u64 size, vcn, undo_next_lsn; + CLST rno, lcn, lcn0, len0, clen; + void *data; + struct NTFS_RESTART *rst = NULL; + struct lcb *lcb = NULL; + struct OPEN_ATTR_ENRTY *oe; + struct TRANSACTION_ENTRY *tr; + struct DIR_PAGE_ENTRY *dp; + u32 i, bytes_per_attr_entry; + u32 l_size = ni->vfs_inode.i_size; + u32 orig_file_size = l_size; + u32 page_size, vbo, tail, off, dlen; + u32 saved_len, rec_len, transact_id; + bool use_second_page; + struct RESTART_AREA *ra2, *ra = NULL; + struct CLIENT_REC *ca, *cr; + __le16 client; + struct RESTART_HDR *rh; + const struct LFS_RECORD_HDR *frh; + const struct LOG_REC_HDR *lrh; + bool is_mapped; + bool is_ro = sb_rdonly(sbi->sb); + u64 t64; + u16 t16; + u32 t32; + + /* Get the size of page. NOTE: To replay we can use default page. */ +#if PAGE_SIZE >= DefaultLogPageSize && PAGE_SIZE <= DefaultLogPageSize * 2 + page_size = norm_file_page(PAGE_SIZE, &l_size, true); +#else + page_size = norm_file_page(PAGE_SIZE, &l_size, false); +#endif + if (!page_size) + return -EINVAL; + + log = kzalloc(sizeof(struct ntfs_log), GFP_NOFS); + if (!log) + return -ENOMEM; + + memset(&rst_info, 0, sizeof(struct restart_info)); + + log->ni = ni; + log->l_size = l_size; + log->one_page_buf = kmalloc(page_size, GFP_NOFS); + if (!log->one_page_buf) { + err = -ENOMEM; + goto out; + } + + log->page_size = page_size; + log->page_mask = page_size - 1; + log->page_bits = blksize_bits(page_size); + + /* Look for a restart area on the disk. */ + err = log_read_rst(log, l_size, true, &rst_info); + if (err) + goto out; + + /* remember 'initialized' */ + *initialized = rst_info.initialized; + + if (!rst_info.restart) { + if (rst_info.initialized) { + /* No restart area but the file is not initialized. */ + err = -EINVAL; + goto out; + } + + log_init_pg_hdr(log, page_size, page_size, 1, 1); + log_create(log, l_size, 0, get_random_u32(), false, false); + + log->ra = ra; + + ra = log_create_ra(log); + if (!ra) { + err = -ENOMEM; + goto out; + } + log->ra = ra; + log->init_ra = true; + + goto process_log; + } + + /* + * If the restart offset above wasn't zero then we won't + * look for a second restart. + */ + if (rst_info.vbo) + goto check_restart_area; + + memset(&rst_info2, 0, sizeof(struct restart_info)); + err = log_read_rst(log, l_size, false, &rst_info2); + if (err) + goto out; + + /* Determine which restart area to use. */ + if (!rst_info2.restart || rst_info2.last_lsn <= rst_info.last_lsn) + goto use_first_page; + + use_second_page = true; + + if (rst_info.chkdsk_was_run && page_size != rst_info.vbo) { + struct RECORD_PAGE_HDR *sp = NULL; + bool usa_error; + + if (!read_log_page(log, page_size, &sp, &usa_error) && + sp->rhdr.sign == NTFS_CHKD_SIGNATURE) { + use_second_page = false; + } + kfree(sp); + } + + if (use_second_page) { + kfree(rst_info.r_page); + memcpy(&rst_info, &rst_info2, sizeof(struct restart_info)); + rst_info2.r_page = NULL; + } + +use_first_page: + kfree(rst_info2.r_page); + +check_restart_area: + /* + * If the restart area is at offset 0, we want + * to write the second restart area first. + */ + log->init_ra = !!rst_info.vbo; + + /* If we have a valid page then grab a pointer to the restart area. */ + ra2 = rst_info.valid_page + ? Add2Ptr(rst_info.r_page, + le16_to_cpu(rst_info.r_page->ra_off)) + : NULL; + + if (rst_info.chkdsk_was_run || + (ra2 && ra2->client_idx[1] == LFS_NO_CLIENT_LE)) { + bool wrapped = false; + bool use_multi_page = false; + u32 open_log_count; + + /* Do some checks based on whether we have a valid log page. */ + if (!rst_info.valid_page) { + open_log_count = get_random_u32(); + goto init_log_instance; + } + open_log_count = le32_to_cpu(ra2->open_log_count); + + /* + * If the restart page size isn't changing then we want to + * check how much work we need to do. + */ + if (page_size != le32_to_cpu(rst_info.r_page->sys_page_size)) + goto init_log_instance; + +init_log_instance: + log_init_pg_hdr(log, page_size, page_size, 1, 1); + + log_create(log, l_size, rst_info.last_lsn, open_log_count, + wrapped, use_multi_page); + + ra = log_create_ra(log); + if (!ra) { + err = -ENOMEM; + goto out; + } + log->ra = ra; + + /* Put the restart areas and initialize + * the log file as required. + */ + goto process_log; + } + + if (!ra2) { + err = -EINVAL; + goto out; + } + + /* + * If the log page or the system page sizes have changed, we can't + * use the log file. We must use the system page size instead of the + * default size if there is not a clean shutdown. + */ + t32 = le32_to_cpu(rst_info.r_page->sys_page_size); + if (page_size != t32) { + l_size = orig_file_size; + page_size = + norm_file_page(t32, &l_size, t32 == DefaultLogPageSize); + } + + if (page_size != t32 || + page_size != le32_to_cpu(rst_info.r_page->page_size)) { + err = -EINVAL; + goto out; + } + + /* If the file size has shrunk then we won't mount it. */ + if (l_size < le64_to_cpu(ra2->l_size)) { + err = -EINVAL; + goto out; + } + + log_init_pg_hdr(log, page_size, page_size, + le16_to_cpu(rst_info.r_page->major_ver), + le16_to_cpu(rst_info.r_page->minor_ver)); + + log->l_size = le64_to_cpu(ra2->l_size); + log->seq_num_bits = le32_to_cpu(ra2->seq_num_bits); + log->file_data_bits = sizeof(u64) * 8 - log->seq_num_bits; + log->seq_num_mask = (8 << log->file_data_bits) - 1; + log->last_lsn = le64_to_cpu(ra2->current_lsn); + log->seq_num = log->last_lsn >> log->file_data_bits; + log->ra_off = le16_to_cpu(rst_info.r_page->ra_off); + log->restart_size = log->sys_page_size - log->ra_off; + log->record_header_len = le16_to_cpu(ra2->rec_hdr_len); + log->ra_size = le16_to_cpu(ra2->ra_len); + log->data_off = le16_to_cpu(ra2->data_off); + log->data_size = log->page_size - log->data_off; + log->reserved = log->data_size - log->record_header_len; + + vbo = lsn_to_vbo(log, log->last_lsn); + + if (vbo < log->first_page) { + /* This is a pseudo lsn. */ + log->l_flags |= NTFSLOG_NO_LAST_LSN; + log->next_page = log->first_page; + goto find_oldest; + } + + /* Find the end of this log record. */ + off = final_log_off(log, log->last_lsn, + le32_to_cpu(ra2->last_lsn_data_len)); + + /* If we wrapped the file then increment the sequence number. */ + if (off <= vbo) { + log->seq_num += 1; + log->l_flags |= NTFSLOG_WRAPPED; + } + + /* Now compute the next log page to use. */ + vbo &= ~log->sys_page_mask; + tail = log->page_size - (off & log->page_mask) - 1; + + /* + *If we can fit another log record on the page, + * move back a page the log file. + */ + if (tail >= log->record_header_len) { + log->l_flags |= NTFSLOG_REUSE_TAIL; + log->next_page = vbo; + } else { + log->next_page = next_page_off(log, vbo); + } + +find_oldest: + /* + * Find the oldest client lsn. Use the last + * flushed lsn as a starting point. + */ + log->oldest_lsn = log->last_lsn; + oldest_client_lsn(Add2Ptr(ra2, le16_to_cpu(ra2->client_off)), + ra2->client_idx[1], &log->oldest_lsn); + log->oldest_lsn_off = lsn_to_vbo(log, log->oldest_lsn); + + if (log->oldest_lsn_off < log->first_page) + log->l_flags |= NTFSLOG_NO_OLDEST_LSN; + + if (!(ra2->flags & RESTART_SINGLE_PAGE_IO)) + log->l_flags |= NTFSLOG_WRAPPED | NTFSLOG_MULTIPLE_PAGE_IO; + + log->current_openlog_count = le32_to_cpu(ra2->open_log_count); + log->total_avail_pages = log->l_size - log->first_page; + log->total_avail = log->total_avail_pages >> log->page_bits; + log->max_current_avail = log->total_avail * log->reserved; + log->total_avail = log->total_avail * log->data_size; + + log->current_avail = current_log_avail(log); + + ra = kzalloc(log->restart_size, GFP_NOFS); + if (!ra) { + err = -ENOMEM; + goto out; + } + log->ra = ra; + + t16 = le16_to_cpu(ra2->client_off); + if (t16 == offsetof(struct RESTART_AREA, clients)) { + memcpy(ra, ra2, log->ra_size); + } else { + memcpy(ra, ra2, offsetof(struct RESTART_AREA, clients)); + memcpy(ra->clients, Add2Ptr(ra2, t16), + le16_to_cpu(ra2->ra_len) - t16); + + log->current_openlog_count = get_random_u32(); + ra->open_log_count = cpu_to_le32(log->current_openlog_count); + log->ra_size = offsetof(struct RESTART_AREA, clients) + + sizeof(struct CLIENT_REC); + ra->client_off = + cpu_to_le16(offsetof(struct RESTART_AREA, clients)); + ra->ra_len = cpu_to_le16(log->ra_size); + } + + le32_add_cpu(&ra->open_log_count, 1); + + /* Now we need to walk through looking for the last lsn. */ + err = last_log_lsn(log); + if (err) + goto out; + + log->current_avail = current_log_avail(log); + + /* Remember which restart area to write first. */ + log->init_ra = rst_info.vbo; + +process_log: + /* 1.0, 1.1, 2.0 log->major_ver/minor_ver - short values. */ + switch ((log->major_ver << 16) + log->minor_ver) { + case 0x10000: + case 0x10001: + case 0x20000: + break; + default: + ntfs_warn(sbi->sb, "\x24LogFile version %d.%d is not supported", + log->major_ver, log->minor_ver); + err = -EOPNOTSUPP; + log->set_dirty = true; + goto out; + } + + /* One client "NTFS" per logfile. */ + ca = Add2Ptr(ra, le16_to_cpu(ra->client_off)); + + for (client = ra->client_idx[1];; client = cr->next_client) { + if (client == LFS_NO_CLIENT_LE) { + /* Insert "NTFS" client LogFile. */ + client = ra->client_idx[0]; + if (client == LFS_NO_CLIENT_LE) { + err = -EINVAL; + goto out; + } + + t16 = le16_to_cpu(client); + cr = ca + t16; + + remove_client(ca, cr, &ra->client_idx[0]); + + cr->restart_lsn = 0; + cr->oldest_lsn = cpu_to_le64(log->oldest_lsn); + cr->name_bytes = cpu_to_le32(8); + cr->name[0] = cpu_to_le16('N'); + cr->name[1] = cpu_to_le16('T'); + cr->name[2] = cpu_to_le16('F'); + cr->name[3] = cpu_to_le16('S'); + + add_client(ca, t16, &ra->client_idx[1]); + break; + } + + cr = ca + le16_to_cpu(client); + + if (cpu_to_le32(8) == cr->name_bytes && + cpu_to_le16('N') == cr->name[0] && + cpu_to_le16('T') == cr->name[1] && + cpu_to_le16('F') == cr->name[2] && + cpu_to_le16('S') == cr->name[3]) + break; + } + + /* Update the client handle with the client block information. */ + log->client_id.seq_num = cr->seq_num; + log->client_id.client_idx = client; + + err = read_rst_area(log, &rst, &ra_lsn); + if (err) + goto out; + + if (!rst) + goto out; + + bytes_per_attr_entry = !rst->major_ver ? 0x2C : 0x28; + + checkpt_lsn = le64_to_cpu(rst->check_point_start); + if (!checkpt_lsn) + checkpt_lsn = ra_lsn; + + /* Allocate and Read the Transaction Table. */ + if (!rst->transact_table_len) + goto check_dirty_page_table; + + t64 = le64_to_cpu(rst->transact_table_lsn); + err = read_log_rec_lcb(log, t64, lcb_ctx_prev, &lcb); + if (err) + goto out; + + lrh = lcb->log_rec; + frh = lcb->lrh; + rec_len = le32_to_cpu(frh->client_data_len); + + if (!check_log_rec(lrh, rec_len, le32_to_cpu(frh->transact_id), + bytes_per_attr_entry)) { + err = -EINVAL; + goto out; + } + + t16 = le16_to_cpu(lrh->redo_off); + + rt = Add2Ptr(lrh, t16); + t32 = rec_len - t16; + + /* Now check that this is a valid restart table. */ + if (!check_rstbl(rt, t32)) { + err = -EINVAL; + goto out; + } + + trtbl = kmemdup(rt, t32, GFP_NOFS); + if (!trtbl) { + err = -ENOMEM; + goto out; + } + + lcb_put(lcb); + lcb = NULL; + +check_dirty_page_table: + /* The next record back should be the Dirty Pages Table. */ + if (!rst->dirty_pages_len) + goto check_attribute_names; + + t64 = le64_to_cpu(rst->dirty_pages_table_lsn); + err = read_log_rec_lcb(log, t64, lcb_ctx_prev, &lcb); + if (err) + goto out; + + lrh = lcb->log_rec; + frh = lcb->lrh; + rec_len = le32_to_cpu(frh->client_data_len); + + if (!check_log_rec(lrh, rec_len, le32_to_cpu(frh->transact_id), + bytes_per_attr_entry)) { + err = -EINVAL; + goto out; + } + + t16 = le16_to_cpu(lrh->redo_off); + + rt = Add2Ptr(lrh, t16); + t32 = rec_len - t16; + + /* Now check that this is a valid restart table. */ + if (!check_rstbl(rt, t32)) { + err = -EINVAL; + goto out; + } + + dptbl = kmemdup(rt, t32, GFP_NOFS); + if (!dptbl) { + err = -ENOMEM; + goto out; + } + + /* Convert Ra version '0' into version '1'. */ + if (rst->major_ver) + goto end_conv_1; + + dp = NULL; + while ((dp = enum_rstbl(dptbl, dp))) { + struct DIR_PAGE_ENTRY_32 *dp0 = (struct DIR_PAGE_ENTRY_32 *)dp; + // NOTE: Danger. Check for of boundary. + memmove(&dp->vcn, &dp0->vcn_low, + 2 * sizeof(u64) + + le32_to_cpu(dp->lcns_follow) * sizeof(u64)); + } + +end_conv_1: + lcb_put(lcb); + lcb = NULL; + + /* + * Go through the table and remove the duplicates, + * remembering the oldest lsn values. + */ + if (sbi->cluster_size <= log->page_size) + goto trace_dp_table; + + dp = NULL; + while ((dp = enum_rstbl(dptbl, dp))) { + struct DIR_PAGE_ENTRY *next = dp; + + while ((next = enum_rstbl(dptbl, next))) { + if (next->target_attr == dp->target_attr && + next->vcn == dp->vcn) { + if (le64_to_cpu(next->oldest_lsn) < + le64_to_cpu(dp->oldest_lsn)) { + dp->oldest_lsn = next->oldest_lsn; + } + + free_rsttbl_idx(dptbl, PtrOffset(dptbl, next)); + } + } + } +trace_dp_table: +check_attribute_names: + /* The next record should be the Attribute Names. */ + if (!rst->attr_names_len) + goto check_attr_table; + + t64 = le64_to_cpu(rst->attr_names_lsn); + err = read_log_rec_lcb(log, t64, lcb_ctx_prev, &lcb); + if (err) + goto out; + + lrh = lcb->log_rec; + frh = lcb->lrh; + rec_len = le32_to_cpu(frh->client_data_len); + + if (!check_log_rec(lrh, rec_len, le32_to_cpu(frh->transact_id), + bytes_per_attr_entry)) { + err = -EINVAL; + goto out; + } + + t32 = lrh_length(lrh); + rec_len -= t32; + + attr_names = kmemdup(Add2Ptr(lrh, t32), rec_len, GFP_NOFS); + if (!attr_names) { + err = -ENOMEM; + goto out; + } + + lcb_put(lcb); + lcb = NULL; + +check_attr_table: + /* The next record should be the attribute Table. */ + if (!rst->open_attr_len) + goto check_attribute_names2; + + t64 = le64_to_cpu(rst->open_attr_table_lsn); + err = read_log_rec_lcb(log, t64, lcb_ctx_prev, &lcb); + if (err) + goto out; + + lrh = lcb->log_rec; + frh = lcb->lrh; + rec_len = le32_to_cpu(frh->client_data_len); + + if (!check_log_rec(lrh, rec_len, le32_to_cpu(frh->transact_id), + bytes_per_attr_entry)) { + err = -EINVAL; + goto out; + } + + t16 = le16_to_cpu(lrh->redo_off); + + rt = Add2Ptr(lrh, t16); + t32 = rec_len - t16; + + if (!check_rstbl(rt, t32)) { + err = -EINVAL; + goto out; + } + + oatbl = kmemdup(rt, t32, GFP_NOFS); + if (!oatbl) { + err = -ENOMEM; + goto out; + } + + log->open_attr_tbl = oatbl; + + /* Clear all of the Attr pointers. */ + oe = NULL; + while ((oe = enum_rstbl(oatbl, oe))) { + if (!rst->major_ver) { + struct OPEN_ATTR_ENRTY_32 oe0; + + /* Really 'oe' points to OPEN_ATTR_ENRTY_32. */ + memcpy(&oe0, oe, SIZEOF_OPENATTRIBUTEENTRY0); + + oe->bytes_per_index = oe0.bytes_per_index; + oe->type = oe0.type; + oe->is_dirty_pages = oe0.is_dirty_pages; + oe->name_len = 0; + oe->ref = oe0.ref; + oe->open_record_lsn = oe0.open_record_lsn; + } + + oe->is_attr_name = 0; + oe->ptr = NULL; + } + + lcb_put(lcb); + lcb = NULL; + +check_attribute_names2: + if (!rst->attr_names_len) + goto trace_attribute_table; + + ane = attr_names; + if (!oatbl) + goto trace_attribute_table; + while (ane->off) { + /* TODO: Clear table on exit! */ + oe = Add2Ptr(oatbl, le16_to_cpu(ane->off)); + t16 = le16_to_cpu(ane->name_bytes); + oe->name_len = t16 / sizeof(short); + oe->ptr = ane->name; + oe->is_attr_name = 2; + ane = Add2Ptr(ane, sizeof(struct ATTR_NAME_ENTRY) + t16); + } + +trace_attribute_table: + /* + * If the checkpt_lsn is zero, then this is a freshly + * formatted disk and we have no work to do. + */ + if (!checkpt_lsn) { + err = 0; + goto out; + } + + if (!oatbl) { + oatbl = init_rsttbl(bytes_per_attr_entry, 8); + if (!oatbl) { + err = -ENOMEM; + goto out; + } + } + + log->open_attr_tbl = oatbl; + + /* Start the analysis pass from the Checkpoint lsn. */ + rec_lsn = checkpt_lsn; + + /* Read the first lsn. */ + err = read_log_rec_lcb(log, checkpt_lsn, lcb_ctx_next, &lcb); + if (err) + goto out; + + /* Loop to read all subsequent records to the end of the log file. */ +next_log_record_analyze: + err = read_next_log_rec(log, lcb, &rec_lsn); + if (err) + goto out; + + if (!rec_lsn) + goto end_log_records_enumerate; + + frh = lcb->lrh; + transact_id = le32_to_cpu(frh->transact_id); + rec_len = le32_to_cpu(frh->client_data_len); + lrh = lcb->log_rec; + + if (!check_log_rec(lrh, rec_len, transact_id, bytes_per_attr_entry)) { + err = -EINVAL; + goto out; + } + + /* + * The first lsn after the previous lsn remembered + * the checkpoint is the first candidate for the rlsn. + */ + if (!rlsn) + rlsn = rec_lsn; + + if (LfsClientRecord != frh->record_type) + goto next_log_record_analyze; + + /* + * Now update the Transaction Table for this transaction. If there + * is no entry present or it is unallocated we allocate the entry. + */ + if (!trtbl) { + trtbl = init_rsttbl(sizeof(struct TRANSACTION_ENTRY), + INITIAL_NUMBER_TRANSACTIONS); + if (!trtbl) { + err = -ENOMEM; + goto out; + } + } + + tr = Add2Ptr(trtbl, transact_id); + + if (transact_id >= bytes_per_rt(trtbl) || + tr->next != RESTART_ENTRY_ALLOCATED_LE) { + tr = alloc_rsttbl_from_idx(&trtbl, transact_id); + if (!tr) { + err = -ENOMEM; + goto out; + } + tr->transact_state = TransactionActive; + tr->first_lsn = cpu_to_le64(rec_lsn); + } + + tr->prev_lsn = tr->undo_next_lsn = cpu_to_le64(rec_lsn); + + /* + * If this is a compensation log record, then change + * the undo_next_lsn to be the undo_next_lsn of this record. + */ + if (lrh->undo_op == cpu_to_le16(CompensationLogRecord)) + tr->undo_next_lsn = frh->client_undo_next_lsn; + + /* Dispatch to handle log record depending on type. */ + switch (le16_to_cpu(lrh->redo_op)) { + case InitializeFileRecordSegment: + case DeallocateFileRecordSegment: + case WriteEndOfFileRecordSegment: + case CreateAttribute: + case DeleteAttribute: + case UpdateResidentValue: + case UpdateNonresidentValue: + case UpdateMappingPairs: + case SetNewAttributeSizes: + case AddIndexEntryRoot: + case DeleteIndexEntryRoot: + case AddIndexEntryAllocation: + case DeleteIndexEntryAllocation: + case WriteEndOfIndexBuffer: + case SetIndexEntryVcnRoot: + case SetIndexEntryVcnAllocation: + case UpdateFileNameRoot: + case UpdateFileNameAllocation: + case SetBitsInNonresidentBitMap: + case ClearBitsInNonresidentBitMap: + case UpdateRecordDataRoot: + case UpdateRecordDataAllocation: + case ZeroEndOfFileRecord: + t16 = le16_to_cpu(lrh->target_attr); + t64 = le64_to_cpu(lrh->target_vcn); + dp = find_dp(dptbl, t16, t64); + + if (dp) + goto copy_lcns; + + /* + * Calculate the number of clusters per page the system + * which wrote the checkpoint, possibly creating the table. + */ + if (dptbl) { + t32 = (le16_to_cpu(dptbl->size) - + sizeof(struct DIR_PAGE_ENTRY)) / + sizeof(u64); + } else { + t32 = log->clst_per_page; + kfree(dptbl); + dptbl = init_rsttbl(struct_size(dp, page_lcns, t32), + 32); + if (!dptbl) { + err = -ENOMEM; + goto out; + } + } + + dp = alloc_rsttbl_idx(&dptbl); + if (!dp) { + err = -ENOMEM; + goto out; + } + dp->target_attr = cpu_to_le32(t16); + dp->transfer_len = cpu_to_le32(t32 << sbi->cluster_bits); + dp->lcns_follow = cpu_to_le32(t32); + dp->vcn = cpu_to_le64(t64 & ~((u64)t32 - 1)); + dp->oldest_lsn = cpu_to_le64(rec_lsn); + +copy_lcns: + /* + * Copy the Lcns from the log record into the Dirty Page Entry. + * TODO: For different page size support, must somehow make + * whole routine a loop, case Lcns do not fit below. + */ + t16 = le16_to_cpu(lrh->lcns_follow); + for (i = 0; i < t16; i++) { + size_t j = (size_t)(le64_to_cpu(lrh->target_vcn) - + le64_to_cpu(dp->vcn)); + dp->page_lcns[j + i] = lrh->page_lcns[i]; + } + + goto next_log_record_analyze; + + case DeleteDirtyClusters: { + u32 range_count = + le16_to_cpu(lrh->redo_len) / sizeof(struct LCN_RANGE); + const struct LCN_RANGE *r = + Add2Ptr(lrh, le16_to_cpu(lrh->redo_off)); + + /* Loop through all of the Lcn ranges this log record. */ + for (i = 0; i < range_count; i++, r++) { + u64 lcn0 = le64_to_cpu(r->lcn); + u64 lcn_e = lcn0 + le64_to_cpu(r->len) - 1; + + dp = NULL; + while ((dp = enum_rstbl(dptbl, dp))) { + u32 j; + + t32 = le32_to_cpu(dp->lcns_follow); + for (j = 0; j < t32; j++) { + t64 = le64_to_cpu(dp->page_lcns[j]); + if (t64 >= lcn0 && t64 <= lcn_e) + dp->page_lcns[j] = 0; + } + } + } + goto next_log_record_analyze; + ; + } + + case OpenNonresidentAttribute: + t16 = le16_to_cpu(lrh->target_attr); + if (t16 >= bytes_per_rt(oatbl)) { + /* + * Compute how big the table needs to be. + * Add 10 extra entries for some cushion. + */ + u32 new_e = t16 / le16_to_cpu(oatbl->size); + + new_e += 10 - le16_to_cpu(oatbl->used); + + oatbl = extend_rsttbl(oatbl, new_e, ~0u); + log->open_attr_tbl = oatbl; + if (!oatbl) { + err = -ENOMEM; + goto out; + } + } + + /* Point to the entry being opened. */ + oe = alloc_rsttbl_from_idx(&oatbl, t16); + log->open_attr_tbl = oatbl; + if (!oe) { + err = -ENOMEM; + goto out; + } + + /* Initialize this entry from the log record. */ + t16 = le16_to_cpu(lrh->redo_off); + if (!rst->major_ver) { + /* Convert version '0' into version '1'. */ + struct OPEN_ATTR_ENRTY_32 *oe0 = Add2Ptr(lrh, t16); + + oe->bytes_per_index = oe0->bytes_per_index; + oe->type = oe0->type; + oe->is_dirty_pages = oe0->is_dirty_pages; + oe->name_len = 0; //oe0.name_len; + oe->ref = oe0->ref; + oe->open_record_lsn = oe0->open_record_lsn; + } else { + memcpy(oe, Add2Ptr(lrh, t16), bytes_per_attr_entry); + } + + t16 = le16_to_cpu(lrh->undo_len); + if (t16) { + oe->ptr = kmalloc(t16, GFP_NOFS); + if (!oe->ptr) { + err = -ENOMEM; + goto out; + } + oe->name_len = t16 / sizeof(short); + memcpy(oe->ptr, + Add2Ptr(lrh, le16_to_cpu(lrh->undo_off)), t16); + oe->is_attr_name = 1; + } else { + oe->ptr = NULL; + oe->is_attr_name = 0; + } + + goto next_log_record_analyze; + + case HotFix: + t16 = le16_to_cpu(lrh->target_attr); + t64 = le64_to_cpu(lrh->target_vcn); + dp = find_dp(dptbl, t16, t64); + if (dp) { + size_t j = le64_to_cpu(lrh->target_vcn) - + le64_to_cpu(dp->vcn); + if (dp->page_lcns[j]) + dp->page_lcns[j] = lrh->page_lcns[0]; + } + goto next_log_record_analyze; + + case EndTopLevelAction: + tr = Add2Ptr(trtbl, transact_id); + tr->prev_lsn = cpu_to_le64(rec_lsn); + tr->undo_next_lsn = frh->client_undo_next_lsn; + goto next_log_record_analyze; + + case PrepareTransaction: + tr = Add2Ptr(trtbl, transact_id); + tr->transact_state = TransactionPrepared; + goto next_log_record_analyze; + + case CommitTransaction: + tr = Add2Ptr(trtbl, transact_id); + tr->transact_state = TransactionCommitted; + goto next_log_record_analyze; + + case ForgetTransaction: + free_rsttbl_idx(trtbl, transact_id); + goto next_log_record_analyze; + + case Noop: + case OpenAttributeTableDump: + case AttributeNamesDump: + case DirtyPageTableDump: + case TransactionTableDump: + /* The following cases require no action the Analysis Pass. */ + goto next_log_record_analyze; + + default: + /* + * All codes will be explicitly handled. + * If we see a code we do not expect, then we are trouble. + */ + goto next_log_record_analyze; + } + +end_log_records_enumerate: + lcb_put(lcb); + lcb = NULL; + + /* + * Scan the Dirty Page Table and Transaction Table for + * the lowest lsn, and return it as the Redo lsn. + */ + dp = NULL; + while ((dp = enum_rstbl(dptbl, dp))) { + t64 = le64_to_cpu(dp->oldest_lsn); + if (t64 && t64 < rlsn) + rlsn = t64; + } + + tr = NULL; + while ((tr = enum_rstbl(trtbl, tr))) { + t64 = le64_to_cpu(tr->first_lsn); + if (t64 && t64 < rlsn) + rlsn = t64; + } + + /* + * Only proceed if the Dirty Page Table or Transaction + * table are not empty. + */ + if ((!dptbl || !dptbl->total) && (!trtbl || !trtbl->total)) + goto end_reply; + + sbi->flags |= NTFS_FLAGS_NEED_REPLAY; + if (is_ro) + goto out; + + /* Reopen all of the attributes with dirty pages. */ + oe = NULL; +next_open_attribute: + + oe = enum_rstbl(oatbl, oe); + if (!oe) { + err = 0; + dp = NULL; + goto next_dirty_page; + } + + oa = kzalloc(sizeof(struct OpenAttr), GFP_NOFS); + if (!oa) { + err = -ENOMEM; + goto out; + } + + inode = ntfs_iget5(sbi->sb, &oe->ref, NULL); + if (IS_ERR(inode)) + goto fake_attr; + + if (is_bad_inode(inode)) { + iput(inode); +fake_attr: + if (oa->ni) { + iput(&oa->ni->vfs_inode); + oa->ni = NULL; + } + + attr = attr_create_nonres_log(sbi, oe->type, 0, oe->ptr, + oe->name_len, 0); + if (!attr) { + kfree(oa); + err = -ENOMEM; + goto out; + } + oa->attr = attr; + oa->run1 = &oa->run0; + goto final_oe; + } + + ni_oe = ntfs_i(inode); + oa->ni = ni_oe; + + attr = ni_find_attr(ni_oe, NULL, NULL, oe->type, oe->ptr, oe->name_len, + NULL, NULL); + + if (!attr) + goto fake_attr; + + t32 = le32_to_cpu(attr->size); + oa->attr = kmemdup(attr, t32, GFP_NOFS); + if (!oa->attr) + goto fake_attr; + + if (!S_ISDIR(inode->i_mode)) { + if (attr->type == ATTR_DATA && !attr->name_len) { + oa->run1 = &ni_oe->file.run; + goto final_oe; + } + } else { + if (attr->type == ATTR_ALLOC && + attr->name_len == ARRAY_SIZE(I30_NAME) && + !memcmp(attr_name(attr), I30_NAME, sizeof(I30_NAME))) { + oa->run1 = &ni_oe->dir.alloc_run; + goto final_oe; + } + } + + if (attr->non_res) { + u16 roff = le16_to_cpu(attr->nres.run_off); + CLST svcn = le64_to_cpu(attr->nres.svcn); + + if (roff > t32) { + kfree(oa->attr); + oa->attr = NULL; + goto fake_attr; + } + + err = run_unpack(&oa->run0, sbi, inode->i_ino, svcn, + le64_to_cpu(attr->nres.evcn), svcn, + Add2Ptr(attr, roff), t32 - roff); + if (err < 0) { + kfree(oa->attr); + oa->attr = NULL; + goto fake_attr; + } + err = 0; + } + oa->run1 = &oa->run0; + attr = oa->attr; + +final_oe: + if (oe->is_attr_name == 1) + kfree(oe->ptr); + oe->is_attr_name = 0; + oe->ptr = oa; + oe->name_len = attr->name_len; + + goto next_open_attribute; + + /* + * Now loop through the dirty page table to extract all of the Vcn/Lcn. + * Mapping that we have, and insert it into the appropriate run. + */ +next_dirty_page: + dp = enum_rstbl(dptbl, dp); + if (!dp) + goto do_redo_1; + + oe = Add2Ptr(oatbl, le32_to_cpu(dp->target_attr)); + + if (oe->next != RESTART_ENTRY_ALLOCATED_LE) + goto next_dirty_page; + + oa = oe->ptr; + if (!oa) + goto next_dirty_page; + + i = -1; +next_dirty_page_vcn: + i += 1; + if (i >= le32_to_cpu(dp->lcns_follow)) + goto next_dirty_page; + + vcn = le64_to_cpu(dp->vcn) + i; + size = (vcn + 1) << sbi->cluster_bits; + + if (!dp->page_lcns[i]) + goto next_dirty_page_vcn; + + rno = ino_get(&oe->ref); + if (rno <= MFT_REC_MIRR && + size < (MFT_REC_VOL + 1) * sbi->record_size && + oe->type == ATTR_DATA) { + goto next_dirty_page_vcn; + } + + lcn = le64_to_cpu(dp->page_lcns[i]); + + if ((!run_lookup_entry(oa->run1, vcn, &lcn0, &len0, NULL) || + lcn0 != lcn) && + !run_add_entry(oa->run1, vcn, lcn, 1, false)) { + err = -ENOMEM; + goto out; + } + attr = oa->attr; + t64 = le64_to_cpu(attr->nres.alloc_size); + if (size > t64) { + attr->nres.valid_size = attr->nres.data_size = + attr->nres.alloc_size = cpu_to_le64(size); + } + goto next_dirty_page_vcn; + +do_redo_1: + /* + * Perform the Redo Pass, to restore all of the dirty pages to the same + * contents that they had immediately before the crash. If the dirty + * page table is empty, then we can skip the entire Redo Pass. + */ + if (!dptbl || !dptbl->total) + goto do_undo_action; + + rec_lsn = rlsn; + + /* + * Read the record at the Redo lsn, before falling + * into common code to handle each record. + */ + err = read_log_rec_lcb(log, rlsn, lcb_ctx_next, &lcb); + if (err) + goto out; + + /* + * Now loop to read all of our log records forwards, until + * we hit the end of the file, cleaning up at the end. + */ +do_action_next: + frh = lcb->lrh; + + if (LfsClientRecord != frh->record_type) + goto read_next_log_do_action; + + transact_id = le32_to_cpu(frh->transact_id); + rec_len = le32_to_cpu(frh->client_data_len); + lrh = lcb->log_rec; + + if (!check_log_rec(lrh, rec_len, transact_id, bytes_per_attr_entry)) { + err = -EINVAL; + goto out; + } + + /* Ignore log records that do not update pages. */ + if (lrh->lcns_follow) + goto find_dirty_page; + + goto read_next_log_do_action; + +find_dirty_page: + t16 = le16_to_cpu(lrh->target_attr); + t64 = le64_to_cpu(lrh->target_vcn); + dp = find_dp(dptbl, t16, t64); + + if (!dp) + goto read_next_log_do_action; + + if (rec_lsn < le64_to_cpu(dp->oldest_lsn)) + goto read_next_log_do_action; + + t16 = le16_to_cpu(lrh->target_attr); + if (t16 >= bytes_per_rt(oatbl)) { + err = -EINVAL; + goto out; + } + + oe = Add2Ptr(oatbl, t16); + + if (oe->next != RESTART_ENTRY_ALLOCATED_LE) { + err = -EINVAL; + goto out; + } + + oa = oe->ptr; + + if (!oa) { + err = -EINVAL; + goto out; + } + attr = oa->attr; + + vcn = le64_to_cpu(lrh->target_vcn); + + if (!run_lookup_entry(oa->run1, vcn, &lcn, NULL, NULL) || + lcn == SPARSE_LCN) { + goto read_next_log_do_action; + } + + /* Point to the Redo data and get its length. */ + data = Add2Ptr(lrh, le16_to_cpu(lrh->redo_off)); + dlen = le16_to_cpu(lrh->redo_len); + + /* Shorten length by any Lcns which were deleted. */ + saved_len = dlen; + + for (i = le16_to_cpu(lrh->lcns_follow); i; i--) { + size_t j; + u32 alen, voff; + + voff = le16_to_cpu(lrh->record_off) + + le16_to_cpu(lrh->attr_off); + voff += le16_to_cpu(lrh->cluster_off) << SECTOR_SHIFT; + + /* If the Vcn question is allocated, we can just get out. */ + j = le64_to_cpu(lrh->target_vcn) - le64_to_cpu(dp->vcn); + if (dp->page_lcns[j + i - 1]) + break; + + if (!saved_len) + saved_len = 1; + + /* + * Calculate the allocated space left relative to the + * log record Vcn, after removing this unallocated Vcn. + */ + alen = (i - 1) << sbi->cluster_bits; + + /* + * If the update described this log record goes beyond + * the allocated space, then we will have to reduce the length. + */ + if (voff >= alen) + dlen = 0; + else if (voff + dlen > alen) + dlen = alen - voff; + } + + /* + * If the resulting dlen from above is now zero, + * we can skip this log record. + */ + if (!dlen && saved_len) + goto read_next_log_do_action; + + t16 = le16_to_cpu(lrh->redo_op); + if (can_skip_action(t16)) + goto read_next_log_do_action; + + /* Apply the Redo operation a common routine. */ + err = do_action(log, oe, lrh, t16, data, dlen, rec_len, &rec_lsn); + if (err) + goto out; + + /* Keep reading and looping back until end of file. */ +read_next_log_do_action: + err = read_next_log_rec(log, lcb, &rec_lsn); + if (!err && rec_lsn) + goto do_action_next; + + lcb_put(lcb); + lcb = NULL; + +do_undo_action: + /* Scan Transaction Table. */ + tr = NULL; +transaction_table_next: + tr = enum_rstbl(trtbl, tr); + if (!tr) + goto undo_action_done; + + if (TransactionActive != tr->transact_state || !tr->undo_next_lsn) { + free_rsttbl_idx(trtbl, PtrOffset(trtbl, tr)); + goto transaction_table_next; + } + + log->transaction_id = PtrOffset(trtbl, tr); + undo_next_lsn = le64_to_cpu(tr->undo_next_lsn); + + /* + * We only have to do anything if the transaction has + * something its undo_next_lsn field. + */ + if (!undo_next_lsn) + goto commit_undo; + + /* Read the first record to be undone by this transaction. */ + err = read_log_rec_lcb(log, undo_next_lsn, lcb_ctx_undo_next, &lcb); + if (err) + goto out; + + /* + * Now loop to read all of our log records forwards, + * until we hit the end of the file, cleaning up at the end. + */ +undo_action_next: + + lrh = lcb->log_rec; + frh = lcb->lrh; + transact_id = le32_to_cpu(frh->transact_id); + rec_len = le32_to_cpu(frh->client_data_len); + + if (!check_log_rec(lrh, rec_len, transact_id, bytes_per_attr_entry)) { + err = -EINVAL; + goto out; + } + + if (lrh->undo_op == cpu_to_le16(Noop)) + goto read_next_log_undo_action; + + oe = Add2Ptr(oatbl, le16_to_cpu(lrh->target_attr)); + oa = oe->ptr; + + t16 = le16_to_cpu(lrh->lcns_follow); + if (!t16) + goto add_allocated_vcns; + + is_mapped = run_lookup_entry(oa->run1, le64_to_cpu(lrh->target_vcn), + &lcn, &clen, NULL); + + /* + * If the mapping isn't already the table or the mapping + * corresponds to a hole the mapping, we need to make sure + * there is no partial page already memory. + */ + if (is_mapped && lcn != SPARSE_LCN && clen >= t16) + goto add_allocated_vcns; + + vcn = le64_to_cpu(lrh->target_vcn); + vcn &= ~(u64)(log->clst_per_page - 1); + +add_allocated_vcns: + for (i = 0, vcn = le64_to_cpu(lrh->target_vcn), + size = (vcn + 1) << sbi->cluster_bits; + i < t16; i++, vcn += 1, size += sbi->cluster_size) { + attr = oa->attr; + if (!attr->non_res) { + if (size > le32_to_cpu(attr->res.data_size)) + attr->res.data_size = cpu_to_le32(size); + } else { + if (size > le64_to_cpu(attr->nres.data_size)) + attr->nres.valid_size = attr->nres.data_size = + attr->nres.alloc_size = + cpu_to_le64(size); + } + } + + t16 = le16_to_cpu(lrh->undo_op); + if (can_skip_action(t16)) + goto read_next_log_undo_action; + + /* Point to the Redo data and get its length. */ + data = Add2Ptr(lrh, le16_to_cpu(lrh->undo_off)); + dlen = le16_to_cpu(lrh->undo_len); + + /* It is time to apply the undo action. */ + err = do_action(log, oe, lrh, t16, data, dlen, rec_len, NULL); + +read_next_log_undo_action: + /* + * Keep reading and looping back until we have read the + * last record for this transaction. + */ + err = read_next_log_rec(log, lcb, &rec_lsn); + if (err) + goto out; + + if (rec_lsn) + goto undo_action_next; + + lcb_put(lcb); + lcb = NULL; + +commit_undo: + free_rsttbl_idx(trtbl, log->transaction_id); + + log->transaction_id = 0; + + goto transaction_table_next; + +undo_action_done: + + ntfs_update_mftmirr(sbi, 0); + + sbi->flags &= ~NTFS_FLAGS_NEED_REPLAY; + +end_reply: + + err = 0; + if (is_ro) + goto out; + + rh = kzalloc(log->page_size, GFP_NOFS); + if (!rh) { + err = -ENOMEM; + goto out; + } + + rh->rhdr.sign = NTFS_RSTR_SIGNATURE; + rh->rhdr.fix_off = cpu_to_le16(offsetof(struct RESTART_HDR, fixups)); + t16 = (log->page_size >> SECTOR_SHIFT) + 1; + rh->rhdr.fix_num = cpu_to_le16(t16); + rh->sys_page_size = cpu_to_le32(log->page_size); + rh->page_size = cpu_to_le32(log->page_size); + + t16 = ALIGN(offsetof(struct RESTART_HDR, fixups) + sizeof(short) * t16, + 8); + rh->ra_off = cpu_to_le16(t16); + rh->minor_ver = cpu_to_le16(1); // 0x1A: + rh->major_ver = cpu_to_le16(1); // 0x1C: + + ra2 = Add2Ptr(rh, t16); + memcpy(ra2, ra, sizeof(struct RESTART_AREA)); + + ra2->client_idx[0] = 0; + ra2->client_idx[1] = LFS_NO_CLIENT_LE; + ra2->flags = cpu_to_le16(2); + + le32_add_cpu(&ra2->open_log_count, 1); + + ntfs_fix_pre_write(&rh->rhdr, log->page_size); + + err = ntfs_sb_write_run(sbi, &ni->file.run, 0, rh, log->page_size, 0); + if (!err) + err = ntfs_sb_write_run(sbi, &log->ni->file.run, log->page_size, + rh, log->page_size, 0); + + kfree(rh); + if (err) + goto out; + +out: + kfree(rst); + if (lcb) + lcb_put(lcb); + + /* + * Scan the Open Attribute Table to close all of + * the open attributes. + */ + oe = NULL; + while ((oe = enum_rstbl(oatbl, oe))) { + rno = ino_get(&oe->ref); + + if (oe->is_attr_name == 1) { + kfree(oe->ptr); + oe->ptr = NULL; + continue; + } + + if (oe->is_attr_name) + continue; + + oa = oe->ptr; + if (!oa) + continue; + + run_close(&oa->run0); + kfree(oa->attr); + if (oa->ni) + iput(&oa->ni->vfs_inode); + kfree(oa); + } + + kfree(trtbl); + kfree(oatbl); + kfree(dptbl); + kfree(attr_names); + kfree(rst_info.r_page); + + kfree(ra); + kfree(log->one_page_buf); + + if (err) + sbi->flags |= NTFS_FLAGS_NEED_REPLAY; + + if (err == -EROFS) + err = 0; + else if (log->set_dirty) + ntfs_set_state(sbi, NTFS_DIRTY_ERROR); + + kfree(log); + + return err; +} diff --git a/fs/ntfs3/fsntfs.c b/fs/ntfs3/fsntfs.c new file mode 100644 index 000000000..873b1434a --- /dev/null +++ b/fs/ntfs3/fsntfs.c @@ -0,0 +1,2503 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * + * Copyright (C) 2019-2021 Paragon Software GmbH, All rights reserved. + * + */ + +#include <linux/blkdev.h> +#include <linux/buffer_head.h> +#include <linux/fs.h> +#include <linux/kernel.h> + +#include "debug.h" +#include "ntfs.h" +#include "ntfs_fs.h" + +// clang-format off +const struct cpu_str NAME_MFT = { + 4, 0, { '$', 'M', 'F', 'T' }, +}; +const struct cpu_str NAME_MIRROR = { + 8, 0, { '$', 'M', 'F', 'T', 'M', 'i', 'r', 'r' }, +}; +const struct cpu_str NAME_LOGFILE = { + 8, 0, { '$', 'L', 'o', 'g', 'F', 'i', 'l', 'e' }, +}; +const struct cpu_str NAME_VOLUME = { + 7, 0, { '$', 'V', 'o', 'l', 'u', 'm', 'e' }, +}; +const struct cpu_str NAME_ATTRDEF = { + 8, 0, { '$', 'A', 't', 't', 'r', 'D', 'e', 'f' }, +}; +const struct cpu_str NAME_ROOT = { + 1, 0, { '.' }, +}; +const struct cpu_str NAME_BITMAP = { + 7, 0, { '$', 'B', 'i', 't', 'm', 'a', 'p' }, +}; +const struct cpu_str NAME_BOOT = { + 5, 0, { '$', 'B', 'o', 'o', 't' }, +}; +const struct cpu_str NAME_BADCLUS = { + 8, 0, { '$', 'B', 'a', 'd', 'C', 'l', 'u', 's' }, +}; +const struct cpu_str NAME_QUOTA = { + 6, 0, { '$', 'Q', 'u', 'o', 't', 'a' }, +}; +const struct cpu_str NAME_SECURE = { + 7, 0, { '$', 'S', 'e', 'c', 'u', 'r', 'e' }, +}; +const struct cpu_str NAME_UPCASE = { + 7, 0, { '$', 'U', 'p', 'C', 'a', 's', 'e' }, +}; +const struct cpu_str NAME_EXTEND = { + 7, 0, { '$', 'E', 'x', 't', 'e', 'n', 'd' }, +}; +const struct cpu_str NAME_OBJID = { + 6, 0, { '$', 'O', 'b', 'j', 'I', 'd' }, +}; +const struct cpu_str NAME_REPARSE = { + 8, 0, { '$', 'R', 'e', 'p', 'a', 'r', 's', 'e' }, +}; +const struct cpu_str NAME_USNJRNL = { + 8, 0, { '$', 'U', 's', 'n', 'J', 'r', 'n', 'l' }, +}; +const __le16 BAD_NAME[4] = { + cpu_to_le16('$'), cpu_to_le16('B'), cpu_to_le16('a'), cpu_to_le16('d'), +}; +const __le16 I30_NAME[4] = { + cpu_to_le16('$'), cpu_to_le16('I'), cpu_to_le16('3'), cpu_to_le16('0'), +}; +const __le16 SII_NAME[4] = { + cpu_to_le16('$'), cpu_to_le16('S'), cpu_to_le16('I'), cpu_to_le16('I'), +}; +const __le16 SDH_NAME[4] = { + cpu_to_le16('$'), cpu_to_le16('S'), cpu_to_le16('D'), cpu_to_le16('H'), +}; +const __le16 SDS_NAME[4] = { + cpu_to_le16('$'), cpu_to_le16('S'), cpu_to_le16('D'), cpu_to_le16('S'), +}; +const __le16 SO_NAME[2] = { + cpu_to_le16('$'), cpu_to_le16('O'), +}; +const __le16 SQ_NAME[2] = { + cpu_to_le16('$'), cpu_to_le16('Q'), +}; +const __le16 SR_NAME[2] = { + cpu_to_le16('$'), cpu_to_le16('R'), +}; + +#ifdef CONFIG_NTFS3_LZX_XPRESS +const __le16 WOF_NAME[17] = { + cpu_to_le16('W'), cpu_to_le16('o'), cpu_to_le16('f'), cpu_to_le16('C'), + cpu_to_le16('o'), cpu_to_le16('m'), cpu_to_le16('p'), cpu_to_le16('r'), + cpu_to_le16('e'), cpu_to_le16('s'), cpu_to_le16('s'), cpu_to_le16('e'), + cpu_to_le16('d'), cpu_to_le16('D'), cpu_to_le16('a'), cpu_to_le16('t'), + cpu_to_le16('a'), +}; +#endif + +// clang-format on + +/* + * ntfs_fix_pre_write - Insert fixups into @rhdr before writing to disk. + */ +bool ntfs_fix_pre_write(struct NTFS_RECORD_HEADER *rhdr, size_t bytes) +{ + u16 *fixup, *ptr; + u16 sample; + u16 fo = le16_to_cpu(rhdr->fix_off); + u16 fn = le16_to_cpu(rhdr->fix_num); + + if ((fo & 1) || fo + fn * sizeof(short) > SECTOR_SIZE || !fn-- || + fn * SECTOR_SIZE > bytes) { + return false; + } + + /* Get fixup pointer. */ + fixup = Add2Ptr(rhdr, fo); + + if (*fixup >= 0x7FFF) + *fixup = 1; + else + *fixup += 1; + + sample = *fixup; + + ptr = Add2Ptr(rhdr, SECTOR_SIZE - sizeof(short)); + + while (fn--) { + *++fixup = *ptr; + *ptr = sample; + ptr += SECTOR_SIZE / sizeof(short); + } + return true; +} + +/* + * ntfs_fix_post_read - Remove fixups after reading from disk. + * + * Return: < 0 if error, 0 if ok, 1 if need to update fixups. + */ +int ntfs_fix_post_read(struct NTFS_RECORD_HEADER *rhdr, size_t bytes, + bool simple) +{ + int ret; + u16 *fixup, *ptr; + u16 sample, fo, fn; + + fo = le16_to_cpu(rhdr->fix_off); + fn = simple ? ((bytes >> SECTOR_SHIFT) + 1) + : le16_to_cpu(rhdr->fix_num); + + /* Check errors. */ + if ((fo & 1) || fo + fn * sizeof(short) > SECTOR_SIZE || !fn-- || + fn * SECTOR_SIZE > bytes) { + return -E_NTFS_CORRUPT; + } + + /* Get fixup pointer. */ + fixup = Add2Ptr(rhdr, fo); + sample = *fixup; + ptr = Add2Ptr(rhdr, SECTOR_SIZE - sizeof(short)); + ret = 0; + + while (fn--) { + /* Test current word. */ + if (*ptr != sample) { + /* Fixup does not match! Is it serious error? */ + ret = -E_NTFS_FIXUP; + } + + /* Replace fixup. */ + *ptr = *++fixup; + ptr += SECTOR_SIZE / sizeof(short); + } + + return ret; +} + +/* + * ntfs_extend_init - Load $Extend file. + */ +int ntfs_extend_init(struct ntfs_sb_info *sbi) +{ + int err; + struct super_block *sb = sbi->sb; + struct inode *inode, *inode2; + struct MFT_REF ref; + + if (sbi->volume.major_ver < 3) { + ntfs_notice(sb, "Skip $Extend 'cause NTFS version"); + return 0; + } + + ref.low = cpu_to_le32(MFT_REC_EXTEND); + ref.high = 0; + ref.seq = cpu_to_le16(MFT_REC_EXTEND); + inode = ntfs_iget5(sb, &ref, &NAME_EXTEND); + if (IS_ERR(inode)) { + err = PTR_ERR(inode); + ntfs_err(sb, "Failed to load $Extend."); + inode = NULL; + goto out; + } + + /* If ntfs_iget5() reads from disk it never returns bad inode. */ + if (!S_ISDIR(inode->i_mode)) { + err = -EINVAL; + goto out; + } + + /* Try to find $ObjId */ + inode2 = dir_search_u(inode, &NAME_OBJID, NULL); + if (inode2 && !IS_ERR(inode2)) { + if (is_bad_inode(inode2)) { + iput(inode2); + } else { + sbi->objid.ni = ntfs_i(inode2); + sbi->objid_no = inode2->i_ino; + } + } + + /* Try to find $Quota */ + inode2 = dir_search_u(inode, &NAME_QUOTA, NULL); + if (inode2 && !IS_ERR(inode2)) { + sbi->quota_no = inode2->i_ino; + iput(inode2); + } + + /* Try to find $Reparse */ + inode2 = dir_search_u(inode, &NAME_REPARSE, NULL); + if (inode2 && !IS_ERR(inode2)) { + sbi->reparse.ni = ntfs_i(inode2); + sbi->reparse_no = inode2->i_ino; + } + + /* Try to find $UsnJrnl */ + inode2 = dir_search_u(inode, &NAME_USNJRNL, NULL); + if (inode2 && !IS_ERR(inode2)) { + sbi->usn_jrnl_no = inode2->i_ino; + iput(inode2); + } + + err = 0; +out: + iput(inode); + return err; +} + +int ntfs_loadlog_and_replay(struct ntfs_inode *ni, struct ntfs_sb_info *sbi) +{ + int err = 0; + struct super_block *sb = sbi->sb; + bool initialized = false; + struct MFT_REF ref; + struct inode *inode; + + /* Check for 4GB. */ + if (ni->vfs_inode.i_size >= 0x100000000ull) { + ntfs_err(sb, "\x24LogFile is too big"); + err = -EINVAL; + goto out; + } + + sbi->flags |= NTFS_FLAGS_LOG_REPLAYING; + + ref.low = cpu_to_le32(MFT_REC_MFT); + ref.high = 0; + ref.seq = cpu_to_le16(1); + + inode = ntfs_iget5(sb, &ref, NULL); + + if (IS_ERR(inode)) + inode = NULL; + + if (!inode) { + /* Try to use MFT copy. */ + u64 t64 = sbi->mft.lbo; + + sbi->mft.lbo = sbi->mft.lbo2; + inode = ntfs_iget5(sb, &ref, NULL); + sbi->mft.lbo = t64; + if (IS_ERR(inode)) + inode = NULL; + } + + if (!inode) { + err = -EINVAL; + ntfs_err(sb, "Failed to load $MFT."); + goto out; + } + + sbi->mft.ni = ntfs_i(inode); + + /* LogFile should not contains attribute list. */ + err = ni_load_all_mi(sbi->mft.ni); + if (!err) + err = log_replay(ni, &initialized); + + iput(inode); + sbi->mft.ni = NULL; + + sync_blockdev(sb->s_bdev); + invalidate_bdev(sb->s_bdev); + + if (sbi->flags & NTFS_FLAGS_NEED_REPLAY) { + err = 0; + goto out; + } + + if (sb_rdonly(sb) || !initialized) + goto out; + + /* Fill LogFile by '-1' if it is initialized. */ + err = ntfs_bio_fill_1(sbi, &ni->file.run); + +out: + sbi->flags &= ~NTFS_FLAGS_LOG_REPLAYING; + + return err; +} + +/* + * ntfs_query_def + * + * Return: Current ATTR_DEF_ENTRY for given attribute type. + */ +const struct ATTR_DEF_ENTRY *ntfs_query_def(struct ntfs_sb_info *sbi, + enum ATTR_TYPE type) +{ + int type_in = le32_to_cpu(type); + size_t min_idx = 0; + size_t max_idx = sbi->def_entries - 1; + + while (min_idx <= max_idx) { + size_t i = min_idx + ((max_idx - min_idx) >> 1); + const struct ATTR_DEF_ENTRY *entry = sbi->def_table + i; + int diff = le32_to_cpu(entry->type) - type_in; + + if (!diff) + return entry; + if (diff < 0) + min_idx = i + 1; + else if (i) + max_idx = i - 1; + else + return NULL; + } + return NULL; +} + +/* + * ntfs_look_for_free_space - Look for a free space in bitmap. + */ +int ntfs_look_for_free_space(struct ntfs_sb_info *sbi, CLST lcn, CLST len, + CLST *new_lcn, CLST *new_len, + enum ALLOCATE_OPT opt) +{ + int err; + CLST alen; + struct super_block *sb = sbi->sb; + size_t alcn, zlen, zeroes, zlcn, zlen2, ztrim, new_zlen; + struct wnd_bitmap *wnd = &sbi->used.bitmap; + + down_write_nested(&wnd->rw_lock, BITMAP_MUTEX_CLUSTERS); + if (opt & ALLOCATE_MFT) { + zlen = wnd_zone_len(wnd); + + if (!zlen) { + err = ntfs_refresh_zone(sbi); + if (err) + goto up_write; + + zlen = wnd_zone_len(wnd); + } + + if (!zlen) { + ntfs_err(sbi->sb, "no free space to extend mft"); + err = -ENOSPC; + goto up_write; + } + + lcn = wnd_zone_bit(wnd); + alen = min_t(CLST, len, zlen); + + wnd_zone_set(wnd, lcn + alen, zlen - alen); + + err = wnd_set_used(wnd, lcn, alen); + if (err) + goto up_write; + + alcn = lcn; + goto space_found; + } + /* + * 'Cause cluster 0 is always used this value means that we should use + * cached value of 'next_free_lcn' to improve performance. + */ + if (!lcn) + lcn = sbi->used.next_free_lcn; + + if (lcn >= wnd->nbits) + lcn = 0; + + alen = wnd_find(wnd, len, lcn, BITMAP_FIND_MARK_AS_USED, &alcn); + if (alen) + goto space_found; + + /* Try to use clusters from MftZone. */ + zlen = wnd_zone_len(wnd); + zeroes = wnd_zeroes(wnd); + + /* Check too big request */ + if (len > zeroes + zlen || zlen <= NTFS_MIN_MFT_ZONE) { + err = -ENOSPC; + goto up_write; + } + + /* How many clusters to cat from zone. */ + zlcn = wnd_zone_bit(wnd); + zlen2 = zlen >> 1; + ztrim = clamp_val(len, zlen2, zlen); + new_zlen = max_t(size_t, zlen - ztrim, NTFS_MIN_MFT_ZONE); + + wnd_zone_set(wnd, zlcn, new_zlen); + + /* Allocate continues clusters. */ + alen = wnd_find(wnd, len, 0, + BITMAP_FIND_MARK_AS_USED | BITMAP_FIND_FULL, &alcn); + if (!alen) { + err = -ENOSPC; + goto up_write; + } + +space_found: + err = 0; + *new_len = alen; + *new_lcn = alcn; + + ntfs_unmap_meta(sb, alcn, alen); + + /* Set hint for next requests. */ + if (!(opt & ALLOCATE_MFT)) + sbi->used.next_free_lcn = alcn + alen; +up_write: + up_write(&wnd->rw_lock); + return err; +} + +/* + * ntfs_extend_mft - Allocate additional MFT records. + * + * sbi->mft.bitmap is locked for write. + * + * NOTE: recursive: + * ntfs_look_free_mft -> + * ntfs_extend_mft -> + * attr_set_size -> + * ni_insert_nonresident -> + * ni_insert_attr -> + * ni_ins_attr_ext -> + * ntfs_look_free_mft -> + * ntfs_extend_mft + * + * To avoid recursive always allocate space for two new MFT records + * see attrib.c: "at least two MFT to avoid recursive loop". + */ +static int ntfs_extend_mft(struct ntfs_sb_info *sbi) +{ + int err; + struct ntfs_inode *ni = sbi->mft.ni; + size_t new_mft_total; + u64 new_mft_bytes, new_bitmap_bytes; + struct ATTRIB *attr; + struct wnd_bitmap *wnd = &sbi->mft.bitmap; + + new_mft_total = (wnd->nbits + MFT_INCREASE_CHUNK + 127) & (CLST)~127; + new_mft_bytes = (u64)new_mft_total << sbi->record_bits; + + /* Step 1: Resize $MFT::DATA. */ + down_write(&ni->file.run_lock); + err = attr_set_size(ni, ATTR_DATA, NULL, 0, &ni->file.run, + new_mft_bytes, NULL, false, &attr); + + if (err) { + up_write(&ni->file.run_lock); + goto out; + } + + attr->nres.valid_size = attr->nres.data_size; + new_mft_total = le64_to_cpu(attr->nres.alloc_size) >> sbi->record_bits; + ni->mi.dirty = true; + + /* Step 2: Resize $MFT::BITMAP. */ + new_bitmap_bytes = bitmap_size(new_mft_total); + + err = attr_set_size(ni, ATTR_BITMAP, NULL, 0, &sbi->mft.bitmap.run, + new_bitmap_bytes, &new_bitmap_bytes, true, NULL); + + /* Refresh MFT Zone if necessary. */ + down_write_nested(&sbi->used.bitmap.rw_lock, BITMAP_MUTEX_CLUSTERS); + + ntfs_refresh_zone(sbi); + + up_write(&sbi->used.bitmap.rw_lock); + up_write(&ni->file.run_lock); + + if (err) + goto out; + + err = wnd_extend(wnd, new_mft_total); + + if (err) + goto out; + + ntfs_clear_mft_tail(sbi, sbi->mft.used, new_mft_total); + + err = _ni_write_inode(&ni->vfs_inode, 0); +out: + return err; +} + +/* + * ntfs_look_free_mft - Look for a free MFT record. + */ +int ntfs_look_free_mft(struct ntfs_sb_info *sbi, CLST *rno, bool mft, + struct ntfs_inode *ni, struct mft_inode **mi) +{ + int err = 0; + size_t zbit, zlen, from, to, fr; + size_t mft_total; + struct MFT_REF ref; + struct super_block *sb = sbi->sb; + struct wnd_bitmap *wnd = &sbi->mft.bitmap; + u32 ir; + + static_assert(sizeof(sbi->mft.reserved_bitmap) * 8 >= + MFT_REC_FREE - MFT_REC_RESERVED); + + if (!mft) + down_write_nested(&wnd->rw_lock, BITMAP_MUTEX_MFT); + + zlen = wnd_zone_len(wnd); + + /* Always reserve space for MFT. */ + if (zlen) { + if (mft) { + zbit = wnd_zone_bit(wnd); + *rno = zbit; + wnd_zone_set(wnd, zbit + 1, zlen - 1); + } + goto found; + } + + /* No MFT zone. Find the nearest to '0' free MFT. */ + if (!wnd_find(wnd, 1, MFT_REC_FREE, 0, &zbit)) { + /* Resize MFT */ + mft_total = wnd->nbits; + + err = ntfs_extend_mft(sbi); + if (!err) { + zbit = mft_total; + goto reserve_mft; + } + + if (!mft || MFT_REC_FREE == sbi->mft.next_reserved) + goto out; + + err = 0; + + /* + * Look for free record reserved area [11-16) == + * [MFT_REC_RESERVED, MFT_REC_FREE ) MFT bitmap always + * marks it as used. + */ + if (!sbi->mft.reserved_bitmap) { + /* Once per session create internal bitmap for 5 bits. */ + sbi->mft.reserved_bitmap = 0xFF; + + ref.high = 0; + for (ir = MFT_REC_RESERVED; ir < MFT_REC_FREE; ir++) { + struct inode *i; + struct ntfs_inode *ni; + struct MFT_REC *mrec; + + ref.low = cpu_to_le32(ir); + ref.seq = cpu_to_le16(ir); + + i = ntfs_iget5(sb, &ref, NULL); + if (IS_ERR(i)) { +next: + ntfs_notice( + sb, + "Invalid reserved record %x", + ref.low); + continue; + } + if (is_bad_inode(i)) { + iput(i); + goto next; + } + + ni = ntfs_i(i); + + mrec = ni->mi.mrec; + + if (!is_rec_base(mrec)) + goto next; + + if (mrec->hard_links) + goto next; + + if (!ni_std(ni)) + goto next; + + if (ni_find_attr(ni, NULL, NULL, ATTR_NAME, + NULL, 0, NULL, NULL)) + goto next; + + __clear_bit(ir - MFT_REC_RESERVED, + &sbi->mft.reserved_bitmap); + } + } + + /* Scan 5 bits for zero. Bit 0 == MFT_REC_RESERVED */ + zbit = find_next_zero_bit(&sbi->mft.reserved_bitmap, + MFT_REC_FREE, MFT_REC_RESERVED); + if (zbit >= MFT_REC_FREE) { + sbi->mft.next_reserved = MFT_REC_FREE; + goto out; + } + + zlen = 1; + sbi->mft.next_reserved = zbit; + } else { +reserve_mft: + zlen = zbit == MFT_REC_FREE ? (MFT_REC_USER - MFT_REC_FREE) : 4; + if (zbit + zlen > wnd->nbits) + zlen = wnd->nbits - zbit; + + while (zlen > 1 && !wnd_is_free(wnd, zbit, zlen)) + zlen -= 1; + + /* [zbit, zbit + zlen) will be used for MFT itself. */ + from = sbi->mft.used; + if (from < zbit) + from = zbit; + to = zbit + zlen; + if (from < to) { + ntfs_clear_mft_tail(sbi, from, to); + sbi->mft.used = to; + } + } + + if (mft) { + *rno = zbit; + zbit += 1; + zlen -= 1; + } + + wnd_zone_set(wnd, zbit, zlen); + +found: + if (!mft) { + /* The request to get record for general purpose. */ + if (sbi->mft.next_free < MFT_REC_USER) + sbi->mft.next_free = MFT_REC_USER; + + for (;;) { + if (sbi->mft.next_free >= sbi->mft.bitmap.nbits) { + } else if (!wnd_find(wnd, 1, MFT_REC_USER, 0, &fr)) { + sbi->mft.next_free = sbi->mft.bitmap.nbits; + } else { + *rno = fr; + sbi->mft.next_free = *rno + 1; + break; + } + + err = ntfs_extend_mft(sbi); + if (err) + goto out; + } + } + + if (ni && !ni_add_subrecord(ni, *rno, mi)) { + err = -ENOMEM; + goto out; + } + + /* We have found a record that are not reserved for next MFT. */ + if (*rno >= MFT_REC_FREE) + wnd_set_used(wnd, *rno, 1); + else if (*rno >= MFT_REC_RESERVED && sbi->mft.reserved_bitmap_inited) + __set_bit(*rno - MFT_REC_RESERVED, &sbi->mft.reserved_bitmap); + +out: + if (!mft) + up_write(&wnd->rw_lock); + + return err; +} + +/* + * ntfs_mark_rec_free - Mark record as free. + * is_mft - true if we are changing MFT + */ +void ntfs_mark_rec_free(struct ntfs_sb_info *sbi, CLST rno, bool is_mft) +{ + struct wnd_bitmap *wnd = &sbi->mft.bitmap; + + if (!is_mft) + down_write_nested(&wnd->rw_lock, BITMAP_MUTEX_MFT); + if (rno >= wnd->nbits) + goto out; + + if (rno >= MFT_REC_FREE) { + if (!wnd_is_used(wnd, rno, 1)) + ntfs_set_state(sbi, NTFS_DIRTY_ERROR); + else + wnd_set_free(wnd, rno, 1); + } else if (rno >= MFT_REC_RESERVED && sbi->mft.reserved_bitmap_inited) { + __clear_bit(rno - MFT_REC_RESERVED, &sbi->mft.reserved_bitmap); + } + + if (rno < wnd_zone_bit(wnd)) + wnd_zone_set(wnd, rno, 1); + else if (rno < sbi->mft.next_free && rno >= MFT_REC_USER) + sbi->mft.next_free = rno; + +out: + if (!is_mft) + up_write(&wnd->rw_lock); +} + +/* + * ntfs_clear_mft_tail - Format empty records [from, to). + * + * sbi->mft.bitmap is locked for write. + */ +int ntfs_clear_mft_tail(struct ntfs_sb_info *sbi, size_t from, size_t to) +{ + int err; + u32 rs; + u64 vbo; + struct runs_tree *run; + struct ntfs_inode *ni; + + if (from >= to) + return 0; + + rs = sbi->record_size; + ni = sbi->mft.ni; + run = &ni->file.run; + + down_read(&ni->file.run_lock); + vbo = (u64)from * rs; + for (; from < to; from++, vbo += rs) { + struct ntfs_buffers nb; + + err = ntfs_get_bh(sbi, run, vbo, rs, &nb); + if (err) + goto out; + + err = ntfs_write_bh(sbi, &sbi->new_rec->rhdr, &nb, 0); + nb_put(&nb); + if (err) + goto out; + } + +out: + sbi->mft.used = from; + up_read(&ni->file.run_lock); + return err; +} + +/* + * ntfs_refresh_zone - Refresh MFT zone. + * + * sbi->used.bitmap is locked for rw. + * sbi->mft.bitmap is locked for write. + * sbi->mft.ni->file.run_lock for write. + */ +int ntfs_refresh_zone(struct ntfs_sb_info *sbi) +{ + CLST lcn, vcn, len; + size_t lcn_s, zlen; + struct wnd_bitmap *wnd = &sbi->used.bitmap; + struct ntfs_inode *ni = sbi->mft.ni; + + /* Do not change anything unless we have non empty MFT zone. */ + if (wnd_zone_len(wnd)) + return 0; + + vcn = bytes_to_cluster(sbi, + (u64)sbi->mft.bitmap.nbits << sbi->record_bits); + + if (!run_lookup_entry(&ni->file.run, vcn - 1, &lcn, &len, NULL)) + lcn = SPARSE_LCN; + + /* We should always find Last Lcn for MFT. */ + if (lcn == SPARSE_LCN) + return -EINVAL; + + lcn_s = lcn + 1; + + /* Try to allocate clusters after last MFT run. */ + zlen = wnd_find(wnd, sbi->zone_max, lcn_s, 0, &lcn_s); + wnd_zone_set(wnd, lcn_s, zlen); + + return 0; +} + +/* + * ntfs_update_mftmirr - Update $MFTMirr data. + */ +void ntfs_update_mftmirr(struct ntfs_sb_info *sbi, int wait) +{ + int err; + struct super_block *sb = sbi->sb; + u32 blocksize; + sector_t block1, block2; + u32 bytes; + + if (!sb) + return; + + blocksize = sb->s_blocksize; + + if (!(sbi->flags & NTFS_FLAGS_MFTMIRR)) + return; + + err = 0; + bytes = sbi->mft.recs_mirr << sbi->record_bits; + block1 = sbi->mft.lbo >> sb->s_blocksize_bits; + block2 = sbi->mft.lbo2 >> sb->s_blocksize_bits; + + for (; bytes >= blocksize; bytes -= blocksize) { + struct buffer_head *bh1, *bh2; + + bh1 = sb_bread(sb, block1++); + if (!bh1) + return; + + bh2 = sb_getblk(sb, block2++); + if (!bh2) { + put_bh(bh1); + return; + } + + if (buffer_locked(bh2)) + __wait_on_buffer(bh2); + + lock_buffer(bh2); + memcpy(bh2->b_data, bh1->b_data, blocksize); + set_buffer_uptodate(bh2); + mark_buffer_dirty(bh2); + unlock_buffer(bh2); + + put_bh(bh1); + bh1 = NULL; + + if (wait) + err = sync_dirty_buffer(bh2); + + put_bh(bh2); + if (err) + return; + } + + sbi->flags &= ~NTFS_FLAGS_MFTMIRR; +} + +/* + * ntfs_bad_inode + * + * Marks inode as bad and marks fs as 'dirty' + */ +void ntfs_bad_inode(struct inode *inode, const char *hint) +{ + struct ntfs_sb_info *sbi = inode->i_sb->s_fs_info; + + ntfs_inode_err(inode, "%s", hint); + make_bad_inode(inode); + ntfs_set_state(sbi, NTFS_DIRTY_ERROR); +} + +/* + * ntfs_set_state + * + * Mount: ntfs_set_state(NTFS_DIRTY_DIRTY) + * Umount: ntfs_set_state(NTFS_DIRTY_CLEAR) + * NTFS error: ntfs_set_state(NTFS_DIRTY_ERROR) + */ +int ntfs_set_state(struct ntfs_sb_info *sbi, enum NTFS_DIRTY_FLAGS dirty) +{ + int err; + struct ATTRIB *attr; + struct VOLUME_INFO *info; + struct mft_inode *mi; + struct ntfs_inode *ni; + + /* + * Do not change state if fs was real_dirty. + * Do not change state if fs already dirty(clear). + * Do not change any thing if mounted read only. + */ + if (sbi->volume.real_dirty || sb_rdonly(sbi->sb)) + return 0; + + /* Check cached value. */ + if ((dirty == NTFS_DIRTY_CLEAR ? 0 : VOLUME_FLAG_DIRTY) == + (sbi->volume.flags & VOLUME_FLAG_DIRTY)) + return 0; + + ni = sbi->volume.ni; + if (!ni) + return -EINVAL; + + mutex_lock_nested(&ni->ni_lock, NTFS_INODE_MUTEX_DIRTY); + + attr = ni_find_attr(ni, NULL, NULL, ATTR_VOL_INFO, NULL, 0, NULL, &mi); + if (!attr) { + err = -EINVAL; + goto out; + } + + info = resident_data_ex(attr, SIZEOF_ATTRIBUTE_VOLUME_INFO); + if (!info) { + err = -EINVAL; + goto out; + } + + switch (dirty) { + case NTFS_DIRTY_ERROR: + ntfs_notice(sbi->sb, "Mark volume as dirty due to NTFS errors"); + sbi->volume.real_dirty = true; + fallthrough; + case NTFS_DIRTY_DIRTY: + info->flags |= VOLUME_FLAG_DIRTY; + break; + case NTFS_DIRTY_CLEAR: + info->flags &= ~VOLUME_FLAG_DIRTY; + break; + } + /* Cache current volume flags. */ + sbi->volume.flags = info->flags; + mi->dirty = true; + err = 0; + +out: + ni_unlock(ni); + if (err) + return err; + + mark_inode_dirty_sync(&ni->vfs_inode); + /* verify(!ntfs_update_mftmirr()); */ + + /* write mft record on disk. */ + err = _ni_write_inode(&ni->vfs_inode, 1); + + return err; +} + +/* + * security_hash - Calculates a hash of security descriptor. + */ +static inline __le32 security_hash(const void *sd, size_t bytes) +{ + u32 hash = 0; + const __le32 *ptr = sd; + + bytes >>= 2; + while (bytes--) + hash = ((hash >> 0x1D) | (hash << 3)) + le32_to_cpu(*ptr++); + return cpu_to_le32(hash); +} + +int ntfs_sb_read(struct super_block *sb, u64 lbo, size_t bytes, void *buffer) +{ + struct block_device *bdev = sb->s_bdev; + u32 blocksize = sb->s_blocksize; + u64 block = lbo >> sb->s_blocksize_bits; + u32 off = lbo & (blocksize - 1); + u32 op = blocksize - off; + + for (; bytes; block += 1, off = 0, op = blocksize) { + struct buffer_head *bh = __bread(bdev, block, blocksize); + + if (!bh) + return -EIO; + + if (op > bytes) + op = bytes; + + memcpy(buffer, bh->b_data + off, op); + + put_bh(bh); + + bytes -= op; + buffer = Add2Ptr(buffer, op); + } + + return 0; +} + +int ntfs_sb_write(struct super_block *sb, u64 lbo, size_t bytes, + const void *buf, int wait) +{ + u32 blocksize = sb->s_blocksize; + struct block_device *bdev = sb->s_bdev; + sector_t block = lbo >> sb->s_blocksize_bits; + u32 off = lbo & (blocksize - 1); + u32 op = blocksize - off; + struct buffer_head *bh; + + if (!wait && (sb->s_flags & SB_SYNCHRONOUS)) + wait = 1; + + for (; bytes; block += 1, off = 0, op = blocksize) { + if (op > bytes) + op = bytes; + + if (op < blocksize) { + bh = __bread(bdev, block, blocksize); + if (!bh) { + ntfs_err(sb, "failed to read block %llx", + (u64)block); + return -EIO; + } + } else { + bh = __getblk(bdev, block, blocksize); + if (!bh) + return -ENOMEM; + } + + if (buffer_locked(bh)) + __wait_on_buffer(bh); + + lock_buffer(bh); + if (buf) { + memcpy(bh->b_data + off, buf, op); + buf = Add2Ptr(buf, op); + } else { + memset(bh->b_data + off, -1, op); + } + + set_buffer_uptodate(bh); + mark_buffer_dirty(bh); + unlock_buffer(bh); + + if (wait) { + int err = sync_dirty_buffer(bh); + + if (err) { + ntfs_err( + sb, + "failed to sync buffer at block %llx, error %d", + (u64)block, err); + put_bh(bh); + return err; + } + } + + put_bh(bh); + + bytes -= op; + } + return 0; +} + +int ntfs_sb_write_run(struct ntfs_sb_info *sbi, const struct runs_tree *run, + u64 vbo, const void *buf, size_t bytes, int sync) +{ + struct super_block *sb = sbi->sb; + u8 cluster_bits = sbi->cluster_bits; + u32 off = vbo & sbi->cluster_mask; + CLST lcn, clen, vcn = vbo >> cluster_bits, vcn_next; + u64 lbo, len; + size_t idx; + + if (!run_lookup_entry(run, vcn, &lcn, &clen, &idx)) + return -ENOENT; + + if (lcn == SPARSE_LCN) + return -EINVAL; + + lbo = ((u64)lcn << cluster_bits) + off; + len = ((u64)clen << cluster_bits) - off; + + for (;;) { + u32 op = min_t(u64, len, bytes); + int err = ntfs_sb_write(sb, lbo, op, buf, sync); + + if (err) + return err; + + bytes -= op; + if (!bytes) + break; + + vcn_next = vcn + clen; + if (!run_get_entry(run, ++idx, &vcn, &lcn, &clen) || + vcn != vcn_next) + return -ENOENT; + + if (lcn == SPARSE_LCN) + return -EINVAL; + + if (buf) + buf = Add2Ptr(buf, op); + + lbo = ((u64)lcn << cluster_bits); + len = ((u64)clen << cluster_bits); + } + + return 0; +} + +struct buffer_head *ntfs_bread_run(struct ntfs_sb_info *sbi, + const struct runs_tree *run, u64 vbo) +{ + struct super_block *sb = sbi->sb; + u8 cluster_bits = sbi->cluster_bits; + CLST lcn; + u64 lbo; + + if (!run_lookup_entry(run, vbo >> cluster_bits, &lcn, NULL, NULL)) + return ERR_PTR(-ENOENT); + + lbo = ((u64)lcn << cluster_bits) + (vbo & sbi->cluster_mask); + + return ntfs_bread(sb, lbo >> sb->s_blocksize_bits); +} + +int ntfs_read_run_nb(struct ntfs_sb_info *sbi, const struct runs_tree *run, + u64 vbo, void *buf, u32 bytes, struct ntfs_buffers *nb) +{ + int err; + struct super_block *sb = sbi->sb; + u32 blocksize = sb->s_blocksize; + u8 cluster_bits = sbi->cluster_bits; + u32 off = vbo & sbi->cluster_mask; + u32 nbh = 0; + CLST vcn_next, vcn = vbo >> cluster_bits; + CLST lcn, clen; + u64 lbo, len; + size_t idx; + struct buffer_head *bh; + + if (!run) { + /* First reading of $Volume + $MFTMirr + $LogFile goes here. */ + if (vbo > MFT_REC_VOL * sbi->record_size) { + err = -ENOENT; + goto out; + } + + /* Use absolute boot's 'MFTCluster' to read record. */ + lbo = vbo + sbi->mft.lbo; + len = sbi->record_size; + } else if (!run_lookup_entry(run, vcn, &lcn, &clen, &idx)) { + err = -ENOENT; + goto out; + } else { + if (lcn == SPARSE_LCN) { + err = -EINVAL; + goto out; + } + + lbo = ((u64)lcn << cluster_bits) + off; + len = ((u64)clen << cluster_bits) - off; + } + + off = lbo & (blocksize - 1); + if (nb) { + nb->off = off; + nb->bytes = bytes; + } + + for (;;) { + u32 len32 = len >= bytes ? bytes : len; + sector_t block = lbo >> sb->s_blocksize_bits; + + do { + u32 op = blocksize - off; + + if (op > len32) + op = len32; + + bh = ntfs_bread(sb, block); + if (!bh) { + err = -EIO; + goto out; + } + + if (buf) { + memcpy(buf, bh->b_data + off, op); + buf = Add2Ptr(buf, op); + } + + if (!nb) { + put_bh(bh); + } else if (nbh >= ARRAY_SIZE(nb->bh)) { + err = -EINVAL; + goto out; + } else { + nb->bh[nbh++] = bh; + nb->nbufs = nbh; + } + + bytes -= op; + if (!bytes) + return 0; + len32 -= op; + block += 1; + off = 0; + + } while (len32); + + vcn_next = vcn + clen; + if (!run_get_entry(run, ++idx, &vcn, &lcn, &clen) || + vcn != vcn_next) { + err = -ENOENT; + goto out; + } + + if (lcn == SPARSE_LCN) { + err = -EINVAL; + goto out; + } + + lbo = ((u64)lcn << cluster_bits); + len = ((u64)clen << cluster_bits); + } + +out: + if (!nbh) + return err; + + while (nbh) { + put_bh(nb->bh[--nbh]); + nb->bh[nbh] = NULL; + } + + nb->nbufs = 0; + return err; +} + +/* + * ntfs_read_bh + * + * Return: < 0 if error, 0 if ok, -E_NTFS_FIXUP if need to update fixups. + */ +int ntfs_read_bh(struct ntfs_sb_info *sbi, const struct runs_tree *run, u64 vbo, + struct NTFS_RECORD_HEADER *rhdr, u32 bytes, + struct ntfs_buffers *nb) +{ + int err = ntfs_read_run_nb(sbi, run, vbo, rhdr, bytes, nb); + + if (err) + return err; + return ntfs_fix_post_read(rhdr, nb->bytes, true); +} + +int ntfs_get_bh(struct ntfs_sb_info *sbi, const struct runs_tree *run, u64 vbo, + u32 bytes, struct ntfs_buffers *nb) +{ + int err = 0; + struct super_block *sb = sbi->sb; + u32 blocksize = sb->s_blocksize; + u8 cluster_bits = sbi->cluster_bits; + CLST vcn_next, vcn = vbo >> cluster_bits; + u32 off; + u32 nbh = 0; + CLST lcn, clen; + u64 lbo, len; + size_t idx; + + nb->bytes = bytes; + + if (!run_lookup_entry(run, vcn, &lcn, &clen, &idx)) { + err = -ENOENT; + goto out; + } + + off = vbo & sbi->cluster_mask; + lbo = ((u64)lcn << cluster_bits) + off; + len = ((u64)clen << cluster_bits) - off; + + nb->off = off = lbo & (blocksize - 1); + + for (;;) { + u32 len32 = min_t(u64, len, bytes); + sector_t block = lbo >> sb->s_blocksize_bits; + + do { + u32 op; + struct buffer_head *bh; + + if (nbh >= ARRAY_SIZE(nb->bh)) { + err = -EINVAL; + goto out; + } + + op = blocksize - off; + if (op > len32) + op = len32; + + if (op == blocksize) { + bh = sb_getblk(sb, block); + if (!bh) { + err = -ENOMEM; + goto out; + } + if (buffer_locked(bh)) + __wait_on_buffer(bh); + set_buffer_uptodate(bh); + } else { + bh = ntfs_bread(sb, block); + if (!bh) { + err = -EIO; + goto out; + } + } + + nb->bh[nbh++] = bh; + bytes -= op; + if (!bytes) { + nb->nbufs = nbh; + return 0; + } + + block += 1; + len32 -= op; + off = 0; + } while (len32); + + vcn_next = vcn + clen; + if (!run_get_entry(run, ++idx, &vcn, &lcn, &clen) || + vcn != vcn_next) { + err = -ENOENT; + goto out; + } + + lbo = ((u64)lcn << cluster_bits); + len = ((u64)clen << cluster_bits); + } + +out: + while (nbh) { + put_bh(nb->bh[--nbh]); + nb->bh[nbh] = NULL; + } + + nb->nbufs = 0; + + return err; +} + +int ntfs_write_bh(struct ntfs_sb_info *sbi, struct NTFS_RECORD_HEADER *rhdr, + struct ntfs_buffers *nb, int sync) +{ + int err = 0; + struct super_block *sb = sbi->sb; + u32 block_size = sb->s_blocksize; + u32 bytes = nb->bytes; + u32 off = nb->off; + u16 fo = le16_to_cpu(rhdr->fix_off); + u16 fn = le16_to_cpu(rhdr->fix_num); + u32 idx; + __le16 *fixup; + __le16 sample; + + if ((fo & 1) || fo + fn * sizeof(short) > SECTOR_SIZE || !fn-- || + fn * SECTOR_SIZE > bytes) { + return -EINVAL; + } + + for (idx = 0; bytes && idx < nb->nbufs; idx += 1, off = 0) { + u32 op = block_size - off; + char *bh_data; + struct buffer_head *bh = nb->bh[idx]; + __le16 *ptr, *end_data; + + if (op > bytes) + op = bytes; + + if (buffer_locked(bh)) + __wait_on_buffer(bh); + + lock_buffer(bh); + + bh_data = bh->b_data + off; + end_data = Add2Ptr(bh_data, op); + memcpy(bh_data, rhdr, op); + + if (!idx) { + u16 t16; + + fixup = Add2Ptr(bh_data, fo); + sample = *fixup; + t16 = le16_to_cpu(sample); + if (t16 >= 0x7FFF) { + sample = *fixup = cpu_to_le16(1); + } else { + sample = cpu_to_le16(t16 + 1); + *fixup = sample; + } + + *(__le16 *)Add2Ptr(rhdr, fo) = sample; + } + + ptr = Add2Ptr(bh_data, SECTOR_SIZE - sizeof(short)); + + do { + *++fixup = *ptr; + *ptr = sample; + ptr += SECTOR_SIZE / sizeof(short); + } while (ptr < end_data); + + set_buffer_uptodate(bh); + mark_buffer_dirty(bh); + unlock_buffer(bh); + + if (sync) { + int err2 = sync_dirty_buffer(bh); + + if (!err && err2) + err = err2; + } + + bytes -= op; + rhdr = Add2Ptr(rhdr, op); + } + + return err; +} + +/* + * ntfs_bio_pages - Read/write pages from/to disk. + */ +int ntfs_bio_pages(struct ntfs_sb_info *sbi, const struct runs_tree *run, + struct page **pages, u32 nr_pages, u64 vbo, u32 bytes, + enum req_op op) +{ + int err = 0; + struct bio *new, *bio = NULL; + struct super_block *sb = sbi->sb; + struct block_device *bdev = sb->s_bdev; + struct page *page; + u8 cluster_bits = sbi->cluster_bits; + CLST lcn, clen, vcn, vcn_next; + u32 add, off, page_idx; + u64 lbo, len; + size_t run_idx; + struct blk_plug plug; + + if (!bytes) + return 0; + + blk_start_plug(&plug); + + /* Align vbo and bytes to be 512 bytes aligned. */ + lbo = (vbo + bytes + 511) & ~511ull; + vbo = vbo & ~511ull; + bytes = lbo - vbo; + + vcn = vbo >> cluster_bits; + if (!run_lookup_entry(run, vcn, &lcn, &clen, &run_idx)) { + err = -ENOENT; + goto out; + } + off = vbo & sbi->cluster_mask; + page_idx = 0; + page = pages[0]; + + for (;;) { + lbo = ((u64)lcn << cluster_bits) + off; + len = ((u64)clen << cluster_bits) - off; +new_bio: + new = bio_alloc(bdev, nr_pages - page_idx, op, GFP_NOFS); + if (bio) { + bio_chain(bio, new); + submit_bio(bio); + } + bio = new; + bio->bi_iter.bi_sector = lbo >> 9; + + while (len) { + off = vbo & (PAGE_SIZE - 1); + add = off + len > PAGE_SIZE ? (PAGE_SIZE - off) : len; + + if (bio_add_page(bio, page, add, off) < add) + goto new_bio; + + if (bytes <= add) + goto out; + bytes -= add; + vbo += add; + + if (add + off == PAGE_SIZE) { + page_idx += 1; + if (WARN_ON(page_idx >= nr_pages)) { + err = -EINVAL; + goto out; + } + page = pages[page_idx]; + } + + if (len <= add) + break; + len -= add; + lbo += add; + } + + vcn_next = vcn + clen; + if (!run_get_entry(run, ++run_idx, &vcn, &lcn, &clen) || + vcn != vcn_next) { + err = -ENOENT; + goto out; + } + off = 0; + } +out: + if (bio) { + if (!err) + err = submit_bio_wait(bio); + bio_put(bio); + } + blk_finish_plug(&plug); + + return err; +} + +/* + * ntfs_bio_fill_1 - Helper for ntfs_loadlog_and_replay(). + * + * Fill on-disk logfile range by (-1) + * this means empty logfile. + */ +int ntfs_bio_fill_1(struct ntfs_sb_info *sbi, const struct runs_tree *run) +{ + int err = 0; + struct super_block *sb = sbi->sb; + struct block_device *bdev = sb->s_bdev; + u8 cluster_bits = sbi->cluster_bits; + struct bio *new, *bio = NULL; + CLST lcn, clen; + u64 lbo, len; + size_t run_idx; + struct page *fill; + void *kaddr; + struct blk_plug plug; + + fill = alloc_page(GFP_KERNEL); + if (!fill) + return -ENOMEM; + + kaddr = kmap_atomic(fill); + memset(kaddr, -1, PAGE_SIZE); + kunmap_atomic(kaddr); + flush_dcache_page(fill); + lock_page(fill); + + if (!run_lookup_entry(run, 0, &lcn, &clen, &run_idx)) { + err = -ENOENT; + goto out; + } + + /* + * TODO: Try blkdev_issue_write_same. + */ + blk_start_plug(&plug); + do { + lbo = (u64)lcn << cluster_bits; + len = (u64)clen << cluster_bits; +new_bio: + new = bio_alloc(bdev, BIO_MAX_VECS, REQ_OP_WRITE, GFP_NOFS); + if (bio) { + bio_chain(bio, new); + submit_bio(bio); + } + bio = new; + bio->bi_iter.bi_sector = lbo >> 9; + + for (;;) { + u32 add = len > PAGE_SIZE ? PAGE_SIZE : len; + + if (bio_add_page(bio, fill, add, 0) < add) + goto new_bio; + + lbo += add; + if (len <= add) + break; + len -= add; + } + } while (run_get_entry(run, ++run_idx, NULL, &lcn, &clen)); + + if (!err) + err = submit_bio_wait(bio); + bio_put(bio); + + blk_finish_plug(&plug); +out: + unlock_page(fill); + put_page(fill); + + return err; +} + +int ntfs_vbo_to_lbo(struct ntfs_sb_info *sbi, const struct runs_tree *run, + u64 vbo, u64 *lbo, u64 *bytes) +{ + u32 off; + CLST lcn, len; + u8 cluster_bits = sbi->cluster_bits; + + if (!run_lookup_entry(run, vbo >> cluster_bits, &lcn, &len, NULL)) + return -ENOENT; + + off = vbo & sbi->cluster_mask; + *lbo = lcn == SPARSE_LCN ? -1 : (((u64)lcn << cluster_bits) + off); + *bytes = ((u64)len << cluster_bits) - off; + + return 0; +} + +struct ntfs_inode *ntfs_new_inode(struct ntfs_sb_info *sbi, CLST rno, bool dir) +{ + int err = 0; + struct super_block *sb = sbi->sb; + struct inode *inode = new_inode(sb); + struct ntfs_inode *ni; + + if (!inode) + return ERR_PTR(-ENOMEM); + + ni = ntfs_i(inode); + + err = mi_format_new(&ni->mi, sbi, rno, dir ? RECORD_FLAG_DIR : 0, + false); + if (err) + goto out; + + inode->i_ino = rno; + if (insert_inode_locked(inode) < 0) { + err = -EIO; + goto out; + } + +out: + if (err) { + iput(inode); + ni = ERR_PTR(err); + } + return ni; +} + +/* + * O:BAG:BAD:(A;OICI;FA;;;WD) + * Owner S-1-5-32-544 (Administrators) + * Group S-1-5-32-544 (Administrators) + * ACE: allow S-1-1-0 (Everyone) with FILE_ALL_ACCESS + */ +const u8 s_default_security[] __aligned(8) = { + 0x01, 0x00, 0x04, 0x80, 0x30, 0x00, 0x00, 0x00, 0x40, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x02, 0x00, 0x1C, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x00, 0x03, 0x14, 0x00, 0xFF, 0x01, 0x1F, 0x00, + 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, + 0x01, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x05, 0x20, 0x00, 0x00, 0x00, + 0x20, 0x02, 0x00, 0x00, 0x01, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x05, + 0x20, 0x00, 0x00, 0x00, 0x20, 0x02, 0x00, 0x00, +}; + +static_assert(sizeof(s_default_security) == 0x50); + +static inline u32 sid_length(const struct SID *sid) +{ + return struct_size(sid, SubAuthority, sid->SubAuthorityCount); +} + +/* + * is_acl_valid + * + * Thanks Mark Harmstone for idea. + */ +static bool is_acl_valid(const struct ACL *acl, u32 len) +{ + const struct ACE_HEADER *ace; + u32 i; + u16 ace_count, ace_size; + + if (acl->AclRevision != ACL_REVISION && + acl->AclRevision != ACL_REVISION_DS) { + /* + * This value should be ACL_REVISION, unless the ACL contains an + * object-specific ACE, in which case this value must be ACL_REVISION_DS. + * All ACEs in an ACL must be at the same revision level. + */ + return false; + } + + if (acl->Sbz1) + return false; + + if (le16_to_cpu(acl->AclSize) > len) + return false; + + if (acl->Sbz2) + return false; + + len -= sizeof(struct ACL); + ace = (struct ACE_HEADER *)&acl[1]; + ace_count = le16_to_cpu(acl->AceCount); + + for (i = 0; i < ace_count; i++) { + if (len < sizeof(struct ACE_HEADER)) + return false; + + ace_size = le16_to_cpu(ace->AceSize); + if (len < ace_size) + return false; + + len -= ace_size; + ace = Add2Ptr(ace, ace_size); + } + + return true; +} + +bool is_sd_valid(const struct SECURITY_DESCRIPTOR_RELATIVE *sd, u32 len) +{ + u32 sd_owner, sd_group, sd_sacl, sd_dacl; + + if (len < sizeof(struct SECURITY_DESCRIPTOR_RELATIVE)) + return false; + + if (sd->Revision != 1) + return false; + + if (sd->Sbz1) + return false; + + if (!(sd->Control & SE_SELF_RELATIVE)) + return false; + + sd_owner = le32_to_cpu(sd->Owner); + if (sd_owner) { + const struct SID *owner = Add2Ptr(sd, sd_owner); + + if (sd_owner + offsetof(struct SID, SubAuthority) > len) + return false; + + if (owner->Revision != 1) + return false; + + if (sd_owner + sid_length(owner) > len) + return false; + } + + sd_group = le32_to_cpu(sd->Group); + if (sd_group) { + const struct SID *group = Add2Ptr(sd, sd_group); + + if (sd_group + offsetof(struct SID, SubAuthority) > len) + return false; + + if (group->Revision != 1) + return false; + + if (sd_group + sid_length(group) > len) + return false; + } + + sd_sacl = le32_to_cpu(sd->Sacl); + if (sd_sacl) { + const struct ACL *sacl = Add2Ptr(sd, sd_sacl); + + if (sd_sacl + sizeof(struct ACL) > len) + return false; + + if (!is_acl_valid(sacl, len - sd_sacl)) + return false; + } + + sd_dacl = le32_to_cpu(sd->Dacl); + if (sd_dacl) { + const struct ACL *dacl = Add2Ptr(sd, sd_dacl); + + if (sd_dacl + sizeof(struct ACL) > len) + return false; + + if (!is_acl_valid(dacl, len - sd_dacl)) + return false; + } + + return true; +} + +/* + * ntfs_security_init - Load and parse $Secure. + */ +int ntfs_security_init(struct ntfs_sb_info *sbi) +{ + int err; + struct super_block *sb = sbi->sb; + struct inode *inode; + struct ntfs_inode *ni; + struct MFT_REF ref; + struct ATTRIB *attr; + struct ATTR_LIST_ENTRY *le; + u64 sds_size; + size_t off; + struct NTFS_DE *ne; + struct NTFS_DE_SII *sii_e; + struct ntfs_fnd *fnd_sii = NULL; + const struct INDEX_ROOT *root_sii; + const struct INDEX_ROOT *root_sdh; + struct ntfs_index *indx_sdh = &sbi->security.index_sdh; + struct ntfs_index *indx_sii = &sbi->security.index_sii; + + ref.low = cpu_to_le32(MFT_REC_SECURE); + ref.high = 0; + ref.seq = cpu_to_le16(MFT_REC_SECURE); + + inode = ntfs_iget5(sb, &ref, &NAME_SECURE); + if (IS_ERR(inode)) { + err = PTR_ERR(inode); + ntfs_err(sb, "Failed to load $Secure."); + inode = NULL; + goto out; + } + + ni = ntfs_i(inode); + + le = NULL; + + attr = ni_find_attr(ni, NULL, &le, ATTR_ROOT, SDH_NAME, + ARRAY_SIZE(SDH_NAME), NULL, NULL); + if (!attr) { + err = -EINVAL; + goto out; + } + + root_sdh = resident_data_ex(attr, sizeof(struct INDEX_ROOT)); + if (root_sdh->type != ATTR_ZERO || + root_sdh->rule != NTFS_COLLATION_TYPE_SECURITY_HASH || + offsetof(struct INDEX_ROOT, ihdr) + root_sdh->ihdr.used > attr->res.data_size) { + err = -EINVAL; + goto out; + } + + err = indx_init(indx_sdh, sbi, attr, INDEX_MUTEX_SDH); + if (err) + goto out; + + attr = ni_find_attr(ni, attr, &le, ATTR_ROOT, SII_NAME, + ARRAY_SIZE(SII_NAME), NULL, NULL); + if (!attr) { + err = -EINVAL; + goto out; + } + + root_sii = resident_data_ex(attr, sizeof(struct INDEX_ROOT)); + if (root_sii->type != ATTR_ZERO || + root_sii->rule != NTFS_COLLATION_TYPE_UINT || + offsetof(struct INDEX_ROOT, ihdr) + root_sii->ihdr.used > attr->res.data_size) { + err = -EINVAL; + goto out; + } + + err = indx_init(indx_sii, sbi, attr, INDEX_MUTEX_SII); + if (err) + goto out; + + fnd_sii = fnd_get(); + if (!fnd_sii) { + err = -ENOMEM; + goto out; + } + + sds_size = inode->i_size; + + /* Find the last valid Id. */ + sbi->security.next_id = SECURITY_ID_FIRST; + /* Always write new security at the end of bucket. */ + sbi->security.next_off = + ALIGN(sds_size - SecurityDescriptorsBlockSize, 16); + + off = 0; + ne = NULL; + + for (;;) { + u32 next_id; + + err = indx_find_raw(indx_sii, ni, root_sii, &ne, &off, fnd_sii); + if (err || !ne) + break; + + sii_e = (struct NTFS_DE_SII *)ne; + if (le16_to_cpu(ne->view.data_size) < SIZEOF_SECURITY_HDR) + continue; + + next_id = le32_to_cpu(sii_e->sec_id) + 1; + if (next_id >= sbi->security.next_id) + sbi->security.next_id = next_id; + } + + sbi->security.ni = ni; + inode = NULL; +out: + iput(inode); + fnd_put(fnd_sii); + + return err; +} + +/* + * ntfs_get_security_by_id - Read security descriptor by id. + */ +int ntfs_get_security_by_id(struct ntfs_sb_info *sbi, __le32 security_id, + struct SECURITY_DESCRIPTOR_RELATIVE **sd, + size_t *size) +{ + int err; + int diff; + struct ntfs_inode *ni = sbi->security.ni; + struct ntfs_index *indx = &sbi->security.index_sii; + void *p = NULL; + struct NTFS_DE_SII *sii_e; + struct ntfs_fnd *fnd_sii; + struct SECURITY_HDR d_security; + const struct INDEX_ROOT *root_sii; + u32 t32; + + *sd = NULL; + + mutex_lock_nested(&ni->ni_lock, NTFS_INODE_MUTEX_SECURITY); + + fnd_sii = fnd_get(); + if (!fnd_sii) { + err = -ENOMEM; + goto out; + } + + root_sii = indx_get_root(indx, ni, NULL, NULL); + if (!root_sii) { + err = -EINVAL; + goto out; + } + + /* Try to find this SECURITY descriptor in SII indexes. */ + err = indx_find(indx, ni, root_sii, &security_id, sizeof(security_id), + NULL, &diff, (struct NTFS_DE **)&sii_e, fnd_sii); + if (err) + goto out; + + if (diff) + goto out; + + t32 = le32_to_cpu(sii_e->sec_hdr.size); + if (t32 < SIZEOF_SECURITY_HDR) { + err = -EINVAL; + goto out; + } + + if (t32 > SIZEOF_SECURITY_HDR + 0x10000) { + /* Looks like too big security. 0x10000 - is arbitrary big number. */ + err = -EFBIG; + goto out; + } + + *size = t32 - SIZEOF_SECURITY_HDR; + + p = kmalloc(*size, GFP_NOFS); + if (!p) { + err = -ENOMEM; + goto out; + } + + err = ntfs_read_run_nb(sbi, &ni->file.run, + le64_to_cpu(sii_e->sec_hdr.off), &d_security, + sizeof(d_security), NULL); + if (err) + goto out; + + if (memcmp(&d_security, &sii_e->sec_hdr, SIZEOF_SECURITY_HDR)) { + err = -EINVAL; + goto out; + } + + err = ntfs_read_run_nb(sbi, &ni->file.run, + le64_to_cpu(sii_e->sec_hdr.off) + + SIZEOF_SECURITY_HDR, + p, *size, NULL); + if (err) + goto out; + + *sd = p; + p = NULL; + +out: + kfree(p); + fnd_put(fnd_sii); + ni_unlock(ni); + + return err; +} + +/* + * ntfs_insert_security - Insert security descriptor into $Secure::SDS. + * + * SECURITY Descriptor Stream data is organized into chunks of 256K bytes + * and it contains a mirror copy of each security descriptor. When writing + * to a security descriptor at location X, another copy will be written at + * location (X+256K). + * When writing a security descriptor that will cross the 256K boundary, + * the pointer will be advanced by 256K to skip + * over the mirror portion. + */ +int ntfs_insert_security(struct ntfs_sb_info *sbi, + const struct SECURITY_DESCRIPTOR_RELATIVE *sd, + u32 size_sd, __le32 *security_id, bool *inserted) +{ + int err, diff; + struct ntfs_inode *ni = sbi->security.ni; + struct ntfs_index *indx_sdh = &sbi->security.index_sdh; + struct ntfs_index *indx_sii = &sbi->security.index_sii; + struct NTFS_DE_SDH *e; + struct NTFS_DE_SDH sdh_e; + struct NTFS_DE_SII sii_e; + struct SECURITY_HDR *d_security; + u32 new_sec_size = size_sd + SIZEOF_SECURITY_HDR; + u32 aligned_sec_size = ALIGN(new_sec_size, 16); + struct SECURITY_KEY hash_key; + struct ntfs_fnd *fnd_sdh = NULL; + const struct INDEX_ROOT *root_sdh; + const struct INDEX_ROOT *root_sii; + u64 mirr_off, new_sds_size; + u32 next, left; + + static_assert((1 << Log2OfSecurityDescriptorsBlockSize) == + SecurityDescriptorsBlockSize); + + hash_key.hash = security_hash(sd, size_sd); + hash_key.sec_id = SECURITY_ID_INVALID; + + if (inserted) + *inserted = false; + *security_id = SECURITY_ID_INVALID; + + /* Allocate a temporal buffer. */ + d_security = kzalloc(aligned_sec_size, GFP_NOFS); + if (!d_security) + return -ENOMEM; + + mutex_lock_nested(&ni->ni_lock, NTFS_INODE_MUTEX_SECURITY); + + fnd_sdh = fnd_get(); + if (!fnd_sdh) { + err = -ENOMEM; + goto out; + } + + root_sdh = indx_get_root(indx_sdh, ni, NULL, NULL); + if (!root_sdh) { + err = -EINVAL; + goto out; + } + + root_sii = indx_get_root(indx_sii, ni, NULL, NULL); + if (!root_sii) { + err = -EINVAL; + goto out; + } + + /* + * Check if such security already exists. + * Use "SDH" and hash -> to get the offset in "SDS". + */ + err = indx_find(indx_sdh, ni, root_sdh, &hash_key, sizeof(hash_key), + &d_security->key.sec_id, &diff, (struct NTFS_DE **)&e, + fnd_sdh); + if (err) + goto out; + + while (e) { + if (le32_to_cpu(e->sec_hdr.size) == new_sec_size) { + err = ntfs_read_run_nb(sbi, &ni->file.run, + le64_to_cpu(e->sec_hdr.off), + d_security, new_sec_size, NULL); + if (err) + goto out; + + if (le32_to_cpu(d_security->size) == new_sec_size && + d_security->key.hash == hash_key.hash && + !memcmp(d_security + 1, sd, size_sd)) { + *security_id = d_security->key.sec_id; + /* Such security already exists. */ + err = 0; + goto out; + } + } + + err = indx_find_sort(indx_sdh, ni, root_sdh, + (struct NTFS_DE **)&e, fnd_sdh); + if (err) + goto out; + + if (!e || e->key.hash != hash_key.hash) + break; + } + + /* Zero unused space. */ + next = sbi->security.next_off & (SecurityDescriptorsBlockSize - 1); + left = SecurityDescriptorsBlockSize - next; + + /* Zero gap until SecurityDescriptorsBlockSize. */ + if (left < new_sec_size) { + /* Zero "left" bytes from sbi->security.next_off. */ + sbi->security.next_off += SecurityDescriptorsBlockSize + left; + } + + /* Zero tail of previous security. */ + //used = ni->vfs_inode.i_size & (SecurityDescriptorsBlockSize - 1); + + /* + * Example: + * 0x40438 == ni->vfs_inode.i_size + * 0x00440 == sbi->security.next_off + * need to zero [0x438-0x440) + * if (next > used) { + * u32 tozero = next - used; + * zero "tozero" bytes from sbi->security.next_off - tozero + */ + + /* Format new security descriptor. */ + d_security->key.hash = hash_key.hash; + d_security->key.sec_id = cpu_to_le32(sbi->security.next_id); + d_security->off = cpu_to_le64(sbi->security.next_off); + d_security->size = cpu_to_le32(new_sec_size); + memcpy(d_security + 1, sd, size_sd); + + /* Write main SDS bucket. */ + err = ntfs_sb_write_run(sbi, &ni->file.run, sbi->security.next_off, + d_security, aligned_sec_size, 0); + + if (err) + goto out; + + mirr_off = sbi->security.next_off + SecurityDescriptorsBlockSize; + new_sds_size = mirr_off + aligned_sec_size; + + if (new_sds_size > ni->vfs_inode.i_size) { + err = attr_set_size(ni, ATTR_DATA, SDS_NAME, + ARRAY_SIZE(SDS_NAME), &ni->file.run, + new_sds_size, &new_sds_size, false, NULL); + if (err) + goto out; + } + + /* Write copy SDS bucket. */ + err = ntfs_sb_write_run(sbi, &ni->file.run, mirr_off, d_security, + aligned_sec_size, 0); + if (err) + goto out; + + /* Fill SII entry. */ + sii_e.de.view.data_off = + cpu_to_le16(offsetof(struct NTFS_DE_SII, sec_hdr)); + sii_e.de.view.data_size = cpu_to_le16(SIZEOF_SECURITY_HDR); + sii_e.de.view.res = 0; + sii_e.de.size = cpu_to_le16(SIZEOF_SII_DIRENTRY); + sii_e.de.key_size = cpu_to_le16(sizeof(d_security->key.sec_id)); + sii_e.de.flags = 0; + sii_e.de.res = 0; + sii_e.sec_id = d_security->key.sec_id; + memcpy(&sii_e.sec_hdr, d_security, SIZEOF_SECURITY_HDR); + + err = indx_insert_entry(indx_sii, ni, &sii_e.de, NULL, NULL, 0); + if (err) + goto out; + + /* Fill SDH entry. */ + sdh_e.de.view.data_off = + cpu_to_le16(offsetof(struct NTFS_DE_SDH, sec_hdr)); + sdh_e.de.view.data_size = cpu_to_le16(SIZEOF_SECURITY_HDR); + sdh_e.de.view.res = 0; + sdh_e.de.size = cpu_to_le16(SIZEOF_SDH_DIRENTRY); + sdh_e.de.key_size = cpu_to_le16(sizeof(sdh_e.key)); + sdh_e.de.flags = 0; + sdh_e.de.res = 0; + sdh_e.key.hash = d_security->key.hash; + sdh_e.key.sec_id = d_security->key.sec_id; + memcpy(&sdh_e.sec_hdr, d_security, SIZEOF_SECURITY_HDR); + sdh_e.magic[0] = cpu_to_le16('I'); + sdh_e.magic[1] = cpu_to_le16('I'); + + fnd_clear(fnd_sdh); + err = indx_insert_entry(indx_sdh, ni, &sdh_e.de, (void *)(size_t)1, + fnd_sdh, 0); + if (err) + goto out; + + *security_id = d_security->key.sec_id; + if (inserted) + *inserted = true; + + /* Update Id and offset for next descriptor. */ + sbi->security.next_id += 1; + sbi->security.next_off += aligned_sec_size; + +out: + fnd_put(fnd_sdh); + mark_inode_dirty(&ni->vfs_inode); + ni_unlock(ni); + kfree(d_security); + + return err; +} + +/* + * ntfs_reparse_init - Load and parse $Extend/$Reparse. + */ +int ntfs_reparse_init(struct ntfs_sb_info *sbi) +{ + int err; + struct ntfs_inode *ni = sbi->reparse.ni; + struct ntfs_index *indx = &sbi->reparse.index_r; + struct ATTRIB *attr; + struct ATTR_LIST_ENTRY *le; + const struct INDEX_ROOT *root_r; + + if (!ni) + return 0; + + le = NULL; + attr = ni_find_attr(ni, NULL, &le, ATTR_ROOT, SR_NAME, + ARRAY_SIZE(SR_NAME), NULL, NULL); + if (!attr) { + err = -EINVAL; + goto out; + } + + root_r = resident_data(attr); + if (root_r->type != ATTR_ZERO || + root_r->rule != NTFS_COLLATION_TYPE_UINTS) { + err = -EINVAL; + goto out; + } + + err = indx_init(indx, sbi, attr, INDEX_MUTEX_SR); + if (err) + goto out; + +out: + return err; +} + +/* + * ntfs_objid_init - Load and parse $Extend/$ObjId. + */ +int ntfs_objid_init(struct ntfs_sb_info *sbi) +{ + int err; + struct ntfs_inode *ni = sbi->objid.ni; + struct ntfs_index *indx = &sbi->objid.index_o; + struct ATTRIB *attr; + struct ATTR_LIST_ENTRY *le; + const struct INDEX_ROOT *root; + + if (!ni) + return 0; + + le = NULL; + attr = ni_find_attr(ni, NULL, &le, ATTR_ROOT, SO_NAME, + ARRAY_SIZE(SO_NAME), NULL, NULL); + if (!attr) { + err = -EINVAL; + goto out; + } + + root = resident_data(attr); + if (root->type != ATTR_ZERO || + root->rule != NTFS_COLLATION_TYPE_UINTS) { + err = -EINVAL; + goto out; + } + + err = indx_init(indx, sbi, attr, INDEX_MUTEX_SO); + if (err) + goto out; + +out: + return err; +} + +int ntfs_objid_remove(struct ntfs_sb_info *sbi, struct GUID *guid) +{ + int err; + struct ntfs_inode *ni = sbi->objid.ni; + struct ntfs_index *indx = &sbi->objid.index_o; + + if (!ni) + return -EINVAL; + + mutex_lock_nested(&ni->ni_lock, NTFS_INODE_MUTEX_OBJID); + + err = indx_delete_entry(indx, ni, guid, sizeof(*guid), NULL); + + mark_inode_dirty(&ni->vfs_inode); + ni_unlock(ni); + + return err; +} + +int ntfs_insert_reparse(struct ntfs_sb_info *sbi, __le32 rtag, + const struct MFT_REF *ref) +{ + int err; + struct ntfs_inode *ni = sbi->reparse.ni; + struct ntfs_index *indx = &sbi->reparse.index_r; + struct NTFS_DE_R re; + + if (!ni) + return -EINVAL; + + memset(&re, 0, sizeof(re)); + + re.de.view.data_off = cpu_to_le16(offsetof(struct NTFS_DE_R, zero)); + re.de.size = cpu_to_le16(sizeof(struct NTFS_DE_R)); + re.de.key_size = cpu_to_le16(sizeof(re.key)); + + re.key.ReparseTag = rtag; + memcpy(&re.key.ref, ref, sizeof(*ref)); + + mutex_lock_nested(&ni->ni_lock, NTFS_INODE_MUTEX_REPARSE); + + err = indx_insert_entry(indx, ni, &re.de, NULL, NULL, 0); + + mark_inode_dirty(&ni->vfs_inode); + ni_unlock(ni); + + return err; +} + +int ntfs_remove_reparse(struct ntfs_sb_info *sbi, __le32 rtag, + const struct MFT_REF *ref) +{ + int err, diff; + struct ntfs_inode *ni = sbi->reparse.ni; + struct ntfs_index *indx = &sbi->reparse.index_r; + struct ntfs_fnd *fnd = NULL; + struct REPARSE_KEY rkey; + struct NTFS_DE_R *re; + struct INDEX_ROOT *root_r; + + if (!ni) + return -EINVAL; + + rkey.ReparseTag = rtag; + rkey.ref = *ref; + + mutex_lock_nested(&ni->ni_lock, NTFS_INODE_MUTEX_REPARSE); + + if (rtag) { + err = indx_delete_entry(indx, ni, &rkey, sizeof(rkey), NULL); + goto out1; + } + + fnd = fnd_get(); + if (!fnd) { + err = -ENOMEM; + goto out1; + } + + root_r = indx_get_root(indx, ni, NULL, NULL); + if (!root_r) { + err = -EINVAL; + goto out; + } + + /* 1 - forces to ignore rkey.ReparseTag when comparing keys. */ + err = indx_find(indx, ni, root_r, &rkey, sizeof(rkey), (void *)1, &diff, + (struct NTFS_DE **)&re, fnd); + if (err) + goto out; + + if (memcmp(&re->key.ref, ref, sizeof(*ref))) { + /* Impossible. Looks like volume corrupt? */ + goto out; + } + + memcpy(&rkey, &re->key, sizeof(rkey)); + + fnd_put(fnd); + fnd = NULL; + + err = indx_delete_entry(indx, ni, &rkey, sizeof(rkey), NULL); + if (err) + goto out; + +out: + fnd_put(fnd); + +out1: + mark_inode_dirty(&ni->vfs_inode); + ni_unlock(ni); + + return err; +} + +static inline void ntfs_unmap_and_discard(struct ntfs_sb_info *sbi, CLST lcn, + CLST len) +{ + ntfs_unmap_meta(sbi->sb, lcn, len); + ntfs_discard(sbi, lcn, len); +} + +void mark_as_free_ex(struct ntfs_sb_info *sbi, CLST lcn, CLST len, bool trim) +{ + CLST end, i, zone_len, zlen; + struct wnd_bitmap *wnd = &sbi->used.bitmap; + bool dirty = false; + + down_write_nested(&wnd->rw_lock, BITMAP_MUTEX_CLUSTERS); + if (!wnd_is_used(wnd, lcn, len)) { + /* mark volume as dirty out of wnd->rw_lock */ + dirty = true; + + end = lcn + len; + len = 0; + for (i = lcn; i < end; i++) { + if (wnd_is_used(wnd, i, 1)) { + if (!len) + lcn = i; + len += 1; + continue; + } + + if (!len) + continue; + + if (trim) + ntfs_unmap_and_discard(sbi, lcn, len); + + wnd_set_free(wnd, lcn, len); + len = 0; + } + + if (!len) + goto out; + } + + if (trim) + ntfs_unmap_and_discard(sbi, lcn, len); + wnd_set_free(wnd, lcn, len); + + /* append to MFT zone, if possible. */ + zone_len = wnd_zone_len(wnd); + zlen = min(zone_len + len, sbi->zone_max); + + if (zlen == zone_len) { + /* MFT zone already has maximum size. */ + } else if (!zone_len) { + /* Create MFT zone only if 'zlen' is large enough. */ + if (zlen == sbi->zone_max) + wnd_zone_set(wnd, lcn, zlen); + } else { + CLST zone_lcn = wnd_zone_bit(wnd); + + if (lcn + len == zone_lcn) { + /* Append into head MFT zone. */ + wnd_zone_set(wnd, lcn, zlen); + } else if (zone_lcn + zone_len == lcn) { + /* Append into tail MFT zone. */ + wnd_zone_set(wnd, zone_lcn, zlen); + } + } + +out: + up_write(&wnd->rw_lock); + if (dirty) + ntfs_set_state(sbi, NTFS_DIRTY_ERROR); +} + +/* + * run_deallocate - Deallocate clusters. + */ +int run_deallocate(struct ntfs_sb_info *sbi, struct runs_tree *run, bool trim) +{ + CLST lcn, len; + size_t idx = 0; + + while (run_get_entry(run, idx++, NULL, &lcn, &len)) { + if (lcn == SPARSE_LCN) + continue; + + mark_as_free_ex(sbi, lcn, len, trim); + } + + return 0; +} diff --git a/fs/ntfs3/index.c b/fs/ntfs3/index.c new file mode 100644 index 000000000..b89a33f57 --- /dev/null +++ b/fs/ntfs3/index.c @@ -0,0 +1,2668 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * + * Copyright (C) 2019-2021 Paragon Software GmbH, All rights reserved. + * + */ + +#include <linux/blkdev.h> +#include <linux/buffer_head.h> +#include <linux/fs.h> +#include <linux/kernel.h> + +#include "debug.h" +#include "ntfs.h" +#include "ntfs_fs.h" + +static const struct INDEX_NAMES { + const __le16 *name; + u8 name_len; +} s_index_names[INDEX_MUTEX_TOTAL] = { + { I30_NAME, ARRAY_SIZE(I30_NAME) }, { SII_NAME, ARRAY_SIZE(SII_NAME) }, + { SDH_NAME, ARRAY_SIZE(SDH_NAME) }, { SO_NAME, ARRAY_SIZE(SO_NAME) }, + { SQ_NAME, ARRAY_SIZE(SQ_NAME) }, { SR_NAME, ARRAY_SIZE(SR_NAME) }, +}; + +/* + * cmp_fnames - Compare two names in index. + * + * if l1 != 0 + * Both names are little endian on-disk ATTR_FILE_NAME structs. + * else + * key1 - cpu_str, key2 - ATTR_FILE_NAME + */ +static int cmp_fnames(const void *key1, size_t l1, const void *key2, size_t l2, + const void *data) +{ + const struct ATTR_FILE_NAME *f2 = key2; + const struct ntfs_sb_info *sbi = data; + const struct ATTR_FILE_NAME *f1; + u16 fsize2; + bool both_case; + + if (l2 <= offsetof(struct ATTR_FILE_NAME, name)) + return -1; + + fsize2 = fname_full_size(f2); + if (l2 < fsize2) + return -1; + + both_case = f2->type != FILE_NAME_DOS /*&& !sbi->options.nocase*/; + if (!l1) { + const struct le_str *s2 = (struct le_str *)&f2->name_len; + + /* + * If names are equal (case insensitive) + * try to compare it case sensitive. + */ + return ntfs_cmp_names_cpu(key1, s2, sbi->upcase, both_case); + } + + f1 = key1; + return ntfs_cmp_names(f1->name, f1->name_len, f2->name, f2->name_len, + sbi->upcase, both_case); +} + +/* + * cmp_uint - $SII of $Secure and $Q of Quota + */ +static int cmp_uint(const void *key1, size_t l1, const void *key2, size_t l2, + const void *data) +{ + const u32 *k1 = key1; + const u32 *k2 = key2; + + if (l2 < sizeof(u32)) + return -1; + + if (*k1 < *k2) + return -1; + if (*k1 > *k2) + return 1; + return 0; +} + +/* + * cmp_sdh - $SDH of $Secure + */ +static int cmp_sdh(const void *key1, size_t l1, const void *key2, size_t l2, + const void *data) +{ + const struct SECURITY_KEY *k1 = key1; + const struct SECURITY_KEY *k2 = key2; + u32 t1, t2; + + if (l2 < sizeof(struct SECURITY_KEY)) + return -1; + + t1 = le32_to_cpu(k1->hash); + t2 = le32_to_cpu(k2->hash); + + /* First value is a hash value itself. */ + if (t1 < t2) + return -1; + if (t1 > t2) + return 1; + + /* Second value is security Id. */ + if (data) { + t1 = le32_to_cpu(k1->sec_id); + t2 = le32_to_cpu(k2->sec_id); + if (t1 < t2) + return -1; + if (t1 > t2) + return 1; + } + + return 0; +} + +/* + * cmp_uints - $O of ObjId and "$R" for Reparse. + */ +static int cmp_uints(const void *key1, size_t l1, const void *key2, size_t l2, + const void *data) +{ + const __le32 *k1 = key1; + const __le32 *k2 = key2; + size_t count; + + if ((size_t)data == 1) { + /* + * ni_delete_all -> ntfs_remove_reparse -> + * delete all with this reference. + * k1, k2 - pointers to REPARSE_KEY + */ + + k1 += 1; // Skip REPARSE_KEY.ReparseTag + k2 += 1; // Skip REPARSE_KEY.ReparseTag + if (l2 <= sizeof(int)) + return -1; + l2 -= sizeof(int); + if (l1 <= sizeof(int)) + return 1; + l1 -= sizeof(int); + } + + if (l2 < sizeof(int)) + return -1; + + for (count = min(l1, l2) >> 2; count > 0; --count, ++k1, ++k2) { + u32 t1 = le32_to_cpu(*k1); + u32 t2 = le32_to_cpu(*k2); + + if (t1 > t2) + return 1; + if (t1 < t2) + return -1; + } + + if (l1 > l2) + return 1; + if (l1 < l2) + return -1; + + return 0; +} + +static inline NTFS_CMP_FUNC get_cmp_func(const struct INDEX_ROOT *root) +{ + switch (root->type) { + case ATTR_NAME: + if (root->rule == NTFS_COLLATION_TYPE_FILENAME) + return &cmp_fnames; + break; + case ATTR_ZERO: + switch (root->rule) { + case NTFS_COLLATION_TYPE_UINT: + return &cmp_uint; + case NTFS_COLLATION_TYPE_SECURITY_HASH: + return &cmp_sdh; + case NTFS_COLLATION_TYPE_UINTS: + return &cmp_uints; + default: + break; + } + break; + default: + break; + } + + return NULL; +} + +struct bmp_buf { + struct ATTRIB *b; + struct mft_inode *mi; + struct buffer_head *bh; + ulong *buf; + size_t bit; + u32 nbits; + u64 new_valid; +}; + +static int bmp_buf_get(struct ntfs_index *indx, struct ntfs_inode *ni, + size_t bit, struct bmp_buf *bbuf) +{ + struct ATTRIB *b; + size_t data_size, valid_size, vbo, off = bit >> 3; + struct ntfs_sb_info *sbi = ni->mi.sbi; + CLST vcn = off >> sbi->cluster_bits; + struct ATTR_LIST_ENTRY *le = NULL; + struct buffer_head *bh; + struct super_block *sb; + u32 blocksize; + const struct INDEX_NAMES *in = &s_index_names[indx->type]; + + bbuf->bh = NULL; + + b = ni_find_attr(ni, NULL, &le, ATTR_BITMAP, in->name, in->name_len, + &vcn, &bbuf->mi); + bbuf->b = b; + if (!b) + return -EINVAL; + + if (!b->non_res) { + data_size = le32_to_cpu(b->res.data_size); + + if (off >= data_size) + return -EINVAL; + + bbuf->buf = (ulong *)resident_data(b); + bbuf->bit = 0; + bbuf->nbits = data_size * 8; + + return 0; + } + + data_size = le64_to_cpu(b->nres.data_size); + if (WARN_ON(off >= data_size)) { + /* Looks like filesystem error. */ + return -EINVAL; + } + + valid_size = le64_to_cpu(b->nres.valid_size); + + bh = ntfs_bread_run(sbi, &indx->bitmap_run, off); + if (!bh) + return -EIO; + + if (IS_ERR(bh)) + return PTR_ERR(bh); + + bbuf->bh = bh; + + if (buffer_locked(bh)) + __wait_on_buffer(bh); + + lock_buffer(bh); + + sb = sbi->sb; + blocksize = sb->s_blocksize; + + vbo = off & ~(size_t)sbi->block_mask; + + bbuf->new_valid = vbo + blocksize; + if (bbuf->new_valid <= valid_size) + bbuf->new_valid = 0; + else if (bbuf->new_valid > data_size) + bbuf->new_valid = data_size; + + if (vbo >= valid_size) { + memset(bh->b_data, 0, blocksize); + } else if (vbo + blocksize > valid_size) { + u32 voff = valid_size & sbi->block_mask; + + memset(bh->b_data + voff, 0, blocksize - voff); + } + + bbuf->buf = (ulong *)bh->b_data; + bbuf->bit = 8 * (off & ~(size_t)sbi->block_mask); + bbuf->nbits = 8 * blocksize; + + return 0; +} + +static void bmp_buf_put(struct bmp_buf *bbuf, bool dirty) +{ + struct buffer_head *bh = bbuf->bh; + struct ATTRIB *b = bbuf->b; + + if (!bh) { + if (b && !b->non_res && dirty) + bbuf->mi->dirty = true; + return; + } + + if (!dirty) + goto out; + + if (bbuf->new_valid) { + b->nres.valid_size = cpu_to_le64(bbuf->new_valid); + bbuf->mi->dirty = true; + } + + set_buffer_uptodate(bh); + mark_buffer_dirty(bh); + +out: + unlock_buffer(bh); + put_bh(bh); +} + +/* + * indx_mark_used - Mark the bit @bit as used. + */ +static int indx_mark_used(struct ntfs_index *indx, struct ntfs_inode *ni, + size_t bit) +{ + int err; + struct bmp_buf bbuf; + + err = bmp_buf_get(indx, ni, bit, &bbuf); + if (err) + return err; + + __set_bit(bit - bbuf.bit, bbuf.buf); + + bmp_buf_put(&bbuf, true); + + return 0; +} + +/* + * indx_mark_free - Mark the bit @bit as free. + */ +static int indx_mark_free(struct ntfs_index *indx, struct ntfs_inode *ni, + size_t bit) +{ + int err; + struct bmp_buf bbuf; + + err = bmp_buf_get(indx, ni, bit, &bbuf); + if (err) + return err; + + __clear_bit(bit - bbuf.bit, bbuf.buf); + + bmp_buf_put(&bbuf, true); + + return 0; +} + +/* + * scan_nres_bitmap + * + * If ntfs_readdir calls this function (indx_used_bit -> scan_nres_bitmap), + * inode is shared locked and no ni_lock. + * Use rw_semaphore for read/write access to bitmap_run. + */ +static int scan_nres_bitmap(struct ntfs_inode *ni, struct ATTRIB *bitmap, + struct ntfs_index *indx, size_t from, + bool (*fn)(const ulong *buf, u32 bit, u32 bits, + size_t *ret), + size_t *ret) +{ + struct ntfs_sb_info *sbi = ni->mi.sbi; + struct super_block *sb = sbi->sb; + struct runs_tree *run = &indx->bitmap_run; + struct rw_semaphore *lock = &indx->run_lock; + u32 nbits = sb->s_blocksize * 8; + u32 blocksize = sb->s_blocksize; + u64 valid_size = le64_to_cpu(bitmap->nres.valid_size); + u64 data_size = le64_to_cpu(bitmap->nres.data_size); + sector_t eblock = bytes_to_block(sb, data_size); + size_t vbo = from >> 3; + sector_t blk = (vbo & sbi->cluster_mask) >> sb->s_blocksize_bits; + sector_t vblock = vbo >> sb->s_blocksize_bits; + sector_t blen, block; + CLST lcn, clen, vcn, vcn_next; + size_t idx; + struct buffer_head *bh; + bool ok; + + *ret = MINUS_ONE_T; + + if (vblock >= eblock) + return 0; + + from &= nbits - 1; + vcn = vbo >> sbi->cluster_bits; + + down_read(lock); + ok = run_lookup_entry(run, vcn, &lcn, &clen, &idx); + up_read(lock); + +next_run: + if (!ok) { + int err; + const struct INDEX_NAMES *name = &s_index_names[indx->type]; + + down_write(lock); + err = attr_load_runs_vcn(ni, ATTR_BITMAP, name->name, + name->name_len, run, vcn); + up_write(lock); + if (err) + return err; + down_read(lock); + ok = run_lookup_entry(run, vcn, &lcn, &clen, &idx); + up_read(lock); + if (!ok) + return -EINVAL; + } + + blen = (sector_t)clen * sbi->blocks_per_cluster; + block = (sector_t)lcn * sbi->blocks_per_cluster; + + for (; blk < blen; blk++, from = 0) { + bh = ntfs_bread(sb, block + blk); + if (!bh) + return -EIO; + + vbo = (u64)vblock << sb->s_blocksize_bits; + if (vbo >= valid_size) { + memset(bh->b_data, 0, blocksize); + } else if (vbo + blocksize > valid_size) { + u32 voff = valid_size & sbi->block_mask; + + memset(bh->b_data + voff, 0, blocksize - voff); + } + + if (vbo + blocksize > data_size) + nbits = 8 * (data_size - vbo); + + ok = nbits > from ? (*fn)((ulong *)bh->b_data, from, nbits, ret) + : false; + put_bh(bh); + + if (ok) { + *ret += 8 * vbo; + return 0; + } + + if (++vblock >= eblock) { + *ret = MINUS_ONE_T; + return 0; + } + } + blk = 0; + vcn_next = vcn + clen; + down_read(lock); + ok = run_get_entry(run, ++idx, &vcn, &lcn, &clen) && vcn == vcn_next; + if (!ok) + vcn = vcn_next; + up_read(lock); + goto next_run; +} + +static bool scan_for_free(const ulong *buf, u32 bit, u32 bits, size_t *ret) +{ + size_t pos = find_next_zero_bit(buf, bits, bit); + + if (pos >= bits) + return false; + *ret = pos; + return true; +} + +/* + * indx_find_free - Look for free bit. + * + * Return: -1 if no free bits. + */ +static int indx_find_free(struct ntfs_index *indx, struct ntfs_inode *ni, + size_t *bit, struct ATTRIB **bitmap) +{ + struct ATTRIB *b; + struct ATTR_LIST_ENTRY *le = NULL; + const struct INDEX_NAMES *in = &s_index_names[indx->type]; + int err; + + b = ni_find_attr(ni, NULL, &le, ATTR_BITMAP, in->name, in->name_len, + NULL, NULL); + + if (!b) + return -ENOENT; + + *bitmap = b; + *bit = MINUS_ONE_T; + + if (!b->non_res) { + u32 nbits = 8 * le32_to_cpu(b->res.data_size); + size_t pos = find_next_zero_bit(resident_data(b), nbits, 0); + + if (pos < nbits) + *bit = pos; + } else { + err = scan_nres_bitmap(ni, b, indx, 0, &scan_for_free, bit); + + if (err) + return err; + } + + return 0; +} + +static bool scan_for_used(const ulong *buf, u32 bit, u32 bits, size_t *ret) +{ + size_t pos = find_next_bit(buf, bits, bit); + + if (pos >= bits) + return false; + *ret = pos; + return true; +} + +/* + * indx_used_bit - Look for used bit. + * + * Return: MINUS_ONE_T if no used bits. + */ +int indx_used_bit(struct ntfs_index *indx, struct ntfs_inode *ni, size_t *bit) +{ + struct ATTRIB *b; + struct ATTR_LIST_ENTRY *le = NULL; + size_t from = *bit; + const struct INDEX_NAMES *in = &s_index_names[indx->type]; + int err; + + b = ni_find_attr(ni, NULL, &le, ATTR_BITMAP, in->name, in->name_len, + NULL, NULL); + + if (!b) + return -ENOENT; + + *bit = MINUS_ONE_T; + + if (!b->non_res) { + u32 nbits = le32_to_cpu(b->res.data_size) * 8; + size_t pos = find_next_bit(resident_data(b), nbits, from); + + if (pos < nbits) + *bit = pos; + } else { + err = scan_nres_bitmap(ni, b, indx, from, &scan_for_used, bit); + if (err) + return err; + } + + return 0; +} + +/* + * hdr_find_split + * + * Find a point at which the index allocation buffer would like to be split. + * NOTE: This function should never return 'END' entry NULL returns on error. + */ +static const struct NTFS_DE *hdr_find_split(const struct INDEX_HDR *hdr) +{ + size_t o; + const struct NTFS_DE *e = hdr_first_de(hdr); + u32 used_2 = le32_to_cpu(hdr->used) >> 1; + u16 esize; + + if (!e || de_is_last(e)) + return NULL; + + esize = le16_to_cpu(e->size); + for (o = le32_to_cpu(hdr->de_off) + esize; o < used_2; o += esize) { + const struct NTFS_DE *p = e; + + e = Add2Ptr(hdr, o); + + /* We must not return END entry. */ + if (de_is_last(e)) + return p; + + esize = le16_to_cpu(e->size); + } + + return e; +} + +/* + * hdr_insert_head - Insert some entries at the beginning of the buffer. + * + * It is used to insert entries into a newly-created buffer. + */ +static const struct NTFS_DE *hdr_insert_head(struct INDEX_HDR *hdr, + const void *ins, u32 ins_bytes) +{ + u32 to_move; + struct NTFS_DE *e = hdr_first_de(hdr); + u32 used = le32_to_cpu(hdr->used); + + if (!e) + return NULL; + + /* Now we just make room for the inserted entries and jam it in. */ + to_move = used - le32_to_cpu(hdr->de_off); + memmove(Add2Ptr(e, ins_bytes), e, to_move); + memcpy(e, ins, ins_bytes); + hdr->used = cpu_to_le32(used + ins_bytes); + + return e; +} + +/* + * index_hdr_check + * + * return true if INDEX_HDR is valid + */ +static bool index_hdr_check(const struct INDEX_HDR *hdr, u32 bytes) +{ + u32 end = le32_to_cpu(hdr->used); + u32 tot = le32_to_cpu(hdr->total); + u32 off = le32_to_cpu(hdr->de_off); + + if (!IS_ALIGNED(off, 8) || tot > bytes || end > tot || + off + sizeof(struct NTFS_DE) > end) { + /* incorrect index buffer. */ + return false; + } + + return true; +} + +/* + * index_buf_check + * + * return true if INDEX_BUFFER seems is valid + */ +static bool index_buf_check(const struct INDEX_BUFFER *ib, u32 bytes, + const CLST *vbn) +{ + const struct NTFS_RECORD_HEADER *rhdr = &ib->rhdr; + u16 fo = le16_to_cpu(rhdr->fix_off); + u16 fn = le16_to_cpu(rhdr->fix_num); + + if (bytes <= offsetof(struct INDEX_BUFFER, ihdr) || + rhdr->sign != NTFS_INDX_SIGNATURE || + fo < sizeof(struct INDEX_BUFFER) + /* Check index buffer vbn. */ + || (vbn && *vbn != le64_to_cpu(ib->vbn)) || (fo % sizeof(short)) || + fo + fn * sizeof(short) >= bytes || + fn != ((bytes >> SECTOR_SHIFT) + 1)) { + /* incorrect index buffer. */ + return false; + } + + return index_hdr_check(&ib->ihdr, + bytes - offsetof(struct INDEX_BUFFER, ihdr)); +} + +void fnd_clear(struct ntfs_fnd *fnd) +{ + int i; + + for (i = fnd->level - 1; i >= 0; i--) { + struct indx_node *n = fnd->nodes[i]; + + if (!n) + continue; + + put_indx_node(n); + fnd->nodes[i] = NULL; + } + fnd->level = 0; + fnd->root_de = NULL; +} + +static int fnd_push(struct ntfs_fnd *fnd, struct indx_node *n, + struct NTFS_DE *e) +{ + int i; + + i = fnd->level; + if (i < 0 || i >= ARRAY_SIZE(fnd->nodes)) + return -EINVAL; + fnd->nodes[i] = n; + fnd->de[i] = e; + fnd->level += 1; + return 0; +} + +static struct indx_node *fnd_pop(struct ntfs_fnd *fnd) +{ + struct indx_node *n; + int i = fnd->level; + + i -= 1; + n = fnd->nodes[i]; + fnd->nodes[i] = NULL; + fnd->level = i; + + return n; +} + +static bool fnd_is_empty(struct ntfs_fnd *fnd) +{ + if (!fnd->level) + return !fnd->root_de; + + return !fnd->de[fnd->level - 1]; +} + +/* + * hdr_find_e - Locate an entry the index buffer. + * + * If no matching entry is found, it returns the first entry which is greater + * than the desired entry If the search key is greater than all the entries the + * buffer, it returns the 'end' entry. This function does a binary search of the + * current index buffer, for the first entry that is <= to the search value. + * + * Return: NULL if error. + */ +static struct NTFS_DE *hdr_find_e(const struct ntfs_index *indx, + const struct INDEX_HDR *hdr, const void *key, + size_t key_len, const void *ctx, int *diff) +{ + struct NTFS_DE *e, *found = NULL; + NTFS_CMP_FUNC cmp = indx->cmp; + int min_idx = 0, mid_idx, max_idx = 0; + int diff2; + int table_size = 8; + u32 e_size, e_key_len; + u32 end = le32_to_cpu(hdr->used); + u32 off = le32_to_cpu(hdr->de_off); + u32 total = le32_to_cpu(hdr->total); + u16 offs[128]; + + if (unlikely(!cmp)) + return NULL; + +fill_table: + if (end > total) + return NULL; + + if (off + sizeof(struct NTFS_DE) > end) + return NULL; + + e = Add2Ptr(hdr, off); + e_size = le16_to_cpu(e->size); + + if (e_size < sizeof(struct NTFS_DE) || off + e_size > end) + return NULL; + + if (!de_is_last(e)) { + offs[max_idx] = off; + off += e_size; + + max_idx++; + if (max_idx < table_size) + goto fill_table; + + max_idx--; + } + +binary_search: + e_key_len = le16_to_cpu(e->key_size); + + diff2 = (*cmp)(key, key_len, e + 1, e_key_len, ctx); + if (diff2 > 0) { + if (found) { + min_idx = mid_idx + 1; + } else { + if (de_is_last(e)) + return NULL; + + max_idx = 0; + table_size = min(table_size * 2, + (int)ARRAY_SIZE(offs)); + goto fill_table; + } + } else if (diff2 < 0) { + if (found) + max_idx = mid_idx - 1; + else + max_idx--; + + found = e; + } else { + *diff = 0; + return e; + } + + if (min_idx > max_idx) { + *diff = -1; + return found; + } + + mid_idx = (min_idx + max_idx) >> 1; + e = Add2Ptr(hdr, offs[mid_idx]); + + goto binary_search; +} + +/* + * hdr_insert_de - Insert an index entry into the buffer. + * + * 'before' should be a pointer previously returned from hdr_find_e. + */ +static struct NTFS_DE *hdr_insert_de(const struct ntfs_index *indx, + struct INDEX_HDR *hdr, + const struct NTFS_DE *de, + struct NTFS_DE *before, const void *ctx) +{ + int diff; + size_t off = PtrOffset(hdr, before); + u32 used = le32_to_cpu(hdr->used); + u32 total = le32_to_cpu(hdr->total); + u16 de_size = le16_to_cpu(de->size); + + /* First, check to see if there's enough room. */ + if (used + de_size > total) + return NULL; + + /* We know there's enough space, so we know we'll succeed. */ + if (before) { + /* Check that before is inside Index. */ + if (off >= used || off < le32_to_cpu(hdr->de_off) || + off + le16_to_cpu(before->size) > total) { + return NULL; + } + goto ok; + } + /* No insert point is applied. Get it manually. */ + before = hdr_find_e(indx, hdr, de + 1, le16_to_cpu(de->key_size), ctx, + &diff); + if (!before) + return NULL; + off = PtrOffset(hdr, before); + +ok: + /* Now we just make room for the entry and jam it in. */ + memmove(Add2Ptr(before, de_size), before, used - off); + + hdr->used = cpu_to_le32(used + de_size); + memcpy(before, de, de_size); + + return before; +} + +/* + * hdr_delete_de - Remove an entry from the index buffer. + */ +static inline struct NTFS_DE *hdr_delete_de(struct INDEX_HDR *hdr, + struct NTFS_DE *re) +{ + u32 used = le32_to_cpu(hdr->used); + u16 esize = le16_to_cpu(re->size); + u32 off = PtrOffset(hdr, re); + int bytes = used - (off + esize); + + /* check INDEX_HDR valid before using INDEX_HDR */ + if (!check_index_header(hdr, le32_to_cpu(hdr->total))) + return NULL; + + if (off >= used || esize < sizeof(struct NTFS_DE) || + bytes < sizeof(struct NTFS_DE)) + return NULL; + + hdr->used = cpu_to_le32(used - esize); + memmove(re, Add2Ptr(re, esize), bytes); + + return re; +} + +void indx_clear(struct ntfs_index *indx) +{ + run_close(&indx->alloc_run); + run_close(&indx->bitmap_run); +} + +int indx_init(struct ntfs_index *indx, struct ntfs_sb_info *sbi, + const struct ATTRIB *attr, enum index_mutex_classed type) +{ + u32 t32; + const struct INDEX_ROOT *root = resident_data(attr); + + t32 = le32_to_cpu(attr->res.data_size); + if (t32 <= offsetof(struct INDEX_ROOT, ihdr) || + !index_hdr_check(&root->ihdr, + t32 - offsetof(struct INDEX_ROOT, ihdr))) { + goto out; + } + + /* Check root fields. */ + if (!root->index_block_clst) + goto out; + + indx->type = type; + indx->idx2vbn_bits = __ffs(root->index_block_clst); + + t32 = le32_to_cpu(root->index_block_size); + indx->index_bits = blksize_bits(t32); + + /* Check index record size. */ + if (t32 < sbi->cluster_size) { + /* Index record is smaller than a cluster, use 512 blocks. */ + if (t32 != root->index_block_clst * SECTOR_SIZE) + goto out; + + /* Check alignment to a cluster. */ + if ((sbi->cluster_size >> SECTOR_SHIFT) & + (root->index_block_clst - 1)) { + goto out; + } + + indx->vbn2vbo_bits = SECTOR_SHIFT; + } else { + /* Index record must be a multiple of cluster size. */ + if (t32 != root->index_block_clst << sbi->cluster_bits) + goto out; + + indx->vbn2vbo_bits = sbi->cluster_bits; + } + + init_rwsem(&indx->run_lock); + + indx->cmp = get_cmp_func(root); + if (!indx->cmp) + goto out; + + return 0; + +out: + ntfs_set_state(sbi, NTFS_DIRTY_DIRTY); + return -EINVAL; +} + +static struct indx_node *indx_new(struct ntfs_index *indx, + struct ntfs_inode *ni, CLST vbn, + const __le64 *sub_vbn) +{ + int err; + struct NTFS_DE *e; + struct indx_node *r; + struct INDEX_HDR *hdr; + struct INDEX_BUFFER *index; + u64 vbo = (u64)vbn << indx->vbn2vbo_bits; + u32 bytes = 1u << indx->index_bits; + u16 fn; + u32 eo; + + r = kzalloc(sizeof(struct indx_node), GFP_NOFS); + if (!r) + return ERR_PTR(-ENOMEM); + + index = kzalloc(bytes, GFP_NOFS); + if (!index) { + kfree(r); + return ERR_PTR(-ENOMEM); + } + + err = ntfs_get_bh(ni->mi.sbi, &indx->alloc_run, vbo, bytes, &r->nb); + + if (err) { + kfree(index); + kfree(r); + return ERR_PTR(err); + } + + /* Create header. */ + index->rhdr.sign = NTFS_INDX_SIGNATURE; + index->rhdr.fix_off = cpu_to_le16(sizeof(struct INDEX_BUFFER)); // 0x28 + fn = (bytes >> SECTOR_SHIFT) + 1; // 9 + index->rhdr.fix_num = cpu_to_le16(fn); + index->vbn = cpu_to_le64(vbn); + hdr = &index->ihdr; + eo = ALIGN(sizeof(struct INDEX_BUFFER) + fn * sizeof(short), 8); + hdr->de_off = cpu_to_le32(eo); + + e = Add2Ptr(hdr, eo); + + if (sub_vbn) { + e->flags = NTFS_IE_LAST | NTFS_IE_HAS_SUBNODES; + e->size = cpu_to_le16(sizeof(struct NTFS_DE) + sizeof(u64)); + hdr->used = + cpu_to_le32(eo + sizeof(struct NTFS_DE) + sizeof(u64)); + de_set_vbn_le(e, *sub_vbn); + hdr->flags = 1; + } else { + e->size = cpu_to_le16(sizeof(struct NTFS_DE)); + hdr->used = cpu_to_le32(eo + sizeof(struct NTFS_DE)); + e->flags = NTFS_IE_LAST; + } + + hdr->total = cpu_to_le32(bytes - offsetof(struct INDEX_BUFFER, ihdr)); + + r->index = index; + return r; +} + +struct INDEX_ROOT *indx_get_root(struct ntfs_index *indx, struct ntfs_inode *ni, + struct ATTRIB **attr, struct mft_inode **mi) +{ + struct ATTR_LIST_ENTRY *le = NULL; + struct ATTRIB *a; + const struct INDEX_NAMES *in = &s_index_names[indx->type]; + + a = ni_find_attr(ni, NULL, &le, ATTR_ROOT, in->name, in->name_len, NULL, + mi); + if (!a) + return NULL; + + if (attr) + *attr = a; + + return resident_data_ex(a, sizeof(struct INDEX_ROOT)); +} + +static int indx_write(struct ntfs_index *indx, struct ntfs_inode *ni, + struct indx_node *node, int sync) +{ + struct INDEX_BUFFER *ib = node->index; + + return ntfs_write_bh(ni->mi.sbi, &ib->rhdr, &node->nb, sync); +} + +/* + * indx_read + * + * If ntfs_readdir calls this function + * inode is shared locked and no ni_lock. + * Use rw_semaphore for read/write access to alloc_run. + */ +int indx_read(struct ntfs_index *indx, struct ntfs_inode *ni, CLST vbn, + struct indx_node **node) +{ + int err; + struct INDEX_BUFFER *ib; + struct runs_tree *run = &indx->alloc_run; + struct rw_semaphore *lock = &indx->run_lock; + u64 vbo = (u64)vbn << indx->vbn2vbo_bits; + u32 bytes = 1u << indx->index_bits; + struct indx_node *in = *node; + const struct INDEX_NAMES *name; + + if (!in) { + in = kzalloc(sizeof(struct indx_node), GFP_NOFS); + if (!in) + return -ENOMEM; + } else { + nb_put(&in->nb); + } + + ib = in->index; + if (!ib) { + ib = kmalloc(bytes, GFP_NOFS); + if (!ib) { + err = -ENOMEM; + goto out; + } + } + + down_read(lock); + err = ntfs_read_bh(ni->mi.sbi, run, vbo, &ib->rhdr, bytes, &in->nb); + up_read(lock); + if (!err) + goto ok; + + if (err == -E_NTFS_FIXUP) + goto ok; + + if (err != -ENOENT) + goto out; + + name = &s_index_names[indx->type]; + down_write(lock); + err = attr_load_runs_range(ni, ATTR_ALLOC, name->name, name->name_len, + run, vbo, vbo + bytes); + up_write(lock); + if (err) + goto out; + + down_read(lock); + err = ntfs_read_bh(ni->mi.sbi, run, vbo, &ib->rhdr, bytes, &in->nb); + up_read(lock); + if (err == -E_NTFS_FIXUP) + goto ok; + + if (err) + goto out; + +ok: + if (!index_buf_check(ib, bytes, &vbn)) { + ntfs_inode_err(&ni->vfs_inode, "directory corrupted"); + ntfs_set_state(ni->mi.sbi, NTFS_DIRTY_ERROR); + err = -EINVAL; + goto out; + } + + if (err == -E_NTFS_FIXUP) { + ntfs_write_bh(ni->mi.sbi, &ib->rhdr, &in->nb, 0); + err = 0; + } + + /* check for index header length */ + if (offsetof(struct INDEX_BUFFER, ihdr) + ib->ihdr.used > bytes) { + err = -EINVAL; + goto out; + } + + in->index = ib; + *node = in; + +out: + if (err == -E_NTFS_CORRUPT) { + ntfs_inode_err(&ni->vfs_inode, "directory corrupted"); + ntfs_set_state(ni->mi.sbi, NTFS_DIRTY_ERROR); + err = -EINVAL; + } + + if (ib != in->index) + kfree(ib); + + if (*node != in) { + nb_put(&in->nb); + kfree(in); + } + + return err; +} + +/* + * indx_find - Scan NTFS directory for given entry. + */ +int indx_find(struct ntfs_index *indx, struct ntfs_inode *ni, + const struct INDEX_ROOT *root, const void *key, size_t key_len, + const void *ctx, int *diff, struct NTFS_DE **entry, + struct ntfs_fnd *fnd) +{ + int err; + struct NTFS_DE *e; + struct indx_node *node; + + if (!root) + root = indx_get_root(&ni->dir, ni, NULL, NULL); + + if (!root) { + /* Should not happen. */ + return -EINVAL; + } + + /* Check cache. */ + e = fnd->level ? fnd->de[fnd->level - 1] : fnd->root_de; + if (e && !de_is_last(e) && + !(*indx->cmp)(key, key_len, e + 1, le16_to_cpu(e->key_size), ctx)) { + *entry = e; + *diff = 0; + return 0; + } + + /* Soft finder reset. */ + fnd_clear(fnd); + + /* Lookup entry that is <= to the search value. */ + e = hdr_find_e(indx, &root->ihdr, key, key_len, ctx, diff); + if (!e) + return -EINVAL; + + fnd->root_de = e; + + for (;;) { + node = NULL; + if (*diff >= 0 || !de_has_vcn_ex(e)) + break; + + /* Read next level. */ + err = indx_read(indx, ni, de_get_vbn(e), &node); + if (err) + return err; + + /* Lookup entry that is <= to the search value. */ + e = hdr_find_e(indx, &node->index->ihdr, key, key_len, ctx, + diff); + if (!e) { + put_indx_node(node); + return -EINVAL; + } + + fnd_push(fnd, node, e); + } + + *entry = e; + return 0; +} + +int indx_find_sort(struct ntfs_index *indx, struct ntfs_inode *ni, + const struct INDEX_ROOT *root, struct NTFS_DE **entry, + struct ntfs_fnd *fnd) +{ + int err; + struct indx_node *n = NULL; + struct NTFS_DE *e; + size_t iter = 0; + int level = fnd->level; + + if (!*entry) { + /* Start find. */ + e = hdr_first_de(&root->ihdr); + if (!e) + return 0; + fnd_clear(fnd); + fnd->root_de = e; + } else if (!level) { + if (de_is_last(fnd->root_de)) { + *entry = NULL; + return 0; + } + + e = hdr_next_de(&root->ihdr, fnd->root_de); + if (!e) + return -EINVAL; + fnd->root_de = e; + } else { + n = fnd->nodes[level - 1]; + e = fnd->de[level - 1]; + + if (de_is_last(e)) + goto pop_level; + + e = hdr_next_de(&n->index->ihdr, e); + if (!e) + return -EINVAL; + + fnd->de[level - 1] = e; + } + + /* Just to avoid tree cycle. */ +next_iter: + if (iter++ >= 1000) + return -EINVAL; + + while (de_has_vcn_ex(e)) { + if (le16_to_cpu(e->size) < + sizeof(struct NTFS_DE) + sizeof(u64)) { + if (n) { + fnd_pop(fnd); + kfree(n); + } + return -EINVAL; + } + + /* Read next level. */ + err = indx_read(indx, ni, de_get_vbn(e), &n); + if (err) + return err; + + /* Try next level. */ + e = hdr_first_de(&n->index->ihdr); + if (!e) { + kfree(n); + return -EINVAL; + } + + fnd_push(fnd, n, e); + } + + if (le16_to_cpu(e->size) > sizeof(struct NTFS_DE)) { + *entry = e; + return 0; + } + +pop_level: + for (;;) { + if (!de_is_last(e)) + goto next_iter; + + /* Pop one level. */ + if (n) { + fnd_pop(fnd); + kfree(n); + } + + level = fnd->level; + + if (level) { + n = fnd->nodes[level - 1]; + e = fnd->de[level - 1]; + } else if (fnd->root_de) { + n = NULL; + e = fnd->root_de; + fnd->root_de = NULL; + } else { + *entry = NULL; + return 0; + } + + if (le16_to_cpu(e->size) > sizeof(struct NTFS_DE)) { + *entry = e; + if (!fnd->root_de) + fnd->root_de = e; + return 0; + } + } +} + +int indx_find_raw(struct ntfs_index *indx, struct ntfs_inode *ni, + const struct INDEX_ROOT *root, struct NTFS_DE **entry, + size_t *off, struct ntfs_fnd *fnd) +{ + int err; + struct indx_node *n = NULL; + struct NTFS_DE *e = NULL; + struct NTFS_DE *e2; + size_t bit; + CLST next_used_vbn; + CLST next_vbn; + u32 record_size = ni->mi.sbi->record_size; + + /* Use non sorted algorithm. */ + if (!*entry) { + /* This is the first call. */ + e = hdr_first_de(&root->ihdr); + if (!e) + return 0; + fnd_clear(fnd); + fnd->root_de = e; + + /* The first call with setup of initial element. */ + if (*off >= record_size) { + next_vbn = (((*off - record_size) >> indx->index_bits)) + << indx->idx2vbn_bits; + /* Jump inside cycle 'for'. */ + goto next; + } + + /* Start enumeration from root. */ + *off = 0; + } else if (!fnd->root_de) + return -EINVAL; + + for (;;) { + /* Check if current entry can be used. */ + if (e && le16_to_cpu(e->size) > sizeof(struct NTFS_DE)) + goto ok; + + if (!fnd->level) { + /* Continue to enumerate root. */ + if (!de_is_last(fnd->root_de)) { + e = hdr_next_de(&root->ihdr, fnd->root_de); + if (!e) + return -EINVAL; + fnd->root_de = e; + continue; + } + + /* Start to enumerate indexes from 0. */ + next_vbn = 0; + } else { + /* Continue to enumerate indexes. */ + e2 = fnd->de[fnd->level - 1]; + + n = fnd->nodes[fnd->level - 1]; + + if (!de_is_last(e2)) { + e = hdr_next_de(&n->index->ihdr, e2); + if (!e) + return -EINVAL; + fnd->de[fnd->level - 1] = e; + continue; + } + + /* Continue with next index. */ + next_vbn = le64_to_cpu(n->index->vbn) + + root->index_block_clst; + } + +next: + /* Release current index. */ + if (n) { + fnd_pop(fnd); + put_indx_node(n); + n = NULL; + } + + /* Skip all free indexes. */ + bit = next_vbn >> indx->idx2vbn_bits; + err = indx_used_bit(indx, ni, &bit); + if (err == -ENOENT || bit == MINUS_ONE_T) { + /* No used indexes. */ + *entry = NULL; + return 0; + } + + next_used_vbn = bit << indx->idx2vbn_bits; + + /* Read buffer into memory. */ + err = indx_read(indx, ni, next_used_vbn, &n); + if (err) + return err; + + e = hdr_first_de(&n->index->ihdr); + fnd_push(fnd, n, e); + if (!e) + return -EINVAL; + } + +ok: + /* Return offset to restore enumerator if necessary. */ + if (!n) { + /* 'e' points in root, */ + *off = PtrOffset(&root->ihdr, e); + } else { + /* 'e' points in index, */ + *off = (le64_to_cpu(n->index->vbn) << indx->vbn2vbo_bits) + + record_size + PtrOffset(&n->index->ihdr, e); + } + + *entry = e; + return 0; +} + +/* + * indx_create_allocate - Create "Allocation + Bitmap" attributes. + */ +static int indx_create_allocate(struct ntfs_index *indx, struct ntfs_inode *ni, + CLST *vbn) +{ + int err; + struct ntfs_sb_info *sbi = ni->mi.sbi; + struct ATTRIB *bitmap; + struct ATTRIB *alloc; + u32 data_size = 1u << indx->index_bits; + u32 alloc_size = ntfs_up_cluster(sbi, data_size); + CLST len = alloc_size >> sbi->cluster_bits; + const struct INDEX_NAMES *in = &s_index_names[indx->type]; + CLST alen; + struct runs_tree run; + + run_init(&run); + + err = attr_allocate_clusters(sbi, &run, 0, 0, len, NULL, 0, &alen, 0, + NULL); + if (err) + goto out; + + err = ni_insert_nonresident(ni, ATTR_ALLOC, in->name, in->name_len, + &run, 0, len, 0, &alloc, NULL, NULL); + if (err) + goto out1; + + alloc->nres.valid_size = alloc->nres.data_size = cpu_to_le64(data_size); + + err = ni_insert_resident(ni, bitmap_size(1), ATTR_BITMAP, in->name, + in->name_len, &bitmap, NULL, NULL); + if (err) + goto out2; + + if (in->name == I30_NAME) { + ni->vfs_inode.i_size = data_size; + inode_set_bytes(&ni->vfs_inode, alloc_size); + } + + memcpy(&indx->alloc_run, &run, sizeof(run)); + + *vbn = 0; + + return 0; + +out2: + mi_remove_attr(NULL, &ni->mi, alloc); + +out1: + run_deallocate(sbi, &run, false); + +out: + return err; +} + +/* + * indx_add_allocate - Add clusters to index. + */ +static int indx_add_allocate(struct ntfs_index *indx, struct ntfs_inode *ni, + CLST *vbn) +{ + int err; + size_t bit; + u64 data_size; + u64 bmp_size, bmp_size_v; + struct ATTRIB *bmp, *alloc; + struct mft_inode *mi; + const struct INDEX_NAMES *in = &s_index_names[indx->type]; + + err = indx_find_free(indx, ni, &bit, &bmp); + if (err) + goto out1; + + if (bit != MINUS_ONE_T) { + bmp = NULL; + } else { + if (bmp->non_res) { + bmp_size = le64_to_cpu(bmp->nres.data_size); + bmp_size_v = le64_to_cpu(bmp->nres.valid_size); + } else { + bmp_size = bmp_size_v = le32_to_cpu(bmp->res.data_size); + } + + bit = bmp_size << 3; + } + + data_size = (u64)(bit + 1) << indx->index_bits; + + if (bmp) { + /* Increase bitmap. */ + err = attr_set_size(ni, ATTR_BITMAP, in->name, in->name_len, + &indx->bitmap_run, bitmap_size(bit + 1), + NULL, true, NULL); + if (err) + goto out1; + } + + alloc = ni_find_attr(ni, NULL, NULL, ATTR_ALLOC, in->name, in->name_len, + NULL, &mi); + if (!alloc) { + err = -EINVAL; + if (bmp) + goto out2; + goto out1; + } + + /* Increase allocation. */ + err = attr_set_size(ni, ATTR_ALLOC, in->name, in->name_len, + &indx->alloc_run, data_size, &data_size, true, + NULL); + if (err) { + if (bmp) + goto out2; + goto out1; + } + + *vbn = bit << indx->idx2vbn_bits; + + return 0; + +out2: + /* Ops. No space? */ + attr_set_size(ni, ATTR_BITMAP, in->name, in->name_len, + &indx->bitmap_run, bmp_size, &bmp_size_v, false, NULL); + +out1: + return err; +} + +/* + * indx_insert_into_root - Attempt to insert an entry into the index root. + * + * @undo - True if we undoing previous remove. + * If necessary, it will twiddle the index b-tree. + */ +static int indx_insert_into_root(struct ntfs_index *indx, struct ntfs_inode *ni, + const struct NTFS_DE *new_de, + struct NTFS_DE *root_de, const void *ctx, + struct ntfs_fnd *fnd, bool undo) +{ + int err = 0; + struct NTFS_DE *e, *e0, *re; + struct mft_inode *mi; + struct ATTRIB *attr; + struct INDEX_HDR *hdr; + struct indx_node *n; + CLST new_vbn; + __le64 *sub_vbn, t_vbn; + u16 new_de_size; + u32 hdr_used, hdr_total, asize, to_move; + u32 root_size, new_root_size; + struct ntfs_sb_info *sbi; + int ds_root; + struct INDEX_ROOT *root, *a_root; + + /* Get the record this root placed in. */ + root = indx_get_root(indx, ni, &attr, &mi); + if (!root) + return -EINVAL; + + /* + * Try easy case: + * hdr_insert_de will succeed if there's + * room the root for the new entry. + */ + hdr = &root->ihdr; + sbi = ni->mi.sbi; + new_de_size = le16_to_cpu(new_de->size); + hdr_used = le32_to_cpu(hdr->used); + hdr_total = le32_to_cpu(hdr->total); + asize = le32_to_cpu(attr->size); + root_size = le32_to_cpu(attr->res.data_size); + + ds_root = new_de_size + hdr_used - hdr_total; + + /* If 'undo' is set then reduce requirements. */ + if ((undo || asize + ds_root < sbi->max_bytes_per_attr) && + mi_resize_attr(mi, attr, ds_root)) { + hdr->total = cpu_to_le32(hdr_total + ds_root); + e = hdr_insert_de(indx, hdr, new_de, root_de, ctx); + WARN_ON(!e); + fnd_clear(fnd); + fnd->root_de = e; + + return 0; + } + + /* Make a copy of root attribute to restore if error. */ + a_root = kmemdup(attr, asize, GFP_NOFS); + if (!a_root) + return -ENOMEM; + + /* + * Copy all the non-end entries from + * the index root to the new buffer. + */ + to_move = 0; + e0 = hdr_first_de(hdr); + + /* Calculate the size to copy. */ + for (e = e0;; e = hdr_next_de(hdr, e)) { + if (!e) { + err = -EINVAL; + goto out_free_root; + } + + if (de_is_last(e)) + break; + to_move += le16_to_cpu(e->size); + } + + if (!to_move) { + re = NULL; + } else { + re = kmemdup(e0, to_move, GFP_NOFS); + if (!re) { + err = -ENOMEM; + goto out_free_root; + } + } + + sub_vbn = NULL; + if (de_has_vcn(e)) { + t_vbn = de_get_vbn_le(e); + sub_vbn = &t_vbn; + } + + new_root_size = sizeof(struct INDEX_ROOT) + sizeof(struct NTFS_DE) + + sizeof(u64); + ds_root = new_root_size - root_size; + + if (ds_root > 0 && asize + ds_root > sbi->max_bytes_per_attr) { + /* Make root external. */ + err = -EOPNOTSUPP; + goto out_free_re; + } + + if (ds_root) + mi_resize_attr(mi, attr, ds_root); + + /* Fill first entry (vcn will be set later). */ + e = (struct NTFS_DE *)(root + 1); + memset(e, 0, sizeof(struct NTFS_DE)); + e->size = cpu_to_le16(sizeof(struct NTFS_DE) + sizeof(u64)); + e->flags = NTFS_IE_HAS_SUBNODES | NTFS_IE_LAST; + + hdr->flags = 1; + hdr->used = hdr->total = + cpu_to_le32(new_root_size - offsetof(struct INDEX_ROOT, ihdr)); + + fnd->root_de = hdr_first_de(hdr); + mi->dirty = true; + + /* Create alloc and bitmap attributes (if not). */ + err = run_is_empty(&indx->alloc_run) + ? indx_create_allocate(indx, ni, &new_vbn) + : indx_add_allocate(indx, ni, &new_vbn); + + /* Layout of record may be changed, so rescan root. */ + root = indx_get_root(indx, ni, &attr, &mi); + if (!root) { + /* Bug? */ + ntfs_set_state(sbi, NTFS_DIRTY_ERROR); + err = -EINVAL; + goto out_free_re; + } + + if (err) { + /* Restore root. */ + if (mi_resize_attr(mi, attr, -ds_root)) { + memcpy(attr, a_root, asize); + } else { + /* Bug? */ + ntfs_set_state(sbi, NTFS_DIRTY_ERROR); + } + goto out_free_re; + } + + e = (struct NTFS_DE *)(root + 1); + *(__le64 *)(e + 1) = cpu_to_le64(new_vbn); + mi->dirty = true; + + /* Now we can create/format the new buffer and copy the entries into. */ + n = indx_new(indx, ni, new_vbn, sub_vbn); + if (IS_ERR(n)) { + err = PTR_ERR(n); + goto out_free_re; + } + + hdr = &n->index->ihdr; + hdr_used = le32_to_cpu(hdr->used); + hdr_total = le32_to_cpu(hdr->total); + + /* Copy root entries into new buffer. */ + hdr_insert_head(hdr, re, to_move); + + /* Update bitmap attribute. */ + indx_mark_used(indx, ni, new_vbn >> indx->idx2vbn_bits); + + /* Check if we can insert new entry new index buffer. */ + if (hdr_used + new_de_size > hdr_total) { + /* + * This occurs if MFT record is the same or bigger than index + * buffer. Move all root new index and have no space to add + * new entry classic case when MFT record is 1K and index + * buffer 4K the problem should not occurs. + */ + kfree(re); + indx_write(indx, ni, n, 0); + + put_indx_node(n); + fnd_clear(fnd); + err = indx_insert_entry(indx, ni, new_de, ctx, fnd, undo); + goto out_free_root; + } + + /* + * Now root is a parent for new index buffer. + * Insert NewEntry a new buffer. + */ + e = hdr_insert_de(indx, hdr, new_de, NULL, ctx); + if (!e) { + err = -EINVAL; + goto out_put_n; + } + fnd_push(fnd, n, e); + + /* Just write updates index into disk. */ + indx_write(indx, ni, n, 0); + + n = NULL; + +out_put_n: + put_indx_node(n); +out_free_re: + kfree(re); +out_free_root: + kfree(a_root); + return err; +} + +/* + * indx_insert_into_buffer + * + * Attempt to insert an entry into an Index Allocation Buffer. + * If necessary, it will split the buffer. + */ +static int +indx_insert_into_buffer(struct ntfs_index *indx, struct ntfs_inode *ni, + struct INDEX_ROOT *root, const struct NTFS_DE *new_de, + const void *ctx, int level, struct ntfs_fnd *fnd) +{ + int err; + const struct NTFS_DE *sp; + struct NTFS_DE *e, *de_t, *up_e; + struct indx_node *n2; + struct indx_node *n1 = fnd->nodes[level]; + struct INDEX_HDR *hdr1 = &n1->index->ihdr; + struct INDEX_HDR *hdr2; + u32 to_copy, used; + CLST new_vbn; + __le64 t_vbn, *sub_vbn; + u16 sp_size; + + /* Try the most easy case. */ + e = fnd->level - 1 == level ? fnd->de[level] : NULL; + e = hdr_insert_de(indx, hdr1, new_de, e, ctx); + fnd->de[level] = e; + if (e) { + /* Just write updated index into disk. */ + indx_write(indx, ni, n1, 0); + return 0; + } + + /* + * No space to insert into buffer. Split it. + * To split we: + * - Save split point ('cause index buffers will be changed) + * - Allocate NewBuffer and copy all entries <= sp into new buffer + * - Remove all entries (sp including) from TargetBuffer + * - Insert NewEntry into left or right buffer (depending on sp <=> + * NewEntry) + * - Insert sp into parent buffer (or root) + * - Make sp a parent for new buffer + */ + sp = hdr_find_split(hdr1); + if (!sp) + return -EINVAL; + + sp_size = le16_to_cpu(sp->size); + up_e = kmalloc(sp_size + sizeof(u64), GFP_NOFS); + if (!up_e) + return -ENOMEM; + memcpy(up_e, sp, sp_size); + + if (!hdr1->flags) { + up_e->flags |= NTFS_IE_HAS_SUBNODES; + up_e->size = cpu_to_le16(sp_size + sizeof(u64)); + sub_vbn = NULL; + } else { + t_vbn = de_get_vbn_le(up_e); + sub_vbn = &t_vbn; + } + + /* Allocate on disk a new index allocation buffer. */ + err = indx_add_allocate(indx, ni, &new_vbn); + if (err) + goto out; + + /* Allocate and format memory a new index buffer. */ + n2 = indx_new(indx, ni, new_vbn, sub_vbn); + if (IS_ERR(n2)) { + err = PTR_ERR(n2); + goto out; + } + + hdr2 = &n2->index->ihdr; + + /* Make sp a parent for new buffer. */ + de_set_vbn(up_e, new_vbn); + + /* Copy all the entries <= sp into the new buffer. */ + de_t = hdr_first_de(hdr1); + to_copy = PtrOffset(de_t, sp); + hdr_insert_head(hdr2, de_t, to_copy); + + /* Remove all entries (sp including) from hdr1. */ + used = le32_to_cpu(hdr1->used) - to_copy - sp_size; + memmove(de_t, Add2Ptr(sp, sp_size), used - le32_to_cpu(hdr1->de_off)); + hdr1->used = cpu_to_le32(used); + + /* + * Insert new entry into left or right buffer + * (depending on sp <=> new_de). + */ + hdr_insert_de(indx, + (*indx->cmp)(new_de + 1, le16_to_cpu(new_de->key_size), + up_e + 1, le16_to_cpu(up_e->key_size), + ctx) < 0 + ? hdr2 + : hdr1, + new_de, NULL, ctx); + + indx_mark_used(indx, ni, new_vbn >> indx->idx2vbn_bits); + + indx_write(indx, ni, n1, 0); + indx_write(indx, ni, n2, 0); + + put_indx_node(n2); + + /* + * We've finished splitting everybody, so we are ready to + * insert the promoted entry into the parent. + */ + if (!level) { + /* Insert in root. */ + err = indx_insert_into_root(indx, ni, up_e, NULL, ctx, fnd, 0); + if (err) + goto out; + } else { + /* + * The target buffer's parent is another index buffer. + * TODO: Remove recursion. + */ + err = indx_insert_into_buffer(indx, ni, root, up_e, ctx, + level - 1, fnd); + if (err) + goto out; + } + +out: + kfree(up_e); + + return err; +} + +/* + * indx_insert_entry - Insert new entry into index. + * + * @undo - True if we undoing previous remove. + */ +int indx_insert_entry(struct ntfs_index *indx, struct ntfs_inode *ni, + const struct NTFS_DE *new_de, const void *ctx, + struct ntfs_fnd *fnd, bool undo) +{ + int err; + int diff; + struct NTFS_DE *e; + struct ntfs_fnd *fnd_a = NULL; + struct INDEX_ROOT *root; + + if (!fnd) { + fnd_a = fnd_get(); + if (!fnd_a) { + err = -ENOMEM; + goto out1; + } + fnd = fnd_a; + } + + root = indx_get_root(indx, ni, NULL, NULL); + if (!root) { + err = -EINVAL; + goto out; + } + + if (fnd_is_empty(fnd)) { + /* + * Find the spot the tree where we want to + * insert the new entry. + */ + err = indx_find(indx, ni, root, new_de + 1, + le16_to_cpu(new_de->key_size), ctx, &diff, &e, + fnd); + if (err) + goto out; + + if (!diff) { + err = -EEXIST; + goto out; + } + } + + if (!fnd->level) { + /* + * The root is also a leaf, so we'll insert the + * new entry into it. + */ + err = indx_insert_into_root(indx, ni, new_de, fnd->root_de, ctx, + fnd, undo); + if (err) + goto out; + } else { + /* + * Found a leaf buffer, so we'll insert the new entry into it. + */ + err = indx_insert_into_buffer(indx, ni, root, new_de, ctx, + fnd->level - 1, fnd); + if (err) + goto out; + } + +out: + fnd_put(fnd_a); +out1: + return err; +} + +/* + * indx_find_buffer - Locate a buffer from the tree. + */ +static struct indx_node *indx_find_buffer(struct ntfs_index *indx, + struct ntfs_inode *ni, + const struct INDEX_ROOT *root, + __le64 vbn, struct indx_node *n) +{ + int err; + const struct NTFS_DE *e; + struct indx_node *r; + const struct INDEX_HDR *hdr = n ? &n->index->ihdr : &root->ihdr; + + /* Step 1: Scan one level. */ + for (e = hdr_first_de(hdr);; e = hdr_next_de(hdr, e)) { + if (!e) + return ERR_PTR(-EINVAL); + + if (de_has_vcn(e) && vbn == de_get_vbn_le(e)) + return n; + + if (de_is_last(e)) + break; + } + + /* Step2: Do recursion. */ + e = Add2Ptr(hdr, le32_to_cpu(hdr->de_off)); + for (;;) { + if (de_has_vcn_ex(e)) { + err = indx_read(indx, ni, de_get_vbn(e), &n); + if (err) + return ERR_PTR(err); + + r = indx_find_buffer(indx, ni, root, vbn, n); + if (r) + return r; + } + + if (de_is_last(e)) + break; + + e = Add2Ptr(e, le16_to_cpu(e->size)); + } + + return NULL; +} + +/* + * indx_shrink - Deallocate unused tail indexes. + */ +static int indx_shrink(struct ntfs_index *indx, struct ntfs_inode *ni, + size_t bit) +{ + int err = 0; + u64 bpb, new_data; + size_t nbits; + struct ATTRIB *b; + struct ATTR_LIST_ENTRY *le = NULL; + const struct INDEX_NAMES *in = &s_index_names[indx->type]; + + b = ni_find_attr(ni, NULL, &le, ATTR_BITMAP, in->name, in->name_len, + NULL, NULL); + + if (!b) + return -ENOENT; + + if (!b->non_res) { + unsigned long pos; + const unsigned long *bm = resident_data(b); + + nbits = (size_t)le32_to_cpu(b->res.data_size) * 8; + + if (bit >= nbits) + return 0; + + pos = find_next_bit(bm, nbits, bit); + if (pos < nbits) + return 0; + } else { + size_t used = MINUS_ONE_T; + + nbits = le64_to_cpu(b->nres.data_size) * 8; + + if (bit >= nbits) + return 0; + + err = scan_nres_bitmap(ni, b, indx, bit, &scan_for_used, &used); + if (err) + return err; + + if (used != MINUS_ONE_T) + return 0; + } + + new_data = (u64)bit << indx->index_bits; + + err = attr_set_size(ni, ATTR_ALLOC, in->name, in->name_len, + &indx->alloc_run, new_data, &new_data, false, NULL); + if (err) + return err; + + bpb = bitmap_size(bit); + if (bpb * 8 == nbits) + return 0; + + err = attr_set_size(ni, ATTR_BITMAP, in->name, in->name_len, + &indx->bitmap_run, bpb, &bpb, false, NULL); + + return err; +} + +static int indx_free_children(struct ntfs_index *indx, struct ntfs_inode *ni, + const struct NTFS_DE *e, bool trim) +{ + int err; + struct indx_node *n = NULL; + struct INDEX_HDR *hdr; + CLST vbn = de_get_vbn(e); + size_t i; + + err = indx_read(indx, ni, vbn, &n); + if (err) + return err; + + hdr = &n->index->ihdr; + /* First, recurse into the children, if any. */ + if (hdr_has_subnode(hdr)) { + for (e = hdr_first_de(hdr); e; e = hdr_next_de(hdr, e)) { + indx_free_children(indx, ni, e, false); + if (de_is_last(e)) + break; + } + } + + put_indx_node(n); + + i = vbn >> indx->idx2vbn_bits; + /* + * We've gotten rid of the children; add this buffer to the free list. + */ + indx_mark_free(indx, ni, i); + + if (!trim) + return 0; + + /* + * If there are no used indexes after current free index + * then we can truncate allocation and bitmap. + * Use bitmap to estimate the case. + */ + indx_shrink(indx, ni, i + 1); + return 0; +} + +/* + * indx_get_entry_to_replace + * + * Find a replacement entry for a deleted entry. + * Always returns a node entry: + * NTFS_IE_HAS_SUBNODES is set the flags and the size includes the sub_vcn. + */ +static int indx_get_entry_to_replace(struct ntfs_index *indx, + struct ntfs_inode *ni, + const struct NTFS_DE *de_next, + struct NTFS_DE **de_to_replace, + struct ntfs_fnd *fnd) +{ + int err; + int level = -1; + CLST vbn; + struct NTFS_DE *e, *te, *re; + struct indx_node *n; + struct INDEX_BUFFER *ib; + + *de_to_replace = NULL; + + /* Find first leaf entry down from de_next. */ + vbn = de_get_vbn(de_next); + for (;;) { + n = NULL; + err = indx_read(indx, ni, vbn, &n); + if (err) + goto out; + + e = hdr_first_de(&n->index->ihdr); + fnd_push(fnd, n, e); + + if (!de_is_last(e)) { + /* + * This buffer is non-empty, so its first entry + * could be used as the replacement entry. + */ + level = fnd->level - 1; + } + + if (!de_has_vcn(e)) + break; + + /* This buffer is a node. Continue to go down. */ + vbn = de_get_vbn(e); + } + + if (level == -1) + goto out; + + n = fnd->nodes[level]; + te = hdr_first_de(&n->index->ihdr); + /* Copy the candidate entry into the replacement entry buffer. */ + re = kmalloc(le16_to_cpu(te->size) + sizeof(u64), GFP_NOFS); + if (!re) { + err = -ENOMEM; + goto out; + } + + *de_to_replace = re; + memcpy(re, te, le16_to_cpu(te->size)); + + if (!de_has_vcn(re)) { + /* + * The replacement entry we found doesn't have a sub_vcn. + * increase its size to hold one. + */ + le16_add_cpu(&re->size, sizeof(u64)); + re->flags |= NTFS_IE_HAS_SUBNODES; + } else { + /* + * The replacement entry we found was a node entry, which + * means that all its child buffers are empty. Return them + * to the free pool. + */ + indx_free_children(indx, ni, te, true); + } + + /* + * Expunge the replacement entry from its former location, + * and then write that buffer. + */ + ib = n->index; + e = hdr_delete_de(&ib->ihdr, te); + + fnd->de[level] = e; + indx_write(indx, ni, n, 0); + + /* Check to see if this action created an empty leaf. */ + if (ib_is_leaf(ib) && ib_is_empty(ib)) + return 0; + +out: + fnd_clear(fnd); + return err; +} + +/* + * indx_delete_entry - Delete an entry from the index. + */ +int indx_delete_entry(struct ntfs_index *indx, struct ntfs_inode *ni, + const void *key, u32 key_len, const void *ctx) +{ + int err, diff; + struct INDEX_ROOT *root; + struct INDEX_HDR *hdr; + struct ntfs_fnd *fnd, *fnd2; + struct INDEX_BUFFER *ib; + struct NTFS_DE *e, *re, *next, *prev, *me; + struct indx_node *n, *n2d = NULL; + __le64 sub_vbn; + int level, level2; + struct ATTRIB *attr; + struct mft_inode *mi; + u32 e_size, root_size, new_root_size; + size_t trim_bit; + const struct INDEX_NAMES *in; + + fnd = fnd_get(); + if (!fnd) { + err = -ENOMEM; + goto out2; + } + + fnd2 = fnd_get(); + if (!fnd2) { + err = -ENOMEM; + goto out1; + } + + root = indx_get_root(indx, ni, &attr, &mi); + if (!root) { + err = -EINVAL; + goto out; + } + + /* Locate the entry to remove. */ + err = indx_find(indx, ni, root, key, key_len, ctx, &diff, &e, fnd); + if (err) + goto out; + + if (!e || diff) { + err = -ENOENT; + goto out; + } + + level = fnd->level; + + if (level) { + n = fnd->nodes[level - 1]; + e = fnd->de[level - 1]; + ib = n->index; + hdr = &ib->ihdr; + } else { + hdr = &root->ihdr; + e = fnd->root_de; + n = NULL; + } + + e_size = le16_to_cpu(e->size); + + if (!de_has_vcn_ex(e)) { + /* The entry to delete is a leaf, so we can just rip it out. */ + hdr_delete_de(hdr, e); + + if (!level) { + hdr->total = hdr->used; + + /* Shrink resident root attribute. */ + mi_resize_attr(mi, attr, 0 - e_size); + goto out; + } + + indx_write(indx, ni, n, 0); + + /* + * Check to see if removing that entry made + * the leaf empty. + */ + if (ib_is_leaf(ib) && ib_is_empty(ib)) { + fnd_pop(fnd); + fnd_push(fnd2, n, e); + } + } else { + /* + * The entry we wish to delete is a node buffer, so we + * have to find a replacement for it. + */ + next = de_get_next(e); + + err = indx_get_entry_to_replace(indx, ni, next, &re, fnd2); + if (err) + goto out; + + if (re) { + de_set_vbn_le(re, de_get_vbn_le(e)); + hdr_delete_de(hdr, e); + + err = level ? indx_insert_into_buffer(indx, ni, root, + re, ctx, + fnd->level - 1, + fnd) + : indx_insert_into_root(indx, ni, re, e, + ctx, fnd, 0); + kfree(re); + + if (err) + goto out; + } else { + /* + * There is no replacement for the current entry. + * This means that the subtree rooted at its node + * is empty, and can be deleted, which turn means + * that the node can just inherit the deleted + * entry sub_vcn. + */ + indx_free_children(indx, ni, next, true); + + de_set_vbn_le(next, de_get_vbn_le(e)); + hdr_delete_de(hdr, e); + if (level) { + indx_write(indx, ni, n, 0); + } else { + hdr->total = hdr->used; + + /* Shrink resident root attribute. */ + mi_resize_attr(mi, attr, 0 - e_size); + } + } + } + + /* Delete a branch of tree. */ + if (!fnd2 || !fnd2->level) + goto out; + + /* Reinit root 'cause it can be changed. */ + root = indx_get_root(indx, ni, &attr, &mi); + if (!root) { + err = -EINVAL; + goto out; + } + + n2d = NULL; + sub_vbn = fnd2->nodes[0]->index->vbn; + level2 = 0; + level = fnd->level; + + hdr = level ? &fnd->nodes[level - 1]->index->ihdr : &root->ihdr; + + /* Scan current level. */ + for (e = hdr_first_de(hdr);; e = hdr_next_de(hdr, e)) { + if (!e) { + err = -EINVAL; + goto out; + } + + if (de_has_vcn(e) && sub_vbn == de_get_vbn_le(e)) + break; + + if (de_is_last(e)) { + e = NULL; + break; + } + } + + if (!e) { + /* Do slow search from root. */ + struct indx_node *in; + + fnd_clear(fnd); + + in = indx_find_buffer(indx, ni, root, sub_vbn, NULL); + if (IS_ERR(in)) { + err = PTR_ERR(in); + goto out; + } + + if (in) + fnd_push(fnd, in, NULL); + } + + /* Merge fnd2 -> fnd. */ + for (level = 0; level < fnd2->level; level++) { + fnd_push(fnd, fnd2->nodes[level], fnd2->de[level]); + fnd2->nodes[level] = NULL; + } + fnd2->level = 0; + + hdr = NULL; + for (level = fnd->level; level; level--) { + struct indx_node *in = fnd->nodes[level - 1]; + + ib = in->index; + if (ib_is_empty(ib)) { + sub_vbn = ib->vbn; + } else { + hdr = &ib->ihdr; + n2d = in; + level2 = level; + break; + } + } + + if (!hdr) + hdr = &root->ihdr; + + e = hdr_first_de(hdr); + if (!e) { + err = -EINVAL; + goto out; + } + + if (hdr != &root->ihdr || !de_is_last(e)) { + prev = NULL; + while (!de_is_last(e)) { + if (de_has_vcn(e) && sub_vbn == de_get_vbn_le(e)) + break; + prev = e; + e = hdr_next_de(hdr, e); + if (!e) { + err = -EINVAL; + goto out; + } + } + + if (sub_vbn != de_get_vbn_le(e)) { + /* + * Didn't find the parent entry, although this buffer + * is the parent trail. Something is corrupt. + */ + err = -EINVAL; + goto out; + } + + if (de_is_last(e)) { + /* + * Since we can't remove the end entry, we'll remove + * its predecessor instead. This means we have to + * transfer the predecessor's sub_vcn to the end entry. + * Note: This index block is not empty, so the + * predecessor must exist. + */ + if (!prev) { + err = -EINVAL; + goto out; + } + + if (de_has_vcn(prev)) { + de_set_vbn_le(e, de_get_vbn_le(prev)); + } else if (de_has_vcn(e)) { + le16_sub_cpu(&e->size, sizeof(u64)); + e->flags &= ~NTFS_IE_HAS_SUBNODES; + le32_sub_cpu(&hdr->used, sizeof(u64)); + } + e = prev; + } + + /* + * Copy the current entry into a temporary buffer (stripping + * off its down-pointer, if any) and delete it from the current + * buffer or root, as appropriate. + */ + e_size = le16_to_cpu(e->size); + me = kmemdup(e, e_size, GFP_NOFS); + if (!me) { + err = -ENOMEM; + goto out; + } + + if (de_has_vcn(me)) { + me->flags &= ~NTFS_IE_HAS_SUBNODES; + le16_sub_cpu(&me->size, sizeof(u64)); + } + + hdr_delete_de(hdr, e); + + if (hdr == &root->ihdr) { + level = 0; + hdr->total = hdr->used; + + /* Shrink resident root attribute. */ + mi_resize_attr(mi, attr, 0 - e_size); + } else { + indx_write(indx, ni, n2d, 0); + level = level2; + } + + /* Mark unused buffers as free. */ + trim_bit = -1; + for (; level < fnd->level; level++) { + ib = fnd->nodes[level]->index; + if (ib_is_empty(ib)) { + size_t k = le64_to_cpu(ib->vbn) >> + indx->idx2vbn_bits; + + indx_mark_free(indx, ni, k); + if (k < trim_bit) + trim_bit = k; + } + } + + fnd_clear(fnd); + /*fnd->root_de = NULL;*/ + + /* + * Re-insert the entry into the tree. + * Find the spot the tree where we want to insert the new entry. + */ + err = indx_insert_entry(indx, ni, me, ctx, fnd, 0); + kfree(me); + if (err) + goto out; + + if (trim_bit != -1) + indx_shrink(indx, ni, trim_bit); + } else { + /* + * This tree needs to be collapsed down to an empty root. + * Recreate the index root as an empty leaf and free all + * the bits the index allocation bitmap. + */ + fnd_clear(fnd); + fnd_clear(fnd2); + + in = &s_index_names[indx->type]; + + err = attr_set_size(ni, ATTR_ALLOC, in->name, in->name_len, + &indx->alloc_run, 0, NULL, false, NULL); + err = ni_remove_attr(ni, ATTR_ALLOC, in->name, in->name_len, + false, NULL); + run_close(&indx->alloc_run); + + err = attr_set_size(ni, ATTR_BITMAP, in->name, in->name_len, + &indx->bitmap_run, 0, NULL, false, NULL); + err = ni_remove_attr(ni, ATTR_BITMAP, in->name, in->name_len, + false, NULL); + run_close(&indx->bitmap_run); + + root = indx_get_root(indx, ni, &attr, &mi); + if (!root) { + err = -EINVAL; + goto out; + } + + root_size = le32_to_cpu(attr->res.data_size); + new_root_size = + sizeof(struct INDEX_ROOT) + sizeof(struct NTFS_DE); + + if (new_root_size != root_size && + !mi_resize_attr(mi, attr, new_root_size - root_size)) { + err = -EINVAL; + goto out; + } + + /* Fill first entry. */ + e = (struct NTFS_DE *)(root + 1); + e->ref.low = 0; + e->ref.high = 0; + e->ref.seq = 0; + e->size = cpu_to_le16(sizeof(struct NTFS_DE)); + e->flags = NTFS_IE_LAST; // 0x02 + e->key_size = 0; + e->res = 0; + + hdr = &root->ihdr; + hdr->flags = 0; + hdr->used = hdr->total = cpu_to_le32( + new_root_size - offsetof(struct INDEX_ROOT, ihdr)); + mi->dirty = true; + } + +out: + fnd_put(fnd2); +out1: + fnd_put(fnd); +out2: + return err; +} + +/* + * Update duplicated information in directory entry + * 'dup' - info from MFT record + */ +int indx_update_dup(struct ntfs_inode *ni, struct ntfs_sb_info *sbi, + const struct ATTR_FILE_NAME *fname, + const struct NTFS_DUP_INFO *dup, int sync) +{ + int err, diff; + struct NTFS_DE *e = NULL; + struct ATTR_FILE_NAME *e_fname; + struct ntfs_fnd *fnd; + struct INDEX_ROOT *root; + struct mft_inode *mi; + struct ntfs_index *indx = &ni->dir; + + fnd = fnd_get(); + if (!fnd) + return -ENOMEM; + + root = indx_get_root(indx, ni, NULL, &mi); + if (!root) { + err = -EINVAL; + goto out; + } + + /* Find entry in directory. */ + err = indx_find(indx, ni, root, fname, fname_full_size(fname), sbi, + &diff, &e, fnd); + if (err) + goto out; + + if (!e) { + err = -EINVAL; + goto out; + } + + if (diff) { + err = -EINVAL; + goto out; + } + + e_fname = (struct ATTR_FILE_NAME *)(e + 1); + + if (!memcmp(&e_fname->dup, dup, sizeof(*dup))) { + /* + * Nothing to update in index! Try to avoid this call. + */ + goto out; + } + + memcpy(&e_fname->dup, dup, sizeof(*dup)); + + if (fnd->level) { + /* Directory entry in index. */ + err = indx_write(indx, ni, fnd->nodes[fnd->level - 1], sync); + } else { + /* Directory entry in directory MFT record. */ + mi->dirty = true; + if (sync) + err = mi_write(mi, 1); + else + mark_inode_dirty(&ni->vfs_inode); + } + +out: + fnd_put(fnd); + return err; +} diff --git a/fs/ntfs3/inode.c b/fs/ntfs3/inode.c new file mode 100644 index 000000000..dc937089a --- /dev/null +++ b/fs/ntfs3/inode.c @@ -0,0 +1,1970 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * + * Copyright (C) 2019-2021 Paragon Software GmbH, All rights reserved. + * + */ + +#include <linux/buffer_head.h> +#include <linux/fs.h> +#include <linux/mpage.h> +#include <linux/namei.h> +#include <linux/nls.h> +#include <linux/uio.h> +#include <linux/writeback.h> + +#include "debug.h" +#include "ntfs.h" +#include "ntfs_fs.h" + +/* + * ntfs_read_mft - Read record and parses MFT. + */ +static struct inode *ntfs_read_mft(struct inode *inode, + const struct cpu_str *name, + const struct MFT_REF *ref) +{ + int err = 0; + struct ntfs_inode *ni = ntfs_i(inode); + struct super_block *sb = inode->i_sb; + struct ntfs_sb_info *sbi = sb->s_fs_info; + mode_t mode = 0; + struct ATTR_STD_INFO5 *std5 = NULL; + struct ATTR_LIST_ENTRY *le; + struct ATTRIB *attr; + bool is_match = false; + bool is_root = false; + bool is_dir; + unsigned long ino = inode->i_ino; + u32 rp_fa = 0, asize, t32; + u16 roff, rsize, names = 0; + const struct ATTR_FILE_NAME *fname = NULL; + const struct INDEX_ROOT *root; + struct REPARSE_DATA_BUFFER rp; // 0x18 bytes + u64 t64; + struct MFT_REC *rec; + struct runs_tree *run; + + inode->i_op = NULL; + /* Setup 'uid' and 'gid' */ + inode->i_uid = sbi->options->fs_uid; + inode->i_gid = sbi->options->fs_gid; + + err = mi_init(&ni->mi, sbi, ino); + if (err) + goto out; + + if (!sbi->mft.ni && ino == MFT_REC_MFT && !sb->s_root) { + t64 = sbi->mft.lbo >> sbi->cluster_bits; + t32 = bytes_to_cluster(sbi, MFT_REC_VOL * sbi->record_size); + sbi->mft.ni = ni; + init_rwsem(&ni->file.run_lock); + + if (!run_add_entry(&ni->file.run, 0, t64, t32, true)) { + err = -ENOMEM; + goto out; + } + } + + err = mi_read(&ni->mi, ino == MFT_REC_MFT); + + if (err) + goto out; + + rec = ni->mi.mrec; + + if (sbi->flags & NTFS_FLAGS_LOG_REPLAYING) { + ; + } else if (ref->seq != rec->seq) { + err = -EINVAL; + ntfs_err(sb, "MFT: r=%lx, expect seq=%x instead of %x!", ino, + le16_to_cpu(ref->seq), le16_to_cpu(rec->seq)); + goto out; + } else if (!is_rec_inuse(rec)) { + err = -ESTALE; + ntfs_err(sb, "Inode r=%x is not in use!", (u32)ino); + goto out; + } + + if (le32_to_cpu(rec->total) != sbi->record_size) { + /* Bad inode? */ + err = -EINVAL; + goto out; + } + + if (!is_rec_base(rec)) { + err = -EINVAL; + goto out; + } + + /* Record should contain $I30 root. */ + is_dir = rec->flags & RECORD_FLAG_DIR; + + /* MFT_REC_MFT is not a dir */ + if (is_dir && ino == MFT_REC_MFT) { + err = -EINVAL; + goto out; + } + + inode->i_generation = le16_to_cpu(rec->seq); + + /* Enumerate all struct Attributes MFT. */ + le = NULL; + attr = NULL; + + /* + * To reduce tab pressure use goto instead of + * while( (attr = ni_enum_attr_ex(ni, attr, &le, NULL) )) + */ +next_attr: + run = NULL; + err = -EINVAL; + attr = ni_enum_attr_ex(ni, attr, &le, NULL); + if (!attr) + goto end_enum; + + if (le && le->vcn) { + /* This is non primary attribute segment. Ignore if not MFT. */ + if (ino != MFT_REC_MFT || attr->type != ATTR_DATA) + goto next_attr; + + run = &ni->file.run; + asize = le32_to_cpu(attr->size); + goto attr_unpack_run; + } + + roff = attr->non_res ? 0 : le16_to_cpu(attr->res.data_off); + rsize = attr->non_res ? 0 : le32_to_cpu(attr->res.data_size); + asize = le32_to_cpu(attr->size); + + if (le16_to_cpu(attr->name_off) + attr->name_len > asize) + goto out; + + if (attr->non_res) { + t64 = le64_to_cpu(attr->nres.alloc_size); + if (le64_to_cpu(attr->nres.data_size) > t64 || + le64_to_cpu(attr->nres.valid_size) > t64) + goto out; + } + + switch (attr->type) { + case ATTR_STD: + if (attr->non_res || + asize < sizeof(struct ATTR_STD_INFO) + roff || + rsize < sizeof(struct ATTR_STD_INFO)) + goto out; + + if (std5) + goto next_attr; + + std5 = Add2Ptr(attr, roff); + +#ifdef STATX_BTIME + nt2kernel(std5->cr_time, &ni->i_crtime); +#endif + nt2kernel(std5->a_time, &inode->i_atime); + nt2kernel(std5->c_time, &inode->i_ctime); + nt2kernel(std5->m_time, &inode->i_mtime); + + ni->std_fa = std5->fa; + + if (asize >= sizeof(struct ATTR_STD_INFO5) + roff && + rsize >= sizeof(struct ATTR_STD_INFO5)) + ni->std_security_id = std5->security_id; + goto next_attr; + + case ATTR_LIST: + if (attr->name_len || le || ino == MFT_REC_LOG) + goto out; + + err = ntfs_load_attr_list(ni, attr); + if (err) + goto out; + + le = NULL; + attr = NULL; + goto next_attr; + + case ATTR_NAME: + if (attr->non_res || asize < SIZEOF_ATTRIBUTE_FILENAME + roff || + rsize < SIZEOF_ATTRIBUTE_FILENAME) + goto out; + + fname = Add2Ptr(attr, roff); + if (fname->type == FILE_NAME_DOS) + goto next_attr; + + names += 1; + if (name && name->len == fname->name_len && + !ntfs_cmp_names_cpu(name, (struct le_str *)&fname->name_len, + NULL, false)) + is_match = true; + + goto next_attr; + + case ATTR_DATA: + if (is_dir) { + /* Ignore data attribute in dir record. */ + goto next_attr; + } + + if (ino == MFT_REC_BADCLUST && !attr->non_res) + goto next_attr; + + if (attr->name_len && + ((ino != MFT_REC_BADCLUST || !attr->non_res || + attr->name_len != ARRAY_SIZE(BAD_NAME) || + memcmp(attr_name(attr), BAD_NAME, sizeof(BAD_NAME))) && + (ino != MFT_REC_SECURE || !attr->non_res || + attr->name_len != ARRAY_SIZE(SDS_NAME) || + memcmp(attr_name(attr), SDS_NAME, sizeof(SDS_NAME))))) { + /* File contains stream attribute. Ignore it. */ + goto next_attr; + } + + if (is_attr_sparsed(attr)) + ni->std_fa |= FILE_ATTRIBUTE_SPARSE_FILE; + else + ni->std_fa &= ~FILE_ATTRIBUTE_SPARSE_FILE; + + if (is_attr_compressed(attr)) + ni->std_fa |= FILE_ATTRIBUTE_COMPRESSED; + else + ni->std_fa &= ~FILE_ATTRIBUTE_COMPRESSED; + + if (is_attr_encrypted(attr)) + ni->std_fa |= FILE_ATTRIBUTE_ENCRYPTED; + else + ni->std_fa &= ~FILE_ATTRIBUTE_ENCRYPTED; + + if (!attr->non_res) { + ni->i_valid = inode->i_size = rsize; + inode_set_bytes(inode, rsize); + } + + mode = S_IFREG | (0777 & sbi->options->fs_fmask_inv); + + if (!attr->non_res) { + ni->ni_flags |= NI_FLAG_RESIDENT; + goto next_attr; + } + + inode_set_bytes(inode, attr_ondisk_size(attr)); + + ni->i_valid = le64_to_cpu(attr->nres.valid_size); + inode->i_size = le64_to_cpu(attr->nres.data_size); + if (!attr->nres.alloc_size) + goto next_attr; + + run = ino == MFT_REC_BITMAP ? &sbi->used.bitmap.run + : &ni->file.run; + break; + + case ATTR_ROOT: + if (attr->non_res) + goto out; + + root = Add2Ptr(attr, roff); + + if (attr->name_len != ARRAY_SIZE(I30_NAME) || + memcmp(attr_name(attr), I30_NAME, sizeof(I30_NAME))) + goto next_attr; + + if (root->type != ATTR_NAME || + root->rule != NTFS_COLLATION_TYPE_FILENAME) + goto out; + + if (!is_dir) + goto next_attr; + + is_root = true; + ni->ni_flags |= NI_FLAG_DIR; + + err = indx_init(&ni->dir, sbi, attr, INDEX_MUTEX_I30); + if (err) + goto out; + + mode = sb->s_root + ? (S_IFDIR | (0777 & sbi->options->fs_dmask_inv)) + : (S_IFDIR | 0777); + goto next_attr; + + case ATTR_ALLOC: + if (!is_root || attr->name_len != ARRAY_SIZE(I30_NAME) || + memcmp(attr_name(attr), I30_NAME, sizeof(I30_NAME))) + goto next_attr; + + inode->i_size = le64_to_cpu(attr->nres.data_size); + ni->i_valid = le64_to_cpu(attr->nres.valid_size); + inode_set_bytes(inode, le64_to_cpu(attr->nres.alloc_size)); + + run = &ni->dir.alloc_run; + break; + + case ATTR_BITMAP: + if (ino == MFT_REC_MFT) { + if (!attr->non_res) + goto out; +#ifndef CONFIG_NTFS3_64BIT_CLUSTER + /* 0x20000000 = 2^32 / 8 */ + if (le64_to_cpu(attr->nres.alloc_size) >= 0x20000000) + goto out; +#endif + run = &sbi->mft.bitmap.run; + break; + } else if (is_dir && attr->name_len == ARRAY_SIZE(I30_NAME) && + !memcmp(attr_name(attr), I30_NAME, + sizeof(I30_NAME)) && + attr->non_res) { + run = &ni->dir.bitmap_run; + break; + } + goto next_attr; + + case ATTR_REPARSE: + if (attr->name_len) + goto next_attr; + + rp_fa = ni_parse_reparse(ni, attr, &rp); + switch (rp_fa) { + case REPARSE_LINK: + /* + * Normal symlink. + * Assume one unicode symbol == one utf8. + */ + inode->i_size = le16_to_cpu(rp.SymbolicLinkReparseBuffer + .PrintNameLength) / + sizeof(u16); + + ni->i_valid = inode->i_size; + + /* Clear directory bit. */ + if (ni->ni_flags & NI_FLAG_DIR) { + indx_clear(&ni->dir); + memset(&ni->dir, 0, sizeof(ni->dir)); + ni->ni_flags &= ~NI_FLAG_DIR; + } else { + run_close(&ni->file.run); + } + mode = S_IFLNK | 0777; + is_dir = false; + if (attr->non_res) { + run = &ni->file.run; + goto attr_unpack_run; // Double break. + } + break; + + case REPARSE_COMPRESSED: + break; + + case REPARSE_DEDUPLICATED: + break; + } + goto next_attr; + + case ATTR_EA_INFO: + if (!attr->name_len && + resident_data_ex(attr, sizeof(struct EA_INFO))) { + ni->ni_flags |= NI_FLAG_EA; + /* + * ntfs_get_wsl_perm updates inode->i_uid, inode->i_gid, inode->i_mode + */ + inode->i_mode = mode; + ntfs_get_wsl_perm(inode); + mode = inode->i_mode; + } + goto next_attr; + + default: + goto next_attr; + } + +attr_unpack_run: + roff = le16_to_cpu(attr->nres.run_off); + + if (roff > asize) { + err = -EINVAL; + goto out; + } + + t64 = le64_to_cpu(attr->nres.svcn); + + err = run_unpack_ex(run, sbi, ino, t64, le64_to_cpu(attr->nres.evcn), + t64, Add2Ptr(attr, roff), asize - roff); + if (err < 0) + goto out; + err = 0; + goto next_attr; + +end_enum: + + if (!std5) + goto out; + + if (!is_match && name) { + /* Reuse rec as buffer for ascii name. */ + err = -ENOENT; + goto out; + } + + if (std5->fa & FILE_ATTRIBUTE_READONLY) + mode &= ~0222; + + if (!names) { + err = -EINVAL; + goto out; + } + + if (names != le16_to_cpu(rec->hard_links)) { + /* Correct minor error on the fly. Do not mark inode as dirty. */ + rec->hard_links = cpu_to_le16(names); + ni->mi.dirty = true; + } + + set_nlink(inode, names); + + if (S_ISDIR(mode)) { + ni->std_fa |= FILE_ATTRIBUTE_DIRECTORY; + + /* + * Dot and dot-dot should be included in count but was not + * included in enumeration. + * Usually a hard links to directories are disabled. + */ + inode->i_op = &ntfs_dir_inode_operations; + inode->i_fop = &ntfs_dir_operations; + ni->i_valid = 0; + } else if (S_ISLNK(mode)) { + ni->std_fa &= ~FILE_ATTRIBUTE_DIRECTORY; + inode->i_op = &ntfs_link_inode_operations; + inode->i_fop = NULL; + inode_nohighmem(inode); + } else if (S_ISREG(mode)) { + ni->std_fa &= ~FILE_ATTRIBUTE_DIRECTORY; + inode->i_op = &ntfs_file_inode_operations; + inode->i_fop = &ntfs_file_operations; + inode->i_mapping->a_ops = + is_compressed(ni) ? &ntfs_aops_cmpr : &ntfs_aops; + if (ino != MFT_REC_MFT) + init_rwsem(&ni->file.run_lock); + } else if (S_ISCHR(mode) || S_ISBLK(mode) || S_ISFIFO(mode) || + S_ISSOCK(mode)) { + inode->i_op = &ntfs_special_inode_operations; + init_special_inode(inode, mode, inode->i_rdev); + } else if (fname && fname->home.low == cpu_to_le32(MFT_REC_EXTEND) && + fname->home.seq == cpu_to_le16(MFT_REC_EXTEND)) { + /* Records in $Extend are not a files or general directories. */ + inode->i_op = &ntfs_file_inode_operations; + } else { + err = -EINVAL; + goto out; + } + + if ((sbi->options->sys_immutable && + (std5->fa & FILE_ATTRIBUTE_SYSTEM)) && + !S_ISFIFO(mode) && !S_ISSOCK(mode) && !S_ISLNK(mode)) { + inode->i_flags |= S_IMMUTABLE; + } else { + inode->i_flags &= ~S_IMMUTABLE; + } + + inode->i_mode = mode; + if (!(ni->ni_flags & NI_FLAG_EA)) { + /* If no xattr then no security (stored in xattr). */ + inode->i_flags |= S_NOSEC; + } + + if (ino == MFT_REC_MFT && !sb->s_root) + sbi->mft.ni = NULL; + + unlock_new_inode(inode); + + return inode; + +out: + if (ino == MFT_REC_MFT && !sb->s_root) + sbi->mft.ni = NULL; + + iget_failed(inode); + return ERR_PTR(err); +} + +/* + * ntfs_test_inode + * + * Return: 1 if match. + */ +static int ntfs_test_inode(struct inode *inode, void *data) +{ + struct MFT_REF *ref = data; + + return ino_get(ref) == inode->i_ino; +} + +static int ntfs_set_inode(struct inode *inode, void *data) +{ + const struct MFT_REF *ref = data; + + inode->i_ino = ino_get(ref); + return 0; +} + +struct inode *ntfs_iget5(struct super_block *sb, const struct MFT_REF *ref, + const struct cpu_str *name) +{ + struct inode *inode; + + inode = iget5_locked(sb, ino_get(ref), ntfs_test_inode, ntfs_set_inode, + (void *)ref); + if (unlikely(!inode)) + return ERR_PTR(-ENOMEM); + + /* If this is a freshly allocated inode, need to read it now. */ + if (inode->i_state & I_NEW) + inode = ntfs_read_mft(inode, name, ref); + else if (ref->seq != ntfs_i(inode)->mi.mrec->seq) { + /* Inode overlaps? */ + _ntfs_bad_inode(inode); + } + + if (IS_ERR(inode) && name) + ntfs_set_state(sb->s_fs_info, NTFS_DIRTY_ERROR); + + return inode; +} + +enum get_block_ctx { + GET_BLOCK_GENERAL = 0, + GET_BLOCK_WRITE_BEGIN = 1, + GET_BLOCK_DIRECT_IO_R = 2, + GET_BLOCK_DIRECT_IO_W = 3, + GET_BLOCK_BMAP = 4, +}; + +static noinline int ntfs_get_block_vbo(struct inode *inode, u64 vbo, + struct buffer_head *bh, int create, + enum get_block_ctx ctx) +{ + struct super_block *sb = inode->i_sb; + struct ntfs_sb_info *sbi = sb->s_fs_info; + struct ntfs_inode *ni = ntfs_i(inode); + struct page *page = bh->b_page; + u8 cluster_bits = sbi->cluster_bits; + u32 block_size = sb->s_blocksize; + u64 bytes, lbo, valid; + u32 off; + int err; + CLST vcn, lcn, len; + bool new; + + /* Clear previous state. */ + clear_buffer_new(bh); + clear_buffer_uptodate(bh); + + /* Direct write uses 'create=0'. */ + if (!create && vbo >= ni->i_valid) { + /* Out of valid. */ + return 0; + } + + if (vbo >= inode->i_size) { + /* Out of size. */ + return 0; + } + + if (is_resident(ni)) { + ni_lock(ni); + err = attr_data_read_resident(ni, page); + ni_unlock(ni); + + if (!err) + set_buffer_uptodate(bh); + bh->b_size = block_size; + return err; + } + + vcn = vbo >> cluster_bits; + off = vbo & sbi->cluster_mask; + new = false; + + err = attr_data_get_block(ni, vcn, 1, &lcn, &len, create ? &new : NULL); + if (err) + goto out; + + if (!len) + return 0; + + bytes = ((u64)len << cluster_bits) - off; + + if (lcn == SPARSE_LCN) { + if (!create) { + if (bh->b_size > bytes) + bh->b_size = bytes; + return 0; + } + WARN_ON(1); + } + + if (new) { + set_buffer_new(bh); + if ((len << cluster_bits) > block_size) + ntfs_sparse_cluster(inode, page, vcn, len); + } + + lbo = ((u64)lcn << cluster_bits) + off; + + set_buffer_mapped(bh); + bh->b_bdev = sb->s_bdev; + bh->b_blocknr = lbo >> sb->s_blocksize_bits; + + valid = ni->i_valid; + + if (ctx == GET_BLOCK_DIRECT_IO_W) { + /* ntfs_direct_IO will update ni->i_valid. */ + if (vbo >= valid) + set_buffer_new(bh); + } else if (create) { + /* Normal write. */ + if (bytes > bh->b_size) + bytes = bh->b_size; + + if (vbo >= valid) + set_buffer_new(bh); + + if (vbo + bytes > valid) { + ni->i_valid = vbo + bytes; + mark_inode_dirty(inode); + } + } else if (vbo >= valid) { + /* Read out of valid data. */ + /* Should never be here 'cause already checked. */ + clear_buffer_mapped(bh); + } else if (vbo + bytes <= valid) { + /* Normal read. */ + } else if (vbo + block_size <= valid) { + /* Normal short read. */ + bytes = block_size; + } else { + /* + * Read across valid size: vbo < valid && valid < vbo + block_size + */ + bytes = block_size; + + if (page) { + u32 voff = valid - vbo; + + bh->b_size = block_size; + off = vbo & (PAGE_SIZE - 1); + set_bh_page(bh, page, off); + err = bh_read(bh, 0); + if (err < 0) + goto out; + zero_user_segment(page, off + voff, off + block_size); + } + } + + if (bh->b_size > bytes) + bh->b_size = bytes; + +#ifndef __LP64__ + if (ctx == GET_BLOCK_DIRECT_IO_W || ctx == GET_BLOCK_DIRECT_IO_R) { + static_assert(sizeof(size_t) < sizeof(loff_t)); + if (bytes > 0x40000000u) + bh->b_size = 0x40000000u; + } +#endif + + return 0; + +out: + return err; +} + +int ntfs_get_block(struct inode *inode, sector_t vbn, + struct buffer_head *bh_result, int create) +{ + return ntfs_get_block_vbo(inode, (u64)vbn << inode->i_blkbits, + bh_result, create, GET_BLOCK_GENERAL); +} + +static int ntfs_get_block_bmap(struct inode *inode, sector_t vsn, + struct buffer_head *bh_result, int create) +{ + return ntfs_get_block_vbo(inode, + (u64)vsn << inode->i_sb->s_blocksize_bits, + bh_result, create, GET_BLOCK_BMAP); +} + +static sector_t ntfs_bmap(struct address_space *mapping, sector_t block) +{ + return generic_block_bmap(mapping, block, ntfs_get_block_bmap); +} + +static int ntfs_read_folio(struct file *file, struct folio *folio) +{ + struct page *page = &folio->page; + int err; + struct address_space *mapping = page->mapping; + struct inode *inode = mapping->host; + struct ntfs_inode *ni = ntfs_i(inode); + + if (is_resident(ni)) { + ni_lock(ni); + err = attr_data_read_resident(ni, page); + ni_unlock(ni); + if (err != E_NTFS_NONRESIDENT) { + unlock_page(page); + return err; + } + } + + if (is_compressed(ni)) { + ni_lock(ni); + err = ni_readpage_cmpr(ni, page); + ni_unlock(ni); + return err; + } + + /* Normal + sparse files. */ + return mpage_read_folio(folio, ntfs_get_block); +} + +static void ntfs_readahead(struct readahead_control *rac) +{ + struct address_space *mapping = rac->mapping; + struct inode *inode = mapping->host; + struct ntfs_inode *ni = ntfs_i(inode); + u64 valid; + loff_t pos; + + if (is_resident(ni)) { + /* No readahead for resident. */ + return; + } + + if (is_compressed(ni)) { + /* No readahead for compressed. */ + return; + } + + valid = ni->i_valid; + pos = readahead_pos(rac); + + if (valid < i_size_read(inode) && pos <= valid && + valid < pos + readahead_length(rac)) { + /* Range cross 'valid'. Read it page by page. */ + return; + } + + mpage_readahead(rac, ntfs_get_block); +} + +static int ntfs_get_block_direct_IO_R(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create) +{ + return ntfs_get_block_vbo(inode, (u64)iblock << inode->i_blkbits, + bh_result, create, GET_BLOCK_DIRECT_IO_R); +} + +static int ntfs_get_block_direct_IO_W(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create) +{ + return ntfs_get_block_vbo(inode, (u64)iblock << inode->i_blkbits, + bh_result, create, GET_BLOCK_DIRECT_IO_W); +} + +static ssize_t ntfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) +{ + struct file *file = iocb->ki_filp; + struct address_space *mapping = file->f_mapping; + struct inode *inode = mapping->host; + struct ntfs_inode *ni = ntfs_i(inode); + loff_t vbo = iocb->ki_pos; + loff_t end; + int wr = iov_iter_rw(iter) & WRITE; + size_t iter_count = iov_iter_count(iter); + loff_t valid; + ssize_t ret; + + if (is_resident(ni)) { + /* Switch to buffered write. */ + ret = 0; + goto out; + } + + ret = blockdev_direct_IO(iocb, inode, iter, + wr ? ntfs_get_block_direct_IO_W + : ntfs_get_block_direct_IO_R); + + if (ret > 0) + end = vbo + ret; + else if (wr && ret == -EIOCBQUEUED) + end = vbo + iter_count; + else + goto out; + + valid = ni->i_valid; + if (wr) { + if (end > valid && !S_ISBLK(inode->i_mode)) { + ni->i_valid = end; + mark_inode_dirty(inode); + } + } else if (vbo < valid && valid < end) { + /* Fix page. */ + iov_iter_revert(iter, end - valid); + iov_iter_zero(end - valid, iter); + } + +out: + return ret; +} + +int ntfs_set_size(struct inode *inode, u64 new_size) +{ + struct super_block *sb = inode->i_sb; + struct ntfs_sb_info *sbi = sb->s_fs_info; + struct ntfs_inode *ni = ntfs_i(inode); + int err; + + /* Check for maximum file size. */ + if (is_sparsed(ni) || is_compressed(ni)) { + if (new_size > sbi->maxbytes_sparse) { + err = -EFBIG; + goto out; + } + } else if (new_size > sbi->maxbytes) { + err = -EFBIG; + goto out; + } + + ni_lock(ni); + down_write(&ni->file.run_lock); + + err = attr_set_size(ni, ATTR_DATA, NULL, 0, &ni->file.run, new_size, + &ni->i_valid, true, NULL); + + up_write(&ni->file.run_lock); + ni_unlock(ni); + + mark_inode_dirty(inode); + +out: + return err; +} + +static int ntfs_writepage(struct page *page, struct writeback_control *wbc) +{ + struct address_space *mapping = page->mapping; + struct inode *inode = mapping->host; + struct ntfs_inode *ni = ntfs_i(inode); + int err; + + if (is_resident(ni)) { + ni_lock(ni); + err = attr_data_write_resident(ni, page); + ni_unlock(ni); + if (err != E_NTFS_NONRESIDENT) { + unlock_page(page); + return err; + } + } + + return block_write_full_page(page, ntfs_get_block, wbc); +} + +static int ntfs_writepages(struct address_space *mapping, + struct writeback_control *wbc) +{ + /* Redirect call to 'ntfs_writepage' for resident files. */ + if (is_resident(ntfs_i(mapping->host))) + return generic_writepages(mapping, wbc); + return mpage_writepages(mapping, wbc, ntfs_get_block); +} + +static int ntfs_get_block_write_begin(struct inode *inode, sector_t vbn, + struct buffer_head *bh_result, int create) +{ + return ntfs_get_block_vbo(inode, (u64)vbn << inode->i_blkbits, + bh_result, create, GET_BLOCK_WRITE_BEGIN); +} + +int ntfs_write_begin(struct file *file, struct address_space *mapping, + loff_t pos, u32 len, struct page **pagep, void **fsdata) +{ + int err; + struct inode *inode = mapping->host; + struct ntfs_inode *ni = ntfs_i(inode); + + *pagep = NULL; + if (is_resident(ni)) { + struct page *page = grab_cache_page_write_begin( + mapping, pos >> PAGE_SHIFT); + + if (!page) { + err = -ENOMEM; + goto out; + } + + ni_lock(ni); + err = attr_data_read_resident(ni, page); + ni_unlock(ni); + + if (!err) { + *pagep = page; + goto out; + } + unlock_page(page); + put_page(page); + + if (err != E_NTFS_NONRESIDENT) + goto out; + } + + err = block_write_begin(mapping, pos, len, pagep, + ntfs_get_block_write_begin); + +out: + return err; +} + +/* + * ntfs_write_end - Address_space_operations::write_end. + */ +int ntfs_write_end(struct file *file, struct address_space *mapping, + loff_t pos, u32 len, u32 copied, struct page *page, + void *fsdata) +{ + struct inode *inode = mapping->host; + struct ntfs_inode *ni = ntfs_i(inode); + u64 valid = ni->i_valid; + bool dirty = false; + int err; + + if (is_resident(ni)) { + ni_lock(ni); + err = attr_data_write_resident(ni, page); + ni_unlock(ni); + if (!err) { + dirty = true; + /* Clear any buffers in page. */ + if (page_has_buffers(page)) { + struct buffer_head *head, *bh; + + bh = head = page_buffers(page); + do { + clear_buffer_dirty(bh); + clear_buffer_mapped(bh); + set_buffer_uptodate(bh); + } while (head != (bh = bh->b_this_page)); + } + SetPageUptodate(page); + err = copied; + } + unlock_page(page); + put_page(page); + } else { + err = generic_write_end(file, mapping, pos, len, copied, page, + fsdata); + } + + if (err >= 0) { + if (!(ni->std_fa & FILE_ATTRIBUTE_ARCHIVE)) { + inode->i_ctime = inode->i_mtime = current_time(inode); + ni->std_fa |= FILE_ATTRIBUTE_ARCHIVE; + dirty = true; + } + + if (valid != ni->i_valid) { + /* ni->i_valid is changed in ntfs_get_block_vbo. */ + dirty = true; + } + + if (dirty) + mark_inode_dirty(inode); + } + + return err; +} + +int reset_log_file(struct inode *inode) +{ + int err; + loff_t pos = 0; + u32 log_size = inode->i_size; + struct address_space *mapping = inode->i_mapping; + + for (;;) { + u32 len; + void *kaddr; + struct page *page; + + len = pos + PAGE_SIZE > log_size ? (log_size - pos) : PAGE_SIZE; + + err = block_write_begin(mapping, pos, len, &page, + ntfs_get_block_write_begin); + if (err) + goto out; + + kaddr = kmap_atomic(page); + memset(kaddr, -1, len); + kunmap_atomic(kaddr); + flush_dcache_page(page); + + err = block_write_end(NULL, mapping, pos, len, len, page, NULL); + if (err < 0) + goto out; + pos += len; + + if (pos >= log_size) + break; + balance_dirty_pages_ratelimited(mapping); + } +out: + mark_inode_dirty_sync(inode); + + return err; +} + +int ntfs3_write_inode(struct inode *inode, struct writeback_control *wbc) +{ + return _ni_write_inode(inode, wbc->sync_mode == WB_SYNC_ALL); +} + +int ntfs_sync_inode(struct inode *inode) +{ + return _ni_write_inode(inode, 1); +} + +/* + * writeback_inode - Helper function for ntfs_flush_inodes(). + * + * This writes both the inode and the file data blocks, waiting + * for in flight data blocks before the start of the call. It + * does not wait for any io started during the call. + */ +static int writeback_inode(struct inode *inode) +{ + int ret = sync_inode_metadata(inode, 0); + + if (!ret) + ret = filemap_fdatawrite(inode->i_mapping); + return ret; +} + +/* + * ntfs_flush_inodes + * + * Write data and metadata corresponding to i1 and i2. The io is + * started but we do not wait for any of it to finish. + * + * filemap_flush() is used for the block device, so if there is a dirty + * page for a block already in flight, we will not wait and start the + * io over again. + */ +int ntfs_flush_inodes(struct super_block *sb, struct inode *i1, + struct inode *i2) +{ + int ret = 0; + + if (i1) + ret = writeback_inode(i1); + if (!ret && i2) + ret = writeback_inode(i2); + if (!ret) + ret = sync_blockdev_nowait(sb->s_bdev); + return ret; +} + +int inode_write_data(struct inode *inode, const void *data, size_t bytes) +{ + pgoff_t idx; + + /* Write non resident data. */ + for (idx = 0; bytes; idx++) { + size_t op = bytes > PAGE_SIZE ? PAGE_SIZE : bytes; + struct page *page = ntfs_map_page(inode->i_mapping, idx); + + if (IS_ERR(page)) + return PTR_ERR(page); + + lock_page(page); + WARN_ON(!PageUptodate(page)); + ClearPageUptodate(page); + + memcpy(page_address(page), data, op); + + flush_dcache_page(page); + SetPageUptodate(page); + unlock_page(page); + + ntfs_unmap_page(page); + + bytes -= op; + data = Add2Ptr(data, PAGE_SIZE); + } + return 0; +} + +/* + * ntfs_reparse_bytes + * + * Number of bytes for REPARSE_DATA_BUFFER(IO_REPARSE_TAG_SYMLINK) + * for unicode string of @uni_len length. + */ +static inline u32 ntfs_reparse_bytes(u32 uni_len) +{ + /* Header + unicode string + decorated unicode string. */ + return sizeof(short) * (2 * uni_len + 4) + + offsetof(struct REPARSE_DATA_BUFFER, + SymbolicLinkReparseBuffer.PathBuffer); +} + +static struct REPARSE_DATA_BUFFER * +ntfs_create_reparse_buffer(struct ntfs_sb_info *sbi, const char *symname, + u32 size, u16 *nsize) +{ + int i, err; + struct REPARSE_DATA_BUFFER *rp; + __le16 *rp_name; + typeof(rp->SymbolicLinkReparseBuffer) *rs; + + rp = kzalloc(ntfs_reparse_bytes(2 * size + 2), GFP_NOFS); + if (!rp) + return ERR_PTR(-ENOMEM); + + rs = &rp->SymbolicLinkReparseBuffer; + rp_name = rs->PathBuffer; + + /* Convert link name to UTF-16. */ + err = ntfs_nls_to_utf16(sbi, symname, size, + (struct cpu_str *)(rp_name - 1), 2 * size, + UTF16_LITTLE_ENDIAN); + if (err < 0) + goto out; + + /* err = the length of unicode name of symlink. */ + *nsize = ntfs_reparse_bytes(err); + + if (*nsize > sbi->reparse.max_size) { + err = -EFBIG; + goto out; + } + + /* Translate Linux '/' into Windows '\'. */ + for (i = 0; i < err; i++) { + if (rp_name[i] == cpu_to_le16('/')) + rp_name[i] = cpu_to_le16('\\'); + } + + rp->ReparseTag = IO_REPARSE_TAG_SYMLINK; + rp->ReparseDataLength = + cpu_to_le16(*nsize - offsetof(struct REPARSE_DATA_BUFFER, + SymbolicLinkReparseBuffer)); + + /* PrintName + SubstituteName. */ + rs->SubstituteNameOffset = cpu_to_le16(sizeof(short) * err); + rs->SubstituteNameLength = cpu_to_le16(sizeof(short) * err + 8); + rs->PrintNameLength = rs->SubstituteNameOffset; + + /* + * TODO: Use relative path if possible to allow Windows to + * parse this path. + * 0-absolute path 1- relative path (SYMLINK_FLAG_RELATIVE). + */ + rs->Flags = 0; + + memmove(rp_name + err + 4, rp_name, sizeof(short) * err); + + /* Decorate SubstituteName. */ + rp_name += err; + rp_name[0] = cpu_to_le16('\\'); + rp_name[1] = cpu_to_le16('?'); + rp_name[2] = cpu_to_le16('?'); + rp_name[3] = cpu_to_le16('\\'); + + return rp; +out: + kfree(rp); + return ERR_PTR(err); +} + +struct inode *ntfs_create_inode(struct user_namespace *mnt_userns, + struct inode *dir, struct dentry *dentry, + const struct cpu_str *uni, umode_t mode, + dev_t dev, const char *symname, u32 size, + struct ntfs_fnd *fnd) +{ + int err; + struct super_block *sb = dir->i_sb; + struct ntfs_sb_info *sbi = sb->s_fs_info; + const struct qstr *name = &dentry->d_name; + CLST ino = 0; + struct ntfs_inode *dir_ni = ntfs_i(dir); + struct ntfs_inode *ni = NULL; + struct inode *inode = NULL; + struct ATTRIB *attr; + struct ATTR_STD_INFO5 *std5; + struct ATTR_FILE_NAME *fname; + struct MFT_REC *rec; + u32 asize, dsize, sd_size; + enum FILE_ATTRIBUTE fa; + __le32 security_id = SECURITY_ID_INVALID; + CLST vcn; + const void *sd; + u16 t16, nsize = 0, aid = 0; + struct INDEX_ROOT *root, *dir_root; + struct NTFS_DE *e, *new_de = NULL; + struct REPARSE_DATA_BUFFER *rp = NULL; + bool rp_inserted = false; + + ni_lock_dir(dir_ni); + + dir_root = indx_get_root(&dir_ni->dir, dir_ni, NULL, NULL); + if (!dir_root) { + err = -EINVAL; + goto out1; + } + + if (S_ISDIR(mode)) { + /* Use parent's directory attributes. */ + fa = dir_ni->std_fa | FILE_ATTRIBUTE_DIRECTORY | + FILE_ATTRIBUTE_ARCHIVE; + /* + * By default child directory inherits parent attributes. + * Root directory is hidden + system. + * Make an exception for children in root. + */ + if (dir->i_ino == MFT_REC_ROOT) + fa &= ~(FILE_ATTRIBUTE_HIDDEN | FILE_ATTRIBUTE_SYSTEM); + } else if (S_ISLNK(mode)) { + /* It is good idea that link should be the same type (file/dir) as target */ + fa = FILE_ATTRIBUTE_REPARSE_POINT; + + /* + * Linux: there are dir/file/symlink and so on. + * NTFS: symlinks are "dir + reparse" or "file + reparse" + * It is good idea to create: + * dir + reparse if 'symname' points to directory + * or + * file + reparse if 'symname' points to file + * Unfortunately kern_path hangs if symname contains 'dir'. + */ + + /* + * struct path path; + * + * if (!kern_path(symname, LOOKUP_FOLLOW, &path)){ + * struct inode *target = d_inode(path.dentry); + * + * if (S_ISDIR(target->i_mode)) + * fa |= FILE_ATTRIBUTE_DIRECTORY; + * // if ( target->i_sb == sb ){ + * // use relative path? + * // } + * path_put(&path); + * } + */ + } else if (S_ISREG(mode)) { + if (sbi->options->sparse) { + /* Sparsed regular file, cause option 'sparse'. */ + fa = FILE_ATTRIBUTE_SPARSE_FILE | + FILE_ATTRIBUTE_ARCHIVE; + } else if (dir_ni->std_fa & FILE_ATTRIBUTE_COMPRESSED) { + /* Compressed regular file, if parent is compressed. */ + fa = FILE_ATTRIBUTE_COMPRESSED | FILE_ATTRIBUTE_ARCHIVE; + } else { + /* Regular file, default attributes. */ + fa = FILE_ATTRIBUTE_ARCHIVE; + } + } else { + fa = FILE_ATTRIBUTE_ARCHIVE; + } + + if (!(mode & 0222)) + fa |= FILE_ATTRIBUTE_READONLY; + + /* Allocate PATH_MAX bytes. */ + new_de = __getname(); + if (!new_de) { + err = -ENOMEM; + goto out1; + } + + /* Mark rw ntfs as dirty. it will be cleared at umount. */ + ntfs_set_state(sbi, NTFS_DIRTY_DIRTY); + + /* Step 1: allocate and fill new mft record. */ + err = ntfs_look_free_mft(sbi, &ino, false, NULL, NULL); + if (err) + goto out2; + + ni = ntfs_new_inode(sbi, ino, fa & FILE_ATTRIBUTE_DIRECTORY); + if (IS_ERR(ni)) { + err = PTR_ERR(ni); + ni = NULL; + goto out3; + } + inode = &ni->vfs_inode; + inode_init_owner(mnt_userns, inode, dir, mode); + mode = inode->i_mode; + + inode->i_atime = inode->i_mtime = inode->i_ctime = ni->i_crtime = + current_time(inode); + + rec = ni->mi.mrec; + rec->hard_links = cpu_to_le16(1); + attr = Add2Ptr(rec, le16_to_cpu(rec->attr_off)); + + /* Get default security id. */ + sd = s_default_security; + sd_size = sizeof(s_default_security); + + if (is_ntfs3(sbi)) { + security_id = dir_ni->std_security_id; + if (le32_to_cpu(security_id) < SECURITY_ID_FIRST) { + security_id = sbi->security.def_security_id; + + if (security_id == SECURITY_ID_INVALID && + !ntfs_insert_security(sbi, sd, sd_size, + &security_id, NULL)) + sbi->security.def_security_id = security_id; + } + } + + /* Insert standard info. */ + std5 = Add2Ptr(attr, SIZEOF_RESIDENT); + + if (security_id == SECURITY_ID_INVALID) { + dsize = sizeof(struct ATTR_STD_INFO); + } else { + dsize = sizeof(struct ATTR_STD_INFO5); + std5->security_id = security_id; + ni->std_security_id = security_id; + } + asize = SIZEOF_RESIDENT + dsize; + + attr->type = ATTR_STD; + attr->size = cpu_to_le32(asize); + attr->id = cpu_to_le16(aid++); + attr->res.data_off = SIZEOF_RESIDENT_LE; + attr->res.data_size = cpu_to_le32(dsize); + + std5->cr_time = std5->m_time = std5->c_time = std5->a_time = + kernel2nt(&inode->i_atime); + + ni->std_fa = fa; + std5->fa = fa; + + attr = Add2Ptr(attr, asize); + + /* Insert file name. */ + err = fill_name_de(sbi, new_de, name, uni); + if (err) + goto out4; + + mi_get_ref(&ni->mi, &new_de->ref); + + fname = (struct ATTR_FILE_NAME *)(new_de + 1); + mi_get_ref(&dir_ni->mi, &fname->home); + fname->dup.cr_time = fname->dup.m_time = fname->dup.c_time = + fname->dup.a_time = std5->cr_time; + fname->dup.alloc_size = fname->dup.data_size = 0; + fname->dup.fa = std5->fa; + fname->dup.ea_size = fname->dup.reparse = 0; + + dsize = le16_to_cpu(new_de->key_size); + asize = ALIGN(SIZEOF_RESIDENT + dsize, 8); + + attr->type = ATTR_NAME; + attr->size = cpu_to_le32(asize); + attr->res.data_off = SIZEOF_RESIDENT_LE; + attr->res.flags = RESIDENT_FLAG_INDEXED; + attr->id = cpu_to_le16(aid++); + attr->res.data_size = cpu_to_le32(dsize); + memcpy(Add2Ptr(attr, SIZEOF_RESIDENT), fname, dsize); + + attr = Add2Ptr(attr, asize); + + if (security_id == SECURITY_ID_INVALID) { + /* Insert security attribute. */ + asize = SIZEOF_RESIDENT + ALIGN(sd_size, 8); + + attr->type = ATTR_SECURE; + attr->size = cpu_to_le32(asize); + attr->id = cpu_to_le16(aid++); + attr->res.data_off = SIZEOF_RESIDENT_LE; + attr->res.data_size = cpu_to_le32(sd_size); + memcpy(Add2Ptr(attr, SIZEOF_RESIDENT), sd, sd_size); + + attr = Add2Ptr(attr, asize); + } + + attr->id = cpu_to_le16(aid++); + if (fa & FILE_ATTRIBUTE_DIRECTORY) { + /* + * Regular directory or symlink to directory. + * Create root attribute. + */ + dsize = sizeof(struct INDEX_ROOT) + sizeof(struct NTFS_DE); + asize = sizeof(I30_NAME) + SIZEOF_RESIDENT + dsize; + + attr->type = ATTR_ROOT; + attr->size = cpu_to_le32(asize); + + attr->name_len = ARRAY_SIZE(I30_NAME); + attr->name_off = SIZEOF_RESIDENT_LE; + attr->res.data_off = + cpu_to_le16(sizeof(I30_NAME) + SIZEOF_RESIDENT); + attr->res.data_size = cpu_to_le32(dsize); + memcpy(Add2Ptr(attr, SIZEOF_RESIDENT), I30_NAME, + sizeof(I30_NAME)); + + root = Add2Ptr(attr, sizeof(I30_NAME) + SIZEOF_RESIDENT); + memcpy(root, dir_root, offsetof(struct INDEX_ROOT, ihdr)); + root->ihdr.de_off = + cpu_to_le32(sizeof(struct INDEX_HDR)); // 0x10 + root->ihdr.used = cpu_to_le32(sizeof(struct INDEX_HDR) + + sizeof(struct NTFS_DE)); + root->ihdr.total = root->ihdr.used; + + e = Add2Ptr(root, sizeof(struct INDEX_ROOT)); + e->size = cpu_to_le16(sizeof(struct NTFS_DE)); + e->flags = NTFS_IE_LAST; + } else if (S_ISLNK(mode)) { + /* + * Symlink to file. + * Create empty resident data attribute. + */ + asize = SIZEOF_RESIDENT; + + /* Insert empty ATTR_DATA */ + attr->type = ATTR_DATA; + attr->size = cpu_to_le32(SIZEOF_RESIDENT); + attr->name_off = SIZEOF_RESIDENT_LE; + attr->res.data_off = SIZEOF_RESIDENT_LE; + } else if (S_ISREG(mode)) { + /* + * Regular file. Create empty non resident data attribute. + */ + attr->type = ATTR_DATA; + attr->non_res = 1; + attr->nres.evcn = cpu_to_le64(-1ll); + if (fa & FILE_ATTRIBUTE_SPARSE_FILE) { + attr->size = cpu_to_le32(SIZEOF_NONRESIDENT_EX + 8); + attr->name_off = SIZEOF_NONRESIDENT_EX_LE; + attr->flags = ATTR_FLAG_SPARSED; + asize = SIZEOF_NONRESIDENT_EX + 8; + } else if (fa & FILE_ATTRIBUTE_COMPRESSED) { + attr->size = cpu_to_le32(SIZEOF_NONRESIDENT_EX + 8); + attr->name_off = SIZEOF_NONRESIDENT_EX_LE; + attr->flags = ATTR_FLAG_COMPRESSED; + attr->nres.c_unit = COMPRESSION_UNIT; + asize = SIZEOF_NONRESIDENT_EX + 8; + } else { + attr->size = cpu_to_le32(SIZEOF_NONRESIDENT + 8); + attr->name_off = SIZEOF_NONRESIDENT_LE; + asize = SIZEOF_NONRESIDENT + 8; + } + attr->nres.run_off = attr->name_off; + } else { + /* + * Node. Create empty resident data attribute. + */ + attr->type = ATTR_DATA; + attr->size = cpu_to_le32(SIZEOF_RESIDENT); + attr->name_off = SIZEOF_RESIDENT_LE; + if (fa & FILE_ATTRIBUTE_SPARSE_FILE) + attr->flags = ATTR_FLAG_SPARSED; + else if (fa & FILE_ATTRIBUTE_COMPRESSED) + attr->flags = ATTR_FLAG_COMPRESSED; + attr->res.data_off = SIZEOF_RESIDENT_LE; + asize = SIZEOF_RESIDENT; + ni->ni_flags |= NI_FLAG_RESIDENT; + } + + if (S_ISDIR(mode)) { + ni->ni_flags |= NI_FLAG_DIR; + err = indx_init(&ni->dir, sbi, attr, INDEX_MUTEX_I30); + if (err) + goto out4; + } else if (S_ISLNK(mode)) { + rp = ntfs_create_reparse_buffer(sbi, symname, size, &nsize); + + if (IS_ERR(rp)) { + err = PTR_ERR(rp); + rp = NULL; + goto out4; + } + + /* + * Insert ATTR_REPARSE. + */ + attr = Add2Ptr(attr, asize); + attr->type = ATTR_REPARSE; + attr->id = cpu_to_le16(aid++); + + /* Resident or non resident? */ + asize = ALIGN(SIZEOF_RESIDENT + nsize, 8); + t16 = PtrOffset(rec, attr); + + /* + * Below function 'ntfs_save_wsl_perm' requires 0x78 bytes. + * It is good idea to keep extened attributes resident. + */ + if (asize + t16 + 0x78 + 8 > sbi->record_size) { + CLST alen; + CLST clst = bytes_to_cluster(sbi, nsize); + + /* Bytes per runs. */ + t16 = sbi->record_size - t16 - SIZEOF_NONRESIDENT; + + attr->non_res = 1; + attr->nres.evcn = cpu_to_le64(clst - 1); + attr->name_off = SIZEOF_NONRESIDENT_LE; + attr->nres.run_off = attr->name_off; + attr->nres.data_size = cpu_to_le64(nsize); + attr->nres.valid_size = attr->nres.data_size; + attr->nres.alloc_size = + cpu_to_le64(ntfs_up_cluster(sbi, nsize)); + + err = attr_allocate_clusters(sbi, &ni->file.run, 0, 0, + clst, NULL, 0, &alen, 0, + NULL); + if (err) + goto out5; + + err = run_pack(&ni->file.run, 0, clst, + Add2Ptr(attr, SIZEOF_NONRESIDENT), t16, + &vcn); + if (err < 0) + goto out5; + + if (vcn != clst) { + err = -EINVAL; + goto out5; + } + + asize = SIZEOF_NONRESIDENT + ALIGN(err, 8); + } else { + attr->res.data_off = SIZEOF_RESIDENT_LE; + attr->res.data_size = cpu_to_le32(nsize); + memcpy(Add2Ptr(attr, SIZEOF_RESIDENT), rp, nsize); + nsize = 0; + } + /* Size of symlink equals the length of input string. */ + inode->i_size = size; + + attr->size = cpu_to_le32(asize); + + err = ntfs_insert_reparse(sbi, IO_REPARSE_TAG_SYMLINK, + &new_de->ref); + if (err) + goto out5; + + rp_inserted = true; + } + + attr = Add2Ptr(attr, asize); + attr->type = ATTR_END; + + rec->used = cpu_to_le32(PtrOffset(rec, attr) + 8); + rec->next_attr_id = cpu_to_le16(aid); + + /* Step 2: Add new name in index. */ + err = indx_insert_entry(&dir_ni->dir, dir_ni, new_de, sbi, fnd, 0); + if (err) + goto out6; + + /* Unlock parent directory before ntfs_init_acl. */ + ni_unlock(dir_ni); + + inode->i_generation = le16_to_cpu(rec->seq); + + dir->i_mtime = dir->i_ctime = inode->i_atime; + + if (S_ISDIR(mode)) { + inode->i_op = &ntfs_dir_inode_operations; + inode->i_fop = &ntfs_dir_operations; + } else if (S_ISLNK(mode)) { + inode->i_op = &ntfs_link_inode_operations; + inode->i_fop = NULL; + inode->i_mapping->a_ops = &ntfs_aops; + inode->i_size = size; + inode_nohighmem(inode); + } else if (S_ISREG(mode)) { + inode->i_op = &ntfs_file_inode_operations; + inode->i_fop = &ntfs_file_operations; + inode->i_mapping->a_ops = + is_compressed(ni) ? &ntfs_aops_cmpr : &ntfs_aops; + init_rwsem(&ni->file.run_lock); + } else { + inode->i_op = &ntfs_special_inode_operations; + init_special_inode(inode, mode, dev); + } + +#ifdef CONFIG_NTFS3_FS_POSIX_ACL + if (!S_ISLNK(mode) && (sb->s_flags & SB_POSIXACL)) { + err = ntfs_init_acl(mnt_userns, inode, dir); + if (err) + goto out7; + } else +#endif + { + inode->i_flags |= S_NOSEC; + } + + /* Write non resident data. */ + if (nsize) { + err = ntfs_sb_write_run(sbi, &ni->file.run, 0, rp, nsize, 0); + if (err) + goto out7; + } + + /* + * Call 'd_instantiate' after inode->i_op is set + * but before finish_open. + */ + d_instantiate(dentry, inode); + + ntfs_save_wsl_perm(inode); + mark_inode_dirty(dir); + mark_inode_dirty(inode); + + /* Normal exit. */ + goto out2; + +out7: + + /* Undo 'indx_insert_entry'. */ + ni_lock_dir(dir_ni); + indx_delete_entry(&dir_ni->dir, dir_ni, new_de + 1, + le16_to_cpu(new_de->key_size), sbi); + /* ni_unlock(dir_ni); will be called later. */ +out6: + if (rp_inserted) + ntfs_remove_reparse(sbi, IO_REPARSE_TAG_SYMLINK, &new_de->ref); + +out5: + if (!S_ISDIR(mode)) + run_deallocate(sbi, &ni->file.run, false); + +out4: + clear_rec_inuse(rec); + clear_nlink(inode); + ni->mi.dirty = false; + discard_new_inode(inode); +out3: + ntfs_mark_rec_free(sbi, ino, false); + +out2: + __putname(new_de); + kfree(rp); + +out1: + if (err) { + ni_unlock(dir_ni); + return ERR_PTR(err); + } + + unlock_new_inode(inode); + + return inode; +} + +int ntfs_link_inode(struct inode *inode, struct dentry *dentry) +{ + int err; + struct ntfs_inode *ni = ntfs_i(inode); + struct ntfs_sb_info *sbi = inode->i_sb->s_fs_info; + struct NTFS_DE *de; + + /* Allocate PATH_MAX bytes. */ + de = __getname(); + if (!de) + return -ENOMEM; + + /* Mark rw ntfs as dirty. It will be cleared at umount. */ + ntfs_set_state(sbi, NTFS_DIRTY_DIRTY); + + /* Construct 'de'. */ + err = fill_name_de(sbi, de, &dentry->d_name, NULL); + if (err) + goto out; + + err = ni_add_name(ntfs_i(d_inode(dentry->d_parent)), ni, de); +out: + __putname(de); + return err; +} + +/* + * ntfs_unlink_inode + * + * inode_operations::unlink + * inode_operations::rmdir + */ +int ntfs_unlink_inode(struct inode *dir, const struct dentry *dentry) +{ + int err; + struct ntfs_sb_info *sbi = dir->i_sb->s_fs_info; + struct inode *inode = d_inode(dentry); + struct ntfs_inode *ni = ntfs_i(inode); + struct ntfs_inode *dir_ni = ntfs_i(dir); + struct NTFS_DE *de, *de2 = NULL; + int undo_remove; + + if (ntfs_is_meta_file(sbi, ni->mi.rno)) + return -EINVAL; + + /* Allocate PATH_MAX bytes. */ + de = __getname(); + if (!de) + return -ENOMEM; + + ni_lock(ni); + + if (S_ISDIR(inode->i_mode) && !dir_is_empty(inode)) { + err = -ENOTEMPTY; + goto out; + } + + err = fill_name_de(sbi, de, &dentry->d_name, NULL); + if (err < 0) + goto out; + + undo_remove = 0; + err = ni_remove_name(dir_ni, ni, de, &de2, &undo_remove); + + if (!err) { + drop_nlink(inode); + dir->i_mtime = dir->i_ctime = current_time(dir); + mark_inode_dirty(dir); + inode->i_ctime = dir->i_ctime; + if (inode->i_nlink) + mark_inode_dirty(inode); + } else if (!ni_remove_name_undo(dir_ni, ni, de, de2, undo_remove)) { + _ntfs_bad_inode(inode); + } else { + if (ni_is_dirty(dir)) + mark_inode_dirty(dir); + if (ni_is_dirty(inode)) + mark_inode_dirty(inode); + } + +out: + ni_unlock(ni); + __putname(de); + return err; +} + +void ntfs_evict_inode(struct inode *inode) +{ + truncate_inode_pages_final(&inode->i_data); + + if (inode->i_nlink) + _ni_write_inode(inode, inode_needs_sync(inode)); + + invalidate_inode_buffers(inode); + clear_inode(inode); + + ni_clear(ntfs_i(inode)); +} + +static noinline int ntfs_readlink_hlp(struct inode *inode, char *buffer, + int buflen) +{ + int i, err = -EINVAL; + struct ntfs_inode *ni = ntfs_i(inode); + struct super_block *sb = inode->i_sb; + struct ntfs_sb_info *sbi = sb->s_fs_info; + u64 size; + u16 ulen = 0; + void *to_free = NULL; + struct REPARSE_DATA_BUFFER *rp; + const __le16 *uname; + struct ATTRIB *attr; + + /* Reparse data present. Try to parse it. */ + static_assert(!offsetof(struct REPARSE_DATA_BUFFER, ReparseTag)); + static_assert(sizeof(u32) == sizeof(rp->ReparseTag)); + + *buffer = 0; + + attr = ni_find_attr(ni, NULL, NULL, ATTR_REPARSE, NULL, 0, NULL, NULL); + if (!attr) + goto out; + + if (!attr->non_res) { + rp = resident_data_ex(attr, sizeof(struct REPARSE_DATA_BUFFER)); + if (!rp) + goto out; + size = le32_to_cpu(attr->res.data_size); + } else { + size = le64_to_cpu(attr->nres.data_size); + rp = NULL; + } + + if (size > sbi->reparse.max_size || size <= sizeof(u32)) + goto out; + + if (!rp) { + rp = kmalloc(size, GFP_NOFS); + if (!rp) { + err = -ENOMEM; + goto out; + } + to_free = rp; + /* Read into temporal buffer. */ + err = ntfs_read_run_nb(sbi, &ni->file.run, 0, rp, size, NULL); + if (err) + goto out; + } + + /* Microsoft Tag. */ + switch (rp->ReparseTag) { + case IO_REPARSE_TAG_MOUNT_POINT: + /* Mount points and junctions. */ + /* Can we use 'Rp->MountPointReparseBuffer.PrintNameLength'? */ + if (size <= offsetof(struct REPARSE_DATA_BUFFER, + MountPointReparseBuffer.PathBuffer)) + goto out; + uname = Add2Ptr(rp, + offsetof(struct REPARSE_DATA_BUFFER, + MountPointReparseBuffer.PathBuffer) + + le16_to_cpu(rp->MountPointReparseBuffer + .PrintNameOffset)); + ulen = le16_to_cpu(rp->MountPointReparseBuffer.PrintNameLength); + break; + + case IO_REPARSE_TAG_SYMLINK: + /* FolderSymbolicLink */ + /* Can we use 'Rp->SymbolicLinkReparseBuffer.PrintNameLength'? */ + if (size <= offsetof(struct REPARSE_DATA_BUFFER, + SymbolicLinkReparseBuffer.PathBuffer)) + goto out; + uname = Add2Ptr( + rp, offsetof(struct REPARSE_DATA_BUFFER, + SymbolicLinkReparseBuffer.PathBuffer) + + le16_to_cpu(rp->SymbolicLinkReparseBuffer + .PrintNameOffset)); + ulen = le16_to_cpu( + rp->SymbolicLinkReparseBuffer.PrintNameLength); + break; + + case IO_REPARSE_TAG_CLOUD: + case IO_REPARSE_TAG_CLOUD_1: + case IO_REPARSE_TAG_CLOUD_2: + case IO_REPARSE_TAG_CLOUD_3: + case IO_REPARSE_TAG_CLOUD_4: + case IO_REPARSE_TAG_CLOUD_5: + case IO_REPARSE_TAG_CLOUD_6: + case IO_REPARSE_TAG_CLOUD_7: + case IO_REPARSE_TAG_CLOUD_8: + case IO_REPARSE_TAG_CLOUD_9: + case IO_REPARSE_TAG_CLOUD_A: + case IO_REPARSE_TAG_CLOUD_B: + case IO_REPARSE_TAG_CLOUD_C: + case IO_REPARSE_TAG_CLOUD_D: + case IO_REPARSE_TAG_CLOUD_E: + case IO_REPARSE_TAG_CLOUD_F: + err = sizeof("OneDrive") - 1; + if (err > buflen) + err = buflen; + memcpy(buffer, "OneDrive", err); + goto out; + + default: + if (IsReparseTagMicrosoft(rp->ReparseTag)) { + /* Unknown Microsoft Tag. */ + goto out; + } + if (!IsReparseTagNameSurrogate(rp->ReparseTag) || + size <= sizeof(struct REPARSE_POINT)) { + goto out; + } + + /* Users tag. */ + uname = Add2Ptr(rp, sizeof(struct REPARSE_POINT)); + ulen = le16_to_cpu(rp->ReparseDataLength) - + sizeof(struct REPARSE_POINT); + } + + /* Convert nlen from bytes to UNICODE chars. */ + ulen >>= 1; + + /* Check that name is available. */ + if (!ulen || uname + ulen > (__le16 *)Add2Ptr(rp, size)) + goto out; + + /* If name is already zero terminated then truncate it now. */ + if (!uname[ulen - 1]) + ulen -= 1; + + err = ntfs_utf16_to_nls(sbi, uname, ulen, buffer, buflen); + + if (err < 0) + goto out; + + /* Translate Windows '\' into Linux '/'. */ + for (i = 0; i < err; i++) { + if (buffer[i] == '\\') + buffer[i] = '/'; + } + + /* Always set last zero. */ + buffer[err] = 0; +out: + kfree(to_free); + return err; +} + +static const char *ntfs_get_link(struct dentry *de, struct inode *inode, + struct delayed_call *done) +{ + int err; + char *ret; + + if (!de) + return ERR_PTR(-ECHILD); + + ret = kmalloc(PAGE_SIZE, GFP_NOFS); + if (!ret) + return ERR_PTR(-ENOMEM); + + err = ntfs_readlink_hlp(inode, ret, PAGE_SIZE); + if (err < 0) { + kfree(ret); + return ERR_PTR(err); + } + + set_delayed_call(done, kfree_link, ret); + + return ret; +} + +// clang-format off +const struct inode_operations ntfs_link_inode_operations = { + .get_link = ntfs_get_link, + .setattr = ntfs3_setattr, + .listxattr = ntfs_listxattr, + .permission = ntfs_permission, +}; + +const struct address_space_operations ntfs_aops = { + .read_folio = ntfs_read_folio, + .readahead = ntfs_readahead, + .writepage = ntfs_writepage, + .writepages = ntfs_writepages, + .write_begin = ntfs_write_begin, + .write_end = ntfs_write_end, + .direct_IO = ntfs_direct_IO, + .bmap = ntfs_bmap, + .dirty_folio = block_dirty_folio, + .invalidate_folio = block_invalidate_folio, +}; + +const struct address_space_operations ntfs_aops_cmpr = { + .read_folio = ntfs_read_folio, + .readahead = ntfs_readahead, +}; +// clang-format on diff --git a/fs/ntfs3/lib/decompress_common.c b/fs/ntfs3/lib/decompress_common.c new file mode 100644 index 000000000..e96652240 --- /dev/null +++ b/fs/ntfs3/lib/decompress_common.c @@ -0,0 +1,319 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * decompress_common.c - Code shared by the XPRESS and LZX decompressors + * + * Copyright (C) 2015 Eric Biggers + */ + +#include "decompress_common.h" + +/* + * make_huffman_decode_table() - + * + * Build a decoding table for a canonical prefix code, or "Huffman code". + * + * This is an internal function, not part of the library API! + * + * This takes as input the length of the codeword for each symbol in the + * alphabet and produces as output a table that can be used for fast + * decoding of prefix-encoded symbols using read_huffsym(). + * + * Strictly speaking, a canonical prefix code might not be a Huffman + * code. But this algorithm will work either way; and in fact, since + * Huffman codes are defined in terms of symbol frequencies, there is no + * way for the decompressor to know whether the code is a true Huffman + * code or not until all symbols have been decoded. + * + * Because the prefix code is assumed to be "canonical", it can be + * reconstructed directly from the codeword lengths. A prefix code is + * canonical if and only if a longer codeword never lexicographically + * precedes a shorter codeword, and the lexicographic ordering of + * codewords of the same length is the same as the lexicographic ordering + * of the corresponding symbols. Consequently, we can sort the symbols + * primarily by codeword length and secondarily by symbol value, then + * reconstruct the prefix code by generating codewords lexicographically + * in that order. + * + * This function does not, however, generate the prefix code explicitly. + * Instead, it directly builds a table for decoding symbols using the + * code. The basic idea is this: given the next 'max_codeword_len' bits + * in the input, we can look up the decoded symbol by indexing a table + * containing 2**max_codeword_len entries. A codeword with length + * 'max_codeword_len' will have exactly one entry in this table, whereas + * a codeword shorter than 'max_codeword_len' will have multiple entries + * in this table. Precisely, a codeword of length n will be represented + * by 2**(max_codeword_len - n) entries in this table. The 0-based index + * of each such entry will contain the corresponding codeword as a prefix + * when zero-padded on the left to 'max_codeword_len' binary digits. + * + * That's the basic idea, but we implement two optimizations regarding + * the format of the decode table itself: + * + * - For many compression formats, the maximum codeword length is too + * long for it to be efficient to build the full decoding table + * whenever a new prefix code is used. Instead, we can build the table + * using only 2**table_bits entries, where 'table_bits' is some number + * less than or equal to 'max_codeword_len'. Then, only codewords of + * length 'table_bits' and shorter can be directly looked up. For + * longer codewords, the direct lookup instead produces the root of a + * binary tree. Using this tree, the decoder can do traditional + * bit-by-bit decoding of the remainder of the codeword. Child nodes + * are allocated in extra entries at the end of the table; leaf nodes + * contain symbols. Note that the long-codeword case is, in general, + * not performance critical, since in Huffman codes the most frequently + * used symbols are assigned the shortest codeword lengths. + * + * - When we decode a symbol using a direct lookup of the table, we still + * need to know its length so that the bitstream can be advanced by the + * appropriate number of bits. The simple solution is to simply retain + * the 'lens' array and use the decoded symbol as an index into it. + * However, this requires two separate array accesses in the fast path. + * The optimization is to store the length directly in the decode + * table. We use the bottom 11 bits for the symbol and the top 5 bits + * for the length. In addition, to combine this optimization with the + * previous one, we introduce a special case where the top 2 bits of + * the length are both set if the entry is actually the root of a + * binary tree. + * + * @decode_table: + * The array in which to create the decoding table. This must have + * a length of at least ((2**table_bits) + 2 * num_syms) entries. + * + * @num_syms: + * The number of symbols in the alphabet; also, the length of the + * 'lens' array. Must be less than or equal to 2048. + * + * @table_bits: + * The order of the decode table size, as explained above. Must be + * less than or equal to 13. + * + * @lens: + * An array of length @num_syms, indexable by symbol, that gives the + * length of the codeword, in bits, for that symbol. The length can + * be 0, which means that the symbol does not have a codeword + * assigned. + * + * @max_codeword_len: + * The longest codeword length allowed in the compression format. + * All entries in 'lens' must be less than or equal to this value. + * This must be less than or equal to 23. + * + * @working_space + * A temporary array of length '2 * (max_codeword_len + 1) + + * num_syms'. + * + * Returns 0 on success, or -1 if the lengths do not form a valid prefix + * code. + */ +int make_huffman_decode_table(u16 decode_table[], const u32 num_syms, + const u32 table_bits, const u8 lens[], + const u32 max_codeword_len, + u16 working_space[]) +{ + const u32 table_num_entries = 1 << table_bits; + u16 * const len_counts = &working_space[0]; + u16 * const offsets = &working_space[1 * (max_codeword_len + 1)]; + u16 * const sorted_syms = &working_space[2 * (max_codeword_len + 1)]; + int left; + void *decode_table_ptr; + u32 sym_idx; + u32 codeword_len; + u32 stores_per_loop; + u32 decode_table_pos; + u32 len; + u32 sym; + + /* Count how many symbols have each possible codeword length. + * Note that a length of 0 indicates the corresponding symbol is not + * used in the code and therefore does not have a codeword. + */ + for (len = 0; len <= max_codeword_len; len++) + len_counts[len] = 0; + for (sym = 0; sym < num_syms; sym++) + len_counts[lens[sym]]++; + + /* We can assume all lengths are <= max_codeword_len, but we + * cannot assume they form a valid prefix code. A codeword of + * length n should require a proportion of the codespace equaling + * (1/2)^n. The code is valid if and only if the codespace is + * exactly filled by the lengths, by this measure. + */ + left = 1; + for (len = 1; len <= max_codeword_len; len++) { + left <<= 1; + left -= len_counts[len]; + if (left < 0) { + /* The lengths overflow the codespace; that is, the code + * is over-subscribed. + */ + return -1; + } + } + + if (left) { + /* The lengths do not fill the codespace; that is, they form an + * incomplete set. + */ + if (left == (1 << max_codeword_len)) { + /* The code is completely empty. This is arguably + * invalid, but in fact it is valid in LZX and XPRESS, + * so we must allow it. By definition, no symbols can + * be decoded with an empty code. Consequently, we + * technically don't even need to fill in the decode + * table. However, to avoid accessing uninitialized + * memory if the algorithm nevertheless attempts to + * decode symbols using such a code, we zero out the + * decode table. + */ + memset(decode_table, 0, + table_num_entries * sizeof(decode_table[0])); + return 0; + } + return -1; + } + + /* Sort the symbols primarily by length and secondarily by symbol order. + */ + + /* Initialize 'offsets' so that offsets[len] for 1 <= len <= + * max_codeword_len is the number of codewords shorter than 'len' bits. + */ + offsets[1] = 0; + for (len = 1; len < max_codeword_len; len++) + offsets[len + 1] = offsets[len] + len_counts[len]; + + /* Use the 'offsets' array to sort the symbols. Note that we do not + * include symbols that are not used in the code. Consequently, fewer + * than 'num_syms' entries in 'sorted_syms' may be filled. + */ + for (sym = 0; sym < num_syms; sym++) + if (lens[sym]) + sorted_syms[offsets[lens[sym]]++] = sym; + + /* Fill entries for codewords with length <= table_bits + * --- that is, those short enough for a direct mapping. + * + * The table will start with entries for the shortest codeword(s), which + * have the most entries. From there, the number of entries per + * codeword will decrease. + */ + decode_table_ptr = decode_table; + sym_idx = 0; + codeword_len = 1; + stores_per_loop = (1 << (table_bits - codeword_len)); + for (; stores_per_loop != 0; codeword_len++, stores_per_loop >>= 1) { + u32 end_sym_idx = sym_idx + len_counts[codeword_len]; + + for (; sym_idx < end_sym_idx; sym_idx++) { + u16 entry; + u16 *p; + u32 n; + + entry = ((u32)codeword_len << 11) | sorted_syms[sym_idx]; + p = (u16 *)decode_table_ptr; + n = stores_per_loop; + + do { + *p++ = entry; + } while (--n); + + decode_table_ptr = p; + } + } + + /* If we've filled in the entire table, we are done. Otherwise, + * there are codewords longer than table_bits for which we must + * generate binary trees. + */ + decode_table_pos = (u16 *)decode_table_ptr - decode_table; + if (decode_table_pos != table_num_entries) { + u32 j; + u32 next_free_tree_slot; + u32 cur_codeword; + + /* First, zero out the remaining entries. This is + * necessary so that these entries appear as + * "unallocated" in the next part. Each of these entries + * will eventually be filled with the representation of + * the root node of a binary tree. + */ + j = decode_table_pos; + do { + decode_table[j] = 0; + } while (++j != table_num_entries); + + /* We allocate child nodes starting at the end of the + * direct lookup table. Note that there should be + * 2*num_syms extra entries for this purpose, although + * fewer than this may actually be needed. + */ + next_free_tree_slot = table_num_entries; + + /* Iterate through each codeword with length greater than + * 'table_bits', primarily in order of codeword length + * and secondarily in order of symbol. + */ + for (cur_codeword = decode_table_pos << 1; + codeword_len <= max_codeword_len; + codeword_len++, cur_codeword <<= 1) { + u32 end_sym_idx = sym_idx + len_counts[codeword_len]; + + for (; sym_idx < end_sym_idx; sym_idx++, cur_codeword++) { + /* 'sorted_sym' is the symbol represented by the + * codeword. + */ + u32 sorted_sym = sorted_syms[sym_idx]; + u32 extra_bits = codeword_len - table_bits; + u32 node_idx = cur_codeword >> extra_bits; + + /* Go through each bit of the current codeword + * beyond the prefix of length @table_bits and + * walk the appropriate binary tree, allocating + * any slots that have not yet been allocated. + * + * Note that the 'pointer' entry to the binary + * tree, which is stored in the direct lookup + * portion of the table, is represented + * identically to other internal (non-leaf) + * nodes of the binary tree; it can be thought + * of as simply the root of the tree. The + * representation of these internal nodes is + * simply the index of the left child combined + * with the special bits 0xC000 to distinguish + * the entry from direct mapping and leaf node + * entries. + */ + do { + /* At least one bit remains in the + * codeword, but the current node is an + * unallocated leaf. Change it to an + * internal node. + */ + if (decode_table[node_idx] == 0) { + decode_table[node_idx] = + next_free_tree_slot | 0xC000; + decode_table[next_free_tree_slot++] = 0; + decode_table[next_free_tree_slot++] = 0; + } + + /* Go to the left child if the next bit + * in the codeword is 0; otherwise go to + * the right child. + */ + node_idx = decode_table[node_idx] & 0x3FFF; + --extra_bits; + node_idx += (cur_codeword >> extra_bits) & 1; + } while (extra_bits != 0); + + /* We've traversed the tree using the entire + * codeword, and we're now at the entry where + * the actual symbol will be stored. This is + * distinguished from internal nodes by not + * having its high two bits set. + */ + decode_table[node_idx] = sorted_sym; + } + } + } + return 0; +} diff --git a/fs/ntfs3/lib/decompress_common.h b/fs/ntfs3/lib/decompress_common.h new file mode 100644 index 000000000..dd7ced000 --- /dev/null +++ b/fs/ntfs3/lib/decompress_common.h @@ -0,0 +1,343 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * decompress_common.h - Code shared by the XPRESS and LZX decompressors + * + * Copyright (C) 2015 Eric Biggers + */ + +#ifndef _LINUX_NTFS3_LIB_DECOMPRESS_COMMON_H +#define _LINUX_NTFS3_LIB_DECOMPRESS_COMMON_H + +#include <linux/string.h> +#include <linux/compiler.h> +#include <linux/types.h> +#include <linux/slab.h> +#include <asm/unaligned.h> + + +/* "Force inline" macro (not required, but helpful for performance) */ +#define forceinline __always_inline + +/* Enable whole-word match copying on selected architectures */ +#if defined(__i386__) || defined(__x86_64__) || defined(__ARM_FEATURE_UNALIGNED) +# define FAST_UNALIGNED_ACCESS +#endif + +/* Size of a machine word */ +#define WORDBYTES (sizeof(size_t)) + +static forceinline void +copy_unaligned_word(const void *src, void *dst) +{ + put_unaligned(get_unaligned((const size_t *)src), (size_t *)dst); +} + + +/* Generate a "word" with platform-dependent size whose bytes all contain the + * value 'b'. + */ +static forceinline size_t repeat_byte(u8 b) +{ + size_t v; + + v = b; + v |= v << 8; + v |= v << 16; + v |= v << ((WORDBYTES == 8) ? 32 : 0); + return v; +} + +/* Structure that encapsulates a block of in-memory data being interpreted as a + * stream of bits, optionally with interwoven literal bytes. Bits are assumed + * to be stored in little endian 16-bit coding units, with the bits ordered high + * to low. + */ +struct input_bitstream { + + /* Bits that have been read from the input buffer. The bits are + * left-justified; the next bit is always bit 31. + */ + u32 bitbuf; + + /* Number of bits currently held in @bitbuf. */ + u32 bitsleft; + + /* Pointer to the next byte to be retrieved from the input buffer. */ + const u8 *next; + + /* Pointer to just past the end of the input buffer. */ + const u8 *end; +}; + +/* Initialize a bitstream to read from the specified input buffer. */ +static forceinline void init_input_bitstream(struct input_bitstream *is, + const void *buffer, u32 size) +{ + is->bitbuf = 0; + is->bitsleft = 0; + is->next = buffer; + is->end = is->next + size; +} + +/* Ensure the bit buffer variable for the bitstream contains at least @num_bits + * bits. Following this, bitstream_peek_bits() and/or bitstream_remove_bits() + * may be called on the bitstream to peek or remove up to @num_bits bits. Note + * that @num_bits must be <= 16. + */ +static forceinline void bitstream_ensure_bits(struct input_bitstream *is, + u32 num_bits) +{ + if (is->bitsleft < num_bits) { + if (is->end - is->next >= 2) { + is->bitbuf |= (u32)get_unaligned_le16(is->next) + << (16 - is->bitsleft); + is->next += 2; + } + is->bitsleft += 16; + } +} + +/* Return the next @num_bits bits from the bitstream, without removing them. + * There must be at least @num_bits remaining in the buffer variable, from a + * previous call to bitstream_ensure_bits(). + */ +static forceinline u32 +bitstream_peek_bits(const struct input_bitstream *is, const u32 num_bits) +{ + return (is->bitbuf >> 1) >> (sizeof(is->bitbuf) * 8 - num_bits - 1); +} + +/* Remove @num_bits from the bitstream. There must be at least @num_bits + * remaining in the buffer variable, from a previous call to + * bitstream_ensure_bits(). + */ +static forceinline void +bitstream_remove_bits(struct input_bitstream *is, u32 num_bits) +{ + is->bitbuf <<= num_bits; + is->bitsleft -= num_bits; +} + +/* Remove and return @num_bits bits from the bitstream. There must be at least + * @num_bits remaining in the buffer variable, from a previous call to + * bitstream_ensure_bits(). + */ +static forceinline u32 +bitstream_pop_bits(struct input_bitstream *is, u32 num_bits) +{ + u32 bits = bitstream_peek_bits(is, num_bits); + + bitstream_remove_bits(is, num_bits); + return bits; +} + +/* Read and return the next @num_bits bits from the bitstream. */ +static forceinline u32 +bitstream_read_bits(struct input_bitstream *is, u32 num_bits) +{ + bitstream_ensure_bits(is, num_bits); + return bitstream_pop_bits(is, num_bits); +} + +/* Read and return the next literal byte embedded in the bitstream. */ +static forceinline u8 +bitstream_read_byte(struct input_bitstream *is) +{ + if (unlikely(is->end == is->next)) + return 0; + return *is->next++; +} + +/* Read and return the next 16-bit integer embedded in the bitstream. */ +static forceinline u16 +bitstream_read_u16(struct input_bitstream *is) +{ + u16 v; + + if (unlikely(is->end - is->next < 2)) + return 0; + v = get_unaligned_le16(is->next); + is->next += 2; + return v; +} + +/* Read and return the next 32-bit integer embedded in the bitstream. */ +static forceinline u32 +bitstream_read_u32(struct input_bitstream *is) +{ + u32 v; + + if (unlikely(is->end - is->next < 4)) + return 0; + v = get_unaligned_le32(is->next); + is->next += 4; + return v; +} + +/* Read into @dst_buffer an array of literal bytes embedded in the bitstream. + * Return either a pointer to the byte past the last written, or NULL if the + * read overflows the input buffer. + */ +static forceinline void *bitstream_read_bytes(struct input_bitstream *is, + void *dst_buffer, size_t count) +{ + if ((size_t)(is->end - is->next) < count) + return NULL; + memcpy(dst_buffer, is->next, count); + is->next += count; + return (u8 *)dst_buffer + count; +} + +/* Align the input bitstream on a coding-unit boundary. */ +static forceinline void bitstream_align(struct input_bitstream *is) +{ + is->bitsleft = 0; + is->bitbuf = 0; +} + +extern int make_huffman_decode_table(u16 decode_table[], const u32 num_syms, + const u32 num_bits, const u8 lens[], + const u32 max_codeword_len, + u16 working_space[]); + + +/* Reads and returns the next Huffman-encoded symbol from a bitstream. If the + * input data is exhausted, the Huffman symbol is decoded as if the missing bits + * are all zeroes. + */ +static forceinline u32 read_huffsym(struct input_bitstream *istream, + const u16 decode_table[], + u32 table_bits, + u32 max_codeword_len) +{ + u32 entry; + u32 key_bits; + + bitstream_ensure_bits(istream, max_codeword_len); + + /* Index the decode table by the next table_bits bits of the input. */ + key_bits = bitstream_peek_bits(istream, table_bits); + entry = decode_table[key_bits]; + if (entry < 0xC000) { + /* Fast case: The decode table directly provided the + * symbol and codeword length. The low 11 bits are the + * symbol, and the high 5 bits are the codeword length. + */ + bitstream_remove_bits(istream, entry >> 11); + return entry & 0x7FF; + } + /* Slow case: The codeword for the symbol is longer than + * table_bits, so the symbol does not have an entry + * directly in the first (1 << table_bits) entries of the + * decode table. Traverse the appropriate binary tree + * bit-by-bit to decode the symbol. + */ + bitstream_remove_bits(istream, table_bits); + do { + key_bits = (entry & 0x3FFF) + bitstream_pop_bits(istream, 1); + } while ((entry = decode_table[key_bits]) >= 0xC000); + return entry; +} + +/* + * Copy an LZ77 match at (dst - offset) to dst. + * + * The length and offset must be already validated --- that is, (dst - offset) + * can't underrun the output buffer, and (dst + length) can't overrun the output + * buffer. Also, the length cannot be 0. + * + * @bufend points to the byte past the end of the output buffer. This function + * won't write any data beyond this position. + * + * Returns dst + length. + */ +static forceinline u8 *lz_copy(u8 *dst, u32 length, u32 offset, const u8 *bufend, + u32 min_length) +{ + const u8 *src = dst - offset; + + /* + * Try to copy one machine word at a time. On i386 and x86_64 this is + * faster than copying one byte at a time, unless the data is + * near-random and all the matches have very short lengths. Note that + * since this requires unaligned memory accesses, it won't necessarily + * be faster on every architecture. + * + * Also note that we might copy more than the length of the match. For + * example, if a word is 8 bytes and the match is of length 5, then + * we'll simply copy 8 bytes. This is okay as long as we don't write + * beyond the end of the output buffer, hence the check for (bufend - + * end >= WORDBYTES - 1). + */ +#ifdef FAST_UNALIGNED_ACCESS + u8 * const end = dst + length; + + if (bufend - end >= (ptrdiff_t)(WORDBYTES - 1)) { + + if (offset >= WORDBYTES) { + /* The source and destination words don't overlap. */ + + /* To improve branch prediction, one iteration of this + * loop is unrolled. Most matches are short and will + * fail the first check. But if that check passes, then + * it becomes increasing likely that the match is long + * and we'll need to continue copying. + */ + + copy_unaligned_word(src, dst); + src += WORDBYTES; + dst += WORDBYTES; + + if (dst < end) { + do { + copy_unaligned_word(src, dst); + src += WORDBYTES; + dst += WORDBYTES; + } while (dst < end); + } + return end; + } else if (offset == 1) { + + /* Offset 1 matches are equivalent to run-length + * encoding of the previous byte. This case is common + * if the data contains many repeated bytes. + */ + size_t v = repeat_byte(*(dst - 1)); + + do { + put_unaligned(v, (size_t *)dst); + src += WORDBYTES; + dst += WORDBYTES; + } while (dst < end); + return end; + } + /* + * We don't bother with special cases for other 'offset < + * WORDBYTES', which are usually rarer than 'offset == 1'. Extra + * checks will just slow things down. Actually, it's possible + * to handle all the 'offset < WORDBYTES' cases using the same + * code, but it still becomes more complicated doesn't seem any + * faster overall; it definitely slows down the more common + * 'offset == 1' case. + */ + } +#endif /* FAST_UNALIGNED_ACCESS */ + + /* Fall back to a bytewise copy. */ + + if (min_length >= 2) { + *dst++ = *src++; + length--; + } + if (min_length >= 3) { + *dst++ = *src++; + length--; + } + do { + *dst++ = *src++; + } while (--length); + + return dst; +} + +#endif /* _LINUX_NTFS3_LIB_DECOMPRESS_COMMON_H */ diff --git a/fs/ntfs3/lib/lib.h b/fs/ntfs3/lib/lib.h new file mode 100644 index 000000000..90309a5ae --- /dev/null +++ b/fs/ntfs3/lib/lib.h @@ -0,0 +1,32 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Adapted for linux kernel by Alexander Mamaev: + * - remove implementations of get_unaligned_ + * - assume GCC is always defined + * - ISO C90 + * - linux kernel code style + */ + +#ifndef _LINUX_NTFS3_LIB_LIB_H +#define _LINUX_NTFS3_LIB_LIB_H + +#include <linux/types.h> + +/* globals from xpress_decompress.c */ +struct xpress_decompressor *xpress_allocate_decompressor(void); +void xpress_free_decompressor(struct xpress_decompressor *d); +int xpress_decompress(struct xpress_decompressor *__restrict d, + const void *__restrict compressed_data, + size_t compressed_size, + void *__restrict uncompressed_data, + size_t uncompressed_size); + +/* globals from lzx_decompress.c */ +struct lzx_decompressor *lzx_allocate_decompressor(void); +void lzx_free_decompressor(struct lzx_decompressor *d); +int lzx_decompress(struct lzx_decompressor *__restrict d, + const void *__restrict compressed_data, + size_t compressed_size, void *__restrict uncompressed_data, + size_t uncompressed_size); + +#endif /* _LINUX_NTFS3_LIB_LIB_H */ diff --git a/fs/ntfs3/lib/lzx_decompress.c b/fs/ntfs3/lib/lzx_decompress.c new file mode 100644 index 000000000..6b16f0707 --- /dev/null +++ b/fs/ntfs3/lib/lzx_decompress.c @@ -0,0 +1,670 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * lzx_decompress.c - A decompressor for the LZX compression format, which can + * be used in "System Compressed" files. This is based on the code from wimlib. + * This code only supports a window size (dictionary size) of 32768 bytes, since + * this is the only size used in System Compression. + * + * Copyright (C) 2015 Eric Biggers + */ + +#include "decompress_common.h" +#include "lib.h" + +/* Number of literal byte values */ +#define LZX_NUM_CHARS 256 + +/* The smallest and largest allowed match lengths */ +#define LZX_MIN_MATCH_LEN 2 +#define LZX_MAX_MATCH_LEN 257 + +/* Number of distinct match lengths that can be represented */ +#define LZX_NUM_LENS (LZX_MAX_MATCH_LEN - LZX_MIN_MATCH_LEN + 1) + +/* Number of match lengths for which no length symbol is required */ +#define LZX_NUM_PRIMARY_LENS 7 +#define LZX_NUM_LEN_HEADERS (LZX_NUM_PRIMARY_LENS + 1) + +/* Valid values of the 3-bit block type field */ +#define LZX_BLOCKTYPE_VERBATIM 1 +#define LZX_BLOCKTYPE_ALIGNED 2 +#define LZX_BLOCKTYPE_UNCOMPRESSED 3 + +/* Number of offset slots for a window size of 32768 */ +#define LZX_NUM_OFFSET_SLOTS 30 + +/* Number of symbols in the main code for a window size of 32768 */ +#define LZX_MAINCODE_NUM_SYMBOLS \ + (LZX_NUM_CHARS + (LZX_NUM_OFFSET_SLOTS * LZX_NUM_LEN_HEADERS)) + +/* Number of symbols in the length code */ +#define LZX_LENCODE_NUM_SYMBOLS (LZX_NUM_LENS - LZX_NUM_PRIMARY_LENS) + +/* Number of symbols in the precode */ +#define LZX_PRECODE_NUM_SYMBOLS 20 + +/* Number of bits in which each precode codeword length is represented */ +#define LZX_PRECODE_ELEMENT_SIZE 4 + +/* Number of low-order bits of each match offset that are entropy-encoded in + * aligned offset blocks + */ +#define LZX_NUM_ALIGNED_OFFSET_BITS 3 + +/* Number of symbols in the aligned offset code */ +#define LZX_ALIGNEDCODE_NUM_SYMBOLS (1 << LZX_NUM_ALIGNED_OFFSET_BITS) + +/* Mask for the match offset bits that are entropy-encoded in aligned offset + * blocks + */ +#define LZX_ALIGNED_OFFSET_BITMASK ((1 << LZX_NUM_ALIGNED_OFFSET_BITS) - 1) + +/* Number of bits in which each aligned offset codeword length is represented */ +#define LZX_ALIGNEDCODE_ELEMENT_SIZE 3 + +/* Maximum lengths (in bits) of the codewords in each Huffman code */ +#define LZX_MAX_MAIN_CODEWORD_LEN 16 +#define LZX_MAX_LEN_CODEWORD_LEN 16 +#define LZX_MAX_PRE_CODEWORD_LEN ((1 << LZX_PRECODE_ELEMENT_SIZE) - 1) +#define LZX_MAX_ALIGNED_CODEWORD_LEN ((1 << LZX_ALIGNEDCODE_ELEMENT_SIZE) - 1) + +/* The default "filesize" value used in pre/post-processing. In the LZX format + * used in cabinet files this value must be given to the decompressor, whereas + * in the LZX format used in WIM files and system-compressed files this value is + * fixed at 12000000. + */ +#define LZX_DEFAULT_FILESIZE 12000000 + +/* Assumed block size when the encoded block size begins with a 0 bit. */ +#define LZX_DEFAULT_BLOCK_SIZE 32768 + +/* Number of offsets in the recent (or "repeat") offsets queue. */ +#define LZX_NUM_RECENT_OFFSETS 3 + +/* These values are chosen for fast decompression. */ +#define LZX_MAINCODE_TABLEBITS 11 +#define LZX_LENCODE_TABLEBITS 10 +#define LZX_PRECODE_TABLEBITS 6 +#define LZX_ALIGNEDCODE_TABLEBITS 7 + +#define LZX_READ_LENS_MAX_OVERRUN 50 + +/* Mapping: offset slot => first match offset that uses that offset slot. + */ +static const u32 lzx_offset_slot_base[LZX_NUM_OFFSET_SLOTS + 1] = { + 0, 1, 2, 3, 4, /* 0 --- 4 */ + 6, 8, 12, 16, 24, /* 5 --- 9 */ + 32, 48, 64, 96, 128, /* 10 --- 14 */ + 192, 256, 384, 512, 768, /* 15 --- 19 */ + 1024, 1536, 2048, 3072, 4096, /* 20 --- 24 */ + 6144, 8192, 12288, 16384, 24576, /* 25 --- 29 */ + 32768, /* extra */ +}; + +/* Mapping: offset slot => how many extra bits must be read and added to the + * corresponding offset slot base to decode the match offset. + */ +static const u8 lzx_extra_offset_bits[LZX_NUM_OFFSET_SLOTS] = { + 0, 0, 0, 0, 1, + 1, 2, 2, 3, 3, + 4, 4, 5, 5, 6, + 6, 7, 7, 8, 8, + 9, 9, 10, 10, 11, + 11, 12, 12, 13, 13, +}; + +/* Reusable heap-allocated memory for LZX decompression */ +struct lzx_decompressor { + + /* Huffman decoding tables, and arrays that map symbols to codeword + * lengths + */ + + u16 maincode_decode_table[(1 << LZX_MAINCODE_TABLEBITS) + + (LZX_MAINCODE_NUM_SYMBOLS * 2)]; + u8 maincode_lens[LZX_MAINCODE_NUM_SYMBOLS + LZX_READ_LENS_MAX_OVERRUN]; + + + u16 lencode_decode_table[(1 << LZX_LENCODE_TABLEBITS) + + (LZX_LENCODE_NUM_SYMBOLS * 2)]; + u8 lencode_lens[LZX_LENCODE_NUM_SYMBOLS + LZX_READ_LENS_MAX_OVERRUN]; + + + u16 alignedcode_decode_table[(1 << LZX_ALIGNEDCODE_TABLEBITS) + + (LZX_ALIGNEDCODE_NUM_SYMBOLS * 2)]; + u8 alignedcode_lens[LZX_ALIGNEDCODE_NUM_SYMBOLS]; + + u16 precode_decode_table[(1 << LZX_PRECODE_TABLEBITS) + + (LZX_PRECODE_NUM_SYMBOLS * 2)]; + u8 precode_lens[LZX_PRECODE_NUM_SYMBOLS]; + + /* Temporary space for make_huffman_decode_table() */ + u16 working_space[2 * (1 + LZX_MAX_MAIN_CODEWORD_LEN) + + LZX_MAINCODE_NUM_SYMBOLS]; +}; + +static void undo_e8_translation(void *target, s32 input_pos) +{ + s32 abs_offset, rel_offset; + + abs_offset = get_unaligned_le32(target); + if (abs_offset >= 0) { + if (abs_offset < LZX_DEFAULT_FILESIZE) { + /* "good translation" */ + rel_offset = abs_offset - input_pos; + put_unaligned_le32(rel_offset, target); + } + } else { + if (abs_offset >= -input_pos) { + /* "compensating translation" */ + rel_offset = abs_offset + LZX_DEFAULT_FILESIZE; + put_unaligned_le32(rel_offset, target); + } + } +} + +/* + * Undo the 'E8' preprocessing used in LZX. Before compression, the + * uncompressed data was preprocessed by changing the targets of suspected x86 + * CALL instructions from relative offsets to absolute offsets. After + * match/literal decoding, the decompressor must undo the translation. + */ +static void lzx_postprocess(u8 *data, u32 size) +{ + /* + * A worthwhile optimization is to push the end-of-buffer check into the + * relatively rare E8 case. This is possible if we replace the last six + * bytes of data with E8 bytes; then we are guaranteed to hit an E8 byte + * before reaching end-of-buffer. In addition, this scheme guarantees + * that no translation can begin following an E8 byte in the last 10 + * bytes because a 4-byte offset containing E8 as its high byte is a + * large negative number that is not valid for translation. That is + * exactly what we need. + */ + u8 *tail; + u8 saved_bytes[6]; + u8 *p; + + if (size <= 10) + return; + + tail = &data[size - 6]; + memcpy(saved_bytes, tail, 6); + memset(tail, 0xE8, 6); + p = data; + for (;;) { + while (*p != 0xE8) + p++; + if (p >= tail) + break; + undo_e8_translation(p + 1, p - data); + p += 5; + } + memcpy(tail, saved_bytes, 6); +} + +/* Read a Huffman-encoded symbol using the precode. */ +static forceinline u32 read_presym(const struct lzx_decompressor *d, + struct input_bitstream *is) +{ + return read_huffsym(is, d->precode_decode_table, + LZX_PRECODE_TABLEBITS, LZX_MAX_PRE_CODEWORD_LEN); +} + +/* Read a Huffman-encoded symbol using the main code. */ +static forceinline u32 read_mainsym(const struct lzx_decompressor *d, + struct input_bitstream *is) +{ + return read_huffsym(is, d->maincode_decode_table, + LZX_MAINCODE_TABLEBITS, LZX_MAX_MAIN_CODEWORD_LEN); +} + +/* Read a Huffman-encoded symbol using the length code. */ +static forceinline u32 read_lensym(const struct lzx_decompressor *d, + struct input_bitstream *is) +{ + return read_huffsym(is, d->lencode_decode_table, + LZX_LENCODE_TABLEBITS, LZX_MAX_LEN_CODEWORD_LEN); +} + +/* Read a Huffman-encoded symbol using the aligned offset code. */ +static forceinline u32 read_alignedsym(const struct lzx_decompressor *d, + struct input_bitstream *is) +{ + return read_huffsym(is, d->alignedcode_decode_table, + LZX_ALIGNEDCODE_TABLEBITS, + LZX_MAX_ALIGNED_CODEWORD_LEN); +} + +/* + * Read the precode from the compressed input bitstream, then use it to decode + * @num_lens codeword length values. + * + * @is: The input bitstream. + * + * @lens: An array that contains the length values from the previous time + * the codeword lengths for this Huffman code were read, or all 0's + * if this is the first time. This array must have at least + * (@num_lens + LZX_READ_LENS_MAX_OVERRUN) entries. + * + * @num_lens: Number of length values to decode. + * + * Returns 0 on success, or -1 if the data was invalid. + */ +static int lzx_read_codeword_lens(struct lzx_decompressor *d, + struct input_bitstream *is, + u8 *lens, u32 num_lens) +{ + u8 *len_ptr = lens; + u8 *lens_end = lens + num_lens; + int i; + + /* Read the lengths of the precode codewords. These are given + * explicitly. + */ + for (i = 0; i < LZX_PRECODE_NUM_SYMBOLS; i++) { + d->precode_lens[i] = + bitstream_read_bits(is, LZX_PRECODE_ELEMENT_SIZE); + } + + /* Make the decoding table for the precode. */ + if (make_huffman_decode_table(d->precode_decode_table, + LZX_PRECODE_NUM_SYMBOLS, + LZX_PRECODE_TABLEBITS, + d->precode_lens, + LZX_MAX_PRE_CODEWORD_LEN, + d->working_space)) + return -1; + + /* Decode the codeword lengths. */ + do { + u32 presym; + u8 len; + + /* Read the next precode symbol. */ + presym = read_presym(d, is); + if (presym < 17) { + /* Difference from old length */ + len = *len_ptr - presym; + if ((s8)len < 0) + len += 17; + *len_ptr++ = len; + } else { + /* Special RLE values */ + + u32 run_len; + + if (presym == 17) { + /* Run of 0's */ + run_len = 4 + bitstream_read_bits(is, 4); + len = 0; + } else if (presym == 18) { + /* Longer run of 0's */ + run_len = 20 + bitstream_read_bits(is, 5); + len = 0; + } else { + /* Run of identical lengths */ + run_len = 4 + bitstream_read_bits(is, 1); + presym = read_presym(d, is); + if (presym > 17) + return -1; + len = *len_ptr - presym; + if ((s8)len < 0) + len += 17; + } + + do { + *len_ptr++ = len; + } while (--run_len); + /* Worst case overrun is when presym == 18, + * run_len == 20 + 31, and only 1 length was remaining. + * So LZX_READ_LENS_MAX_OVERRUN == 50. + * + * Overrun while reading the first half of maincode_lens + * can corrupt the previous values in the second half. + * This doesn't really matter because the resulting + * lengths will still be in range, and data that + * generates overruns is invalid anyway. + */ + } + } while (len_ptr < lens_end); + + return 0; +} + +/* + * Read the header of an LZX block and save the block type and (uncompressed) + * size in *block_type_ret and *block_size_ret, respectively. + * + * If the block is compressed, also update the Huffman decode @tables with the + * new Huffman codes. If the block is uncompressed, also update the match + * offset @queue with the new match offsets. + * + * Return 0 on success, or -1 if the data was invalid. + */ +static int lzx_read_block_header(struct lzx_decompressor *d, + struct input_bitstream *is, + int *block_type_ret, + u32 *block_size_ret, + u32 recent_offsets[]) +{ + int block_type; + u32 block_size; + int i; + + bitstream_ensure_bits(is, 4); + + /* The first three bits tell us what kind of block it is, and should be + * one of the LZX_BLOCKTYPE_* values. + */ + block_type = bitstream_pop_bits(is, 3); + + /* Read the block size. */ + if (bitstream_pop_bits(is, 1)) { + block_size = LZX_DEFAULT_BLOCK_SIZE; + } else { + block_size = 0; + block_size |= bitstream_read_bits(is, 8); + block_size <<= 8; + block_size |= bitstream_read_bits(is, 8); + } + + switch (block_type) { + + case LZX_BLOCKTYPE_ALIGNED: + + /* Read the aligned offset code and prepare its decode table. + */ + + for (i = 0; i < LZX_ALIGNEDCODE_NUM_SYMBOLS; i++) { + d->alignedcode_lens[i] = + bitstream_read_bits(is, + LZX_ALIGNEDCODE_ELEMENT_SIZE); + } + + if (make_huffman_decode_table(d->alignedcode_decode_table, + LZX_ALIGNEDCODE_NUM_SYMBOLS, + LZX_ALIGNEDCODE_TABLEBITS, + d->alignedcode_lens, + LZX_MAX_ALIGNED_CODEWORD_LEN, + d->working_space)) + return -1; + + /* Fall though, since the rest of the header for aligned offset + * blocks is the same as that for verbatim blocks. + */ + fallthrough; + + case LZX_BLOCKTYPE_VERBATIM: + + /* Read the main code and prepare its decode table. + * + * Note that the codeword lengths in the main code are encoded + * in two parts: one part for literal symbols, and one part for + * match symbols. + */ + + if (lzx_read_codeword_lens(d, is, d->maincode_lens, + LZX_NUM_CHARS)) + return -1; + + if (lzx_read_codeword_lens(d, is, + d->maincode_lens + LZX_NUM_CHARS, + LZX_MAINCODE_NUM_SYMBOLS - LZX_NUM_CHARS)) + return -1; + + if (make_huffman_decode_table(d->maincode_decode_table, + LZX_MAINCODE_NUM_SYMBOLS, + LZX_MAINCODE_TABLEBITS, + d->maincode_lens, + LZX_MAX_MAIN_CODEWORD_LEN, + d->working_space)) + return -1; + + /* Read the length code and prepare its decode table. */ + + if (lzx_read_codeword_lens(d, is, d->lencode_lens, + LZX_LENCODE_NUM_SYMBOLS)) + return -1; + + if (make_huffman_decode_table(d->lencode_decode_table, + LZX_LENCODE_NUM_SYMBOLS, + LZX_LENCODE_TABLEBITS, + d->lencode_lens, + LZX_MAX_LEN_CODEWORD_LEN, + d->working_space)) + return -1; + + break; + + case LZX_BLOCKTYPE_UNCOMPRESSED: + + /* Before reading the three recent offsets from the uncompressed + * block header, the stream must be aligned on a 16-bit + * boundary. But if the stream is *already* aligned, then the + * next 16 bits must be discarded. + */ + bitstream_ensure_bits(is, 1); + bitstream_align(is); + + recent_offsets[0] = bitstream_read_u32(is); + recent_offsets[1] = bitstream_read_u32(is); + recent_offsets[2] = bitstream_read_u32(is); + + /* Offsets of 0 are invalid. */ + if (recent_offsets[0] == 0 || recent_offsets[1] == 0 || + recent_offsets[2] == 0) + return -1; + break; + + default: + /* Unrecognized block type. */ + return -1; + } + + *block_type_ret = block_type; + *block_size_ret = block_size; + return 0; +} + +/* Decompress a block of LZX-compressed data. */ +static int lzx_decompress_block(const struct lzx_decompressor *d, + struct input_bitstream *is, + int block_type, u32 block_size, + u8 * const out_begin, u8 *out_next, + u32 recent_offsets[]) +{ + u8 * const block_end = out_next + block_size; + u32 ones_if_aligned = 0U - (block_type == LZX_BLOCKTYPE_ALIGNED); + + do { + u32 mainsym; + u32 match_len; + u32 match_offset; + u32 offset_slot; + u32 num_extra_bits; + + mainsym = read_mainsym(d, is); + if (mainsym < LZX_NUM_CHARS) { + /* Literal */ + *out_next++ = mainsym; + continue; + } + + /* Match */ + + /* Decode the length header and offset slot. */ + mainsym -= LZX_NUM_CHARS; + match_len = mainsym % LZX_NUM_LEN_HEADERS; + offset_slot = mainsym / LZX_NUM_LEN_HEADERS; + + /* If needed, read a length symbol to decode the full length. */ + if (match_len == LZX_NUM_PRIMARY_LENS) + match_len += read_lensym(d, is); + match_len += LZX_MIN_MATCH_LEN; + + if (offset_slot < LZX_NUM_RECENT_OFFSETS) { + /* Repeat offset */ + + /* Note: This isn't a real LRU queue, since using the R2 + * offset doesn't bump the R1 offset down to R2. This + * quirk allows all 3 recent offsets to be handled by + * the same code. (For R0, the swap is a no-op.) + */ + match_offset = recent_offsets[offset_slot]; + recent_offsets[offset_slot] = recent_offsets[0]; + recent_offsets[0] = match_offset; + } else { + /* Explicit offset */ + + /* Look up the number of extra bits that need to be read + * to decode offsets with this offset slot. + */ + num_extra_bits = lzx_extra_offset_bits[offset_slot]; + + /* Start with the offset slot base value. */ + match_offset = lzx_offset_slot_base[offset_slot]; + + /* In aligned offset blocks, the low-order 3 bits of + * each offset are encoded using the aligned offset + * code. Otherwise, all the extra bits are literal. + */ + + if ((num_extra_bits & ones_if_aligned) >= LZX_NUM_ALIGNED_OFFSET_BITS) { + match_offset += + bitstream_read_bits(is, num_extra_bits - + LZX_NUM_ALIGNED_OFFSET_BITS) + << LZX_NUM_ALIGNED_OFFSET_BITS; + match_offset += read_alignedsym(d, is); + } else { + match_offset += bitstream_read_bits(is, num_extra_bits); + } + + /* Adjust the offset. */ + match_offset -= (LZX_NUM_RECENT_OFFSETS - 1); + + /* Update the recent offsets. */ + recent_offsets[2] = recent_offsets[1]; + recent_offsets[1] = recent_offsets[0]; + recent_offsets[0] = match_offset; + } + + /* Validate the match, then copy it to the current position. */ + + if (match_len > (size_t)(block_end - out_next)) + return -1; + + if (match_offset > (size_t)(out_next - out_begin)) + return -1; + + out_next = lz_copy(out_next, match_len, match_offset, + block_end, LZX_MIN_MATCH_LEN); + + } while (out_next != block_end); + + return 0; +} + +/* + * lzx_allocate_decompressor - Allocate an LZX decompressor + * + * Return the pointer to the decompressor on success, or return NULL and set + * errno on failure. + */ +struct lzx_decompressor *lzx_allocate_decompressor(void) +{ + return kmalloc(sizeof(struct lzx_decompressor), GFP_NOFS); +} + +/* + * lzx_decompress - Decompress a buffer of LZX-compressed data + * + * @decompressor: A decompressor allocated with lzx_allocate_decompressor() + * @compressed_data: The buffer of data to decompress + * @compressed_size: Number of bytes of compressed data + * @uncompressed_data: The buffer in which to store the decompressed data + * @uncompressed_size: The number of bytes the data decompresses into + * + * Return 0 on success, or return -1 and set errno on failure. + */ +int lzx_decompress(struct lzx_decompressor *decompressor, + const void *compressed_data, size_t compressed_size, + void *uncompressed_data, size_t uncompressed_size) +{ + struct lzx_decompressor *d = decompressor; + u8 * const out_begin = uncompressed_data; + u8 *out_next = out_begin; + u8 * const out_end = out_begin + uncompressed_size; + struct input_bitstream is; + u32 recent_offsets[LZX_NUM_RECENT_OFFSETS] = {1, 1, 1}; + int e8_status = 0; + + init_input_bitstream(&is, compressed_data, compressed_size); + + /* Codeword lengths begin as all 0's for delta encoding purposes. */ + memset(d->maincode_lens, 0, LZX_MAINCODE_NUM_SYMBOLS); + memset(d->lencode_lens, 0, LZX_LENCODE_NUM_SYMBOLS); + + /* Decompress blocks until we have all the uncompressed data. */ + + while (out_next != out_end) { + int block_type; + u32 block_size; + + if (lzx_read_block_header(d, &is, &block_type, &block_size, + recent_offsets)) + goto invalid; + + if (block_size < 1 || block_size > (size_t)(out_end - out_next)) + goto invalid; + + if (block_type != LZX_BLOCKTYPE_UNCOMPRESSED) { + + /* Compressed block */ + + if (lzx_decompress_block(d, + &is, + block_type, + block_size, + out_begin, + out_next, + recent_offsets)) + goto invalid; + + e8_status |= d->maincode_lens[0xe8]; + out_next += block_size; + } else { + /* Uncompressed block */ + + out_next = bitstream_read_bytes(&is, out_next, + block_size); + if (!out_next) + goto invalid; + + if (block_size & 1) + bitstream_read_byte(&is); + + e8_status = 1; + } + } + + /* Postprocess the data unless it cannot possibly contain 0xe8 bytes. */ + if (e8_status) + lzx_postprocess(uncompressed_data, uncompressed_size); + + return 0; + +invalid: + return -1; +} + +/* + * lzx_free_decompressor - Free an LZX decompressor + * + * @decompressor: A decompressor that was allocated with + * lzx_allocate_decompressor(), or NULL. + */ +void lzx_free_decompressor(struct lzx_decompressor *decompressor) +{ + kfree(decompressor); +} diff --git a/fs/ntfs3/lib/xpress_decompress.c b/fs/ntfs3/lib/xpress_decompress.c new file mode 100644 index 000000000..769c6d3dd --- /dev/null +++ b/fs/ntfs3/lib/xpress_decompress.c @@ -0,0 +1,142 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * xpress_decompress.c - A decompressor for the XPRESS compression format + * (Huffman variant), which can be used in "System Compressed" files. This is + * based on the code from wimlib. + * + * Copyright (C) 2015 Eric Biggers + */ + +#include "decompress_common.h" +#include "lib.h" + +#define XPRESS_NUM_SYMBOLS 512 +#define XPRESS_MAX_CODEWORD_LEN 15 +#define XPRESS_MIN_MATCH_LEN 3 + +/* This value is chosen for fast decompression. */ +#define XPRESS_TABLEBITS 12 + +/* Reusable heap-allocated memory for XPRESS decompression */ +struct xpress_decompressor { + + /* The Huffman decoding table */ + u16 decode_table[(1 << XPRESS_TABLEBITS) + 2 * XPRESS_NUM_SYMBOLS]; + + /* An array that maps symbols to codeword lengths */ + u8 lens[XPRESS_NUM_SYMBOLS]; + + /* Temporary space for make_huffman_decode_table() */ + u16 working_space[2 * (1 + XPRESS_MAX_CODEWORD_LEN) + + XPRESS_NUM_SYMBOLS]; +}; + +/* + * xpress_allocate_decompressor - Allocate an XPRESS decompressor + * + * Return the pointer to the decompressor on success, or return NULL and set + * errno on failure. + */ +struct xpress_decompressor *xpress_allocate_decompressor(void) +{ + return kmalloc(sizeof(struct xpress_decompressor), GFP_NOFS); +} + +/* + * xpress_decompress - Decompress a buffer of XPRESS-compressed data + * + * @decompressor: A decompressor that was allocated with + * xpress_allocate_decompressor() + * @compressed_data: The buffer of data to decompress + * @compressed_size: Number of bytes of compressed data + * @uncompressed_data: The buffer in which to store the decompressed data + * @uncompressed_size: The number of bytes the data decompresses into + * + * Return 0 on success, or return -1 and set errno on failure. + */ +int xpress_decompress(struct xpress_decompressor *decompressor, + const void *compressed_data, size_t compressed_size, + void *uncompressed_data, size_t uncompressed_size) +{ + struct xpress_decompressor *d = decompressor; + const u8 * const in_begin = compressed_data; + u8 * const out_begin = uncompressed_data; + u8 *out_next = out_begin; + u8 * const out_end = out_begin + uncompressed_size; + struct input_bitstream is; + u32 i; + + /* Read the Huffman codeword lengths. */ + if (compressed_size < XPRESS_NUM_SYMBOLS / 2) + goto invalid; + for (i = 0; i < XPRESS_NUM_SYMBOLS / 2; i++) { + d->lens[i*2 + 0] = in_begin[i] & 0xF; + d->lens[i*2 + 1] = in_begin[i] >> 4; + } + + /* Build a decoding table for the Huffman code. */ + if (make_huffman_decode_table(d->decode_table, XPRESS_NUM_SYMBOLS, + XPRESS_TABLEBITS, d->lens, + XPRESS_MAX_CODEWORD_LEN, + d->working_space)) + goto invalid; + + /* Decode the matches and literals. */ + + init_input_bitstream(&is, in_begin + XPRESS_NUM_SYMBOLS / 2, + compressed_size - XPRESS_NUM_SYMBOLS / 2); + + while (out_next != out_end) { + u32 sym; + u32 log2_offset; + u32 length; + u32 offset; + + sym = read_huffsym(&is, d->decode_table, + XPRESS_TABLEBITS, XPRESS_MAX_CODEWORD_LEN); + if (sym < 256) { + /* Literal */ + *out_next++ = sym; + } else { + /* Match */ + length = sym & 0xf; + log2_offset = (sym >> 4) & 0xf; + + bitstream_ensure_bits(&is, 16); + + offset = ((u32)1 << log2_offset) | + bitstream_pop_bits(&is, log2_offset); + + if (length == 0xf) { + length += bitstream_read_byte(&is); + if (length == 0xf + 0xff) + length = bitstream_read_u16(&is); + } + length += XPRESS_MIN_MATCH_LEN; + + if (offset > (size_t)(out_next - out_begin)) + goto invalid; + + if (length > (size_t)(out_end - out_next)) + goto invalid; + + out_next = lz_copy(out_next, length, offset, out_end, + XPRESS_MIN_MATCH_LEN); + } + } + return 0; + +invalid: + return -1; +} + +/* + * xpress_free_decompressor - Free an XPRESS decompressor + * + * @decompressor: A decompressor that was allocated with + * xpress_allocate_decompressor(), or NULL. + */ +void xpress_free_decompressor(struct xpress_decompressor *decompressor) +{ + kfree(decompressor); +} diff --git a/fs/ntfs3/lznt.c b/fs/ntfs3/lznt.c new file mode 100644 index 000000000..28f654561 --- /dev/null +++ b/fs/ntfs3/lznt.c @@ -0,0 +1,453 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * + * Copyright (C) 2019-2021 Paragon Software GmbH, All rights reserved. + * + */ + +#include <linux/kernel.h> +#include <linux/slab.h> +#include <linux/stddef.h> +#include <linux/string.h> +#include <linux/types.h> + +#include "debug.h" +#include "ntfs_fs.h" + +// clang-format off +/* Src buffer is zero. */ +#define LZNT_ERROR_ALL_ZEROS 1 +#define LZNT_CHUNK_SIZE 0x1000 +// clang-format on + +struct lznt_hash { + const u8 *p1; + const u8 *p2; +}; + +struct lznt { + const u8 *unc; + const u8 *unc_end; + const u8 *best_match; + size_t max_len; + bool std; + + struct lznt_hash hash[LZNT_CHUNK_SIZE]; +}; + +static inline size_t get_match_len(const u8 *ptr, const u8 *end, const u8 *prev, + size_t max_len) +{ + size_t len = 0; + + while (ptr + len < end && ptr[len] == prev[len] && ++len < max_len) + ; + return len; +} + +static size_t longest_match_std(const u8 *src, struct lznt *ctx) +{ + size_t hash_index; + size_t len1 = 0, len2 = 0; + const u8 **hash; + + hash_index = + ((40543U * ((((src[0] << 4) ^ src[1]) << 4) ^ src[2])) >> 4) & + (LZNT_CHUNK_SIZE - 1); + + hash = &(ctx->hash[hash_index].p1); + + if (hash[0] >= ctx->unc && hash[0] < src && hash[0][0] == src[0] && + hash[0][1] == src[1] && hash[0][2] == src[2]) { + len1 = 3; + if (ctx->max_len > 3) + len1 += get_match_len(src + 3, ctx->unc_end, + hash[0] + 3, ctx->max_len - 3); + } + + if (hash[1] >= ctx->unc && hash[1] < src && hash[1][0] == src[0] && + hash[1][1] == src[1] && hash[1][2] == src[2]) { + len2 = 3; + if (ctx->max_len > 3) + len2 += get_match_len(src + 3, ctx->unc_end, + hash[1] + 3, ctx->max_len - 3); + } + + /* Compare two matches and select the best one. */ + if (len1 < len2) { + ctx->best_match = hash[1]; + len1 = len2; + } else { + ctx->best_match = hash[0]; + } + + hash[1] = hash[0]; + hash[0] = src; + return len1; +} + +static size_t longest_match_best(const u8 *src, struct lznt *ctx) +{ + size_t max_len; + const u8 *ptr; + + if (ctx->unc >= src || !ctx->max_len) + return 0; + + max_len = 0; + for (ptr = ctx->unc; ptr < src; ++ptr) { + size_t len = + get_match_len(src, ctx->unc_end, ptr, ctx->max_len); + if (len >= max_len) { + max_len = len; + ctx->best_match = ptr; + } + } + + return max_len >= 3 ? max_len : 0; +} + +static const size_t s_max_len[] = { + 0x1002, 0x802, 0x402, 0x202, 0x102, 0x82, 0x42, 0x22, 0x12, +}; + +static const size_t s_max_off[] = { + 0x10, 0x20, 0x40, 0x80, 0x100, 0x200, 0x400, 0x800, 0x1000, +}; + +static inline u16 make_pair(size_t offset, size_t len, size_t index) +{ + return ((offset - 1) << (12 - index)) | + ((len - 3) & (((1 << (12 - index)) - 1))); +} + +static inline size_t parse_pair(u16 pair, size_t *offset, size_t index) +{ + *offset = 1 + (pair >> (12 - index)); + return 3 + (pair & ((1 << (12 - index)) - 1)); +} + +/* + * compress_chunk + * + * Return: + * * 0 - Ok, @cmpr contains @cmpr_chunk_size bytes of compressed data. + * * 1 - Input buffer is full zero. + * * -2 - The compressed buffer is too small to hold the compressed data. + */ +static inline int compress_chunk(size_t (*match)(const u8 *, struct lznt *), + const u8 *unc, const u8 *unc_end, u8 *cmpr, + u8 *cmpr_end, size_t *cmpr_chunk_size, + struct lznt *ctx) +{ + size_t cnt = 0; + size_t idx = 0; + const u8 *up = unc; + u8 *cp = cmpr + 3; + u8 *cp2 = cmpr + 2; + u8 not_zero = 0; + /* Control byte of 8-bit values: ( 0 - means byte as is, 1 - short pair ). */ + u8 ohdr = 0; + u8 *last; + u16 t16; + + if (unc + LZNT_CHUNK_SIZE < unc_end) + unc_end = unc + LZNT_CHUNK_SIZE; + + last = min(cmpr + LZNT_CHUNK_SIZE + sizeof(short), cmpr_end); + + ctx->unc = unc; + ctx->unc_end = unc_end; + ctx->max_len = s_max_len[0]; + + while (up < unc_end) { + size_t max_len; + + while (unc + s_max_off[idx] < up) + ctx->max_len = s_max_len[++idx]; + + /* Find match. */ + max_len = up + 3 <= unc_end ? (*match)(up, ctx) : 0; + + if (!max_len) { + if (cp >= last) + goto NotCompressed; + not_zero |= *cp++ = *up++; + } else if (cp + 1 >= last) { + goto NotCompressed; + } else { + t16 = make_pair(up - ctx->best_match, max_len, idx); + *cp++ = t16; + *cp++ = t16 >> 8; + + ohdr |= 1 << cnt; + up += max_len; + } + + cnt = (cnt + 1) & 7; + if (!cnt) { + *cp2 = ohdr; + ohdr = 0; + cp2 = cp; + cp += 1; + } + } + + if (cp2 < last) + *cp2 = ohdr; + else + cp -= 1; + + *cmpr_chunk_size = cp - cmpr; + + t16 = (*cmpr_chunk_size - 3) | 0xB000; + cmpr[0] = t16; + cmpr[1] = t16 >> 8; + + return not_zero ? 0 : LZNT_ERROR_ALL_ZEROS; + +NotCompressed: + + if ((cmpr + LZNT_CHUNK_SIZE + sizeof(short)) > last) + return -2; + + /* + * Copy non cmpr data. + * 0x3FFF == ((LZNT_CHUNK_SIZE + 2 - 3) | 0x3000) + */ + cmpr[0] = 0xff; + cmpr[1] = 0x3f; + + memcpy(cmpr + sizeof(short), unc, LZNT_CHUNK_SIZE); + *cmpr_chunk_size = LZNT_CHUNK_SIZE + sizeof(short); + + return 0; +} + +static inline ssize_t decompress_chunk(u8 *unc, u8 *unc_end, const u8 *cmpr, + const u8 *cmpr_end) +{ + u8 *up = unc; + u8 ch = *cmpr++; + size_t bit = 0; + size_t index = 0; + u16 pair; + size_t offset, length; + + /* Do decompression until pointers are inside range. */ + while (up < unc_end && cmpr < cmpr_end) { + /* Correct index */ + while (unc + s_max_off[index] < up) + index += 1; + + /* Check the current flag for zero. */ + if (!(ch & (1 << bit))) { + /* Just copy byte. */ + *up++ = *cmpr++; + goto next; + } + + /* Check for boundary. */ + if (cmpr + 1 >= cmpr_end) + return -EINVAL; + + /* Read a short from little endian stream. */ + pair = cmpr[1]; + pair <<= 8; + pair |= cmpr[0]; + + cmpr += 2; + + /* Translate packed information into offset and length. */ + length = parse_pair(pair, &offset, index); + + /* Check offset for boundary. */ + if (unc + offset > up) + return -EINVAL; + + /* Truncate the length if necessary. */ + if (up + length >= unc_end) + length = unc_end - up; + + /* Now we copy bytes. This is the heart of LZ algorithm. */ + for (; length > 0; length--, up++) + *up = *(up - offset); + +next: + /* Advance flag bit value. */ + bit = (bit + 1) & 7; + + if (!bit) { + if (cmpr >= cmpr_end) + break; + + ch = *cmpr++; + } + } + + /* Return the size of uncompressed data. */ + return up - unc; +} + +/* + * get_lznt_ctx + * @level: 0 - Standard compression. + * !0 - Best compression, requires a lot of cpu. + */ +struct lznt *get_lznt_ctx(int level) +{ + struct lznt *r = kzalloc(level ? offsetof(struct lznt, hash) + : sizeof(struct lznt), + GFP_NOFS); + + if (r) + r->std = !level; + return r; +} + +/* + * compress_lznt - Compresses @unc into @cmpr + * + * Return: + * * +x - Ok, @cmpr contains 'final_compressed_size' bytes of compressed data. + * * 0 - Input buffer is full zero. + */ +size_t compress_lznt(const void *unc, size_t unc_size, void *cmpr, + size_t cmpr_size, struct lznt *ctx) +{ + int err; + size_t (*match)(const u8 *src, struct lznt *ctx); + u8 *p = cmpr; + u8 *end = p + cmpr_size; + const u8 *unc_chunk = unc; + const u8 *unc_end = unc_chunk + unc_size; + bool is_zero = true; + + if (ctx->std) { + match = &longest_match_std; + memset(ctx->hash, 0, sizeof(ctx->hash)); + } else { + match = &longest_match_best; + } + + /* Compression cycle. */ + for (; unc_chunk < unc_end; unc_chunk += LZNT_CHUNK_SIZE) { + cmpr_size = 0; + err = compress_chunk(match, unc_chunk, unc_end, p, end, + &cmpr_size, ctx); + if (err < 0) + return unc_size; + + if (is_zero && err != LZNT_ERROR_ALL_ZEROS) + is_zero = false; + + p += cmpr_size; + } + + if (p <= end - 2) + p[0] = p[1] = 0; + + return is_zero ? 0 : PtrOffset(cmpr, p); +} + +/* + * decompress_lznt - Decompress @cmpr into @unc. + */ +ssize_t decompress_lznt(const void *cmpr, size_t cmpr_size, void *unc, + size_t unc_size) +{ + const u8 *cmpr_chunk = cmpr; + const u8 *cmpr_end = cmpr_chunk + cmpr_size; + u8 *unc_chunk = unc; + u8 *unc_end = unc_chunk + unc_size; + u16 chunk_hdr; + + if (cmpr_size < sizeof(short)) + return -EINVAL; + + /* Read chunk header. */ + chunk_hdr = cmpr_chunk[1]; + chunk_hdr <<= 8; + chunk_hdr |= cmpr_chunk[0]; + + /* Loop through decompressing chunks. */ + for (;;) { + size_t chunk_size_saved; + size_t unc_use; + size_t cmpr_use = 3 + (chunk_hdr & (LZNT_CHUNK_SIZE - 1)); + + /* Check that the chunk actually fits the supplied buffer. */ + if (cmpr_chunk + cmpr_use > cmpr_end) + return -EINVAL; + + /* First make sure the chunk contains compressed data. */ + if (chunk_hdr & 0x8000) { + /* Decompress a chunk and return if we get an error. */ + ssize_t err = + decompress_chunk(unc_chunk, unc_end, + cmpr_chunk + sizeof(chunk_hdr), + cmpr_chunk + cmpr_use); + if (err < 0) + return err; + unc_use = err; + } else { + /* This chunk does not contain compressed data. */ + unc_use = unc_chunk + LZNT_CHUNK_SIZE > unc_end + ? unc_end - unc_chunk + : LZNT_CHUNK_SIZE; + + if (cmpr_chunk + sizeof(chunk_hdr) + unc_use > + cmpr_end) { + return -EINVAL; + } + + memcpy(unc_chunk, cmpr_chunk + sizeof(chunk_hdr), + unc_use); + } + + /* Advance pointers. */ + cmpr_chunk += cmpr_use; + unc_chunk += unc_use; + + /* Check for the end of unc buffer. */ + if (unc_chunk >= unc_end) + break; + + /* Proceed the next chunk. */ + if (cmpr_chunk > cmpr_end - 2) + break; + + chunk_size_saved = LZNT_CHUNK_SIZE; + + /* Read chunk header. */ + chunk_hdr = cmpr_chunk[1]; + chunk_hdr <<= 8; + chunk_hdr |= cmpr_chunk[0]; + + if (!chunk_hdr) + break; + + /* Check the size of unc buffer. */ + if (unc_use < chunk_size_saved) { + size_t t1 = chunk_size_saved - unc_use; + u8 *t2 = unc_chunk + t1; + + /* 'Zero' memory. */ + if (t2 >= unc_end) + break; + + memset(unc_chunk, 0, t1); + unc_chunk = t2; + } + } + + /* Check compression boundary. */ + if (cmpr_chunk > cmpr_end) + return -EINVAL; + + /* + * The unc size is just a difference between current + * pointer and original one. + */ + return PtrOffset(unc, unc_chunk); +} diff --git a/fs/ntfs3/namei.c b/fs/ntfs3/namei.c new file mode 100644 index 000000000..a9549e730 --- /dev/null +++ b/fs/ntfs3/namei.c @@ -0,0 +1,395 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * + * Copyright (C) 2019-2021 Paragon Software GmbH, All rights reserved. + * + */ + +#include <linux/fs.h> +#include <linux/nls.h> + +#include "debug.h" +#include "ntfs.h" +#include "ntfs_fs.h" + +/* + * fill_name_de - Format NTFS_DE in @buf. + */ +int fill_name_de(struct ntfs_sb_info *sbi, void *buf, const struct qstr *name, + const struct cpu_str *uni) +{ + int err; + struct NTFS_DE *e = buf; + u16 data_size; + struct ATTR_FILE_NAME *fname = (struct ATTR_FILE_NAME *)(e + 1); + +#ifndef CONFIG_NTFS3_64BIT_CLUSTER + e->ref.high = fname->home.high = 0; +#endif + if (uni) { +#ifdef __BIG_ENDIAN + int ulen = uni->len; + __le16 *uname = fname->name; + const u16 *name_cpu = uni->name; + + while (ulen--) + *uname++ = cpu_to_le16(*name_cpu++); +#else + memcpy(fname->name, uni->name, uni->len * sizeof(u16)); +#endif + fname->name_len = uni->len; + + } else { + /* Convert input string to unicode. */ + err = ntfs_nls_to_utf16(sbi, name->name, name->len, + (struct cpu_str *)&fname->name_len, + NTFS_NAME_LEN, UTF16_LITTLE_ENDIAN); + if (err < 0) + return err; + } + + fname->type = FILE_NAME_POSIX; + data_size = fname_full_size(fname); + + e->size = cpu_to_le16(ALIGN(data_size, 8) + sizeof(struct NTFS_DE)); + e->key_size = cpu_to_le16(data_size); + e->flags = 0; + e->res = 0; + + return 0; +} + +/* + * ntfs_lookup - inode_operations::lookup + */ +static struct dentry *ntfs_lookup(struct inode *dir, struct dentry *dentry, + u32 flags) +{ + struct ntfs_inode *ni = ntfs_i(dir); + struct cpu_str *uni = __getname(); + struct inode *inode; + int err; + + if (!uni) + inode = ERR_PTR(-ENOMEM); + else { + err = ntfs_nls_to_utf16(ni->mi.sbi, dentry->d_name.name, + dentry->d_name.len, uni, NTFS_NAME_LEN, + UTF16_HOST_ENDIAN); + if (err < 0) + inode = ERR_PTR(err); + else { + ni_lock(ni); + inode = dir_search_u(dir, uni, NULL); + ni_unlock(ni); + } + __putname(uni); + } + + /* + * Check for a null pointer + * If the MFT record of ntfs inode is not a base record, inode->i_op can be NULL. + * This causes null pointer dereference in d_splice_alias(). + */ + if (!IS_ERR_OR_NULL(inode) && !inode->i_op) { + iput(inode); + inode = ERR_PTR(-EINVAL); + } + + return d_splice_alias(inode, dentry); +} + +/* + * ntfs_create - inode_operations::create + */ +static int ntfs_create(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode, bool excl) +{ + struct inode *inode; + + inode = ntfs_create_inode(mnt_userns, dir, dentry, NULL, S_IFREG | mode, + 0, NULL, 0, NULL); + + return IS_ERR(inode) ? PTR_ERR(inode) : 0; +} + +/* + * ntfs_mknod + * + * inode_operations::mknod + */ +static int ntfs_mknod(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode, dev_t rdev) +{ + struct inode *inode; + + inode = ntfs_create_inode(mnt_userns, dir, dentry, NULL, mode, rdev, + NULL, 0, NULL); + + return IS_ERR(inode) ? PTR_ERR(inode) : 0; +} + +/* + * ntfs_link - inode_operations::link + */ +static int ntfs_link(struct dentry *ode, struct inode *dir, struct dentry *de) +{ + int err; + struct inode *inode = d_inode(ode); + struct ntfs_inode *ni = ntfs_i(inode); + + if (S_ISDIR(inode->i_mode)) + return -EPERM; + + if (inode->i_nlink >= NTFS_LINK_MAX) + return -EMLINK; + + ni_lock_dir(ntfs_i(dir)); + if (inode != dir) + ni_lock(ni); + + inc_nlink(inode); + ihold(inode); + + err = ntfs_link_inode(inode, de); + + if (!err) { + dir->i_ctime = dir->i_mtime = inode->i_ctime = + current_time(dir); + mark_inode_dirty(inode); + mark_inode_dirty(dir); + d_instantiate(de, inode); + } else { + drop_nlink(inode); + iput(inode); + } + + if (inode != dir) + ni_unlock(ni); + ni_unlock(ntfs_i(dir)); + + return err; +} + +/* + * ntfs_unlink - inode_operations::unlink + */ +static int ntfs_unlink(struct inode *dir, struct dentry *dentry) +{ + struct ntfs_inode *ni = ntfs_i(dir); + int err; + + ni_lock_dir(ni); + + err = ntfs_unlink_inode(dir, dentry); + + ni_unlock(ni); + + return err; +} + +/* + * ntfs_symlink - inode_operations::symlink + */ +static int ntfs_symlink(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, const char *symname) +{ + u32 size = strlen(symname); + struct inode *inode; + + inode = ntfs_create_inode(mnt_userns, dir, dentry, NULL, S_IFLNK | 0777, + 0, symname, size, NULL); + + return IS_ERR(inode) ? PTR_ERR(inode) : 0; +} + +/* + * ntfs_mkdir- inode_operations::mkdir + */ +static int ntfs_mkdir(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode) +{ + struct inode *inode; + + inode = ntfs_create_inode(mnt_userns, dir, dentry, NULL, S_IFDIR | mode, + 0, NULL, 0, NULL); + + return IS_ERR(inode) ? PTR_ERR(inode) : 0; +} + +/* + * ntfs_rmdir - inode_operations::rmdir + */ +static int ntfs_rmdir(struct inode *dir, struct dentry *dentry) +{ + struct ntfs_inode *ni = ntfs_i(dir); + int err; + + ni_lock_dir(ni); + + err = ntfs_unlink_inode(dir, dentry); + + ni_unlock(ni); + + return err; +} + +/* + * ntfs_rename - inode_operations::rename + */ +static int ntfs_rename(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, struct inode *new_dir, + struct dentry *new_dentry, u32 flags) +{ + int err; + struct super_block *sb = dir->i_sb; + struct ntfs_sb_info *sbi = sb->s_fs_info; + struct ntfs_inode *dir_ni = ntfs_i(dir); + struct ntfs_inode *new_dir_ni = ntfs_i(new_dir); + struct inode *inode = d_inode(dentry); + struct ntfs_inode *ni = ntfs_i(inode); + struct inode *new_inode = d_inode(new_dentry); + struct NTFS_DE *de, *new_de; + bool is_same, is_bad; + /* + * de - memory of PATH_MAX bytes: + * [0-1024) - original name (dentry->d_name) + * [1024-2048) - paired to original name, usually DOS variant of dentry->d_name + * [2048-3072) - new name (new_dentry->d_name) + */ + static_assert(SIZEOF_ATTRIBUTE_FILENAME_MAX + SIZEOF_RESIDENT < 1024); + static_assert(SIZEOF_ATTRIBUTE_FILENAME_MAX + sizeof(struct NTFS_DE) < + 1024); + static_assert(PATH_MAX >= 4 * 1024); + + if (flags & ~RENAME_NOREPLACE) + return -EINVAL; + + is_same = dentry->d_name.len == new_dentry->d_name.len && + !memcmp(dentry->d_name.name, new_dentry->d_name.name, + dentry->d_name.len); + + if (is_same && dir == new_dir) { + /* Nothing to do. */ + return 0; + } + + if (ntfs_is_meta_file(sbi, inode->i_ino)) { + /* Should we print an error? */ + return -EINVAL; + } + + if (new_inode) { + /* Target name exists. Unlink it. */ + dget(new_dentry); + ni_lock_dir(new_dir_ni); + err = ntfs_unlink_inode(new_dir, new_dentry); + ni_unlock(new_dir_ni); + dput(new_dentry); + if (err) + return err; + } + + /* Allocate PATH_MAX bytes. */ + de = __getname(); + if (!de) + return -ENOMEM; + + /* Translate dentry->d_name into unicode form. */ + err = fill_name_de(sbi, de, &dentry->d_name, NULL); + if (err < 0) + goto out; + + if (is_same) { + /* Reuse 'de'. */ + new_de = de; + } else { + /* Translate new_dentry->d_name into unicode form. */ + new_de = Add2Ptr(de, 2048); + err = fill_name_de(sbi, new_de, &new_dentry->d_name, NULL); + if (err < 0) + goto out; + } + + ni_lock_dir(dir_ni); + ni_lock(ni); + + is_bad = false; + err = ni_rename(dir_ni, new_dir_ni, ni, de, new_de, &is_bad); + if (is_bad) { + /* Restore after failed rename failed too. */ + _ntfs_bad_inode(inode); + } else if (!err) { + inode->i_ctime = dir->i_ctime = dir->i_mtime = + current_time(dir); + mark_inode_dirty(inode); + mark_inode_dirty(dir); + if (dir != new_dir) { + new_dir->i_mtime = new_dir->i_ctime = dir->i_ctime; + mark_inode_dirty(new_dir); + } + + if (IS_DIRSYNC(dir)) + ntfs_sync_inode(dir); + + if (IS_DIRSYNC(new_dir)) + ntfs_sync_inode(inode); + } + + ni_unlock(ni); + ni_unlock(dir_ni); +out: + __putname(de); + return err; +} + +struct dentry *ntfs3_get_parent(struct dentry *child) +{ + struct inode *inode = d_inode(child); + struct ntfs_inode *ni = ntfs_i(inode); + + struct ATTR_LIST_ENTRY *le = NULL; + struct ATTRIB *attr = NULL; + struct ATTR_FILE_NAME *fname; + + while ((attr = ni_find_attr(ni, attr, &le, ATTR_NAME, NULL, 0, NULL, + NULL))) { + fname = resident_data_ex(attr, SIZEOF_ATTRIBUTE_FILENAME); + if (!fname) + continue; + + return d_obtain_alias( + ntfs_iget5(inode->i_sb, &fname->home, NULL)); + } + + return ERR_PTR(-ENOENT); +} + +// clang-format off +const struct inode_operations ntfs_dir_inode_operations = { + .lookup = ntfs_lookup, + .create = ntfs_create, + .link = ntfs_link, + .unlink = ntfs_unlink, + .symlink = ntfs_symlink, + .mkdir = ntfs_mkdir, + .rmdir = ntfs_rmdir, + .mknod = ntfs_mknod, + .rename = ntfs_rename, + .permission = ntfs_permission, + .get_acl = ntfs_get_acl, + .set_acl = ntfs_set_acl, + .setattr = ntfs3_setattr, + .getattr = ntfs_getattr, + .listxattr = ntfs_listxattr, + .fiemap = ntfs_fiemap, +}; + +const struct inode_operations ntfs_special_inode_operations = { + .setattr = ntfs3_setattr, + .getattr = ntfs_getattr, + .listxattr = ntfs_listxattr, + .get_acl = ntfs_get_acl, + .set_acl = ntfs_set_acl, +}; +// clang-format on diff --git a/fs/ntfs3/ntfs.h b/fs/ntfs3/ntfs.h new file mode 100644 index 000000000..0f38d5581 --- /dev/null +++ b/fs/ntfs3/ntfs.h @@ -0,0 +1,1221 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * + * Copyright (C) 2019-2021 Paragon Software GmbH, All rights reserved. + * + * on-disk ntfs structs + */ + +// clang-format off +#ifndef _LINUX_NTFS3_NTFS_H +#define _LINUX_NTFS3_NTFS_H + +#include <linux/blkdev.h> +#include <linux/build_bug.h> +#include <linux/kernel.h> +#include <linux/stddef.h> +#include <linux/string.h> +#include <linux/types.h> + +#include "debug.h" + +/* TODO: Check 4K MFT record and 512 bytes cluster. */ + +/* Check each run for marked clusters. */ +#define NTFS3_CHECK_FREE_CLST + +#define NTFS_NAME_LEN 255 + +/* + * ntfs.sys used 500 maximum links on-disk struct allows up to 0xffff. + * xfstest generic/041 creates 3003 hardlinks. + */ +#define NTFS_LINK_MAX 4000 + +/* + * Activate to use 64 bit clusters instead of 32 bits in ntfs.sys. + * Logical and virtual cluster number if needed, may be + * redefined to use 64 bit value. + */ +//#define CONFIG_NTFS3_64BIT_CLUSTER + +#define NTFS_LZNT_MAX_CLUSTER 4096 +#define NTFS_LZNT_CUNIT 4 +#define NTFS_LZNT_CLUSTERS (1u<<NTFS_LZNT_CUNIT) + +struct GUID { + __le32 Data1; + __le16 Data2; + __le16 Data3; + u8 Data4[8]; +}; + +/* + * This struct repeats layout of ATTR_FILE_NAME + * at offset 0x40. + * It used to store global constants NAME_MFT/NAME_MIRROR... + * most constant names are shorter than 10. + */ +struct cpu_str { + u8 len; + u8 unused; + u16 name[10]; +}; + +struct le_str { + u8 len; + u8 unused; + __le16 name[]; +}; + +static_assert(SECTOR_SHIFT == 9); + +#ifdef CONFIG_NTFS3_64BIT_CLUSTER +typedef u64 CLST; +static_assert(sizeof(size_t) == 8); +#else +typedef u32 CLST; +#endif + +#define SPARSE_LCN64 ((u64)-1) +#define SPARSE_LCN ((CLST)-1) +#define RESIDENT_LCN ((CLST)-2) +#define COMPRESSED_LCN ((CLST)-3) + +#define COMPRESSION_UNIT 4 +#define COMPRESS_MAX_CLUSTER 0x1000 +#define MFT_INCREASE_CHUNK 1024 + +enum RECORD_NUM { + MFT_REC_MFT = 0, + MFT_REC_MIRR = 1, + MFT_REC_LOG = 2, + MFT_REC_VOL = 3, + MFT_REC_ATTR = 4, + MFT_REC_ROOT = 5, + MFT_REC_BITMAP = 6, + MFT_REC_BOOT = 7, + MFT_REC_BADCLUST = 8, + //MFT_REC_QUOTA = 9, + MFT_REC_SECURE = 9, // NTFS 3.0 + MFT_REC_UPCASE = 10, + MFT_REC_EXTEND = 11, // NTFS 3.0 + MFT_REC_RESERVED = 11, + MFT_REC_FREE = 16, + MFT_REC_USER = 24, +}; + +enum ATTR_TYPE { + ATTR_ZERO = cpu_to_le32(0x00), + ATTR_STD = cpu_to_le32(0x10), + ATTR_LIST = cpu_to_le32(0x20), + ATTR_NAME = cpu_to_le32(0x30), + // ATTR_VOLUME_VERSION on Nt4 + ATTR_ID = cpu_to_le32(0x40), + ATTR_SECURE = cpu_to_le32(0x50), + ATTR_LABEL = cpu_to_le32(0x60), + ATTR_VOL_INFO = cpu_to_le32(0x70), + ATTR_DATA = cpu_to_le32(0x80), + ATTR_ROOT = cpu_to_le32(0x90), + ATTR_ALLOC = cpu_to_le32(0xA0), + ATTR_BITMAP = cpu_to_le32(0xB0), + // ATTR_SYMLINK on Nt4 + ATTR_REPARSE = cpu_to_le32(0xC0), + ATTR_EA_INFO = cpu_to_le32(0xD0), + ATTR_EA = cpu_to_le32(0xE0), + ATTR_PROPERTYSET = cpu_to_le32(0xF0), + ATTR_LOGGED_UTILITY_STREAM = cpu_to_le32(0x100), + ATTR_END = cpu_to_le32(0xFFFFFFFF) +}; + +static_assert(sizeof(enum ATTR_TYPE) == 4); + +enum FILE_ATTRIBUTE { + FILE_ATTRIBUTE_READONLY = cpu_to_le32(0x00000001), + FILE_ATTRIBUTE_HIDDEN = cpu_to_le32(0x00000002), + FILE_ATTRIBUTE_SYSTEM = cpu_to_le32(0x00000004), + FILE_ATTRIBUTE_ARCHIVE = cpu_to_le32(0x00000020), + FILE_ATTRIBUTE_DEVICE = cpu_to_le32(0x00000040), + FILE_ATTRIBUTE_TEMPORARY = cpu_to_le32(0x00000100), + FILE_ATTRIBUTE_SPARSE_FILE = cpu_to_le32(0x00000200), + FILE_ATTRIBUTE_REPARSE_POINT = cpu_to_le32(0x00000400), + FILE_ATTRIBUTE_COMPRESSED = cpu_to_le32(0x00000800), + FILE_ATTRIBUTE_OFFLINE = cpu_to_le32(0x00001000), + FILE_ATTRIBUTE_NOT_CONTENT_INDEXED = cpu_to_le32(0x00002000), + FILE_ATTRIBUTE_ENCRYPTED = cpu_to_le32(0x00004000), + FILE_ATTRIBUTE_VALID_FLAGS = cpu_to_le32(0x00007fb7), + FILE_ATTRIBUTE_DIRECTORY = cpu_to_le32(0x10000000), +}; + +static_assert(sizeof(enum FILE_ATTRIBUTE) == 4); + +extern const struct cpu_str NAME_MFT; +extern const struct cpu_str NAME_MIRROR; +extern const struct cpu_str NAME_LOGFILE; +extern const struct cpu_str NAME_VOLUME; +extern const struct cpu_str NAME_ATTRDEF; +extern const struct cpu_str NAME_ROOT; +extern const struct cpu_str NAME_BITMAP; +extern const struct cpu_str NAME_BOOT; +extern const struct cpu_str NAME_BADCLUS; +extern const struct cpu_str NAME_QUOTA; +extern const struct cpu_str NAME_SECURE; +extern const struct cpu_str NAME_UPCASE; +extern const struct cpu_str NAME_EXTEND; +extern const struct cpu_str NAME_OBJID; +extern const struct cpu_str NAME_REPARSE; +extern const struct cpu_str NAME_USNJRNL; + +extern const __le16 I30_NAME[4]; +extern const __le16 SII_NAME[4]; +extern const __le16 SDH_NAME[4]; +extern const __le16 SO_NAME[2]; +extern const __le16 SQ_NAME[2]; +extern const __le16 SR_NAME[2]; + +extern const __le16 BAD_NAME[4]; +extern const __le16 SDS_NAME[4]; +extern const __le16 WOF_NAME[17]; /* WofCompressedData */ + +/* MFT record number structure. */ +struct MFT_REF { + __le32 low; // The low part of the number. + __le16 high; // The high part of the number. + __le16 seq; // The sequence number of MFT record. +}; + +static_assert(sizeof(__le64) == sizeof(struct MFT_REF)); + +static inline CLST ino_get(const struct MFT_REF *ref) +{ +#ifdef CONFIG_NTFS3_64BIT_CLUSTER + return le32_to_cpu(ref->low) | ((u64)le16_to_cpu(ref->high) << 32); +#else + return le32_to_cpu(ref->low); +#endif +} + +struct NTFS_BOOT { + u8 jump_code[3]; // 0x00: Jump to boot code. + u8 system_id[8]; // 0x03: System ID, equals "NTFS " + + // NOTE: This member is not aligned(!) + // bytes_per_sector[0] must be 0. + // bytes_per_sector[1] must be multiplied by 256. + u8 bytes_per_sector[2]; // 0x0B: Bytes per sector. + + u8 sectors_per_clusters;// 0x0D: Sectors per cluster. + u8 unused1[7]; + u8 media_type; // 0x15: Media type (0xF8 - harddisk) + u8 unused2[2]; + __le16 sct_per_track; // 0x18: number of sectors per track. + __le16 heads; // 0x1A: number of heads per cylinder. + __le32 hidden_sectors; // 0x1C: number of 'hidden' sectors. + u8 unused3[4]; + u8 bios_drive_num; // 0x24: BIOS drive number =0x80. + u8 unused4; + u8 signature_ex; // 0x26: Extended BOOT signature =0x80. + u8 unused5; + __le64 sectors_per_volume;// 0x28: Size of volume in sectors. + __le64 mft_clst; // 0x30: First cluster of $MFT + __le64 mft2_clst; // 0x38: First cluster of $MFTMirr + s8 record_size; // 0x40: Size of MFT record in clusters(sectors). + u8 unused6[3]; + s8 index_size; // 0x44: Size of INDX record in clusters(sectors). + u8 unused7[3]; + __le64 serial_num; // 0x48: Volume serial number + __le32 check_sum; // 0x50: Simple additive checksum of all + // of the u32's which precede the 'check_sum'. + + u8 boot_code[0x200 - 0x50 - 2 - 4]; // 0x54: + u8 boot_magic[2]; // 0x1FE: Boot signature =0x55 + 0xAA +}; + +static_assert(sizeof(struct NTFS_BOOT) == 0x200); + +enum NTFS_SIGNATURE { + NTFS_FILE_SIGNATURE = cpu_to_le32(0x454C4946), // 'FILE' + NTFS_INDX_SIGNATURE = cpu_to_le32(0x58444E49), // 'INDX' + NTFS_CHKD_SIGNATURE = cpu_to_le32(0x444B4843), // 'CHKD' + NTFS_RSTR_SIGNATURE = cpu_to_le32(0x52545352), // 'RSTR' + NTFS_RCRD_SIGNATURE = cpu_to_le32(0x44524352), // 'RCRD' + NTFS_BAAD_SIGNATURE = cpu_to_le32(0x44414142), // 'BAAD' + NTFS_HOLE_SIGNATURE = cpu_to_le32(0x454C4F48), // 'HOLE' + NTFS_FFFF_SIGNATURE = cpu_to_le32(0xffffffff), +}; + +static_assert(sizeof(enum NTFS_SIGNATURE) == 4); + +/* MFT Record header structure. */ +struct NTFS_RECORD_HEADER { + /* Record magic number, equals 'FILE'/'INDX'/'RSTR'/'RCRD'. */ + enum NTFS_SIGNATURE sign; // 0x00: + __le16 fix_off; // 0x04: + __le16 fix_num; // 0x06: + __le64 lsn; // 0x08: Log file sequence number, +}; + +static_assert(sizeof(struct NTFS_RECORD_HEADER) == 0x10); + +static inline int is_baad(const struct NTFS_RECORD_HEADER *hdr) +{ + return hdr->sign == NTFS_BAAD_SIGNATURE; +} + +/* Possible bits in struct MFT_REC.flags. */ +enum RECORD_FLAG { + RECORD_FLAG_IN_USE = cpu_to_le16(0x0001), + RECORD_FLAG_DIR = cpu_to_le16(0x0002), + RECORD_FLAG_SYSTEM = cpu_to_le16(0x0004), + RECORD_FLAG_UNKNOWN = cpu_to_le16(0x0008), +}; + +/* MFT Record structure. */ +struct MFT_REC { + struct NTFS_RECORD_HEADER rhdr; // 'FILE' + + __le16 seq; // 0x10: Sequence number for this record. + __le16 hard_links; // 0x12: The number of hard links to record. + __le16 attr_off; // 0x14: Offset to attributes. + __le16 flags; // 0x16: See RECORD_FLAG. + __le32 used; // 0x18: The size of used part. + __le32 total; // 0x1C: Total record size. + + struct MFT_REF parent_ref; // 0x20: Parent MFT record. + __le16 next_attr_id; // 0x28: The next attribute Id. + + __le16 res; // 0x2A: High part of MFT record? + __le32 mft_record; // 0x2C: Current MFT record number. + __le16 fixups[]; // 0x30: +}; + +#define MFTRECORD_FIXUP_OFFSET_1 offsetof(struct MFT_REC, res) +#define MFTRECORD_FIXUP_OFFSET_3 offsetof(struct MFT_REC, fixups) + +static_assert(MFTRECORD_FIXUP_OFFSET_1 == 0x2A); +static_assert(MFTRECORD_FIXUP_OFFSET_3 == 0x30); + +static inline bool is_rec_base(const struct MFT_REC *rec) +{ + const struct MFT_REF *r = &rec->parent_ref; + + return !r->low && !r->high && !r->seq; +} + +static inline bool is_mft_rec5(const struct MFT_REC *rec) +{ + return le16_to_cpu(rec->rhdr.fix_off) >= + offsetof(struct MFT_REC, fixups); +} + +static inline bool is_rec_inuse(const struct MFT_REC *rec) +{ + return rec->flags & RECORD_FLAG_IN_USE; +} + +static inline bool clear_rec_inuse(struct MFT_REC *rec) +{ + return rec->flags &= ~RECORD_FLAG_IN_USE; +} + +/* Possible values of ATTR_RESIDENT.flags */ +#define RESIDENT_FLAG_INDEXED 0x01 + +struct ATTR_RESIDENT { + __le32 data_size; // 0x10: The size of data. + __le16 data_off; // 0x14: Offset to data. + u8 flags; // 0x16: Resident flags ( 1 - indexed ). + u8 res; // 0x17: +}; // sizeof() = 0x18 + +struct ATTR_NONRESIDENT { + __le64 svcn; // 0x10: Starting VCN of this segment. + __le64 evcn; // 0x18: End VCN of this segment. + __le16 run_off; // 0x20: Offset to packed runs. + // Unit of Compression size for this stream, expressed + // as a log of the cluster size. + // + // 0 means file is not compressed + // 1, 2, 3, and 4 are potentially legal values if the + // stream is compressed, however the implementation + // may only choose to use 4, or possibly 3. Note + // that 4 means cluster size time 16. If convenient + // the implementation may wish to accept a + // reasonable range of legal values here (1-5?), + // even if the implementation only generates + // a smaller set of values itself. + u8 c_unit; // 0x22: + u8 res1[5]; // 0x23: + __le64 alloc_size; // 0x28: The allocated size of attribute in bytes. + // (multiple of cluster size) + __le64 data_size; // 0x30: The size of attribute in bytes <= alloc_size. + __le64 valid_size; // 0x38: The size of valid part in bytes <= data_size. + __le64 total_size; // 0x40: The sum of the allocated clusters for a file. + // (present only for the first segment (0 == vcn) + // of compressed attribute) + +}; // sizeof()=0x40 or 0x48 (if compressed) + +/* Possible values of ATTRIB.flags: */ +#define ATTR_FLAG_COMPRESSED cpu_to_le16(0x0001) +#define ATTR_FLAG_COMPRESSED_MASK cpu_to_le16(0x00FF) +#define ATTR_FLAG_ENCRYPTED cpu_to_le16(0x4000) +#define ATTR_FLAG_SPARSED cpu_to_le16(0x8000) + +struct ATTRIB { + enum ATTR_TYPE type; // 0x00: The type of this attribute. + __le32 size; // 0x04: The size of this attribute. + u8 non_res; // 0x08: Is this attribute non-resident? + u8 name_len; // 0x09: This attribute name length. + __le16 name_off; // 0x0A: Offset to the attribute name. + __le16 flags; // 0x0C: See ATTR_FLAG_XXX. + __le16 id; // 0x0E: Unique id (per record). + + union { + struct ATTR_RESIDENT res; // 0x10 + struct ATTR_NONRESIDENT nres; // 0x10 + }; +}; + +/* Define attribute sizes. */ +#define SIZEOF_RESIDENT 0x18 +#define SIZEOF_NONRESIDENT_EX 0x48 +#define SIZEOF_NONRESIDENT 0x40 + +#define SIZEOF_RESIDENT_LE cpu_to_le16(0x18) +#define SIZEOF_NONRESIDENT_EX_LE cpu_to_le16(0x48) +#define SIZEOF_NONRESIDENT_LE cpu_to_le16(0x40) + +static inline u64 attr_ondisk_size(const struct ATTRIB *attr) +{ + return attr->non_res ? ((attr->flags & + (ATTR_FLAG_COMPRESSED | ATTR_FLAG_SPARSED)) ? + le64_to_cpu(attr->nres.total_size) : + le64_to_cpu(attr->nres.alloc_size)) + : ALIGN(le32_to_cpu(attr->res.data_size), 8); +} + +static inline u64 attr_size(const struct ATTRIB *attr) +{ + return attr->non_res ? le64_to_cpu(attr->nres.data_size) : + le32_to_cpu(attr->res.data_size); +} + +static inline bool is_attr_encrypted(const struct ATTRIB *attr) +{ + return attr->flags & ATTR_FLAG_ENCRYPTED; +} + +static inline bool is_attr_sparsed(const struct ATTRIB *attr) +{ + return attr->flags & ATTR_FLAG_SPARSED; +} + +static inline bool is_attr_compressed(const struct ATTRIB *attr) +{ + return attr->flags & ATTR_FLAG_COMPRESSED; +} + +static inline bool is_attr_ext(const struct ATTRIB *attr) +{ + return attr->flags & (ATTR_FLAG_SPARSED | ATTR_FLAG_COMPRESSED); +} + +static inline bool is_attr_indexed(const struct ATTRIB *attr) +{ + return !attr->non_res && (attr->res.flags & RESIDENT_FLAG_INDEXED); +} + +static inline __le16 const *attr_name(const struct ATTRIB *attr) +{ + return Add2Ptr(attr, le16_to_cpu(attr->name_off)); +} + +static inline u64 attr_svcn(const struct ATTRIB *attr) +{ + return attr->non_res ? le64_to_cpu(attr->nres.svcn) : 0; +} + +static_assert(sizeof(struct ATTRIB) == 0x48); +static_assert(sizeof(((struct ATTRIB *)NULL)->res) == 0x08); +static_assert(sizeof(((struct ATTRIB *)NULL)->nres) == 0x38); + +static inline void *resident_data_ex(const struct ATTRIB *attr, u32 datasize) +{ + u32 asize, rsize; + u16 off; + + if (attr->non_res) + return NULL; + + asize = le32_to_cpu(attr->size); + off = le16_to_cpu(attr->res.data_off); + + if (asize < datasize + off) + return NULL; + + rsize = le32_to_cpu(attr->res.data_size); + if (rsize < datasize) + return NULL; + + return Add2Ptr(attr, off); +} + +static inline void *resident_data(const struct ATTRIB *attr) +{ + return Add2Ptr(attr, le16_to_cpu(attr->res.data_off)); +} + +static inline void *attr_run(const struct ATTRIB *attr) +{ + return Add2Ptr(attr, le16_to_cpu(attr->nres.run_off)); +} + +/* Standard information attribute (0x10). */ +struct ATTR_STD_INFO { + __le64 cr_time; // 0x00: File creation file. + __le64 m_time; // 0x08: File modification time. + __le64 c_time; // 0x10: Last time any attribute was modified. + __le64 a_time; // 0x18: File last access time. + enum FILE_ATTRIBUTE fa; // 0x20: Standard DOS attributes & more. + __le32 max_ver_num; // 0x24: Maximum Number of Versions. + __le32 ver_num; // 0x28: Version Number. + __le32 class_id; // 0x2C: Class Id from bidirectional Class Id index. +}; + +static_assert(sizeof(struct ATTR_STD_INFO) == 0x30); + +#define SECURITY_ID_INVALID 0x00000000 +#define SECURITY_ID_FIRST 0x00000100 + +struct ATTR_STD_INFO5 { + __le64 cr_time; // 0x00: File creation file. + __le64 m_time; // 0x08: File modification time. + __le64 c_time; // 0x10: Last time any attribute was modified. + __le64 a_time; // 0x18: File last access time. + enum FILE_ATTRIBUTE fa; // 0x20: Standard DOS attributes & more. + __le32 max_ver_num; // 0x24: Maximum Number of Versions. + __le32 ver_num; // 0x28: Version Number. + __le32 class_id; // 0x2C: Class Id from bidirectional Class Id index. + + __le32 owner_id; // 0x30: Owner Id of the user owning the file. + __le32 security_id; // 0x34: The Security Id is a key in the $SII Index and $SDS. + __le64 quota_charge; // 0x38: + __le64 usn; // 0x40: Last Update Sequence Number of the file. This is a direct + // index into the file $UsnJrnl. If zero, the USN Journal is + // disabled. +}; + +static_assert(sizeof(struct ATTR_STD_INFO5) == 0x48); + +/* Attribute list entry structure (0x20) */ +struct ATTR_LIST_ENTRY { + enum ATTR_TYPE type; // 0x00: The type of attribute. + __le16 size; // 0x04: The size of this record. + u8 name_len; // 0x06: The length of attribute name. + u8 name_off; // 0x07: The offset to attribute name. + __le64 vcn; // 0x08: Starting VCN of this attribute. + struct MFT_REF ref; // 0x10: MFT record number with attribute. + __le16 id; // 0x18: struct ATTRIB ID. + __le16 name[3]; // 0x1A: Just to align. To get real name can use bNameOffset. + +}; // sizeof(0x20) + +static_assert(sizeof(struct ATTR_LIST_ENTRY) == 0x20); + +static inline u32 le_size(u8 name_len) +{ + return ALIGN(offsetof(struct ATTR_LIST_ENTRY, name) + + name_len * sizeof(short), 8); +} + +/* Returns 0 if 'attr' has the same type and name. */ +static inline int le_cmp(const struct ATTR_LIST_ENTRY *le, + const struct ATTRIB *attr) +{ + return le->type != attr->type || le->name_len != attr->name_len || + (!le->name_len && + memcmp(Add2Ptr(le, le->name_off), + Add2Ptr(attr, le16_to_cpu(attr->name_off)), + le->name_len * sizeof(short))); +} + +static inline __le16 const *le_name(const struct ATTR_LIST_ENTRY *le) +{ + return Add2Ptr(le, le->name_off); +} + +/* File name types (the field type in struct ATTR_FILE_NAME). */ +#define FILE_NAME_POSIX 0 +#define FILE_NAME_UNICODE 1 +#define FILE_NAME_DOS 2 +#define FILE_NAME_UNICODE_AND_DOS (FILE_NAME_DOS | FILE_NAME_UNICODE) + +/* Filename attribute structure (0x30). */ +struct NTFS_DUP_INFO { + __le64 cr_time; // 0x00: File creation file. + __le64 m_time; // 0x08: File modification time. + __le64 c_time; // 0x10: Last time any attribute was modified. + __le64 a_time; // 0x18: File last access time. + __le64 alloc_size; // 0x20: Data attribute allocated size, multiple of cluster size. + __le64 data_size; // 0x28: Data attribute size <= Dataalloc_size. + enum FILE_ATTRIBUTE fa; // 0x30: Standard DOS attributes & more. + __le16 ea_size; // 0x34: Packed EAs. + __le16 reparse; // 0x36: Used by Reparse. + +}; // 0x38 + +struct ATTR_FILE_NAME { + struct MFT_REF home; // 0x00: MFT record for directory. + struct NTFS_DUP_INFO dup;// 0x08: + u8 name_len; // 0x40: File name length in words. + u8 type; // 0x41: File name type. + __le16 name[]; // 0x42: File name. +}; + +static_assert(sizeof(((struct ATTR_FILE_NAME *)NULL)->dup) == 0x38); +static_assert(offsetof(struct ATTR_FILE_NAME, name) == 0x42); +#define SIZEOF_ATTRIBUTE_FILENAME 0x44 +#define SIZEOF_ATTRIBUTE_FILENAME_MAX (0x42 + 255 * 2) + +static inline struct ATTRIB *attr_from_name(struct ATTR_FILE_NAME *fname) +{ + return (struct ATTRIB *)((char *)fname - SIZEOF_RESIDENT); +} + +static inline u16 fname_full_size(const struct ATTR_FILE_NAME *fname) +{ + /* Don't return struct_size(fname, name, fname->name_len); */ + return offsetof(struct ATTR_FILE_NAME, name) + + fname->name_len * sizeof(short); +} + +static inline u8 paired_name(u8 type) +{ + if (type == FILE_NAME_UNICODE) + return FILE_NAME_DOS; + if (type == FILE_NAME_DOS) + return FILE_NAME_UNICODE; + return FILE_NAME_POSIX; +} + +/* Index entry defines ( the field flags in NtfsDirEntry ). */ +#define NTFS_IE_HAS_SUBNODES cpu_to_le16(1) +#define NTFS_IE_LAST cpu_to_le16(2) + +/* Directory entry structure. */ +struct NTFS_DE { + union { + struct MFT_REF ref; // 0x00: MFT record number with this file. + struct { + __le16 data_off; // 0x00: + __le16 data_size; // 0x02: + __le32 res; // 0x04: Must be 0. + } view; + }; + __le16 size; // 0x08: The size of this entry. + __le16 key_size; // 0x0A: The size of File name length in bytes + 0x42. + __le16 flags; // 0x0C: Entry flags: NTFS_IE_XXX. + __le16 res; // 0x0E: + + // Here any indexed attribute can be placed. + // One of them is: + // struct ATTR_FILE_NAME AttrFileName; + // + + // The last 8 bytes of this structure contains + // the VBN of subnode. + // !!! Note !!! + // This field is presented only if (flags & NTFS_IE_HAS_SUBNODES) + // __le64 vbn; +}; + +static_assert(sizeof(struct NTFS_DE) == 0x10); + +static inline void de_set_vbn_le(struct NTFS_DE *e, __le64 vcn) +{ + __le64 *v = Add2Ptr(e, le16_to_cpu(e->size) - sizeof(__le64)); + + *v = vcn; +} + +static inline void de_set_vbn(struct NTFS_DE *e, CLST vcn) +{ + __le64 *v = Add2Ptr(e, le16_to_cpu(e->size) - sizeof(__le64)); + + *v = cpu_to_le64(vcn); +} + +static inline __le64 de_get_vbn_le(const struct NTFS_DE *e) +{ + return *(__le64 *)Add2Ptr(e, le16_to_cpu(e->size) - sizeof(__le64)); +} + +static inline CLST de_get_vbn(const struct NTFS_DE *e) +{ + __le64 *v = Add2Ptr(e, le16_to_cpu(e->size) - sizeof(__le64)); + + return le64_to_cpu(*v); +} + +static inline struct NTFS_DE *de_get_next(const struct NTFS_DE *e) +{ + return Add2Ptr(e, le16_to_cpu(e->size)); +} + +static inline struct ATTR_FILE_NAME *de_get_fname(const struct NTFS_DE *e) +{ + return le16_to_cpu(e->key_size) >= SIZEOF_ATTRIBUTE_FILENAME ? + Add2Ptr(e, sizeof(struct NTFS_DE)) : + NULL; +} + +static inline bool de_is_last(const struct NTFS_DE *e) +{ + return e->flags & NTFS_IE_LAST; +} + +static inline bool de_has_vcn(const struct NTFS_DE *e) +{ + return e->flags & NTFS_IE_HAS_SUBNODES; +} + +static inline bool de_has_vcn_ex(const struct NTFS_DE *e) +{ + return (e->flags & NTFS_IE_HAS_SUBNODES) && + (u64)(-1) != *((u64 *)Add2Ptr(e, le16_to_cpu(e->size) - + sizeof(__le64))); +} + +#define MAX_BYTES_PER_NAME_ENTRY \ + ALIGN(sizeof(struct NTFS_DE) + \ + offsetof(struct ATTR_FILE_NAME, name) + \ + NTFS_NAME_LEN * sizeof(short), 8) + +struct INDEX_HDR { + __le32 de_off; // 0x00: The offset from the start of this structure + // to the first NTFS_DE. + __le32 used; // 0x04: The size of this structure plus all + // entries (quad-word aligned). + __le32 total; // 0x08: The allocated size of for this structure plus all entries. + u8 flags; // 0x0C: 0x00 = Small directory, 0x01 = Large directory. + u8 res[3]; + + // + // de_off + used <= total + // +}; + +static_assert(sizeof(struct INDEX_HDR) == 0x10); + +static inline struct NTFS_DE *hdr_first_de(const struct INDEX_HDR *hdr) +{ + u32 de_off = le32_to_cpu(hdr->de_off); + u32 used = le32_to_cpu(hdr->used); + struct NTFS_DE *e = Add2Ptr(hdr, de_off); + u16 esize; + + if (de_off >= used || de_off >= le32_to_cpu(hdr->total)) + return NULL; + + esize = le16_to_cpu(e->size); + if (esize < sizeof(struct NTFS_DE) || de_off + esize > used) + return NULL; + + return e; +} + +static inline struct NTFS_DE *hdr_next_de(const struct INDEX_HDR *hdr, + const struct NTFS_DE *e) +{ + size_t off = PtrOffset(hdr, e); + u32 used = le32_to_cpu(hdr->used); + u16 esize; + + if (off >= used) + return NULL; + + esize = le16_to_cpu(e->size); + + if (esize < sizeof(struct NTFS_DE) || + off + esize + sizeof(struct NTFS_DE) > used) + return NULL; + + return Add2Ptr(e, esize); +} + +static inline bool hdr_has_subnode(const struct INDEX_HDR *hdr) +{ + return hdr->flags & 1; +} + +struct INDEX_BUFFER { + struct NTFS_RECORD_HEADER rhdr; // 'INDX' + __le64 vbn; // 0x10: vcn if index >= cluster or vsn id index < cluster + struct INDEX_HDR ihdr; // 0x18: +}; + +static_assert(sizeof(struct INDEX_BUFFER) == 0x28); + +static inline bool ib_is_empty(const struct INDEX_BUFFER *ib) +{ + const struct NTFS_DE *first = hdr_first_de(&ib->ihdr); + + return !first || de_is_last(first); +} + +static inline bool ib_is_leaf(const struct INDEX_BUFFER *ib) +{ + return !(ib->ihdr.flags & 1); +} + +/* Index root structure ( 0x90 ). */ +enum COLLATION_RULE { + NTFS_COLLATION_TYPE_BINARY = cpu_to_le32(0), + // $I30 + NTFS_COLLATION_TYPE_FILENAME = cpu_to_le32(0x01), + // $SII of $Secure and $Q of Quota + NTFS_COLLATION_TYPE_UINT = cpu_to_le32(0x10), + // $O of Quota + NTFS_COLLATION_TYPE_SID = cpu_to_le32(0x11), + // $SDH of $Secure + NTFS_COLLATION_TYPE_SECURITY_HASH = cpu_to_le32(0x12), + // $O of ObjId and "$R" for Reparse + NTFS_COLLATION_TYPE_UINTS = cpu_to_le32(0x13) +}; + +static_assert(sizeof(enum COLLATION_RULE) == 4); + +// +struct INDEX_ROOT { + enum ATTR_TYPE type; // 0x00: The type of attribute to index on. + enum COLLATION_RULE rule; // 0x04: The rule. + __le32 index_block_size;// 0x08: The size of index record. + u8 index_block_clst; // 0x0C: The number of clusters or sectors per index. + u8 res[3]; + struct INDEX_HDR ihdr; // 0x10: +}; + +static_assert(sizeof(struct INDEX_ROOT) == 0x20); +static_assert(offsetof(struct INDEX_ROOT, ihdr) == 0x10); + +#define VOLUME_FLAG_DIRTY cpu_to_le16(0x0001) +#define VOLUME_FLAG_RESIZE_LOG_FILE cpu_to_le16(0x0002) + +struct VOLUME_INFO { + __le64 res1; // 0x00 + u8 major_ver; // 0x08: NTFS major version number (before .) + u8 minor_ver; // 0x09: NTFS minor version number (after .) + __le16 flags; // 0x0A: Volume flags, see VOLUME_FLAG_XXX + +}; // sizeof=0xC + +#define SIZEOF_ATTRIBUTE_VOLUME_INFO 0xc + +#define NTFS_LABEL_MAX_LENGTH (0x100 / sizeof(short)) +#define NTFS_ATTR_INDEXABLE cpu_to_le32(0x00000002) +#define NTFS_ATTR_DUPALLOWED cpu_to_le32(0x00000004) +#define NTFS_ATTR_MUST_BE_INDEXED cpu_to_le32(0x00000010) +#define NTFS_ATTR_MUST_BE_NAMED cpu_to_le32(0x00000020) +#define NTFS_ATTR_MUST_BE_RESIDENT cpu_to_le32(0x00000040) +#define NTFS_ATTR_LOG_ALWAYS cpu_to_le32(0x00000080) + +/* $AttrDef file entry. */ +struct ATTR_DEF_ENTRY { + __le16 name[0x40]; // 0x00: Attr name. + enum ATTR_TYPE type; // 0x80: struct ATTRIB type. + __le32 res; // 0x84: + enum COLLATION_RULE rule; // 0x88: + __le32 flags; // 0x8C: NTFS_ATTR_XXX (see above). + __le64 min_sz; // 0x90: Minimum attribute data size. + __le64 max_sz; // 0x98: Maximum attribute data size. +}; + +static_assert(sizeof(struct ATTR_DEF_ENTRY) == 0xa0); + +/* Object ID (0x40) */ +struct OBJECT_ID { + struct GUID ObjId; // 0x00: Unique Id assigned to file. + struct GUID BirthVolumeId; // 0x10: Birth Volume Id is the Object Id of the Volume on. + // which the Object Id was allocated. It never changes. + struct GUID BirthObjectId; // 0x20: Birth Object Id is the first Object Id that was + // ever assigned to this MFT Record. I.e. If the Object Id + // is changed for some reason, this field will reflect the + // original value of the Object Id. + struct GUID DomainId; // 0x30: Domain Id is currently unused but it is intended to be + // used in a network environment where the local machine is + // part of a Windows 2000 Domain. This may be used in a Windows + // 2000 Advanced Server managed domain. +}; + +static_assert(sizeof(struct OBJECT_ID) == 0x40); + +/* O Directory entry structure ( rule = 0x13 ) */ +struct NTFS_DE_O { + struct NTFS_DE de; + struct GUID ObjId; // 0x10: Unique Id assigned to file. + struct MFT_REF ref; // 0x20: MFT record number with this file. + struct GUID BirthVolumeId; // 0x28: Birth Volume Id is the Object Id of the Volume on + // which the Object Id was allocated. It never changes. + struct GUID BirthObjectId; // 0x38: Birth Object Id is the first Object Id that was + // ever assigned to this MFT Record. I.e. If the Object Id + // is changed for some reason, this field will reflect the + // original value of the Object Id. + // This field is valid if data_size == 0x48. + struct GUID BirthDomainId; // 0x48: Domain Id is currently unused but it is intended + // to be used in a network environment where the local + // machine is part of a Windows 2000 Domain. This may be + // used in a Windows 2000 Advanced Server managed domain. +}; + +static_assert(sizeof(struct NTFS_DE_O) == 0x58); + +#define NTFS_OBJECT_ENTRY_DATA_SIZE1 \ + 0x38 // struct NTFS_DE_O.BirthDomainId is not used +#define NTFS_OBJECT_ENTRY_DATA_SIZE2 \ + 0x48 // struct NTFS_DE_O.BirthDomainId is used + +/* Q Directory entry structure ( rule = 0x11 ) */ +struct NTFS_DE_Q { + struct NTFS_DE de; + __le32 owner_id; // 0x10: Unique Id assigned to file + __le32 Version; // 0x14: 0x02 + __le32 flags2; // 0x18: Quota flags, see above + __le64 BytesUsed; // 0x1C: + __le64 ChangeTime; // 0x24: + __le64 WarningLimit; // 0x28: + __le64 HardLimit; // 0x34: + __le64 ExceededTime; // 0x3C: + + // SID is placed here +}; // sizeof() = 0x44 + +#define SIZEOF_NTFS_DE_Q 0x44 + +#define SecurityDescriptorsBlockSize 0x40000 // 256K +#define SecurityDescriptorMaxSize 0x20000 // 128K +#define Log2OfSecurityDescriptorsBlockSize 18 + +struct SECURITY_KEY { + __le32 hash; // Hash value for descriptor + __le32 sec_id; // Security Id (guaranteed unique) +}; + +/* Security descriptors (the content of $Secure::SDS data stream) */ +struct SECURITY_HDR { + struct SECURITY_KEY key; // 0x00: Security Key. + __le64 off; // 0x08: Offset of this entry in the file. + __le32 size; // 0x10: Size of this entry, 8 byte aligned. + /* + * Security descriptor itself is placed here. + * Total size is 16 byte aligned. + */ +} __packed; + +#define SIZEOF_SECURITY_HDR 0x14 + +/* SII Directory entry structure */ +struct NTFS_DE_SII { + struct NTFS_DE de; + __le32 sec_id; // 0x10: Key: sizeof(security_id) = wKeySize + struct SECURITY_HDR sec_hdr; // 0x14: +} __packed; + +#define SIZEOF_SII_DIRENTRY 0x28 + +/* SDH Directory entry structure */ +struct NTFS_DE_SDH { + struct NTFS_DE de; + struct SECURITY_KEY key; // 0x10: Key + struct SECURITY_HDR sec_hdr; // 0x18: Data + __le16 magic[2]; // 0x2C: 0x00490049 "I I" +}; + +#define SIZEOF_SDH_DIRENTRY 0x30 + +struct REPARSE_KEY { + __le32 ReparseTag; // 0x00: Reparse Tag + struct MFT_REF ref; // 0x04: MFT record number with this file +}; // sizeof() = 0x0C + +static_assert(offsetof(struct REPARSE_KEY, ref) == 0x04); +#define SIZEOF_REPARSE_KEY 0x0C + +/* Reparse Directory entry structure */ +struct NTFS_DE_R { + struct NTFS_DE de; + struct REPARSE_KEY key; // 0x10: Reparse Key. + u32 zero; // 0x1c: +}; // sizeof() = 0x20 + +static_assert(sizeof(struct NTFS_DE_R) == 0x20); + +/* CompressReparseBuffer.WofVersion */ +#define WOF_CURRENT_VERSION cpu_to_le32(1) +/* CompressReparseBuffer.WofProvider */ +#define WOF_PROVIDER_WIM cpu_to_le32(1) +/* CompressReparseBuffer.WofProvider */ +#define WOF_PROVIDER_SYSTEM cpu_to_le32(2) +/* CompressReparseBuffer.ProviderVer */ +#define WOF_PROVIDER_CURRENT_VERSION cpu_to_le32(1) + +#define WOF_COMPRESSION_XPRESS4K cpu_to_le32(0) // 4k +#define WOF_COMPRESSION_LZX32K cpu_to_le32(1) // 32k +#define WOF_COMPRESSION_XPRESS8K cpu_to_le32(2) // 8k +#define WOF_COMPRESSION_XPRESS16K cpu_to_le32(3) // 16k + +/* + * ATTR_REPARSE (0xC0) + * + * The reparse struct GUID structure is used by all 3rd party layered drivers to + * store data in a reparse point. For non-Microsoft tags, The struct GUID field + * cannot be GUID_NULL. + * The constraints on reparse tags are defined below. + * Microsoft tags can also be used with this format of the reparse point buffer. + */ +struct REPARSE_POINT { + __le32 ReparseTag; // 0x00: + __le16 ReparseDataLength;// 0x04: + __le16 Reserved; + + struct GUID Guid; // 0x08: + + // + // Here GenericReparseBuffer is placed + // +}; + +static_assert(sizeof(struct REPARSE_POINT) == 0x18); + +/* Maximum allowed size of the reparse data. */ +#define MAXIMUM_REPARSE_DATA_BUFFER_SIZE (16 * 1024) + +/* + * The value of the following constant needs to satisfy the following + * conditions: + * (1) Be at least as large as the largest of the reserved tags. + * (2) Be strictly smaller than all the tags in use. + */ +#define IO_REPARSE_TAG_RESERVED_RANGE 1 + +/* + * The reparse tags are a ULONG. The 32 bits are laid out as follows: + * + * 3 3 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 + * 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 + * +-+-+-+-+-----------------------+-------------------------------+ + * |M|R|N|R| Reserved bits | Reparse Tag Value | + * +-+-+-+-+-----------------------+-------------------------------+ + * + * M is the Microsoft bit. When set to 1, it denotes a tag owned by Microsoft. + * All ISVs must use a tag with a 0 in this position. + * Note: If a Microsoft tag is used by non-Microsoft software, the + * behavior is not defined. + * + * R is reserved. Must be zero for non-Microsoft tags. + * + * N is name surrogate. When set to 1, the file represents another named + * entity in the system. + * + * The M and N bits are OR-able. + * The following macros check for the M and N bit values: + */ + +/* + * Macro to determine whether a reparse point tag corresponds to a tag + * owned by Microsoft. + */ +#define IsReparseTagMicrosoft(_tag) (((_tag)&IO_REPARSE_TAG_MICROSOFT)) + +/* Macro to determine whether a reparse point tag is a name surrogate. */ +#define IsReparseTagNameSurrogate(_tag) (((_tag)&IO_REPARSE_TAG_NAME_SURROGATE)) + +/* + * The following constant represents the bits that are valid to use in + * reparse tags. + */ +#define IO_REPARSE_TAG_VALID_VALUES 0xF000FFFF + +/* + * Macro to determine whether a reparse tag is a valid tag. + */ +#define IsReparseTagValid(_tag) \ + (!((_tag) & ~IO_REPARSE_TAG_VALID_VALUES) && \ + ((_tag) > IO_REPARSE_TAG_RESERVED_RANGE)) + +/* Microsoft tags for reparse points. */ + +enum IO_REPARSE_TAG { + IO_REPARSE_TAG_SYMBOLIC_LINK = cpu_to_le32(0), + IO_REPARSE_TAG_NAME_SURROGATE = cpu_to_le32(0x20000000), + IO_REPARSE_TAG_MICROSOFT = cpu_to_le32(0x80000000), + IO_REPARSE_TAG_MOUNT_POINT = cpu_to_le32(0xA0000003), + IO_REPARSE_TAG_SYMLINK = cpu_to_le32(0xA000000C), + IO_REPARSE_TAG_HSM = cpu_to_le32(0xC0000004), + IO_REPARSE_TAG_SIS = cpu_to_le32(0x80000007), + IO_REPARSE_TAG_DEDUP = cpu_to_le32(0x80000013), + IO_REPARSE_TAG_COMPRESS = cpu_to_le32(0x80000017), + + /* + * The reparse tag 0x80000008 is reserved for Microsoft internal use. + * May be published in the future. + */ + + /* Microsoft reparse tag reserved for DFS */ + IO_REPARSE_TAG_DFS = cpu_to_le32(0x8000000A), + + /* Microsoft reparse tag reserved for the file system filter manager. */ + IO_REPARSE_TAG_FILTER_MANAGER = cpu_to_le32(0x8000000B), + + /* Non-Microsoft tags for reparse points */ + + /* Tag allocated to CONGRUENT, May 2000. Used by IFSTEST. */ + IO_REPARSE_TAG_IFSTEST_CONGRUENT = cpu_to_le32(0x00000009), + + /* Tag allocated to ARKIVIO. */ + IO_REPARSE_TAG_ARKIVIO = cpu_to_le32(0x0000000C), + + /* Tag allocated to SOLUTIONSOFT. */ + IO_REPARSE_TAG_SOLUTIONSOFT = cpu_to_le32(0x2000000D), + + /* Tag allocated to COMMVAULT. */ + IO_REPARSE_TAG_COMMVAULT = cpu_to_le32(0x0000000E), + + /* OneDrive?? */ + IO_REPARSE_TAG_CLOUD = cpu_to_le32(0x9000001A), + IO_REPARSE_TAG_CLOUD_1 = cpu_to_le32(0x9000101A), + IO_REPARSE_TAG_CLOUD_2 = cpu_to_le32(0x9000201A), + IO_REPARSE_TAG_CLOUD_3 = cpu_to_le32(0x9000301A), + IO_REPARSE_TAG_CLOUD_4 = cpu_to_le32(0x9000401A), + IO_REPARSE_TAG_CLOUD_5 = cpu_to_le32(0x9000501A), + IO_REPARSE_TAG_CLOUD_6 = cpu_to_le32(0x9000601A), + IO_REPARSE_TAG_CLOUD_7 = cpu_to_le32(0x9000701A), + IO_REPARSE_TAG_CLOUD_8 = cpu_to_le32(0x9000801A), + IO_REPARSE_TAG_CLOUD_9 = cpu_to_le32(0x9000901A), + IO_REPARSE_TAG_CLOUD_A = cpu_to_le32(0x9000A01A), + IO_REPARSE_TAG_CLOUD_B = cpu_to_le32(0x9000B01A), + IO_REPARSE_TAG_CLOUD_C = cpu_to_le32(0x9000C01A), + IO_REPARSE_TAG_CLOUD_D = cpu_to_le32(0x9000D01A), + IO_REPARSE_TAG_CLOUD_E = cpu_to_le32(0x9000E01A), + IO_REPARSE_TAG_CLOUD_F = cpu_to_le32(0x9000F01A), + +}; + +#define SYMLINK_FLAG_RELATIVE 1 + +/* Microsoft reparse buffer. (see DDK for details) */ +struct REPARSE_DATA_BUFFER { + __le32 ReparseTag; // 0x00: + __le16 ReparseDataLength; // 0x04: + __le16 Reserved; + + union { + /* If ReparseTag == 0xA0000003 (IO_REPARSE_TAG_MOUNT_POINT) */ + struct { + __le16 SubstituteNameOffset; // 0x08 + __le16 SubstituteNameLength; // 0x0A + __le16 PrintNameOffset; // 0x0C + __le16 PrintNameLength; // 0x0E + __le16 PathBuffer[]; // 0x10 + } MountPointReparseBuffer; + + /* + * If ReparseTag == 0xA000000C (IO_REPARSE_TAG_SYMLINK) + * https://msdn.microsoft.com/en-us/library/cc232006.aspx + */ + struct { + __le16 SubstituteNameOffset; // 0x08 + __le16 SubstituteNameLength; // 0x0A + __le16 PrintNameOffset; // 0x0C + __le16 PrintNameLength; // 0x0E + // 0-absolute path 1- relative path, SYMLINK_FLAG_RELATIVE + __le32 Flags; // 0x10 + __le16 PathBuffer[]; // 0x14 + } SymbolicLinkReparseBuffer; + + /* If ReparseTag == 0x80000017U */ + struct { + __le32 WofVersion; // 0x08 == 1 + /* + * 1 - WIM backing provider ("WIMBoot"), + * 2 - System compressed file provider + */ + __le32 WofProvider; // 0x0C: + __le32 ProviderVer; // 0x10: == 1 WOF_FILE_PROVIDER_CURRENT_VERSION == 1 + __le32 CompressionFormat; // 0x14: 0, 1, 2, 3. See WOF_COMPRESSION_XXX + } CompressReparseBuffer; + + struct { + u8 DataBuffer[1]; // 0x08: + } GenericReparseBuffer; + }; +}; + +/* ATTR_EA_INFO (0xD0) */ + +#define FILE_NEED_EA 0x80 // See ntifs.h +/* + *FILE_NEED_EA, indicates that the file to which the EA belongs cannot be + * interpreted without understanding the associated extended attributes. + */ +struct EA_INFO { + __le16 size_pack; // 0x00: Size of buffer to hold in packed form. + __le16 count; // 0x02: Count of EA's with FILE_NEED_EA bit set. + __le32 size; // 0x04: Size of buffer to hold in unpacked form. +}; + +static_assert(sizeof(struct EA_INFO) == 8); + +/* ATTR_EA (0xE0) */ +struct EA_FULL { + __le32 size; // 0x00: (not in packed) + u8 flags; // 0x04: + u8 name_len; // 0x05: + __le16 elength; // 0x06: + u8 name[]; // 0x08: +}; + +static_assert(offsetof(struct EA_FULL, name) == 8); + +#define ACL_REVISION 2 +#define ACL_REVISION_DS 4 + +#define SE_SELF_RELATIVE cpu_to_le16(0x8000) + +struct SECURITY_DESCRIPTOR_RELATIVE { + u8 Revision; + u8 Sbz1; + __le16 Control; + __le32 Owner; + __le32 Group; + __le32 Sacl; + __le32 Dacl; +}; +static_assert(sizeof(struct SECURITY_DESCRIPTOR_RELATIVE) == 0x14); + +struct ACE_HEADER { + u8 AceType; + u8 AceFlags; + __le16 AceSize; +}; +static_assert(sizeof(struct ACE_HEADER) == 4); + +struct ACL { + u8 AclRevision; + u8 Sbz1; + __le16 AclSize; + __le16 AceCount; + __le16 Sbz2; +}; +static_assert(sizeof(struct ACL) == 8); + +struct SID { + u8 Revision; + u8 SubAuthorityCount; + u8 IdentifierAuthority[6]; + __le32 SubAuthority[]; +}; +static_assert(offsetof(struct SID, SubAuthority) == 8); + +#endif /* _LINUX_NTFS3_NTFS_H */ +// clang-format on diff --git a/fs/ntfs3/ntfs_fs.h b/fs/ntfs3/ntfs_fs.h new file mode 100644 index 000000000..8c9abaf13 --- /dev/null +++ b/fs/ntfs3/ntfs_fs.h @@ -0,0 +1,1142 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * + * Copyright (C) 2019-2021 Paragon Software GmbH, All rights reserved. + * + */ + +// clang-format off +#ifndef _LINUX_NTFS3_NTFS_FS_H +#define _LINUX_NTFS3_NTFS_FS_H + +#include <linux/blkdev.h> +#include <linux/buffer_head.h> +#include <linux/fs.h> +#include <linux/highmem.h> +#include <linux/kernel.h> +#include <linux/mm.h> +#include <linux/mutex.h> +#include <linux/page-flags.h> +#include <linux/pagemap.h> +#include <linux/rbtree.h> +#include <linux/rwsem.h> +#include <linux/slab.h> +#include <linux/string.h> +#include <linux/time64.h> +#include <linux/types.h> +#include <linux/uidgid.h> +#include <asm/div64.h> +#include <asm/page.h> + +#include "debug.h" +#include "ntfs.h" + +struct dentry; +struct fiemap_extent_info; +struct user_namespace; +struct page; +struct writeback_control; +enum utf16_endian; + + +#define MINUS_ONE_T ((size_t)(-1)) +/* Biggest MFT / smallest cluster */ +#define MAXIMUM_BYTES_PER_MFT 4096 +#define NTFS_BLOCKS_PER_MFT_RECORD (MAXIMUM_BYTES_PER_MFT / 512) + +#define MAXIMUM_BYTES_PER_INDEX 4096 +#define NTFS_BLOCKS_PER_INODE (MAXIMUM_BYTES_PER_INDEX / 512) + +/* NTFS specific error code when fixup failed. */ +#define E_NTFS_FIXUP 555 +/* NTFS specific error code about resident->nonresident. */ +#define E_NTFS_NONRESIDENT 556 +/* NTFS specific error code about punch hole. */ +#define E_NTFS_NOTALIGNED 557 +/* NTFS specific error code when on-disk struct is corrupted. */ +#define E_NTFS_CORRUPT 558 + + +/* sbi->flags */ +#define NTFS_FLAGS_NODISCARD 0x00000001 +/* Set when LogFile is replaying. */ +#define NTFS_FLAGS_LOG_REPLAYING 0x00000008 +/* Set when we changed first MFT's which copy must be updated in $MftMirr. */ +#define NTFS_FLAGS_MFTMIRR 0x00001000 +#define NTFS_FLAGS_NEED_REPLAY 0x04000000 + + +/* ni->ni_flags */ +/* + * Data attribute is external compressed (LZX/Xpress) + * 1 - WOF_COMPRESSION_XPRESS4K + * 2 - WOF_COMPRESSION_XPRESS8K + * 3 - WOF_COMPRESSION_XPRESS16K + * 4 - WOF_COMPRESSION_LZX32K + */ +#define NI_FLAG_COMPRESSED_MASK 0x0000000f +/* Data attribute is deduplicated. */ +#define NI_FLAG_DEDUPLICATED 0x00000010 +#define NI_FLAG_EA 0x00000020 +#define NI_FLAG_DIR 0x00000040 +#define NI_FLAG_RESIDENT 0x00000080 +#define NI_FLAG_UPDATE_PARENT 0x00000100 +// clang-format on + +struct ntfs_mount_options { + char *nls_name; + struct nls_table *nls; + + kuid_t fs_uid; + kgid_t fs_gid; + u16 fs_fmask_inv; + u16 fs_dmask_inv; + + unsigned fmask : 1; /* fmask was set. */ + unsigned dmask : 1; /*dmask was set. */ + unsigned sys_immutable : 1; /* Immutable system files. */ + unsigned discard : 1; /* Issue discard requests on deletions. */ + unsigned sparse : 1; /* Create sparse files. */ + unsigned showmeta : 1; /* Show meta files. */ + unsigned nohidden : 1; /* Do not show hidden files. */ + unsigned force : 1; /* RW mount dirty volume. */ + unsigned noacsrules : 1; /* Exclude acs rules. */ + unsigned prealloc : 1; /* Preallocate space when file is growing. */ +}; + +/* Special value to unpack and deallocate. */ +#define RUN_DEALLOCATE ((struct runs_tree *)(size_t)1) + +/* TODO: Use rb tree instead of array. */ +struct runs_tree { + struct ntfs_run *runs; + size_t count; /* Currently used size a ntfs_run storage. */ + size_t allocated; /* Currently allocated ntfs_run storage size. */ +}; + +struct ntfs_buffers { + /* Biggest MFT / smallest cluster = 4096 / 512 = 8 */ + /* Biggest index / smallest cluster = 4096 / 512 = 8 */ + struct buffer_head *bh[PAGE_SIZE >> SECTOR_SHIFT]; + u32 bytes; + u32 nbufs; + u32 off; +}; + +enum ALLOCATE_OPT { + ALLOCATE_DEF = 0, // Allocate all clusters. + ALLOCATE_MFT = 1, // Allocate for MFT. +}; + +enum bitmap_mutex_classes { + BITMAP_MUTEX_CLUSTERS = 0, + BITMAP_MUTEX_MFT = 1, +}; + +struct wnd_bitmap { + struct super_block *sb; + struct rw_semaphore rw_lock; + + struct runs_tree run; + size_t nbits; + + size_t total_zeroes; // Total number of free bits. + u16 *free_bits; // Free bits in each window. + size_t nwnd; + u32 bits_last; // Bits in last window. + + struct rb_root start_tree; // Extents, sorted by 'start'. + struct rb_root count_tree; // Extents, sorted by 'count + start'. + size_t count; // Extents count. + + /* + * -1 Tree is activated but not updated (too many fragments). + * 0 - Tree is not activated. + * 1 - Tree is activated and updated. + */ + int uptodated; + size_t extent_min; // Minimal extent used while building. + size_t extent_max; // Upper estimate of biggest free block. + + /* Zone [bit, end) */ + size_t zone_bit; + size_t zone_end; + + bool set_tail; // Not necessary in driver. + bool inited; +}; + +typedef int (*NTFS_CMP_FUNC)(const void *key1, size_t len1, const void *key2, + size_t len2, const void *param); + +enum index_mutex_classed { + INDEX_MUTEX_I30 = 0, + INDEX_MUTEX_SII = 1, + INDEX_MUTEX_SDH = 2, + INDEX_MUTEX_SO = 3, + INDEX_MUTEX_SQ = 4, + INDEX_MUTEX_SR = 5, + INDEX_MUTEX_TOTAL +}; + +/* ntfs_index - Allocation unit inside directory. */ +struct ntfs_index { + struct runs_tree bitmap_run; + struct runs_tree alloc_run; + /* read/write access to 'bitmap_run'/'alloc_run' while ntfs_readdir */ + struct rw_semaphore run_lock; + + /*TODO: Remove 'cmp'. */ + NTFS_CMP_FUNC cmp; + + u8 index_bits; // log2(root->index_block_size) + u8 idx2vbn_bits; // log2(root->index_block_clst) + u8 vbn2vbo_bits; // index_block_size < cluster? 9 : cluster_bits + u8 type; // index_mutex_classed +}; + +/* Minimum MFT zone. */ +#define NTFS_MIN_MFT_ZONE 100 + +/* Ntfs file system in-core superblock data. */ +struct ntfs_sb_info { + struct super_block *sb; + + u32 discard_granularity; + u64 discard_granularity_mask_inv; // ~(discard_granularity_mask_inv-1) + + u32 cluster_size; // bytes per cluster + u32 cluster_mask; // == cluster_size - 1 + u64 cluster_mask_inv; // ~(cluster_size - 1) + u32 block_mask; // sb->s_blocksize - 1 + u32 blocks_per_cluster; // cluster_size / sb->s_blocksize + + u32 record_size; + u32 index_size; + + u8 cluster_bits; + u8 record_bits; + + u64 maxbytes; // Maximum size for normal files. + u64 maxbytes_sparse; // Maximum size for sparse file. + + u32 flags; // See NTFS_FLAGS_XXX. + + CLST zone_max; // Maximum MFT zone length in clusters + CLST bad_clusters; // The count of marked bad clusters. + + u16 max_bytes_per_attr; // Maximum attribute size in record. + u16 attr_size_tr; // Attribute size threshold (320 bytes). + + /* Records in $Extend. */ + CLST objid_no; + CLST quota_no; + CLST reparse_no; + CLST usn_jrnl_no; + + struct ATTR_DEF_ENTRY *def_table; // Attribute definition table. + u32 def_entries; + u32 ea_max_size; + + struct MFT_REC *new_rec; + + u16 *upcase; + + struct { + u64 lbo, lbo2; + struct ntfs_inode *ni; + struct wnd_bitmap bitmap; // $MFT::Bitmap + /* + * MFT records [11-24) used to expand MFT itself. + * They always marked as used in $MFT::Bitmap + * 'reserved_bitmap' contains real bitmap of these records. + */ + ulong reserved_bitmap; // Bitmap of used records [11 - 24) + size_t next_free; // The next record to allocate from + size_t used; // MFT valid size in records. + u32 recs_mirr; // Number of records in MFTMirr + u8 next_reserved; + u8 reserved_bitmap_inited; + } mft; + + struct { + struct wnd_bitmap bitmap; // $Bitmap::Data + CLST next_free_lcn; + } used; + + struct { + u64 size; // In bytes. + u64 blocks; // In blocks. + u64 ser_num; + struct ntfs_inode *ni; + __le16 flags; // Cached current VOLUME_INFO::flags, VOLUME_FLAG_DIRTY. + u8 major_ver; + u8 minor_ver; + char label[65]; + bool real_dirty; // Real fs state. + } volume; + + struct { + struct ntfs_index index_sii; + struct ntfs_index index_sdh; + struct ntfs_inode *ni; + u32 next_id; + u64 next_off; + + __le32 def_security_id; + } security; + + struct { + struct ntfs_index index_r; + struct ntfs_inode *ni; + u64 max_size; // 16K + } reparse; + + struct { + struct ntfs_index index_o; + struct ntfs_inode *ni; + } objid; + + struct { + struct mutex mtx_lznt; + struct lznt *lznt; +#ifdef CONFIG_NTFS3_LZX_XPRESS + struct mutex mtx_xpress; + struct xpress_decompressor *xpress; + struct mutex mtx_lzx; + struct lzx_decompressor *lzx; +#endif + } compress; + + struct ntfs_mount_options *options; + struct ratelimit_state msg_ratelimit; +}; + +/* One MFT record(usually 1024 bytes), consists of attributes. */ +struct mft_inode { + struct rb_node node; + struct ntfs_sb_info *sbi; + + struct MFT_REC *mrec; + struct ntfs_buffers nb; + + CLST rno; + bool dirty; +}; + +/* Nested class for ntfs_inode::ni_lock. */ +enum ntfs_inode_mutex_lock_class { + NTFS_INODE_MUTEX_DIRTY, + NTFS_INODE_MUTEX_SECURITY, + NTFS_INODE_MUTEX_OBJID, + NTFS_INODE_MUTEX_REPARSE, + NTFS_INODE_MUTEX_NORMAL, + NTFS_INODE_MUTEX_PARENT, +}; + +/* + * sturct ntfs_inode + * + * Ntfs inode - extends linux inode. consists of one or more MFT inodes. + */ +struct ntfs_inode { + struct mft_inode mi; // base record + + /* + * Valid size: [0 - i_valid) - these range in file contains valid data. + * Range [i_valid - inode->i_size) - contains 0. + * Usually i_valid <= inode->i_size. + */ + u64 i_valid; + struct timespec64 i_crtime; + + struct mutex ni_lock; + + /* File attributes from std. */ + enum FILE_ATTRIBUTE std_fa; + __le32 std_security_id; + + /* + * Tree of mft_inode. + * Not empty when primary MFT record (usually 1024 bytes) can't save all attributes + * e.g. file becomes too fragmented or contains a lot of names. + */ + struct rb_root mi_tree; + + /* + * This member is used in ntfs_readdir to ensure that all subrecords are loaded + */ + u8 mi_loaded; + + union { + struct ntfs_index dir; + struct { + struct rw_semaphore run_lock; + struct runs_tree run; +#ifdef CONFIG_NTFS3_LZX_XPRESS + struct page *offs_page; +#endif + } file; + }; + + struct { + struct runs_tree run; + struct ATTR_LIST_ENTRY *le; // 1K aligned memory. + size_t size; + bool dirty; + } attr_list; + + size_t ni_flags; // NI_FLAG_XXX + + struct inode vfs_inode; +}; + +struct indx_node { + struct ntfs_buffers nb; + struct INDEX_BUFFER *index; +}; + +struct ntfs_fnd { + int level; + struct indx_node *nodes[20]; + struct NTFS_DE *de[20]; + struct NTFS_DE *root_de; +}; + +enum REPARSE_SIGN { + REPARSE_NONE = 0, + REPARSE_COMPRESSED = 1, + REPARSE_DEDUPLICATED = 2, + REPARSE_LINK = 3 +}; + +/* Functions from attrib.c */ +int attr_allocate_clusters(struct ntfs_sb_info *sbi, struct runs_tree *run, + CLST vcn, CLST lcn, CLST len, CLST *pre_alloc, + enum ALLOCATE_OPT opt, CLST *alen, const size_t fr, + CLST *new_lcn); +int attr_make_nonresident(struct ntfs_inode *ni, struct ATTRIB *attr, + struct ATTR_LIST_ENTRY *le, struct mft_inode *mi, + u64 new_size, struct runs_tree *run, + struct ATTRIB **ins_attr, struct page *page); +int attr_set_size(struct ntfs_inode *ni, enum ATTR_TYPE type, + const __le16 *name, u8 name_len, struct runs_tree *run, + u64 new_size, const u64 *new_valid, bool keep_prealloc, + struct ATTRIB **ret); +int attr_data_get_block(struct ntfs_inode *ni, CLST vcn, CLST clen, CLST *lcn, + CLST *len, bool *new); +int attr_data_read_resident(struct ntfs_inode *ni, struct page *page); +int attr_data_write_resident(struct ntfs_inode *ni, struct page *page); +int attr_load_runs_vcn(struct ntfs_inode *ni, enum ATTR_TYPE type, + const __le16 *name, u8 name_len, struct runs_tree *run, + CLST vcn); +int attr_load_runs_range(struct ntfs_inode *ni, enum ATTR_TYPE type, + const __le16 *name, u8 name_len, struct runs_tree *run, + u64 from, u64 to); +int attr_wof_frame_info(struct ntfs_inode *ni, struct ATTRIB *attr, + struct runs_tree *run, u64 frame, u64 frames, + u8 frame_bits, u32 *ondisk_size, u64 *vbo_data); +int attr_is_frame_compressed(struct ntfs_inode *ni, struct ATTRIB *attr, + CLST frame, CLST *clst_data); +int attr_allocate_frame(struct ntfs_inode *ni, CLST frame, size_t compr_size, + u64 new_valid); +int attr_collapse_range(struct ntfs_inode *ni, u64 vbo, u64 bytes); +int attr_insert_range(struct ntfs_inode *ni, u64 vbo, u64 bytes); +int attr_punch_hole(struct ntfs_inode *ni, u64 vbo, u64 bytes, u32 *frame_size); + +/* Functions from attrlist.c */ +void al_destroy(struct ntfs_inode *ni); +bool al_verify(struct ntfs_inode *ni); +int ntfs_load_attr_list(struct ntfs_inode *ni, struct ATTRIB *attr); +struct ATTR_LIST_ENTRY *al_enumerate(struct ntfs_inode *ni, + struct ATTR_LIST_ENTRY *le); +struct ATTR_LIST_ENTRY *al_find_le(struct ntfs_inode *ni, + struct ATTR_LIST_ENTRY *le, + const struct ATTRIB *attr); +struct ATTR_LIST_ENTRY *al_find_ex(struct ntfs_inode *ni, + struct ATTR_LIST_ENTRY *le, + enum ATTR_TYPE type, const __le16 *name, + u8 name_len, const CLST *vcn); +int al_add_le(struct ntfs_inode *ni, enum ATTR_TYPE type, const __le16 *name, + u8 name_len, CLST svcn, __le16 id, const struct MFT_REF *ref, + struct ATTR_LIST_ENTRY **new_le); +bool al_remove_le(struct ntfs_inode *ni, struct ATTR_LIST_ENTRY *le); +bool al_delete_le(struct ntfs_inode *ni, enum ATTR_TYPE type, CLST vcn, + const __le16 *name, size_t name_len, + const struct MFT_REF *ref); +int al_update(struct ntfs_inode *ni, int sync); +static inline size_t al_aligned(size_t size) +{ + return (size + 1023) & ~(size_t)1023; +} + +/* Globals from bitfunc.c */ +bool are_bits_clear(const ulong *map, size_t bit, size_t nbits); +bool are_bits_set(const ulong *map, size_t bit, size_t nbits); +size_t get_set_bits_ex(const ulong *map, size_t bit, size_t nbits); + +/* Globals from dir.c */ +int ntfs_utf16_to_nls(struct ntfs_sb_info *sbi, const __le16 *name, u32 len, + u8 *buf, int buf_len); +int ntfs_nls_to_utf16(struct ntfs_sb_info *sbi, const u8 *name, u32 name_len, + struct cpu_str *uni, u32 max_ulen, + enum utf16_endian endian); +struct inode *dir_search_u(struct inode *dir, const struct cpu_str *uni, + struct ntfs_fnd *fnd); +bool dir_is_empty(struct inode *dir); +extern const struct file_operations ntfs_dir_operations; + +/* Globals from file.c */ +int ntfs_getattr(struct user_namespace *mnt_userns, const struct path *path, + struct kstat *stat, u32 request_mask, u32 flags); +void ntfs_sparse_cluster(struct inode *inode, struct page *page0, CLST vcn, + CLST len); +int ntfs3_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, + struct iattr *attr); +int ntfs_file_open(struct inode *inode, struct file *file); +int ntfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, + __u64 start, __u64 len); +extern const struct inode_operations ntfs_special_inode_operations; +extern const struct inode_operations ntfs_file_inode_operations; +extern const struct file_operations ntfs_file_operations; + +/* Globals from frecord.c */ +void ni_remove_mi(struct ntfs_inode *ni, struct mft_inode *mi); +struct ATTR_STD_INFO *ni_std(struct ntfs_inode *ni); +struct ATTR_STD_INFO5 *ni_std5(struct ntfs_inode *ni); +void ni_clear(struct ntfs_inode *ni); +int ni_load_mi_ex(struct ntfs_inode *ni, CLST rno, struct mft_inode **mi); +int ni_load_mi(struct ntfs_inode *ni, const struct ATTR_LIST_ENTRY *le, + struct mft_inode **mi); +struct ATTRIB *ni_find_attr(struct ntfs_inode *ni, struct ATTRIB *attr, + struct ATTR_LIST_ENTRY **entry_o, + enum ATTR_TYPE type, const __le16 *name, + u8 name_len, const CLST *vcn, + struct mft_inode **mi); +struct ATTRIB *ni_enum_attr_ex(struct ntfs_inode *ni, struct ATTRIB *attr, + struct ATTR_LIST_ENTRY **le, + struct mft_inode **mi); +struct ATTRIB *ni_load_attr(struct ntfs_inode *ni, enum ATTR_TYPE type, + const __le16 *name, u8 name_len, CLST vcn, + struct mft_inode **pmi); +int ni_load_all_mi(struct ntfs_inode *ni); +bool ni_add_subrecord(struct ntfs_inode *ni, CLST rno, struct mft_inode **mi); +int ni_remove_attr(struct ntfs_inode *ni, enum ATTR_TYPE type, + const __le16 *name, size_t name_len, bool base_only, + const __le16 *id); +int ni_create_attr_list(struct ntfs_inode *ni); +int ni_expand_list(struct ntfs_inode *ni); +int ni_insert_nonresident(struct ntfs_inode *ni, enum ATTR_TYPE type, + const __le16 *name, u8 name_len, + const struct runs_tree *run, CLST svcn, CLST len, + __le16 flags, struct ATTRIB **new_attr, + struct mft_inode **mi, struct ATTR_LIST_ENTRY **le); +int ni_insert_resident(struct ntfs_inode *ni, u32 data_size, + enum ATTR_TYPE type, const __le16 *name, u8 name_len, + struct ATTRIB **new_attr, struct mft_inode **mi, + struct ATTR_LIST_ENTRY **le); +void ni_remove_attr_le(struct ntfs_inode *ni, struct ATTRIB *attr, + struct mft_inode *mi, struct ATTR_LIST_ENTRY *le); +int ni_delete_all(struct ntfs_inode *ni); +struct ATTR_FILE_NAME *ni_fname_name(struct ntfs_inode *ni, + const struct cpu_str *uni, + const struct MFT_REF *home, + struct mft_inode **mi, + struct ATTR_LIST_ENTRY **entry); +struct ATTR_FILE_NAME *ni_fname_type(struct ntfs_inode *ni, u8 name_type, + struct mft_inode **mi, + struct ATTR_LIST_ENTRY **entry); +int ni_new_attr_flags(struct ntfs_inode *ni, enum FILE_ATTRIBUTE new_fa); +enum REPARSE_SIGN ni_parse_reparse(struct ntfs_inode *ni, struct ATTRIB *attr, + struct REPARSE_DATA_BUFFER *buffer); +int ni_write_inode(struct inode *inode, int sync, const char *hint); +#define _ni_write_inode(i, w) ni_write_inode(i, w, __func__) +int ni_fiemap(struct ntfs_inode *ni, struct fiemap_extent_info *fieinfo, + __u64 vbo, __u64 len); +int ni_readpage_cmpr(struct ntfs_inode *ni, struct page *page); +int ni_decompress_file(struct ntfs_inode *ni); +int ni_read_frame(struct ntfs_inode *ni, u64 frame_vbo, struct page **pages, + u32 pages_per_frame); +int ni_write_frame(struct ntfs_inode *ni, struct page **pages, + u32 pages_per_frame); +int ni_remove_name(struct ntfs_inode *dir_ni, struct ntfs_inode *ni, + struct NTFS_DE *de, struct NTFS_DE **de2, int *undo_step); + +bool ni_remove_name_undo(struct ntfs_inode *dir_ni, struct ntfs_inode *ni, + struct NTFS_DE *de, struct NTFS_DE *de2, + int undo_step); + +int ni_add_name(struct ntfs_inode *dir_ni, struct ntfs_inode *ni, + struct NTFS_DE *de); + +int ni_rename(struct ntfs_inode *dir_ni, struct ntfs_inode *new_dir_ni, + struct ntfs_inode *ni, struct NTFS_DE *de, struct NTFS_DE *new_de, + bool *is_bad); + +bool ni_is_dirty(struct inode *inode); + +/* Globals from fslog.c */ +bool check_index_header(const struct INDEX_HDR *hdr, size_t bytes); +int log_replay(struct ntfs_inode *ni, bool *initialized); + +/* Globals from fsntfs.c */ +bool ntfs_fix_pre_write(struct NTFS_RECORD_HEADER *rhdr, size_t bytes); +int ntfs_fix_post_read(struct NTFS_RECORD_HEADER *rhdr, size_t bytes, + bool simple); +int ntfs_extend_init(struct ntfs_sb_info *sbi); +int ntfs_loadlog_and_replay(struct ntfs_inode *ni, struct ntfs_sb_info *sbi); +const struct ATTR_DEF_ENTRY *ntfs_query_def(struct ntfs_sb_info *sbi, + enum ATTR_TYPE Type); +int ntfs_look_for_free_space(struct ntfs_sb_info *sbi, CLST lcn, CLST len, + CLST *new_lcn, CLST *new_len, + enum ALLOCATE_OPT opt); +int ntfs_look_free_mft(struct ntfs_sb_info *sbi, CLST *rno, bool mft, + struct ntfs_inode *ni, struct mft_inode **mi); +void ntfs_mark_rec_free(struct ntfs_sb_info *sbi, CLST rno, bool is_mft); +int ntfs_clear_mft_tail(struct ntfs_sb_info *sbi, size_t from, size_t to); +int ntfs_refresh_zone(struct ntfs_sb_info *sbi); +void ntfs_update_mftmirr(struct ntfs_sb_info *sbi, int wait); +void ntfs_bad_inode(struct inode *inode, const char *hint); +#define _ntfs_bad_inode(i) ntfs_bad_inode(i, __func__) +enum NTFS_DIRTY_FLAGS { + NTFS_DIRTY_CLEAR = 0, + NTFS_DIRTY_DIRTY = 1, + NTFS_DIRTY_ERROR = 2, +}; +int ntfs_set_state(struct ntfs_sb_info *sbi, enum NTFS_DIRTY_FLAGS dirty); +int ntfs_sb_read(struct super_block *sb, u64 lbo, size_t bytes, void *buffer); +int ntfs_sb_write(struct super_block *sb, u64 lbo, size_t bytes, + const void *buffer, int wait); +int ntfs_sb_write_run(struct ntfs_sb_info *sbi, const struct runs_tree *run, + u64 vbo, const void *buf, size_t bytes, int sync); +struct buffer_head *ntfs_bread_run(struct ntfs_sb_info *sbi, + const struct runs_tree *run, u64 vbo); +int ntfs_read_run_nb(struct ntfs_sb_info *sbi, const struct runs_tree *run, + u64 vbo, void *buf, u32 bytes, struct ntfs_buffers *nb); +int ntfs_read_bh(struct ntfs_sb_info *sbi, const struct runs_tree *run, u64 vbo, + struct NTFS_RECORD_HEADER *rhdr, u32 bytes, + struct ntfs_buffers *nb); +int ntfs_get_bh(struct ntfs_sb_info *sbi, const struct runs_tree *run, u64 vbo, + u32 bytes, struct ntfs_buffers *nb); +int ntfs_write_bh(struct ntfs_sb_info *sbi, struct NTFS_RECORD_HEADER *rhdr, + struct ntfs_buffers *nb, int sync); +int ntfs_bio_pages(struct ntfs_sb_info *sbi, const struct runs_tree *run, + struct page **pages, u32 nr_pages, u64 vbo, u32 bytes, + enum req_op op); +int ntfs_bio_fill_1(struct ntfs_sb_info *sbi, const struct runs_tree *run); +int ntfs_vbo_to_lbo(struct ntfs_sb_info *sbi, const struct runs_tree *run, + u64 vbo, u64 *lbo, u64 *bytes); +struct ntfs_inode *ntfs_new_inode(struct ntfs_sb_info *sbi, CLST nRec, + bool dir); +extern const u8 s_default_security[0x50]; +bool is_sd_valid(const struct SECURITY_DESCRIPTOR_RELATIVE *sd, u32 len); +int ntfs_security_init(struct ntfs_sb_info *sbi); +int ntfs_get_security_by_id(struct ntfs_sb_info *sbi, __le32 security_id, + struct SECURITY_DESCRIPTOR_RELATIVE **sd, + size_t *size); +int ntfs_insert_security(struct ntfs_sb_info *sbi, + const struct SECURITY_DESCRIPTOR_RELATIVE *sd, + u32 size, __le32 *security_id, bool *inserted); +int ntfs_reparse_init(struct ntfs_sb_info *sbi); +int ntfs_objid_init(struct ntfs_sb_info *sbi); +int ntfs_objid_remove(struct ntfs_sb_info *sbi, struct GUID *guid); +int ntfs_insert_reparse(struct ntfs_sb_info *sbi, __le32 rtag, + const struct MFT_REF *ref); +int ntfs_remove_reparse(struct ntfs_sb_info *sbi, __le32 rtag, + const struct MFT_REF *ref); +void mark_as_free_ex(struct ntfs_sb_info *sbi, CLST lcn, CLST len, bool trim); +int run_deallocate(struct ntfs_sb_info *sbi, struct runs_tree *run, bool trim); + +/* Globals from index.c */ +int indx_used_bit(struct ntfs_index *indx, struct ntfs_inode *ni, size_t *bit); +void fnd_clear(struct ntfs_fnd *fnd); +static inline struct ntfs_fnd *fnd_get(void) +{ + return kzalloc(sizeof(struct ntfs_fnd), GFP_NOFS); +} +static inline void fnd_put(struct ntfs_fnd *fnd) +{ + if (fnd) { + fnd_clear(fnd); + kfree(fnd); + } +} +void indx_clear(struct ntfs_index *idx); +int indx_init(struct ntfs_index *indx, struct ntfs_sb_info *sbi, + const struct ATTRIB *attr, enum index_mutex_classed type); +struct INDEX_ROOT *indx_get_root(struct ntfs_index *indx, struct ntfs_inode *ni, + struct ATTRIB **attr, struct mft_inode **mi); +int indx_read(struct ntfs_index *idx, struct ntfs_inode *ni, CLST vbn, + struct indx_node **node); +int indx_find(struct ntfs_index *indx, struct ntfs_inode *dir, + const struct INDEX_ROOT *root, const void *Key, size_t KeyLen, + const void *param, int *diff, struct NTFS_DE **entry, + struct ntfs_fnd *fnd); +int indx_find_sort(struct ntfs_index *indx, struct ntfs_inode *ni, + const struct INDEX_ROOT *root, struct NTFS_DE **entry, + struct ntfs_fnd *fnd); +int indx_find_raw(struct ntfs_index *indx, struct ntfs_inode *ni, + const struct INDEX_ROOT *root, struct NTFS_DE **entry, + size_t *off, struct ntfs_fnd *fnd); +int indx_insert_entry(struct ntfs_index *indx, struct ntfs_inode *ni, + const struct NTFS_DE *new_de, const void *param, + struct ntfs_fnd *fnd, bool undo); +int indx_delete_entry(struct ntfs_index *indx, struct ntfs_inode *ni, + const void *key, u32 key_len, const void *param); +int indx_update_dup(struct ntfs_inode *ni, struct ntfs_sb_info *sbi, + const struct ATTR_FILE_NAME *fname, + const struct NTFS_DUP_INFO *dup, int sync); + +/* Globals from inode.c */ +struct inode *ntfs_iget5(struct super_block *sb, const struct MFT_REF *ref, + const struct cpu_str *name); +int ntfs_set_size(struct inode *inode, u64 new_size); +int reset_log_file(struct inode *inode); +int ntfs_get_block(struct inode *inode, sector_t vbn, + struct buffer_head *bh_result, int create); +int ntfs_write_begin(struct file *file, struct address_space *mapping, + loff_t pos, u32 len, struct page **pagep, void **fsdata); +int ntfs_write_end(struct file *file, struct address_space *mapping, + loff_t pos, u32 len, u32 copied, struct page *page, + void *fsdata); +int ntfs3_write_inode(struct inode *inode, struct writeback_control *wbc); +int ntfs_sync_inode(struct inode *inode); +int ntfs_flush_inodes(struct super_block *sb, struct inode *i1, + struct inode *i2); +int inode_write_data(struct inode *inode, const void *data, size_t bytes); +struct inode *ntfs_create_inode(struct user_namespace *mnt_userns, + struct inode *dir, struct dentry *dentry, + const struct cpu_str *uni, umode_t mode, + dev_t dev, const char *symname, u32 size, + struct ntfs_fnd *fnd); +int ntfs_link_inode(struct inode *inode, struct dentry *dentry); +int ntfs_unlink_inode(struct inode *dir, const struct dentry *dentry); +void ntfs_evict_inode(struct inode *inode); +extern const struct inode_operations ntfs_link_inode_operations; +extern const struct address_space_operations ntfs_aops; +extern const struct address_space_operations ntfs_aops_cmpr; + +/* Globals from name_i.c */ +int fill_name_de(struct ntfs_sb_info *sbi, void *buf, const struct qstr *name, + const struct cpu_str *uni); +struct dentry *ntfs3_get_parent(struct dentry *child); + +extern const struct inode_operations ntfs_dir_inode_operations; +extern const struct inode_operations ntfs_special_inode_operations; + +/* Globals from record.c */ +int mi_get(struct ntfs_sb_info *sbi, CLST rno, struct mft_inode **mi); +void mi_put(struct mft_inode *mi); +int mi_init(struct mft_inode *mi, struct ntfs_sb_info *sbi, CLST rno); +int mi_read(struct mft_inode *mi, bool is_mft); +struct ATTRIB *mi_enum_attr(struct mft_inode *mi, struct ATTRIB *attr); +// TODO: id? +struct ATTRIB *mi_find_attr(struct mft_inode *mi, struct ATTRIB *attr, + enum ATTR_TYPE type, const __le16 *name, + size_t name_len, const __le16 *id); +static inline struct ATTRIB *rec_find_attr_le(struct mft_inode *rec, + struct ATTR_LIST_ENTRY *le) +{ + return mi_find_attr(rec, NULL, le->type, le_name(le), le->name_len, + &le->id); +} +int mi_write(struct mft_inode *mi, int wait); +int mi_format_new(struct mft_inode *mi, struct ntfs_sb_info *sbi, CLST rno, + __le16 flags, bool is_mft); +struct ATTRIB *mi_insert_attr(struct mft_inode *mi, enum ATTR_TYPE type, + const __le16 *name, u8 name_len, u32 asize, + u16 name_off); + +bool mi_remove_attr(struct ntfs_inode *ni, struct mft_inode *mi, + struct ATTRIB *attr); +bool mi_resize_attr(struct mft_inode *mi, struct ATTRIB *attr, int bytes); +int mi_pack_runs(struct mft_inode *mi, struct ATTRIB *attr, + struct runs_tree *run, CLST len); +static inline bool mi_is_ref(const struct mft_inode *mi, + const struct MFT_REF *ref) +{ + if (le32_to_cpu(ref->low) != mi->rno) + return false; + if (ref->seq != mi->mrec->seq) + return false; + +#ifdef CONFIG_NTFS3_64BIT_CLUSTER + return le16_to_cpu(ref->high) == (mi->rno >> 32); +#else + return !ref->high; +#endif +} + +static inline void mi_get_ref(const struct mft_inode *mi, struct MFT_REF *ref) +{ + ref->low = cpu_to_le32(mi->rno); +#ifdef CONFIG_NTFS3_64BIT_CLUSTER + ref->high = cpu_to_le16(mi->rno >> 32); +#else + ref->high = 0; +#endif + ref->seq = mi->mrec->seq; +} + +/* Globals from run.c */ +bool run_lookup_entry(const struct runs_tree *run, CLST vcn, CLST *lcn, + CLST *len, size_t *index); +void run_truncate(struct runs_tree *run, CLST vcn); +void run_truncate_head(struct runs_tree *run, CLST vcn); +void run_truncate_around(struct runs_tree *run, CLST vcn); +bool run_add_entry(struct runs_tree *run, CLST vcn, CLST lcn, CLST len, + bool is_mft); +bool run_collapse_range(struct runs_tree *run, CLST vcn, CLST len); +bool run_insert_range(struct runs_tree *run, CLST vcn, CLST len); +bool run_get_entry(const struct runs_tree *run, size_t index, CLST *vcn, + CLST *lcn, CLST *len); +bool run_is_mapped_full(const struct runs_tree *run, CLST svcn, CLST evcn); + +int run_pack(const struct runs_tree *run, CLST svcn, CLST len, u8 *run_buf, + u32 run_buf_size, CLST *packed_vcns); +int run_unpack(struct runs_tree *run, struct ntfs_sb_info *sbi, CLST ino, + CLST svcn, CLST evcn, CLST vcn, const u8 *run_buf, + int run_buf_size); + +#ifdef NTFS3_CHECK_FREE_CLST +int run_unpack_ex(struct runs_tree *run, struct ntfs_sb_info *sbi, CLST ino, + CLST svcn, CLST evcn, CLST vcn, const u8 *run_buf, + int run_buf_size); +#else +#define run_unpack_ex run_unpack +#endif +int run_get_highest_vcn(CLST vcn, const u8 *run_buf, u64 *highest_vcn); +int run_clone(const struct runs_tree *run, struct runs_tree *new_run); + +/* Globals from super.c */ +void *ntfs_set_shared(void *ptr, u32 bytes); +void *ntfs_put_shared(void *ptr); +void ntfs_unmap_meta(struct super_block *sb, CLST lcn, CLST len); +int ntfs_discard(struct ntfs_sb_info *sbi, CLST Lcn, CLST Len); + +/* Globals from bitmap.c*/ +int __init ntfs3_init_bitmap(void); +void ntfs3_exit_bitmap(void); +void wnd_close(struct wnd_bitmap *wnd); +static inline size_t wnd_zeroes(const struct wnd_bitmap *wnd) +{ + return wnd->total_zeroes; +} +int wnd_init(struct wnd_bitmap *wnd, struct super_block *sb, size_t nbits); +int wnd_set_free(struct wnd_bitmap *wnd, size_t bit, size_t bits); +int wnd_set_used(struct wnd_bitmap *wnd, size_t bit, size_t bits); +bool wnd_is_free(struct wnd_bitmap *wnd, size_t bit, size_t bits); +bool wnd_is_used(struct wnd_bitmap *wnd, size_t bit, size_t bits); + +/* Possible values for 'flags' 'wnd_find'. */ +#define BITMAP_FIND_MARK_AS_USED 0x01 +#define BITMAP_FIND_FULL 0x02 +size_t wnd_find(struct wnd_bitmap *wnd, size_t to_alloc, size_t hint, + size_t flags, size_t *allocated); +int wnd_extend(struct wnd_bitmap *wnd, size_t new_bits); +void wnd_zone_set(struct wnd_bitmap *wnd, size_t Lcn, size_t Len); +int ntfs_trim_fs(struct ntfs_sb_info *sbi, struct fstrim_range *range); + +/* Globals from upcase.c */ +int ntfs_cmp_names(const __le16 *s1, size_t l1, const __le16 *s2, size_t l2, + const u16 *upcase, bool bothcase); +int ntfs_cmp_names_cpu(const struct cpu_str *uni1, const struct le_str *uni2, + const u16 *upcase, bool bothcase); + +/* globals from xattr.c */ +#ifdef CONFIG_NTFS3_FS_POSIX_ACL +struct posix_acl *ntfs_get_acl(struct inode *inode, int type, bool rcu); +int ntfs_set_acl(struct user_namespace *mnt_userns, struct inode *inode, + struct posix_acl *acl, int type); +int ntfs_init_acl(struct user_namespace *mnt_userns, struct inode *inode, + struct inode *dir); +#else +#define ntfs_get_acl NULL +#define ntfs_set_acl NULL +#endif + +int ntfs_acl_chmod(struct user_namespace *mnt_userns, struct inode *inode); +int ntfs_permission(struct user_namespace *mnt_userns, struct inode *inode, + int mask); +ssize_t ntfs_listxattr(struct dentry *dentry, char *buffer, size_t size); +extern const struct xattr_handler *ntfs_xattr_handlers[]; + +int ntfs_save_wsl_perm(struct inode *inode); +void ntfs_get_wsl_perm(struct inode *inode); + +/* globals from lznt.c */ +struct lznt *get_lznt_ctx(int level); +size_t compress_lznt(const void *uncompressed, size_t uncompressed_size, + void *compressed, size_t compressed_size, + struct lznt *ctx); +ssize_t decompress_lznt(const void *compressed, size_t compressed_size, + void *uncompressed, size_t uncompressed_size); + +static inline bool is_ntfs3(struct ntfs_sb_info *sbi) +{ + return sbi->volume.major_ver >= 3; +} + +/* (sb->s_flags & SB_ACTIVE) */ +static inline bool is_mounted(struct ntfs_sb_info *sbi) +{ + return !!sbi->sb->s_root; +} + +static inline bool ntfs_is_meta_file(struct ntfs_sb_info *sbi, CLST rno) +{ + return rno < MFT_REC_FREE || rno == sbi->objid_no || + rno == sbi->quota_no || rno == sbi->reparse_no || + rno == sbi->usn_jrnl_no; +} + +static inline void ntfs_unmap_page(struct page *page) +{ + kunmap(page); + put_page(page); +} + +static inline struct page *ntfs_map_page(struct address_space *mapping, + unsigned long index) +{ + struct page *page = read_mapping_page(mapping, index, NULL); + + if (!IS_ERR(page)) + kmap(page); + return page; +} + +static inline size_t wnd_zone_bit(const struct wnd_bitmap *wnd) +{ + return wnd->zone_bit; +} + +static inline size_t wnd_zone_len(const struct wnd_bitmap *wnd) +{ + return wnd->zone_end - wnd->zone_bit; +} + +static inline void run_init(struct runs_tree *run) +{ + run->runs = NULL; + run->count = 0; + run->allocated = 0; +} + +static inline struct runs_tree *run_alloc(void) +{ + return kzalloc(sizeof(struct runs_tree), GFP_NOFS); +} + +static inline void run_close(struct runs_tree *run) +{ + kvfree(run->runs); + memset(run, 0, sizeof(*run)); +} + +static inline void run_free(struct runs_tree *run) +{ + if (run) { + kvfree(run->runs); + kfree(run); + } +} + +static inline bool run_is_empty(struct runs_tree *run) +{ + return !run->count; +} + +/* NTFS uses quad aligned bitmaps. */ +static inline size_t bitmap_size(size_t bits) +{ + return ALIGN((bits + 7) >> 3, 8); +} + +#define _100ns2seconds 10000000 +#define SecondsToStartOf1970 0x00000002B6109100 + +#define NTFS_TIME_GRAN 100 + +/* + * kernel2nt - Converts in-memory kernel timestamp into nt time. + */ +static inline __le64 kernel2nt(const struct timespec64 *ts) +{ + // 10^7 units of 100 nanoseconds one second + return cpu_to_le64(_100ns2seconds * + (ts->tv_sec + SecondsToStartOf1970) + + ts->tv_nsec / NTFS_TIME_GRAN); +} + +/* + * nt2kernel - Converts on-disk nt time into kernel timestamp. + */ +static inline void nt2kernel(const __le64 tm, struct timespec64 *ts) +{ + u64 t = le64_to_cpu(tm) - _100ns2seconds * SecondsToStartOf1970; + + // WARNING: do_div changes its first argument(!) + ts->tv_nsec = do_div(t, _100ns2seconds) * 100; + ts->tv_sec = t; +} + +static inline struct ntfs_sb_info *ntfs_sb(struct super_block *sb) +{ + return sb->s_fs_info; +} + +/* + * ntfs_up_cluster - Align up on cluster boundary. + */ +static inline u64 ntfs_up_cluster(const struct ntfs_sb_info *sbi, u64 size) +{ + return (size + sbi->cluster_mask) & sbi->cluster_mask_inv; +} + +/* + * ntfs_up_block - Align up on cluster boundary. + */ +static inline u64 ntfs_up_block(const struct super_block *sb, u64 size) +{ + return (size + sb->s_blocksize - 1) & ~(u64)(sb->s_blocksize - 1); +} + +static inline CLST bytes_to_cluster(const struct ntfs_sb_info *sbi, u64 size) +{ + return (size + sbi->cluster_mask) >> sbi->cluster_bits; +} + +static inline u64 bytes_to_block(const struct super_block *sb, u64 size) +{ + return (size + sb->s_blocksize - 1) >> sb->s_blocksize_bits; +} + +static inline struct buffer_head *ntfs_bread(struct super_block *sb, + sector_t block) +{ + struct buffer_head *bh = sb_bread(sb, block); + + if (bh) + return bh; + + ntfs_err(sb, "failed to read volume at offset 0x%llx", + (u64)block << sb->s_blocksize_bits); + return NULL; +} + +static inline struct ntfs_inode *ntfs_i(struct inode *inode) +{ + return container_of(inode, struct ntfs_inode, vfs_inode); +} + +static inline bool is_compressed(const struct ntfs_inode *ni) +{ + return (ni->std_fa & FILE_ATTRIBUTE_COMPRESSED) || + (ni->ni_flags & NI_FLAG_COMPRESSED_MASK); +} + +static inline int ni_ext_compress_bits(const struct ntfs_inode *ni) +{ + return 0xb + (ni->ni_flags & NI_FLAG_COMPRESSED_MASK); +} + +/* Bits - 0xc, 0xd, 0xe, 0xf, 0x10 */ +static inline void ni_set_ext_compress_bits(struct ntfs_inode *ni, u8 bits) +{ + ni->ni_flags |= (bits - 0xb) & NI_FLAG_COMPRESSED_MASK; +} + +static inline bool is_dedup(const struct ntfs_inode *ni) +{ + return ni->ni_flags & NI_FLAG_DEDUPLICATED; +} + +static inline bool is_encrypted(const struct ntfs_inode *ni) +{ + return ni->std_fa & FILE_ATTRIBUTE_ENCRYPTED; +} + +static inline bool is_sparsed(const struct ntfs_inode *ni) +{ + return ni->std_fa & FILE_ATTRIBUTE_SPARSE_FILE; +} + +static inline int is_resident(struct ntfs_inode *ni) +{ + return ni->ni_flags & NI_FLAG_RESIDENT; +} + +static inline void le16_sub_cpu(__le16 *var, u16 val) +{ + *var = cpu_to_le16(le16_to_cpu(*var) - val); +} + +static inline void le32_sub_cpu(__le32 *var, u32 val) +{ + *var = cpu_to_le32(le32_to_cpu(*var) - val); +} + +static inline void nb_put(struct ntfs_buffers *nb) +{ + u32 i, nbufs = nb->nbufs; + + if (!nbufs) + return; + + for (i = 0; i < nbufs; i++) + put_bh(nb->bh[i]); + nb->nbufs = 0; +} + +static inline void put_indx_node(struct indx_node *in) +{ + if (!in) + return; + + kfree(in->index); + nb_put(&in->nb); + kfree(in); +} + +static inline void mi_clear(struct mft_inode *mi) +{ + nb_put(&mi->nb); + kfree(mi->mrec); + mi->mrec = NULL; +} + +static inline void ni_lock(struct ntfs_inode *ni) +{ + mutex_lock_nested(&ni->ni_lock, NTFS_INODE_MUTEX_NORMAL); +} + +static inline void ni_lock_dir(struct ntfs_inode *ni) +{ + mutex_lock_nested(&ni->ni_lock, NTFS_INODE_MUTEX_PARENT); +} + +static inline void ni_unlock(struct ntfs_inode *ni) +{ + mutex_unlock(&ni->ni_lock); +} + +static inline int ni_trylock(struct ntfs_inode *ni) +{ + return mutex_trylock(&ni->ni_lock); +} + +static inline int attr_load_runs_attr(struct ntfs_inode *ni, + struct ATTRIB *attr, + struct runs_tree *run, CLST vcn) +{ + return attr_load_runs_vcn(ni, attr->type, attr_name(attr), + attr->name_len, run, vcn); +} + +static inline void le64_sub_cpu(__le64 *var, u64 val) +{ + *var = cpu_to_le64(le64_to_cpu(*var) - val); +} + +#endif /* _LINUX_NTFS3_NTFS_FS_H */ diff --git a/fs/ntfs3/record.c b/fs/ntfs3/record.c new file mode 100644 index 000000000..ba336c728 --- /dev/null +++ b/fs/ntfs3/record.c @@ -0,0 +1,595 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * + * Copyright (C) 2019-2021 Paragon Software GmbH, All rights reserved. + * + */ + +#include <linux/fs.h> + +#include "debug.h" +#include "ntfs.h" +#include "ntfs_fs.h" + +static inline int compare_attr(const struct ATTRIB *left, enum ATTR_TYPE type, + const __le16 *name, u8 name_len, + const u16 *upcase) +{ + /* First, compare the type codes. */ + int diff = le32_to_cpu(left->type) - le32_to_cpu(type); + + if (diff) + return diff; + + /* They have the same type code, so we have to compare the names. */ + return ntfs_cmp_names(attr_name(left), left->name_len, name, name_len, + upcase, true); +} + +/* + * mi_new_attt_id + * + * Return: Unused attribute id that is less than mrec->next_attr_id. + */ +static __le16 mi_new_attt_id(struct mft_inode *mi) +{ + u16 free_id, max_id, t16; + struct MFT_REC *rec = mi->mrec; + struct ATTRIB *attr; + __le16 id; + + id = rec->next_attr_id; + free_id = le16_to_cpu(id); + if (free_id < 0x7FFF) { + rec->next_attr_id = cpu_to_le16(free_id + 1); + return id; + } + + /* One record can store up to 1024/24 ~= 42 attributes. */ + free_id = 0; + max_id = 0; + + attr = NULL; + + for (;;) { + attr = mi_enum_attr(mi, attr); + if (!attr) { + rec->next_attr_id = cpu_to_le16(max_id + 1); + mi->dirty = true; + return cpu_to_le16(free_id); + } + + t16 = le16_to_cpu(attr->id); + if (t16 == free_id) { + free_id += 1; + attr = NULL; + } else if (max_id < t16) + max_id = t16; + } +} + +int mi_get(struct ntfs_sb_info *sbi, CLST rno, struct mft_inode **mi) +{ + int err; + struct mft_inode *m = kzalloc(sizeof(struct mft_inode), GFP_NOFS); + + if (!m) + return -ENOMEM; + + err = mi_init(m, sbi, rno); + if (err) { + kfree(m); + return err; + } + + err = mi_read(m, false); + if (err) { + mi_put(m); + return err; + } + + *mi = m; + return 0; +} + +void mi_put(struct mft_inode *mi) +{ + mi_clear(mi); + kfree(mi); +} + +int mi_init(struct mft_inode *mi, struct ntfs_sb_info *sbi, CLST rno) +{ + mi->sbi = sbi; + mi->rno = rno; + mi->mrec = kmalloc(sbi->record_size, GFP_NOFS); + if (!mi->mrec) + return -ENOMEM; + + return 0; +} + +/* + * mi_read - Read MFT data. + */ +int mi_read(struct mft_inode *mi, bool is_mft) +{ + int err; + struct MFT_REC *rec = mi->mrec; + struct ntfs_sb_info *sbi = mi->sbi; + u32 bpr = sbi->record_size; + u64 vbo = (u64)mi->rno << sbi->record_bits; + struct ntfs_inode *mft_ni = sbi->mft.ni; + struct runs_tree *run = mft_ni ? &mft_ni->file.run : NULL; + struct rw_semaphore *rw_lock = NULL; + + if (is_mounted(sbi)) { + if (!is_mft && mft_ni) { + rw_lock = &mft_ni->file.run_lock; + down_read(rw_lock); + } + } + + err = ntfs_read_bh(sbi, run, vbo, &rec->rhdr, bpr, &mi->nb); + if (rw_lock) + up_read(rw_lock); + if (!err) + goto ok; + + if (err == -E_NTFS_FIXUP) { + mi->dirty = true; + goto ok; + } + + if (err != -ENOENT) + goto out; + + if (rw_lock) { + ni_lock(mft_ni); + down_write(rw_lock); + } + err = attr_load_runs_vcn(mft_ni, ATTR_DATA, NULL, 0, run, + vbo >> sbi->cluster_bits); + if (rw_lock) { + up_write(rw_lock); + ni_unlock(mft_ni); + } + if (err) + goto out; + + if (rw_lock) + down_read(rw_lock); + err = ntfs_read_bh(sbi, run, vbo, &rec->rhdr, bpr, &mi->nb); + if (rw_lock) + up_read(rw_lock); + + if (err == -E_NTFS_FIXUP) { + mi->dirty = true; + goto ok; + } + if (err) + goto out; + +ok: + /* Check field 'total' only here. */ + if (le32_to_cpu(rec->total) != bpr) { + err = -EINVAL; + goto out; + } + + return 0; + +out: + if (err == -E_NTFS_CORRUPT) { + ntfs_err(sbi->sb, "mft corrupted"); + ntfs_set_state(sbi, NTFS_DIRTY_ERROR); + err = -EINVAL; + } + + return err; +} + +struct ATTRIB *mi_enum_attr(struct mft_inode *mi, struct ATTRIB *attr) +{ + const struct MFT_REC *rec = mi->mrec; + u32 used = le32_to_cpu(rec->used); + u32 t32, off, asize; + u16 t16; + + if (!attr) { + u32 total = le32_to_cpu(rec->total); + + off = le16_to_cpu(rec->attr_off); + + if (used > total) + return NULL; + + if (off >= used || off < MFTRECORD_FIXUP_OFFSET_1 || + !IS_ALIGNED(off, 4)) { + return NULL; + } + + /* Skip non-resident records. */ + if (!is_rec_inuse(rec)) + return NULL; + + attr = Add2Ptr(rec, off); + } else { + /* Check if input attr inside record. */ + off = PtrOffset(rec, attr); + if (off >= used) + return NULL; + + asize = le32_to_cpu(attr->size); + if (asize < SIZEOF_RESIDENT) { + /* Impossible 'cause we should not return such attribute. */ + return NULL; + } + + if (off + asize < off) { + /* overflow check */ + return NULL; + } + + attr = Add2Ptr(attr, asize); + off += asize; + } + + asize = le32_to_cpu(attr->size); + + /* Can we use the first field (attr->type). */ + if (off + 8 > used) { + static_assert(ALIGN(sizeof(enum ATTR_TYPE), 8) == 8); + return NULL; + } + + if (attr->type == ATTR_END) { + /* End of enumeration. */ + return NULL; + } + + /* 0x100 is last known attribute for now. */ + t32 = le32_to_cpu(attr->type); + if ((t32 & 0xf) || (t32 > 0x100)) + return NULL; + + /* Check boundary. */ + if (off + asize > used) + return NULL; + + /* Check size of attribute. */ + if (!attr->non_res) { + if (asize < SIZEOF_RESIDENT) + return NULL; + + t16 = le16_to_cpu(attr->res.data_off); + + if (t16 > asize) + return NULL; + + t32 = le32_to_cpu(attr->res.data_size); + if (t16 + t32 > asize) + return NULL; + + if (attr->name_len && + le16_to_cpu(attr->name_off) + sizeof(short) * attr->name_len > t16) { + return NULL; + } + + return attr; + } + + /* Check some nonresident fields. */ + if (attr->name_len && + le16_to_cpu(attr->name_off) + sizeof(short) * attr->name_len > + le16_to_cpu(attr->nres.run_off)) { + return NULL; + } + + if (attr->nres.svcn || !is_attr_ext(attr)) { + if (asize + 8 < SIZEOF_NONRESIDENT) + return NULL; + + if (attr->nres.c_unit) + return NULL; + } else if (asize + 8 < SIZEOF_NONRESIDENT_EX) + return NULL; + + return attr; +} + +/* + * mi_find_attr - Find the attribute by type and name and id. + */ +struct ATTRIB *mi_find_attr(struct mft_inode *mi, struct ATTRIB *attr, + enum ATTR_TYPE type, const __le16 *name, + size_t name_len, const __le16 *id) +{ + u32 type_in = le32_to_cpu(type); + u32 atype; + +next_attr: + attr = mi_enum_attr(mi, attr); + if (!attr) + return NULL; + + atype = le32_to_cpu(attr->type); + if (atype > type_in) + return NULL; + + if (atype < type_in) + goto next_attr; + + if (attr->name_len != name_len) + goto next_attr; + + if (name_len && memcmp(attr_name(attr), name, name_len * sizeof(short))) + goto next_attr; + + if (id && *id != attr->id) + goto next_attr; + + return attr; +} + +int mi_write(struct mft_inode *mi, int wait) +{ + struct MFT_REC *rec; + int err; + struct ntfs_sb_info *sbi; + + if (!mi->dirty) + return 0; + + sbi = mi->sbi; + rec = mi->mrec; + + err = ntfs_write_bh(sbi, &rec->rhdr, &mi->nb, wait); + if (err) + return err; + + if (mi->rno < sbi->mft.recs_mirr) + sbi->flags |= NTFS_FLAGS_MFTMIRR; + + mi->dirty = false; + + return 0; +} + +int mi_format_new(struct mft_inode *mi, struct ntfs_sb_info *sbi, CLST rno, + __le16 flags, bool is_mft) +{ + int err; + u16 seq = 1; + struct MFT_REC *rec; + u64 vbo = (u64)rno << sbi->record_bits; + + err = mi_init(mi, sbi, rno); + if (err) + return err; + + rec = mi->mrec; + + if (rno == MFT_REC_MFT) { + ; + } else if (rno < MFT_REC_FREE) { + seq = rno; + } else if (rno >= sbi->mft.used) { + ; + } else if (mi_read(mi, is_mft)) { + ; + } else if (rec->rhdr.sign == NTFS_FILE_SIGNATURE) { + /* Record is reused. Update its sequence number. */ + seq = le16_to_cpu(rec->seq) + 1; + if (!seq) + seq = 1; + } + + memcpy(rec, sbi->new_rec, sbi->record_size); + + rec->seq = cpu_to_le16(seq); + rec->flags = RECORD_FLAG_IN_USE | flags; + + mi->dirty = true; + + if (!mi->nb.nbufs) { + struct ntfs_inode *ni = sbi->mft.ni; + bool lock = false; + + if (is_mounted(sbi) && !is_mft) { + down_read(&ni->file.run_lock); + lock = true; + } + + err = ntfs_get_bh(sbi, &ni->file.run, vbo, sbi->record_size, + &mi->nb); + if (lock) + up_read(&ni->file.run_lock); + } + + return err; +} + +/* + * mi_insert_attr - Reserve space for new attribute. + * + * Return: Not full constructed attribute or NULL if not possible to create. + */ +struct ATTRIB *mi_insert_attr(struct mft_inode *mi, enum ATTR_TYPE type, + const __le16 *name, u8 name_len, u32 asize, + u16 name_off) +{ + size_t tail; + struct ATTRIB *attr; + __le16 id; + struct MFT_REC *rec = mi->mrec; + struct ntfs_sb_info *sbi = mi->sbi; + u32 used = le32_to_cpu(rec->used); + const u16 *upcase = sbi->upcase; + int diff; + + /* Can we insert mi attribute? */ + if (used + asize > mi->sbi->record_size) + return NULL; + + /* + * Scan through the list of attributes to find the point + * at which we should insert it. + */ + attr = NULL; + while ((attr = mi_enum_attr(mi, attr))) { + diff = compare_attr(attr, type, name, name_len, upcase); + + if (diff < 0) + continue; + + if (!diff && !is_attr_indexed(attr)) + return NULL; + break; + } + + if (!attr) { + tail = 8; /* Not used, just to suppress warning. */ + attr = Add2Ptr(rec, used - 8); + } else { + tail = used - PtrOffset(rec, attr); + } + + id = mi_new_attt_id(mi); + + memmove(Add2Ptr(attr, asize), attr, tail); + memset(attr, 0, asize); + + attr->type = type; + attr->size = cpu_to_le32(asize); + attr->name_len = name_len; + attr->name_off = cpu_to_le16(name_off); + attr->id = id; + + memmove(Add2Ptr(attr, name_off), name, name_len * sizeof(short)); + rec->used = cpu_to_le32(used + asize); + + mi->dirty = true; + + return attr; +} + +/* + * mi_remove_attr - Remove the attribute from record. + * + * NOTE: The source attr will point to next attribute. + */ +bool mi_remove_attr(struct ntfs_inode *ni, struct mft_inode *mi, + struct ATTRIB *attr) +{ + struct MFT_REC *rec = mi->mrec; + u32 aoff = PtrOffset(rec, attr); + u32 used = le32_to_cpu(rec->used); + u32 asize = le32_to_cpu(attr->size); + + if (aoff + asize > used) + return false; + + if (ni && is_attr_indexed(attr)) { + le16_add_cpu(&ni->mi.mrec->hard_links, -1); + ni->mi.dirty = true; + } + + used -= asize; + memmove(attr, Add2Ptr(attr, asize), used - aoff); + rec->used = cpu_to_le32(used); + mi->dirty = true; + + return true; +} + +/* bytes = "new attribute size" - "old attribute size" */ +bool mi_resize_attr(struct mft_inode *mi, struct ATTRIB *attr, int bytes) +{ + struct MFT_REC *rec = mi->mrec; + u32 aoff = PtrOffset(rec, attr); + u32 total, used = le32_to_cpu(rec->used); + u32 nsize, asize = le32_to_cpu(attr->size); + u32 rsize = le32_to_cpu(attr->res.data_size); + int tail = (int)(used - aoff - asize); + int dsize; + char *next; + + if (tail < 0 || aoff >= used) + return false; + + if (!bytes) + return true; + + total = le32_to_cpu(rec->total); + next = Add2Ptr(attr, asize); + + if (bytes > 0) { + dsize = ALIGN(bytes, 8); + if (used + dsize > total) + return false; + nsize = asize + dsize; + /* Move tail */ + memmove(next + dsize, next, tail); + memset(next, 0, dsize); + used += dsize; + rsize += dsize; + } else { + dsize = ALIGN(-bytes, 8); + if (dsize > asize) + return false; + nsize = asize - dsize; + memmove(next - dsize, next, tail); + used -= dsize; + rsize -= dsize; + } + + rec->used = cpu_to_le32(used); + attr->size = cpu_to_le32(nsize); + if (!attr->non_res) + attr->res.data_size = cpu_to_le32(rsize); + mi->dirty = true; + + return true; +} + +int mi_pack_runs(struct mft_inode *mi, struct ATTRIB *attr, + struct runs_tree *run, CLST len) +{ + int err = 0; + struct ntfs_sb_info *sbi = mi->sbi; + u32 new_run_size; + CLST plen; + struct MFT_REC *rec = mi->mrec; + CLST svcn = le64_to_cpu(attr->nres.svcn); + u32 used = le32_to_cpu(rec->used); + u32 aoff = PtrOffset(rec, attr); + u32 asize = le32_to_cpu(attr->size); + char *next = Add2Ptr(attr, asize); + u16 run_off = le16_to_cpu(attr->nres.run_off); + u32 run_size = asize - run_off; + u32 tail = used - aoff - asize; + u32 dsize = sbi->record_size - used; + + /* Make a maximum gap in current record. */ + memmove(next + dsize, next, tail); + + /* Pack as much as possible. */ + err = run_pack(run, svcn, len, Add2Ptr(attr, run_off), run_size + dsize, + &plen); + if (err < 0) { + memmove(next, next + dsize, tail); + return err; + } + + new_run_size = ALIGN(err, 8); + + memmove(next + new_run_size - run_size, next + dsize, tail); + + attr->size = cpu_to_le32(asize + new_run_size - run_size); + attr->nres.evcn = cpu_to_le64(svcn + plen - 1); + rec->used = cpu_to_le32(used + new_run_size - run_size); + mi->dirty = true; + + return 0; +} diff --git a/fs/ntfs3/run.c b/fs/ntfs3/run.c new file mode 100644 index 000000000..12d8682f3 --- /dev/null +++ b/fs/ntfs3/run.c @@ -0,0 +1,1186 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * + * Copyright (C) 2019-2021 Paragon Software GmbH, All rights reserved. + * + * TODO: try to use extents tree (instead of array) + */ + +#include <linux/blkdev.h> +#include <linux/fs.h> +#include <linux/log2.h> + +#include "debug.h" +#include "ntfs.h" +#include "ntfs_fs.h" + +/* runs_tree is a continues memory. Try to avoid big size. */ +#define NTFS3_RUN_MAX_BYTES 0x10000 + +struct ntfs_run { + CLST vcn; /* Virtual cluster number. */ + CLST len; /* Length in clusters. */ + CLST lcn; /* Logical cluster number. */ +}; + +/* + * run_lookup - Lookup the index of a MCB entry that is first <= vcn. + * + * Case of success it will return non-zero value and set + * @index parameter to index of entry been found. + * Case of entry missing from list 'index' will be set to + * point to insertion position for the entry question. + */ +static bool run_lookup(const struct runs_tree *run, CLST vcn, size_t *index) +{ + size_t min_idx, max_idx, mid_idx; + struct ntfs_run *r; + + if (!run->count) { + *index = 0; + return false; + } + + min_idx = 0; + max_idx = run->count - 1; + + /* Check boundary cases specially, 'cause they cover the often requests. */ + r = run->runs; + if (vcn < r->vcn) { + *index = 0; + return false; + } + + if (vcn < r->vcn + r->len) { + *index = 0; + return true; + } + + r += max_idx; + if (vcn >= r->vcn + r->len) { + *index = run->count; + return false; + } + + if (vcn >= r->vcn) { + *index = max_idx; + return true; + } + + do { + mid_idx = min_idx + ((max_idx - min_idx) >> 1); + r = run->runs + mid_idx; + + if (vcn < r->vcn) { + max_idx = mid_idx - 1; + if (!mid_idx) + break; + } else if (vcn >= r->vcn + r->len) { + min_idx = mid_idx + 1; + } else { + *index = mid_idx; + return true; + } + } while (min_idx <= max_idx); + + *index = max_idx + 1; + return false; +} + +/* + * run_consolidate - Consolidate runs starting from a given one. + */ +static void run_consolidate(struct runs_tree *run, size_t index) +{ + size_t i; + struct ntfs_run *r = run->runs + index; + + while (index + 1 < run->count) { + /* + * I should merge current run with next + * if start of the next run lies inside one being tested. + */ + struct ntfs_run *n = r + 1; + CLST end = r->vcn + r->len; + CLST dl; + + /* Stop if runs are not aligned one to another. */ + if (n->vcn > end) + break; + + dl = end - n->vcn; + + /* + * If range at index overlaps with next one + * then I will either adjust it's start position + * or (if completely matches) dust remove one from the list. + */ + if (dl > 0) { + if (n->len <= dl) + goto remove_next_range; + + n->len -= dl; + n->vcn += dl; + if (n->lcn != SPARSE_LCN) + n->lcn += dl; + dl = 0; + } + + /* + * Stop if sparse mode does not match + * both current and next runs. + */ + if ((n->lcn == SPARSE_LCN) != (r->lcn == SPARSE_LCN)) { + index += 1; + r = n; + continue; + } + + /* + * Check if volume block + * of a next run lcn does not match + * last volume block of the current run. + */ + if (n->lcn != SPARSE_LCN && n->lcn != r->lcn + r->len) + break; + + /* + * Next and current are siblings. + * Eat/join. + */ + r->len += n->len - dl; + +remove_next_range: + i = run->count - (index + 1); + if (i > 1) + memmove(n, n + 1, sizeof(*n) * (i - 1)); + + run->count -= 1; + } +} + +/* + * run_is_mapped_full + * + * Return: True if range [svcn - evcn] is mapped. + */ +bool run_is_mapped_full(const struct runs_tree *run, CLST svcn, CLST evcn) +{ + size_t i; + const struct ntfs_run *r, *end; + CLST next_vcn; + + if (!run_lookup(run, svcn, &i)) + return false; + + end = run->runs + run->count; + r = run->runs + i; + + for (;;) { + next_vcn = r->vcn + r->len; + if (next_vcn > evcn) + return true; + + if (++r >= end) + return false; + + if (r->vcn != next_vcn) + return false; + } +} + +bool run_lookup_entry(const struct runs_tree *run, CLST vcn, CLST *lcn, + CLST *len, size_t *index) +{ + size_t idx; + CLST gap; + struct ntfs_run *r; + + /* Fail immediately if nrun was not touched yet. */ + if (!run->runs) + return false; + + if (!run_lookup(run, vcn, &idx)) + return false; + + r = run->runs + idx; + + if (vcn >= r->vcn + r->len) + return false; + + gap = vcn - r->vcn; + if (r->len <= gap) + return false; + + *lcn = r->lcn == SPARSE_LCN ? SPARSE_LCN : (r->lcn + gap); + + if (len) + *len = r->len - gap; + if (index) + *index = idx; + + return true; +} + +/* + * run_truncate_head - Decommit the range before vcn. + */ +void run_truncate_head(struct runs_tree *run, CLST vcn) +{ + size_t index; + struct ntfs_run *r; + + if (run_lookup(run, vcn, &index)) { + r = run->runs + index; + + if (vcn > r->vcn) { + CLST dlen = vcn - r->vcn; + + r->vcn = vcn; + r->len -= dlen; + if (r->lcn != SPARSE_LCN) + r->lcn += dlen; + } + + if (!index) + return; + } + r = run->runs; + memmove(r, r + index, sizeof(*r) * (run->count - index)); + + run->count -= index; + + if (!run->count) { + kvfree(run->runs); + run->runs = NULL; + run->allocated = 0; + } +} + +/* + * run_truncate - Decommit the range after vcn. + */ +void run_truncate(struct runs_tree *run, CLST vcn) +{ + size_t index; + + /* + * If I hit the range then + * I have to truncate one. + * If range to be truncated is becoming empty + * then it will entirely be removed. + */ + if (run_lookup(run, vcn, &index)) { + struct ntfs_run *r = run->runs + index; + + r->len = vcn - r->vcn; + + if (r->len > 0) + index += 1; + } + + /* + * At this point 'index' is set to position that + * should be thrown away (including index itself) + * Simple one - just set the limit. + */ + run->count = index; + + /* Do not reallocate array 'runs'. Only free if possible. */ + if (!index) { + kvfree(run->runs); + run->runs = NULL; + run->allocated = 0; + } +} + +/* + * run_truncate_around - Trim head and tail if necessary. + */ +void run_truncate_around(struct runs_tree *run, CLST vcn) +{ + run_truncate_head(run, vcn); + + if (run->count >= NTFS3_RUN_MAX_BYTES / sizeof(struct ntfs_run) / 2) + run_truncate(run, (run->runs + (run->count >> 1))->vcn); +} + +/* + * run_add_entry + * + * Sets location to known state. + * Run to be added may overlap with existing location. + * + * Return: false if of memory. + */ +bool run_add_entry(struct runs_tree *run, CLST vcn, CLST lcn, CLST len, + bool is_mft) +{ + size_t used, index; + struct ntfs_run *r; + bool inrange; + CLST tail_vcn = 0, tail_len = 0, tail_lcn = 0; + bool should_add_tail = false; + + /* + * Lookup the insertion point. + * + * Execute bsearch for the entry containing + * start position question. + */ + inrange = run_lookup(run, vcn, &index); + + /* + * Shortcut here would be case of + * range not been found but one been added + * continues previous run. + * This case I can directly make use of + * existing range as my start point. + */ + if (!inrange && index > 0) { + struct ntfs_run *t = run->runs + index - 1; + + if (t->vcn + t->len == vcn && + (t->lcn == SPARSE_LCN) == (lcn == SPARSE_LCN) && + (lcn == SPARSE_LCN || lcn == t->lcn + t->len)) { + inrange = true; + index -= 1; + } + } + + /* + * At this point 'index' either points to the range + * containing start position or to the insertion position + * for a new range. + * So first let's check if range I'm probing is here already. + */ + if (!inrange) { +requires_new_range: + /* + * Range was not found. + * Insert at position 'index' + */ + used = run->count * sizeof(struct ntfs_run); + + /* + * Check allocated space. + * If one is not enough to get one more entry + * then it will be reallocated. + */ + if (run->allocated < used + sizeof(struct ntfs_run)) { + size_t bytes; + struct ntfs_run *new_ptr; + + /* Use power of 2 for 'bytes'. */ + if (!used) { + bytes = 64; + } else if (used <= 16 * PAGE_SIZE) { + if (is_power_of_2(run->allocated)) + bytes = run->allocated << 1; + else + bytes = (size_t)1 + << (2 + blksize_bits(used)); + } else { + bytes = run->allocated + (16 * PAGE_SIZE); + } + + WARN_ON(!is_mft && bytes > NTFS3_RUN_MAX_BYTES); + + new_ptr = kvmalloc(bytes, GFP_KERNEL); + + if (!new_ptr) + return false; + + r = new_ptr + index; + memcpy(new_ptr, run->runs, + index * sizeof(struct ntfs_run)); + memcpy(r + 1, run->runs + index, + sizeof(struct ntfs_run) * (run->count - index)); + + kvfree(run->runs); + run->runs = new_ptr; + run->allocated = bytes; + + } else { + size_t i = run->count - index; + + r = run->runs + index; + + /* memmove appears to be a bottle neck here... */ + if (i > 0) + memmove(r + 1, r, sizeof(struct ntfs_run) * i); + } + + r->vcn = vcn; + r->lcn = lcn; + r->len = len; + run->count += 1; + } else { + r = run->runs + index; + + /* + * If one of ranges was not allocated then we + * have to split location we just matched and + * insert current one. + * A common case this requires tail to be reinserted + * a recursive call. + */ + if (((lcn == SPARSE_LCN) != (r->lcn == SPARSE_LCN)) || + (lcn != SPARSE_LCN && lcn != r->lcn + (vcn - r->vcn))) { + CLST to_eat = vcn - r->vcn; + CLST Tovcn = to_eat + len; + + should_add_tail = Tovcn < r->len; + + if (should_add_tail) { + tail_lcn = r->lcn == SPARSE_LCN + ? SPARSE_LCN + : (r->lcn + Tovcn); + tail_vcn = r->vcn + Tovcn; + tail_len = r->len - Tovcn; + } + + if (to_eat > 0) { + r->len = to_eat; + inrange = false; + index += 1; + goto requires_new_range; + } + + /* lcn should match one were going to add. */ + r->lcn = lcn; + } + + /* + * If existing range fits then were done. + * Otherwise extend found one and fall back to range jocode. + */ + if (r->vcn + r->len < vcn + len) + r->len += len - ((r->vcn + r->len) - vcn); + } + + /* + * And normalize it starting from insertion point. + * It's possible that no insertion needed case if + * start point lies within the range of an entry + * that 'index' points to. + */ + if (inrange && index > 0) + index -= 1; + run_consolidate(run, index); + run_consolidate(run, index + 1); + + /* + * A special case. + * We have to add extra range a tail. + */ + if (should_add_tail && + !run_add_entry(run, tail_vcn, tail_lcn, tail_len, is_mft)) + return false; + + return true; +} + +/* run_collapse_range + * + * Helper for attr_collapse_range(), + * which is helper for fallocate(collapse_range). + */ +bool run_collapse_range(struct runs_tree *run, CLST vcn, CLST len) +{ + size_t index, eat; + struct ntfs_run *r, *e, *eat_start, *eat_end; + CLST end; + + if (WARN_ON(!run_lookup(run, vcn, &index))) + return true; /* Should never be here. */ + + e = run->runs + run->count; + r = run->runs + index; + end = vcn + len; + + if (vcn > r->vcn) { + if (r->vcn + r->len <= end) { + /* Collapse tail of run .*/ + r->len = vcn - r->vcn; + } else if (r->lcn == SPARSE_LCN) { + /* Collapse a middle part of sparsed run. */ + r->len -= len; + } else { + /* Collapse a middle part of normal run, split. */ + if (!run_add_entry(run, vcn, SPARSE_LCN, len, false)) + return false; + return run_collapse_range(run, vcn, len); + } + + r += 1; + } + + eat_start = r; + eat_end = r; + + for (; r < e; r++) { + CLST d; + + if (r->vcn >= end) { + r->vcn -= len; + continue; + } + + if (r->vcn + r->len <= end) { + /* Eat this run. */ + eat_end = r + 1; + continue; + } + + d = end - r->vcn; + if (r->lcn != SPARSE_LCN) + r->lcn += d; + r->len -= d; + r->vcn -= len - d; + } + + eat = eat_end - eat_start; + memmove(eat_start, eat_end, (e - eat_end) * sizeof(*r)); + run->count -= eat; + + return true; +} + +/* run_insert_range + * + * Helper for attr_insert_range(), + * which is helper for fallocate(insert_range). + */ +bool run_insert_range(struct runs_tree *run, CLST vcn, CLST len) +{ + size_t index; + struct ntfs_run *r, *e; + + if (WARN_ON(!run_lookup(run, vcn, &index))) + return false; /* Should never be here. */ + + e = run->runs + run->count; + r = run->runs + index; + + if (vcn > r->vcn) + r += 1; + + for (; r < e; r++) + r->vcn += len; + + r = run->runs + index; + + if (vcn > r->vcn) { + /* split fragment. */ + CLST len1 = vcn - r->vcn; + CLST len2 = r->len - len1; + CLST lcn2 = r->lcn == SPARSE_LCN ? SPARSE_LCN : (r->lcn + len1); + + r->len = len1; + + if (!run_add_entry(run, vcn + len, lcn2, len2, false)) + return false; + } + + if (!run_add_entry(run, vcn, SPARSE_LCN, len, false)) + return false; + + return true; +} + +/* + * run_get_entry - Return index-th mapped region. + */ +bool run_get_entry(const struct runs_tree *run, size_t index, CLST *vcn, + CLST *lcn, CLST *len) +{ + const struct ntfs_run *r; + + if (index >= run->count) + return false; + + r = run->runs + index; + + if (!r->len) + return false; + + if (vcn) + *vcn = r->vcn; + if (lcn) + *lcn = r->lcn; + if (len) + *len = r->len; + return true; +} + +/* + * run_packed_size - Calculate the size of packed int64. + */ +#ifdef __BIG_ENDIAN +static inline int run_packed_size(const s64 n) +{ + const u8 *p = (const u8 *)&n + sizeof(n) - 1; + + if (n >= 0) { + if (p[-7] || p[-6] || p[-5] || p[-4]) + p -= 4; + if (p[-3] || p[-2]) + p -= 2; + if (p[-1]) + p -= 1; + if (p[0] & 0x80) + p -= 1; + } else { + if (p[-7] != 0xff || p[-6] != 0xff || p[-5] != 0xff || + p[-4] != 0xff) + p -= 4; + if (p[-3] != 0xff || p[-2] != 0xff) + p -= 2; + if (p[-1] != 0xff) + p -= 1; + if (!(p[0] & 0x80)) + p -= 1; + } + return (const u8 *)&n + sizeof(n) - p; +} + +/* Full trusted function. It does not check 'size' for errors. */ +static inline void run_pack_s64(u8 *run_buf, u8 size, s64 v) +{ + const u8 *p = (u8 *)&v; + + switch (size) { + case 8: + run_buf[7] = p[0]; + fallthrough; + case 7: + run_buf[6] = p[1]; + fallthrough; + case 6: + run_buf[5] = p[2]; + fallthrough; + case 5: + run_buf[4] = p[3]; + fallthrough; + case 4: + run_buf[3] = p[4]; + fallthrough; + case 3: + run_buf[2] = p[5]; + fallthrough; + case 2: + run_buf[1] = p[6]; + fallthrough; + case 1: + run_buf[0] = p[7]; + } +} + +/* Full trusted function. It does not check 'size' for errors. */ +static inline s64 run_unpack_s64(const u8 *run_buf, u8 size, s64 v) +{ + u8 *p = (u8 *)&v; + + switch (size) { + case 8: + p[0] = run_buf[7]; + fallthrough; + case 7: + p[1] = run_buf[6]; + fallthrough; + case 6: + p[2] = run_buf[5]; + fallthrough; + case 5: + p[3] = run_buf[4]; + fallthrough; + case 4: + p[4] = run_buf[3]; + fallthrough; + case 3: + p[5] = run_buf[2]; + fallthrough; + case 2: + p[6] = run_buf[1]; + fallthrough; + case 1: + p[7] = run_buf[0]; + } + return v; +} + +#else + +static inline int run_packed_size(const s64 n) +{ + const u8 *p = (const u8 *)&n; + + if (n >= 0) { + if (p[7] || p[6] || p[5] || p[4]) + p += 4; + if (p[3] || p[2]) + p += 2; + if (p[1]) + p += 1; + if (p[0] & 0x80) + p += 1; + } else { + if (p[7] != 0xff || p[6] != 0xff || p[5] != 0xff || + p[4] != 0xff) + p += 4; + if (p[3] != 0xff || p[2] != 0xff) + p += 2; + if (p[1] != 0xff) + p += 1; + if (!(p[0] & 0x80)) + p += 1; + } + + return 1 + p - (const u8 *)&n; +} + +/* Full trusted function. It does not check 'size' for errors. */ +static inline void run_pack_s64(u8 *run_buf, u8 size, s64 v) +{ + const u8 *p = (u8 *)&v; + + /* memcpy( run_buf, &v, size); Is it faster? */ + switch (size) { + case 8: + run_buf[7] = p[7]; + fallthrough; + case 7: + run_buf[6] = p[6]; + fallthrough; + case 6: + run_buf[5] = p[5]; + fallthrough; + case 5: + run_buf[4] = p[4]; + fallthrough; + case 4: + run_buf[3] = p[3]; + fallthrough; + case 3: + run_buf[2] = p[2]; + fallthrough; + case 2: + run_buf[1] = p[1]; + fallthrough; + case 1: + run_buf[0] = p[0]; + } +} + +/* full trusted function. It does not check 'size' for errors */ +static inline s64 run_unpack_s64(const u8 *run_buf, u8 size, s64 v) +{ + u8 *p = (u8 *)&v; + + /* memcpy( &v, run_buf, size); Is it faster? */ + switch (size) { + case 8: + p[7] = run_buf[7]; + fallthrough; + case 7: + p[6] = run_buf[6]; + fallthrough; + case 6: + p[5] = run_buf[5]; + fallthrough; + case 5: + p[4] = run_buf[4]; + fallthrough; + case 4: + p[3] = run_buf[3]; + fallthrough; + case 3: + p[2] = run_buf[2]; + fallthrough; + case 2: + p[1] = run_buf[1]; + fallthrough; + case 1: + p[0] = run_buf[0]; + } + return v; +} +#endif + +/* + * run_pack - Pack runs into buffer. + * + * packed_vcns - How much runs we have packed. + * packed_size - How much bytes we have used run_buf. + */ +int run_pack(const struct runs_tree *run, CLST svcn, CLST len, u8 *run_buf, + u32 run_buf_size, CLST *packed_vcns) +{ + CLST next_vcn, vcn, lcn; + CLST prev_lcn = 0; + CLST evcn1 = svcn + len; + const struct ntfs_run *r, *r_end; + int packed_size = 0; + size_t i; + s64 dlcn; + int offset_size, size_size, tmp; + + *packed_vcns = 0; + + if (!len) + goto out; + + /* Check all required entries [svcn, encv1) available. */ + if (!run_lookup(run, svcn, &i)) + return -ENOENT; + + r_end = run->runs + run->count; + r = run->runs + i; + + for (next_vcn = r->vcn + r->len; next_vcn < evcn1; + next_vcn = r->vcn + r->len) { + if (++r >= r_end || r->vcn != next_vcn) + return -ENOENT; + } + + /* Repeat cycle above and pack runs. Assume no errors. */ + r = run->runs + i; + len = svcn - r->vcn; + vcn = svcn; + lcn = r->lcn == SPARSE_LCN ? SPARSE_LCN : (r->lcn + len); + len = r->len - len; + + for (;;) { + next_vcn = vcn + len; + if (next_vcn > evcn1) + len = evcn1 - vcn; + + /* How much bytes required to pack len. */ + size_size = run_packed_size(len); + + /* offset_size - How much bytes is packed dlcn. */ + if (lcn == SPARSE_LCN) { + offset_size = 0; + dlcn = 0; + } else { + /* NOTE: lcn can be less than prev_lcn! */ + dlcn = (s64)lcn - prev_lcn; + offset_size = run_packed_size(dlcn); + prev_lcn = lcn; + } + + tmp = run_buf_size - packed_size - 2 - offset_size; + if (tmp <= 0) + goto out; + + /* Can we store this entire run. */ + if (tmp < size_size) + goto out; + + if (run_buf) { + /* Pack run header. */ + run_buf[0] = ((u8)(size_size | (offset_size << 4))); + run_buf += 1; + + /* Pack the length of run. */ + run_pack_s64(run_buf, size_size, len); + + run_buf += size_size; + /* Pack the offset from previous LCN. */ + run_pack_s64(run_buf, offset_size, dlcn); + run_buf += offset_size; + } + + packed_size += 1 + offset_size + size_size; + *packed_vcns += len; + + if (packed_size + 1 >= run_buf_size || next_vcn >= evcn1) + goto out; + + r += 1; + vcn = r->vcn; + lcn = r->lcn; + len = r->len; + } + +out: + /* Store last zero. */ + if (run_buf) + run_buf[0] = 0; + + return packed_size + 1; +} + +/* + * run_unpack - Unpack packed runs from @run_buf. + * + * Return: Error if negative, or real used bytes. + */ +int run_unpack(struct runs_tree *run, struct ntfs_sb_info *sbi, CLST ino, + CLST svcn, CLST evcn, CLST vcn, const u8 *run_buf, + int run_buf_size) +{ + u64 prev_lcn, vcn64, lcn, next_vcn; + const u8 *run_last, *run_0; + bool is_mft = ino == MFT_REC_MFT; + + if (run_buf_size < 0) + return -EINVAL; + + /* Check for empty. */ + if (evcn + 1 == svcn) + return 0; + + if (evcn < svcn) + return -EINVAL; + + run_0 = run_buf; + run_last = run_buf + run_buf_size; + prev_lcn = 0; + vcn64 = svcn; + + /* Read all runs the chain. */ + /* size_size - How much bytes is packed len. */ + while (run_buf < run_last) { + /* size_size - How much bytes is packed len. */ + u8 size_size = *run_buf & 0xF; + /* offset_size - How much bytes is packed dlcn. */ + u8 offset_size = *run_buf++ >> 4; + u64 len; + + if (!size_size) + break; + + /* + * Unpack runs. + * NOTE: Runs are stored little endian order + * "len" is unsigned value, "dlcn" is signed. + * Large positive number requires to store 5 bytes + * e.g.: 05 FF 7E FF FF 00 00 00 + */ + if (size_size > 8) + return -EINVAL; + + len = run_unpack_s64(run_buf, size_size, 0); + /* Skip size_size. */ + run_buf += size_size; + + if (!len) + return -EINVAL; + + if (!offset_size) + lcn = SPARSE_LCN64; + else if (offset_size <= 8) { + s64 dlcn; + + /* Initial value of dlcn is -1 or 0. */ + dlcn = (run_buf[offset_size - 1] & 0x80) ? (s64)-1 : 0; + dlcn = run_unpack_s64(run_buf, offset_size, dlcn); + /* Skip offset_size. */ + run_buf += offset_size; + + if (!dlcn) + return -EINVAL; + lcn = prev_lcn + dlcn; + prev_lcn = lcn; + } else + return -EINVAL; + + next_vcn = vcn64 + len; + /* Check boundary. */ + if (next_vcn > evcn + 1) + return -EINVAL; + +#ifndef CONFIG_NTFS3_64BIT_CLUSTER + if (next_vcn > 0x100000000ull || (lcn + len) > 0x100000000ull) { + ntfs_err( + sbi->sb, + "This driver is compiled without CONFIG_NTFS3_64BIT_CLUSTER (like windows driver).\n" + "Volume contains 64 bits run: vcn %llx, lcn %llx, len %llx.\n" + "Activate CONFIG_NTFS3_64BIT_CLUSTER to process this case", + vcn64, lcn, len); + return -EOPNOTSUPP; + } +#endif + if (lcn != SPARSE_LCN64 && lcn + len > sbi->used.bitmap.nbits) { + /* LCN range is out of volume. */ + return -EINVAL; + } + + if (!run) + ; /* Called from check_attr(fslog.c) to check run. */ + else if (run == RUN_DEALLOCATE) { + /* + * Called from ni_delete_all to free clusters + * without storing in run. + */ + if (lcn != SPARSE_LCN64) + mark_as_free_ex(sbi, lcn, len, true); + } else if (vcn64 >= vcn) { + if (!run_add_entry(run, vcn64, lcn, len, is_mft)) + return -ENOMEM; + } else if (next_vcn > vcn) { + u64 dlen = vcn - vcn64; + + if (!run_add_entry(run, vcn, lcn + dlen, len - dlen, + is_mft)) + return -ENOMEM; + } + + vcn64 = next_vcn; + } + + if (vcn64 != evcn + 1) { + /* Not expected length of unpacked runs. */ + return -EINVAL; + } + + return run_buf - run_0; +} + +#ifdef NTFS3_CHECK_FREE_CLST +/* + * run_unpack_ex - Unpack packed runs from "run_buf". + * + * Checks unpacked runs to be used in bitmap. + * + * Return: Error if negative, or real used bytes. + */ +int run_unpack_ex(struct runs_tree *run, struct ntfs_sb_info *sbi, CLST ino, + CLST svcn, CLST evcn, CLST vcn, const u8 *run_buf, + int run_buf_size) +{ + int ret, err; + CLST next_vcn, lcn, len; + size_t index; + bool ok; + struct wnd_bitmap *wnd; + + ret = run_unpack(run, sbi, ino, svcn, evcn, vcn, run_buf, run_buf_size); + if (ret <= 0) + return ret; + + if (!sbi->used.bitmap.sb || !run || run == RUN_DEALLOCATE) + return ret; + + if (ino == MFT_REC_BADCLUST) + return ret; + + next_vcn = vcn = svcn; + wnd = &sbi->used.bitmap; + + for (ok = run_lookup_entry(run, vcn, &lcn, &len, &index); + next_vcn <= evcn; + ok = run_get_entry(run, ++index, &vcn, &lcn, &len)) { + if (!ok || next_vcn != vcn) + return -EINVAL; + + next_vcn = vcn + len; + + if (lcn == SPARSE_LCN) + continue; + + if (sbi->flags & NTFS_FLAGS_NEED_REPLAY) + continue; + + down_read_nested(&wnd->rw_lock, BITMAP_MUTEX_CLUSTERS); + /* Check for free blocks. */ + ok = wnd_is_used(wnd, lcn, len); + up_read(&wnd->rw_lock); + if (ok) + continue; + + /* Looks like volume is corrupted. */ + ntfs_set_state(sbi, NTFS_DIRTY_ERROR); + + if (down_write_trylock(&wnd->rw_lock)) { + /* Mark all zero bits as used in range [lcn, lcn+len). */ + CLST i, lcn_f = 0, len_f = 0; + + err = 0; + for (i = 0; i < len; i++) { + if (wnd_is_free(wnd, lcn + i, 1)) { + if (!len_f) + lcn_f = lcn + i; + len_f += 1; + } else if (len_f) { + err = wnd_set_used(wnd, lcn_f, len_f); + len_f = 0; + if (err) + break; + } + } + + if (len_f) + err = wnd_set_used(wnd, lcn_f, len_f); + + up_write(&wnd->rw_lock); + if (err) + return err; + } + } + + return ret; +} +#endif + +/* + * run_get_highest_vcn + * + * Return the highest vcn from a mapping pairs array + * it used while replaying log file. + */ +int run_get_highest_vcn(CLST vcn, const u8 *run_buf, u64 *highest_vcn) +{ + u64 vcn64 = vcn; + u8 size_size; + + while ((size_size = *run_buf & 0xF)) { + u8 offset_size = *run_buf++ >> 4; + u64 len; + + if (size_size > 8 || offset_size > 8) + return -EINVAL; + + len = run_unpack_s64(run_buf, size_size, 0); + if (!len) + return -EINVAL; + + run_buf += size_size + offset_size; + vcn64 += len; + +#ifndef CONFIG_NTFS3_64BIT_CLUSTER + if (vcn64 > 0x100000000ull) + return -EINVAL; +#endif + } + + *highest_vcn = vcn64 - 1; + return 0; +} + +/* + * run_clone + * + * Make a copy of run + */ +int run_clone(const struct runs_tree *run, struct runs_tree *new_run) +{ + size_t bytes = run->count * sizeof(struct ntfs_run); + + if (bytes > new_run->allocated) { + struct ntfs_run *new_ptr = kvmalloc(bytes, GFP_KERNEL); + + if (!new_ptr) + return -ENOMEM; + + kvfree(new_run->runs); + new_run->runs = new_ptr; + new_run->allocated = bytes; + } + + memcpy(new_run->runs, run->runs, bytes); + new_run->count = run->count; + return 0; +} diff --git a/fs/ntfs3/super.c b/fs/ntfs3/super.c new file mode 100644 index 000000000..6066eea3f --- /dev/null +++ b/fs/ntfs3/super.c @@ -0,0 +1,1517 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * + * Copyright (C) 2019-2021 Paragon Software GmbH, All rights reserved. + * + * + * terminology + * + * cluster - allocation unit - 512,1K,2K,4K,...,2M + * vcn - virtual cluster number - Offset inside the file in clusters. + * vbo - virtual byte offset - Offset inside the file in bytes. + * lcn - logical cluster number - 0 based cluster in clusters heap. + * lbo - logical byte offset - Absolute position inside volume. + * run - maps VCN to LCN - Stored in attributes in packed form. + * attr - attribute segment - std/name/data etc records inside MFT. + * mi - MFT inode - One MFT record(usually 1024 bytes or 4K), consists of attributes. + * ni - NTFS inode - Extends linux inode. consists of one or more mft inodes. + * index - unit inside directory - 2K, 4K, <=page size, does not depend on cluster size. + * + * WSL - Windows Subsystem for Linux + * https://docs.microsoft.com/en-us/windows/wsl/file-permissions + * It stores uid/gid/mode/dev in xattr + * + */ + +#include <linux/blkdev.h> +#include <linux/buffer_head.h> +#include <linux/exportfs.h> +#include <linux/fs.h> +#include <linux/fs_context.h> +#include <linux/fs_parser.h> +#include <linux/log2.h> +#include <linux/minmax.h> +#include <linux/module.h> +#include <linux/nls.h> +#include <linux/seq_file.h> +#include <linux/statfs.h> + +#include "debug.h" +#include "ntfs.h" +#include "ntfs_fs.h" +#ifdef CONFIG_NTFS3_LZX_XPRESS +#include "lib/lib.h" +#endif + +#ifdef CONFIG_PRINTK +/* + * ntfs_printk - Trace warnings/notices/errors. + * + * Thanks Joe Perches <joe@perches.com> for implementation + */ +void ntfs_printk(const struct super_block *sb, const char *fmt, ...) +{ + struct va_format vaf; + va_list args; + int level; + struct ntfs_sb_info *sbi = sb->s_fs_info; + + /* Should we use different ratelimits for warnings/notices/errors? */ + if (!___ratelimit(&sbi->msg_ratelimit, "ntfs3")) + return; + + va_start(args, fmt); + + level = printk_get_level(fmt); + vaf.fmt = printk_skip_level(fmt); + vaf.va = &args; + printk("%c%cntfs3: %s: %pV\n", KERN_SOH_ASCII, level, sb->s_id, &vaf); + + va_end(args); +} + +static char s_name_buf[512]; +static atomic_t s_name_buf_cnt = ATOMIC_INIT(1); // 1 means 'free s_name_buf'. + +/* + * ntfs_inode_printk + * + * Print warnings/notices/errors about inode using name or inode number. + */ +void ntfs_inode_printk(struct inode *inode, const char *fmt, ...) +{ + struct super_block *sb = inode->i_sb; + struct ntfs_sb_info *sbi = sb->s_fs_info; + char *name; + va_list args; + struct va_format vaf; + int level; + + if (!___ratelimit(&sbi->msg_ratelimit, "ntfs3")) + return; + + /* Use static allocated buffer, if possible. */ + name = atomic_dec_and_test(&s_name_buf_cnt) + ? s_name_buf + : kmalloc(sizeof(s_name_buf), GFP_NOFS); + + if (name) { + struct dentry *de = d_find_alias(inode); + const u32 name_len = ARRAY_SIZE(s_name_buf) - 1; + + if (de) { + spin_lock(&de->d_lock); + snprintf(name, name_len, " \"%s\"", de->d_name.name); + spin_unlock(&de->d_lock); + name[name_len] = 0; /* To be sure. */ + } else { + name[0] = 0; + } + dput(de); /* Cocci warns if placed in branch "if (de)" */ + } + + va_start(args, fmt); + + level = printk_get_level(fmt); + vaf.fmt = printk_skip_level(fmt); + vaf.va = &args; + + printk("%c%cntfs3: %s: ino=%lx,%s %pV\n", KERN_SOH_ASCII, level, + sb->s_id, inode->i_ino, name ? name : "", &vaf); + + va_end(args); + + atomic_inc(&s_name_buf_cnt); + if (name != s_name_buf) + kfree(name); +} +#endif + +/* + * Shared memory struct. + * + * On-disk ntfs's upcase table is created by ntfs formatter. + * 'upcase' table is 128K bytes of memory. + * We should read it into memory when mounting. + * Several ntfs volumes likely use the same 'upcase' table. + * It is good idea to share in-memory 'upcase' table between different volumes. + * Unfortunately winxp/vista/win7 use different upcase tables. + */ +static DEFINE_SPINLOCK(s_shared_lock); + +static struct { + void *ptr; + u32 len; + int cnt; +} s_shared[8]; + +/* + * ntfs_set_shared + * + * Return: + * * @ptr - If pointer was saved in shared memory. + * * NULL - If pointer was not shared. + */ +void *ntfs_set_shared(void *ptr, u32 bytes) +{ + void *ret = NULL; + int i, j = -1; + + spin_lock(&s_shared_lock); + for (i = 0; i < ARRAY_SIZE(s_shared); i++) { + if (!s_shared[i].cnt) { + j = i; + } else if (bytes == s_shared[i].len && + !memcmp(s_shared[i].ptr, ptr, bytes)) { + s_shared[i].cnt += 1; + ret = s_shared[i].ptr; + break; + } + } + + if (!ret && j != -1) { + s_shared[j].ptr = ptr; + s_shared[j].len = bytes; + s_shared[j].cnt = 1; + ret = ptr; + } + spin_unlock(&s_shared_lock); + + return ret; +} + +/* + * ntfs_put_shared + * + * Return: + * * @ptr - If pointer is not shared anymore. + * * NULL - If pointer is still shared. + */ +void *ntfs_put_shared(void *ptr) +{ + void *ret = ptr; + int i; + + spin_lock(&s_shared_lock); + for (i = 0; i < ARRAY_SIZE(s_shared); i++) { + if (s_shared[i].cnt && s_shared[i].ptr == ptr) { + if (--s_shared[i].cnt) + ret = NULL; + break; + } + } + spin_unlock(&s_shared_lock); + + return ret; +} + +static inline void put_mount_options(struct ntfs_mount_options *options) +{ + kfree(options->nls_name); + unload_nls(options->nls); + kfree(options); +} + +enum Opt { + Opt_uid, + Opt_gid, + Opt_umask, + Opt_dmask, + Opt_fmask, + Opt_immutable, + Opt_discard, + Opt_force, + Opt_sparse, + Opt_nohidden, + Opt_showmeta, + Opt_acl, + Opt_iocharset, + Opt_prealloc, + Opt_noacsrules, + Opt_err, +}; + +static const struct fs_parameter_spec ntfs_fs_parameters[] = { + fsparam_u32("uid", Opt_uid), + fsparam_u32("gid", Opt_gid), + fsparam_u32oct("umask", Opt_umask), + fsparam_u32oct("dmask", Opt_dmask), + fsparam_u32oct("fmask", Opt_fmask), + fsparam_flag_no("sys_immutable", Opt_immutable), + fsparam_flag_no("discard", Opt_discard), + fsparam_flag_no("force", Opt_force), + fsparam_flag_no("sparse", Opt_sparse), + fsparam_flag_no("hidden", Opt_nohidden), + fsparam_flag_no("acl", Opt_acl), + fsparam_flag_no("showmeta", Opt_showmeta), + fsparam_flag_no("prealloc", Opt_prealloc), + fsparam_flag_no("acsrules", Opt_noacsrules), + fsparam_string("iocharset", Opt_iocharset), + {} +}; + +/* + * Load nls table or if @nls is utf8 then return NULL. + */ +static struct nls_table *ntfs_load_nls(char *nls) +{ + struct nls_table *ret; + + if (!nls) + nls = CONFIG_NLS_DEFAULT; + + if (strcmp(nls, "utf8") == 0) + return NULL; + + if (strcmp(nls, CONFIG_NLS_DEFAULT) == 0) + return load_nls_default(); + + ret = load_nls(nls); + if (ret) + return ret; + + return ERR_PTR(-EINVAL); +} + +static int ntfs_fs_parse_param(struct fs_context *fc, + struct fs_parameter *param) +{ + struct ntfs_mount_options *opts = fc->fs_private; + struct fs_parse_result result; + int opt; + + opt = fs_parse(fc, ntfs_fs_parameters, param, &result); + if (opt < 0) + return opt; + + switch (opt) { + case Opt_uid: + opts->fs_uid = make_kuid(current_user_ns(), result.uint_32); + if (!uid_valid(opts->fs_uid)) + return invalf(fc, "ntfs3: Invalid value for uid."); + break; + case Opt_gid: + opts->fs_gid = make_kgid(current_user_ns(), result.uint_32); + if (!gid_valid(opts->fs_gid)) + return invalf(fc, "ntfs3: Invalid value for gid."); + break; + case Opt_umask: + if (result.uint_32 & ~07777) + return invalf(fc, "ntfs3: Invalid value for umask."); + opts->fs_fmask_inv = ~result.uint_32; + opts->fs_dmask_inv = ~result.uint_32; + opts->fmask = 1; + opts->dmask = 1; + break; + case Opt_dmask: + if (result.uint_32 & ~07777) + return invalf(fc, "ntfs3: Invalid value for dmask."); + opts->fs_dmask_inv = ~result.uint_32; + opts->dmask = 1; + break; + case Opt_fmask: + if (result.uint_32 & ~07777) + return invalf(fc, "ntfs3: Invalid value for fmask."); + opts->fs_fmask_inv = ~result.uint_32; + opts->fmask = 1; + break; + case Opt_immutable: + opts->sys_immutable = result.negated ? 0 : 1; + break; + case Opt_discard: + opts->discard = result.negated ? 0 : 1; + break; + case Opt_force: + opts->force = result.negated ? 0 : 1; + break; + case Opt_sparse: + opts->sparse = result.negated ? 0 : 1; + break; + case Opt_nohidden: + opts->nohidden = result.negated ? 1 : 0; + break; + case Opt_acl: + if (!result.negated) +#ifdef CONFIG_NTFS3_FS_POSIX_ACL + fc->sb_flags |= SB_POSIXACL; +#else + return invalf(fc, "ntfs3: Support for ACL not compiled in!"); +#endif + else + fc->sb_flags &= ~SB_POSIXACL; + break; + case Opt_showmeta: + opts->showmeta = result.negated ? 0 : 1; + break; + case Opt_iocharset: + kfree(opts->nls_name); + opts->nls_name = param->string; + param->string = NULL; + break; + case Opt_prealloc: + opts->prealloc = result.negated ? 0 : 1; + break; + case Opt_noacsrules: + opts->noacsrules = result.negated ? 1 : 0; + break; + default: + /* Should not be here unless we forget add case. */ + return -EINVAL; + } + return 0; +} + +static int ntfs_fs_reconfigure(struct fs_context *fc) +{ + struct super_block *sb = fc->root->d_sb; + struct ntfs_sb_info *sbi = sb->s_fs_info; + struct ntfs_mount_options *new_opts = fc->fs_private; + int ro_rw; + + ro_rw = sb_rdonly(sb) && !(fc->sb_flags & SB_RDONLY); + if (ro_rw && (sbi->flags & NTFS_FLAGS_NEED_REPLAY)) { + errorf(fc, "ntfs3: Couldn't remount rw because journal is not replayed. Please umount/remount instead\n"); + return -EINVAL; + } + + new_opts->nls = ntfs_load_nls(new_opts->nls_name); + if (IS_ERR(new_opts->nls)) { + new_opts->nls = NULL; + errorf(fc, "ntfs3: Cannot load iocharset %s", new_opts->nls_name); + return -EINVAL; + } + if (new_opts->nls != sbi->options->nls) + return invalf(fc, "ntfs3: Cannot use different iocharset when remounting!"); + + sync_filesystem(sb); + + if (ro_rw && (sbi->volume.flags & VOLUME_FLAG_DIRTY) && + !new_opts->force) { + errorf(fc, "ntfs3: Volume is dirty and \"force\" flag is not set!"); + return -EINVAL; + } + + swap(sbi->options, fc->fs_private); + + return 0; +} + +static struct kmem_cache *ntfs_inode_cachep; + +static struct inode *ntfs_alloc_inode(struct super_block *sb) +{ + struct ntfs_inode *ni = alloc_inode_sb(sb, ntfs_inode_cachep, GFP_NOFS); + + if (!ni) + return NULL; + + memset(ni, 0, offsetof(struct ntfs_inode, vfs_inode)); + + mutex_init(&ni->ni_lock); + + return &ni->vfs_inode; +} + +static void ntfs_i_callback(struct rcu_head *head) +{ + struct inode *inode = container_of(head, struct inode, i_rcu); + struct ntfs_inode *ni = ntfs_i(inode); + + mutex_destroy(&ni->ni_lock); + + kmem_cache_free(ntfs_inode_cachep, ni); +} + +static void ntfs_destroy_inode(struct inode *inode) +{ + call_rcu(&inode->i_rcu, ntfs_i_callback); +} + +static void init_once(void *foo) +{ + struct ntfs_inode *ni = foo; + + inode_init_once(&ni->vfs_inode); +} + +/* + * put_ntfs - Noinline to reduce binary size. + */ +static noinline void put_ntfs(struct ntfs_sb_info *sbi) +{ + kfree(sbi->new_rec); + kvfree(ntfs_put_shared(sbi->upcase)); + kfree(sbi->def_table); + + wnd_close(&sbi->mft.bitmap); + wnd_close(&sbi->used.bitmap); + + if (sbi->mft.ni) + iput(&sbi->mft.ni->vfs_inode); + + if (sbi->security.ni) + iput(&sbi->security.ni->vfs_inode); + + if (sbi->reparse.ni) + iput(&sbi->reparse.ni->vfs_inode); + + if (sbi->objid.ni) + iput(&sbi->objid.ni->vfs_inode); + + if (sbi->volume.ni) + iput(&sbi->volume.ni->vfs_inode); + + ntfs_update_mftmirr(sbi, 0); + + indx_clear(&sbi->security.index_sii); + indx_clear(&sbi->security.index_sdh); + indx_clear(&sbi->reparse.index_r); + indx_clear(&sbi->objid.index_o); + kfree(sbi->compress.lznt); +#ifdef CONFIG_NTFS3_LZX_XPRESS + xpress_free_decompressor(sbi->compress.xpress); + lzx_free_decompressor(sbi->compress.lzx); +#endif + kfree(sbi); +} + +static void ntfs_put_super(struct super_block *sb) +{ + struct ntfs_sb_info *sbi = sb->s_fs_info; + + /* Mark rw ntfs as clear, if possible. */ + ntfs_set_state(sbi, NTFS_DIRTY_CLEAR); + + put_mount_options(sbi->options); + put_ntfs(sbi); + sb->s_fs_info = NULL; + + sync_blockdev(sb->s_bdev); +} + +static int ntfs_statfs(struct dentry *dentry, struct kstatfs *buf) +{ + struct super_block *sb = dentry->d_sb; + struct ntfs_sb_info *sbi = sb->s_fs_info; + struct wnd_bitmap *wnd = &sbi->used.bitmap; + + buf->f_type = sb->s_magic; + buf->f_bsize = sbi->cluster_size; + buf->f_blocks = wnd->nbits; + + buf->f_bfree = buf->f_bavail = wnd_zeroes(wnd); + buf->f_fsid.val[0] = sbi->volume.ser_num; + buf->f_fsid.val[1] = (sbi->volume.ser_num >> 32); + buf->f_namelen = NTFS_NAME_LEN; + + return 0; +} + +static int ntfs_show_options(struct seq_file *m, struct dentry *root) +{ + struct super_block *sb = root->d_sb; + struct ntfs_sb_info *sbi = sb->s_fs_info; + struct ntfs_mount_options *opts = sbi->options; + struct user_namespace *user_ns = seq_user_ns(m); + + seq_printf(m, ",uid=%u", + from_kuid_munged(user_ns, opts->fs_uid)); + seq_printf(m, ",gid=%u", + from_kgid_munged(user_ns, opts->fs_gid)); + if (opts->fmask) + seq_printf(m, ",fmask=%04o", ~opts->fs_fmask_inv); + if (opts->dmask) + seq_printf(m, ",dmask=%04o", ~opts->fs_dmask_inv); + if (opts->nls) + seq_printf(m, ",iocharset=%s", opts->nls->charset); + else + seq_puts(m, ",iocharset=utf8"); + if (opts->sys_immutable) + seq_puts(m, ",sys_immutable"); + if (opts->discard) + seq_puts(m, ",discard"); + if (opts->sparse) + seq_puts(m, ",sparse"); + if (opts->showmeta) + seq_puts(m, ",showmeta"); + if (opts->nohidden) + seq_puts(m, ",nohidden"); + if (opts->force) + seq_puts(m, ",force"); + if (opts->noacsrules) + seq_puts(m, ",noacsrules"); + if (opts->prealloc) + seq_puts(m, ",prealloc"); + if (sb->s_flags & SB_POSIXACL) + seq_puts(m, ",acl"); + + return 0; +} + +/* + * ntfs_sync_fs - super_operations::sync_fs + */ +static int ntfs_sync_fs(struct super_block *sb, int wait) +{ + int err = 0, err2; + struct ntfs_sb_info *sbi = sb->s_fs_info; + struct ntfs_inode *ni; + struct inode *inode; + + ni = sbi->security.ni; + if (ni) { + inode = &ni->vfs_inode; + err2 = _ni_write_inode(inode, wait); + if (err2 && !err) + err = err2; + } + + ni = sbi->objid.ni; + if (ni) { + inode = &ni->vfs_inode; + err2 = _ni_write_inode(inode, wait); + if (err2 && !err) + err = err2; + } + + ni = sbi->reparse.ni; + if (ni) { + inode = &ni->vfs_inode; + err2 = _ni_write_inode(inode, wait); + if (err2 && !err) + err = err2; + } + + if (!err) + ntfs_set_state(sbi, NTFS_DIRTY_CLEAR); + + ntfs_update_mftmirr(sbi, wait); + + return err; +} + +static const struct super_operations ntfs_sops = { + .alloc_inode = ntfs_alloc_inode, + .destroy_inode = ntfs_destroy_inode, + .evict_inode = ntfs_evict_inode, + .put_super = ntfs_put_super, + .statfs = ntfs_statfs, + .show_options = ntfs_show_options, + .sync_fs = ntfs_sync_fs, + .write_inode = ntfs3_write_inode, +}; + +static struct inode *ntfs_export_get_inode(struct super_block *sb, u64 ino, + u32 generation) +{ + struct MFT_REF ref; + struct inode *inode; + + ref.low = cpu_to_le32(ino); +#ifdef CONFIG_NTFS3_64BIT_CLUSTER + ref.high = cpu_to_le16(ino >> 32); +#else + ref.high = 0; +#endif + ref.seq = cpu_to_le16(generation); + + inode = ntfs_iget5(sb, &ref, NULL); + if (!IS_ERR(inode) && is_bad_inode(inode)) { + iput(inode); + inode = ERR_PTR(-ESTALE); + } + + return inode; +} + +static struct dentry *ntfs_fh_to_dentry(struct super_block *sb, struct fid *fid, + int fh_len, int fh_type) +{ + return generic_fh_to_dentry(sb, fid, fh_len, fh_type, + ntfs_export_get_inode); +} + +static struct dentry *ntfs_fh_to_parent(struct super_block *sb, struct fid *fid, + int fh_len, int fh_type) +{ + return generic_fh_to_parent(sb, fid, fh_len, fh_type, + ntfs_export_get_inode); +} + +/* TODO: == ntfs_sync_inode */ +static int ntfs_nfs_commit_metadata(struct inode *inode) +{ + return _ni_write_inode(inode, 1); +} + +static const struct export_operations ntfs_export_ops = { + .fh_to_dentry = ntfs_fh_to_dentry, + .fh_to_parent = ntfs_fh_to_parent, + .get_parent = ntfs3_get_parent, + .commit_metadata = ntfs_nfs_commit_metadata, +}; + +/* + * format_size_gb - Return Gb,Mb to print with "%u.%02u Gb". + */ +static u32 format_size_gb(const u64 bytes, u32 *mb) +{ + /* Do simple right 30 bit shift of 64 bit value. */ + u64 kbytes = bytes >> 10; + u32 kbytes32 = kbytes; + + *mb = (100 * (kbytes32 & 0xfffff) + 0x7ffff) >> 20; + if (*mb >= 100) + *mb = 99; + + return (kbytes32 >> 20) | (((u32)(kbytes >> 32)) << 12); +} + +static u32 true_sectors_per_clst(const struct NTFS_BOOT *boot) +{ + if (boot->sectors_per_clusters <= 0x80) + return boot->sectors_per_clusters; + if (boot->sectors_per_clusters >= 0xf4) /* limit shift to 2MB max */ + return 1U << -(s8)boot->sectors_per_clusters; + return -EINVAL; +} + +/* + * ntfs_init_from_boot - Init internal info from on-disk boot sector. + */ +static int ntfs_init_from_boot(struct super_block *sb, u32 sector_size, + u64 dev_size) +{ + struct ntfs_sb_info *sbi = sb->s_fs_info; + int err; + u32 mb, gb, boot_sector_size, sct_per_clst, record_size; + u64 sectors, clusters, mlcn, mlcn2; + struct NTFS_BOOT *boot; + struct buffer_head *bh; + struct MFT_REC *rec; + u16 fn, ao; + + sbi->volume.blocks = dev_size >> PAGE_SHIFT; + + bh = ntfs_bread(sb, 0); + if (!bh) + return -EIO; + + err = -EINVAL; + boot = (struct NTFS_BOOT *)bh->b_data; + + if (memcmp(boot->system_id, "NTFS ", sizeof("NTFS ") - 1)) + goto out; + + /* 0x55AA is not mandaroty. Thanks Maxim Suhanov*/ + /*if (0x55 != boot->boot_magic[0] || 0xAA != boot->boot_magic[1]) + * goto out; + */ + + boot_sector_size = (u32)boot->bytes_per_sector[1] << 8; + if (boot->bytes_per_sector[0] || boot_sector_size < SECTOR_SIZE || + !is_power_of_2(boot_sector_size)) { + goto out; + } + + /* cluster size: 512, 1K, 2K, 4K, ... 2M */ + sct_per_clst = true_sectors_per_clst(boot); + if ((int)sct_per_clst < 0) + goto out; + if (!is_power_of_2(sct_per_clst)) + goto out; + + mlcn = le64_to_cpu(boot->mft_clst); + mlcn2 = le64_to_cpu(boot->mft2_clst); + sectors = le64_to_cpu(boot->sectors_per_volume); + + if (mlcn * sct_per_clst >= sectors) + goto out; + + if (mlcn2 * sct_per_clst >= sectors) + goto out; + + /* Check MFT record size. */ + if ((boot->record_size < 0 && + SECTOR_SIZE > (2U << (-boot->record_size))) || + (boot->record_size >= 0 && !is_power_of_2(boot->record_size))) { + goto out; + } + + /* Check index record size. */ + if ((boot->index_size < 0 && + SECTOR_SIZE > (2U << (-boot->index_size))) || + (boot->index_size >= 0 && !is_power_of_2(boot->index_size))) { + goto out; + } + + sbi->volume.size = sectors * boot_sector_size; + + gb = format_size_gb(sbi->volume.size + boot_sector_size, &mb); + + /* + * - Volume formatted and mounted with the same sector size. + * - Volume formatted 4K and mounted as 512. + * - Volume formatted 512 and mounted as 4K. + */ + if (boot_sector_size != sector_size) { + ntfs_warn( + sb, + "Different NTFS' sector size (%u) and media sector size (%u)", + boot_sector_size, sector_size); + dev_size += sector_size - 1; + } + + sbi->cluster_size = boot_sector_size * sct_per_clst; + sbi->cluster_bits = blksize_bits(sbi->cluster_size); + + sbi->mft.lbo = mlcn << sbi->cluster_bits; + sbi->mft.lbo2 = mlcn2 << sbi->cluster_bits; + + /* Compare boot's cluster and sector. */ + if (sbi->cluster_size < boot_sector_size) + goto out; + + /* Compare boot's cluster and media sector. */ + if (sbi->cluster_size < sector_size) { + /* No way to use ntfs_get_block in this case. */ + ntfs_err( + sb, + "Failed to mount 'cause NTFS's cluster size (%u) is less than media sector size (%u)", + sbi->cluster_size, sector_size); + goto out; + } + + sbi->cluster_mask = sbi->cluster_size - 1; + sbi->cluster_mask_inv = ~(u64)sbi->cluster_mask; + sbi->record_size = record_size = boot->record_size < 0 + ? 1 << (-boot->record_size) + : (u32)boot->record_size + << sbi->cluster_bits; + + if (record_size > MAXIMUM_BYTES_PER_MFT || record_size < SECTOR_SIZE) + goto out; + + sbi->record_bits = blksize_bits(record_size); + sbi->attr_size_tr = (5 * record_size >> 4); // ~320 bytes + + sbi->max_bytes_per_attr = + record_size - ALIGN(MFTRECORD_FIXUP_OFFSET_1, 8) - + ALIGN(((record_size >> SECTOR_SHIFT) * sizeof(short)), 8) - + ALIGN(sizeof(enum ATTR_TYPE), 8); + + sbi->index_size = boot->index_size < 0 + ? 1u << (-boot->index_size) + : (u32)boot->index_size << sbi->cluster_bits; + + sbi->volume.ser_num = le64_to_cpu(boot->serial_num); + + /* Warning if RAW volume. */ + if (dev_size < sbi->volume.size + boot_sector_size) { + u32 mb0, gb0; + + gb0 = format_size_gb(dev_size, &mb0); + ntfs_warn( + sb, + "RAW NTFS volume: Filesystem size %u.%02u Gb > volume size %u.%02u Gb. Mount in read-only", + gb, mb, gb0, mb0); + sb->s_flags |= SB_RDONLY; + } + + clusters = sbi->volume.size >> sbi->cluster_bits; +#ifndef CONFIG_NTFS3_64BIT_CLUSTER + /* 32 bits per cluster. */ + if (clusters >> 32) { + ntfs_notice( + sb, + "NTFS %u.%02u Gb is too big to use 32 bits per cluster", + gb, mb); + goto out; + } +#elif BITS_PER_LONG < 64 +#error "CONFIG_NTFS3_64BIT_CLUSTER incompatible in 32 bit OS" +#endif + + sbi->used.bitmap.nbits = clusters; + + rec = kzalloc(record_size, GFP_NOFS); + if (!rec) { + err = -ENOMEM; + goto out; + } + + sbi->new_rec = rec; + rec->rhdr.sign = NTFS_FILE_SIGNATURE; + rec->rhdr.fix_off = cpu_to_le16(MFTRECORD_FIXUP_OFFSET_1); + fn = (sbi->record_size >> SECTOR_SHIFT) + 1; + rec->rhdr.fix_num = cpu_to_le16(fn); + ao = ALIGN(MFTRECORD_FIXUP_OFFSET_1 + sizeof(short) * fn, 8); + rec->attr_off = cpu_to_le16(ao); + rec->used = cpu_to_le32(ao + ALIGN(sizeof(enum ATTR_TYPE), 8)); + rec->total = cpu_to_le32(sbi->record_size); + ((struct ATTRIB *)Add2Ptr(rec, ao))->type = ATTR_END; + + sb_set_blocksize(sb, min_t(u32, sbi->cluster_size, PAGE_SIZE)); + + sbi->block_mask = sb->s_blocksize - 1; + sbi->blocks_per_cluster = sbi->cluster_size >> sb->s_blocksize_bits; + sbi->volume.blocks = sbi->volume.size >> sb->s_blocksize_bits; + + /* Maximum size for normal files. */ + sbi->maxbytes = (clusters << sbi->cluster_bits) - 1; + +#ifdef CONFIG_NTFS3_64BIT_CLUSTER + if (clusters >= (1ull << (64 - sbi->cluster_bits))) + sbi->maxbytes = -1; + sbi->maxbytes_sparse = -1; + sb->s_maxbytes = MAX_LFS_FILESIZE; +#else + /* Maximum size for sparse file. */ + sbi->maxbytes_sparse = (1ull << (sbi->cluster_bits + 32)) - 1; + sb->s_maxbytes = 0xFFFFFFFFull << sbi->cluster_bits; +#endif + + /* + * Compute the MFT zone at two steps. + * It would be nice if we are able to allocate 1/8 of + * total clusters for MFT but not more then 512 MB. + */ + sbi->zone_max = min_t(CLST, 0x20000000 >> sbi->cluster_bits, clusters >> 3); + + err = 0; + +out: + brelse(bh); + + return err; +} + +/* + * ntfs_fill_super - Try to mount. + */ +static int ntfs_fill_super(struct super_block *sb, struct fs_context *fc) +{ + int err; + struct ntfs_sb_info *sbi = sb->s_fs_info; + struct block_device *bdev = sb->s_bdev; + struct inode *inode; + struct ntfs_inode *ni; + size_t i, tt; + CLST vcn, lcn, len; + struct ATTRIB *attr; + const struct VOLUME_INFO *info; + u32 idx, done, bytes; + struct ATTR_DEF_ENTRY *t; + u16 *shared; + struct MFT_REF ref; + + ref.high = 0; + + sbi->sb = sb; + sbi->options = fc->fs_private; + fc->fs_private = NULL; + sb->s_flags |= SB_NODIRATIME; + sb->s_magic = 0x7366746e; // "ntfs" + sb->s_op = &ntfs_sops; + sb->s_export_op = &ntfs_export_ops; + sb->s_time_gran = NTFS_TIME_GRAN; // 100 nsec + sb->s_xattr = ntfs_xattr_handlers; + + sbi->options->nls = ntfs_load_nls(sbi->options->nls_name); + if (IS_ERR(sbi->options->nls)) { + sbi->options->nls = NULL; + errorf(fc, "Cannot load nls %s", sbi->options->nls_name); + err = -EINVAL; + goto out; + } + + if (bdev_max_discard_sectors(bdev) && bdev_discard_granularity(bdev)) { + sbi->discard_granularity = bdev_discard_granularity(bdev); + sbi->discard_granularity_mask_inv = + ~(u64)(sbi->discard_granularity - 1); + } + + /* Parse boot. */ + err = ntfs_init_from_boot(sb, bdev_logical_block_size(bdev), + bdev_nr_bytes(bdev)); + if (err) + goto out; + + /* + * Load $Volume. This should be done before $LogFile + * 'cause 'sbi->volume.ni' is used 'ntfs_set_state'. + */ + ref.low = cpu_to_le32(MFT_REC_VOL); + ref.seq = cpu_to_le16(MFT_REC_VOL); + inode = ntfs_iget5(sb, &ref, &NAME_VOLUME); + if (IS_ERR(inode)) { + ntfs_err(sb, "Failed to load $Volume."); + err = PTR_ERR(inode); + goto out; + } + + ni = ntfs_i(inode); + + /* Load and save label (not necessary). */ + attr = ni_find_attr(ni, NULL, NULL, ATTR_LABEL, NULL, 0, NULL, NULL); + + if (!attr) { + /* It is ok if no ATTR_LABEL */ + } else if (!attr->non_res && !is_attr_ext(attr)) { + /* $AttrDef allows labels to be up to 128 symbols. */ + err = utf16s_to_utf8s(resident_data(attr), + le32_to_cpu(attr->res.data_size) >> 1, + UTF16_LITTLE_ENDIAN, sbi->volume.label, + sizeof(sbi->volume.label)); + if (err < 0) + sbi->volume.label[0] = 0; + } else { + /* Should we break mounting here? */ + //err = -EINVAL; + //goto put_inode_out; + } + + attr = ni_find_attr(ni, attr, NULL, ATTR_VOL_INFO, NULL, 0, NULL, NULL); + if (!attr || is_attr_ext(attr)) { + err = -EINVAL; + goto put_inode_out; + } + + info = resident_data_ex(attr, SIZEOF_ATTRIBUTE_VOLUME_INFO); + if (!info) { + err = -EINVAL; + goto put_inode_out; + } + + sbi->volume.major_ver = info->major_ver; + sbi->volume.minor_ver = info->minor_ver; + sbi->volume.flags = info->flags; + sbi->volume.ni = ni; + + /* Load $MFTMirr to estimate recs_mirr. */ + ref.low = cpu_to_le32(MFT_REC_MIRR); + ref.seq = cpu_to_le16(MFT_REC_MIRR); + inode = ntfs_iget5(sb, &ref, &NAME_MIRROR); + if (IS_ERR(inode)) { + ntfs_err(sb, "Failed to load $MFTMirr."); + err = PTR_ERR(inode); + goto out; + } + + sbi->mft.recs_mirr = + ntfs_up_cluster(sbi, inode->i_size) >> sbi->record_bits; + + iput(inode); + + /* Load LogFile to replay. */ + ref.low = cpu_to_le32(MFT_REC_LOG); + ref.seq = cpu_to_le16(MFT_REC_LOG); + inode = ntfs_iget5(sb, &ref, &NAME_LOGFILE); + if (IS_ERR(inode)) { + ntfs_err(sb, "Failed to load \x24LogFile."); + err = PTR_ERR(inode); + goto out; + } + + ni = ntfs_i(inode); + + err = ntfs_loadlog_and_replay(ni, sbi); + if (err) + goto put_inode_out; + + iput(inode); + + if (sbi->flags & NTFS_FLAGS_NEED_REPLAY) { + if (!sb_rdonly(sb)) { + ntfs_warn(sb, + "failed to replay log file. Can't mount rw!"); + err = -EINVAL; + goto out; + } + } else if (sbi->volume.flags & VOLUME_FLAG_DIRTY) { + if (!sb_rdonly(sb) && !sbi->options->force) { + ntfs_warn( + sb, + "volume is dirty and \"force\" flag is not set!"); + err = -EINVAL; + goto out; + } + } + + /* Load $MFT. */ + ref.low = cpu_to_le32(MFT_REC_MFT); + ref.seq = cpu_to_le16(1); + + inode = ntfs_iget5(sb, &ref, &NAME_MFT); + if (IS_ERR(inode)) { + ntfs_err(sb, "Failed to load $MFT."); + err = PTR_ERR(inode); + goto out; + } + + ni = ntfs_i(inode); + + sbi->mft.used = ni->i_valid >> sbi->record_bits; + tt = inode->i_size >> sbi->record_bits; + sbi->mft.next_free = MFT_REC_USER; + + err = wnd_init(&sbi->mft.bitmap, sb, tt); + if (err) + goto put_inode_out; + + err = ni_load_all_mi(ni); + if (err) + goto put_inode_out; + + sbi->mft.ni = ni; + + /* Load $BadClus. */ + ref.low = cpu_to_le32(MFT_REC_BADCLUST); + ref.seq = cpu_to_le16(MFT_REC_BADCLUST); + inode = ntfs_iget5(sb, &ref, &NAME_BADCLUS); + if (IS_ERR(inode)) { + ntfs_err(sb, "Failed to load $BadClus."); + err = PTR_ERR(inode); + goto out; + } + + ni = ntfs_i(inode); + + for (i = 0; run_get_entry(&ni->file.run, i, &vcn, &lcn, &len); i++) { + if (lcn == SPARSE_LCN) + continue; + + if (!sbi->bad_clusters) + ntfs_notice(sb, "Volume contains bad blocks"); + + sbi->bad_clusters += len; + } + + iput(inode); + + /* Load $Bitmap. */ + ref.low = cpu_to_le32(MFT_REC_BITMAP); + ref.seq = cpu_to_le16(MFT_REC_BITMAP); + inode = ntfs_iget5(sb, &ref, &NAME_BITMAP); + if (IS_ERR(inode)) { + ntfs_err(sb, "Failed to load $Bitmap."); + err = PTR_ERR(inode); + goto out; + } + +#ifndef CONFIG_NTFS3_64BIT_CLUSTER + if (inode->i_size >> 32) { + err = -EINVAL; + goto put_inode_out; + } +#endif + + /* Check bitmap boundary. */ + tt = sbi->used.bitmap.nbits; + if (inode->i_size < bitmap_size(tt)) { + err = -EINVAL; + goto put_inode_out; + } + + /* Not necessary. */ + sbi->used.bitmap.set_tail = true; + err = wnd_init(&sbi->used.bitmap, sb, tt); + if (err) + goto put_inode_out; + + iput(inode); + + /* Compute the MFT zone. */ + err = ntfs_refresh_zone(sbi); + if (err) + goto out; + + /* Load $AttrDef. */ + ref.low = cpu_to_le32(MFT_REC_ATTR); + ref.seq = cpu_to_le16(MFT_REC_ATTR); + inode = ntfs_iget5(sb, &ref, &NAME_ATTRDEF); + if (IS_ERR(inode)) { + ntfs_err(sb, "Failed to load $AttrDef -> %d", err); + err = PTR_ERR(inode); + goto out; + } + + if (inode->i_size < sizeof(struct ATTR_DEF_ENTRY)) { + err = -EINVAL; + goto put_inode_out; + } + bytes = inode->i_size; + sbi->def_table = t = kvmalloc(bytes, GFP_KERNEL); + if (!t) { + err = -ENOMEM; + goto put_inode_out; + } + + for (done = idx = 0; done < bytes; done += PAGE_SIZE, idx++) { + unsigned long tail = bytes - done; + struct page *page = ntfs_map_page(inode->i_mapping, idx); + + if (IS_ERR(page)) { + err = PTR_ERR(page); + goto put_inode_out; + } + memcpy(Add2Ptr(t, done), page_address(page), + min(PAGE_SIZE, tail)); + ntfs_unmap_page(page); + + if (!idx && ATTR_STD != t->type) { + err = -EINVAL; + goto put_inode_out; + } + } + + t += 1; + sbi->def_entries = 1; + done = sizeof(struct ATTR_DEF_ENTRY); + sbi->reparse.max_size = MAXIMUM_REPARSE_DATA_BUFFER_SIZE; + sbi->ea_max_size = 0x10000; /* default formatter value */ + + while (done + sizeof(struct ATTR_DEF_ENTRY) <= bytes) { + u32 t32 = le32_to_cpu(t->type); + u64 sz = le64_to_cpu(t->max_sz); + + if ((t32 & 0xF) || le32_to_cpu(t[-1].type) >= t32) + break; + + if (t->type == ATTR_REPARSE) + sbi->reparse.max_size = sz; + else if (t->type == ATTR_EA) + sbi->ea_max_size = sz; + + done += sizeof(struct ATTR_DEF_ENTRY); + t += 1; + sbi->def_entries += 1; + } + iput(inode); + + /* Load $UpCase. */ + ref.low = cpu_to_le32(MFT_REC_UPCASE); + ref.seq = cpu_to_le16(MFT_REC_UPCASE); + inode = ntfs_iget5(sb, &ref, &NAME_UPCASE); + if (IS_ERR(inode)) { + ntfs_err(sb, "Failed to load $UpCase."); + err = PTR_ERR(inode); + goto out; + } + + if (inode->i_size != 0x10000 * sizeof(short)) { + err = -EINVAL; + goto put_inode_out; + } + + for (idx = 0; idx < (0x10000 * sizeof(short) >> PAGE_SHIFT); idx++) { + const __le16 *src; + u16 *dst = Add2Ptr(sbi->upcase, idx << PAGE_SHIFT); + struct page *page = ntfs_map_page(inode->i_mapping, idx); + + if (IS_ERR(page)) { + err = PTR_ERR(page); + goto put_inode_out; + } + + src = page_address(page); + +#ifdef __BIG_ENDIAN + for (i = 0; i < PAGE_SIZE / sizeof(u16); i++) + *dst++ = le16_to_cpu(*src++); +#else + memcpy(dst, src, PAGE_SIZE); +#endif + ntfs_unmap_page(page); + } + + shared = ntfs_set_shared(sbi->upcase, 0x10000 * sizeof(short)); + if (shared && sbi->upcase != shared) { + kvfree(sbi->upcase); + sbi->upcase = shared; + } + + iput(inode); + + if (is_ntfs3(sbi)) { + /* Load $Secure. */ + err = ntfs_security_init(sbi); + if (err) + goto out; + + /* Load $Extend. */ + err = ntfs_extend_init(sbi); + if (err) + goto load_root; + + /* Load $Extend\$Reparse. */ + err = ntfs_reparse_init(sbi); + if (err) + goto load_root; + + /* Load $Extend\$ObjId. */ + err = ntfs_objid_init(sbi); + if (err) + goto load_root; + } + +load_root: + /* Load root. */ + ref.low = cpu_to_le32(MFT_REC_ROOT); + ref.seq = cpu_to_le16(MFT_REC_ROOT); + inode = ntfs_iget5(sb, &ref, &NAME_ROOT); + if (IS_ERR(inode) || !inode->i_op) { + ntfs_err(sb, "Failed to load root."); + err = IS_ERR(inode) ? PTR_ERR(inode) : -EINVAL; + goto out; + } + + sb->s_root = d_make_root(inode); + if (!sb->s_root) { + err = -ENOMEM; + goto put_inode_out; + } + + return 0; + +put_inode_out: + iput(inode); +out: + /* + * Free resources here. + * ntfs_fs_free will be called with fc->s_fs_info = NULL + */ + put_mount_options(sbi->options); + put_ntfs(sbi); + sb->s_fs_info = NULL; + + return err; +} + +void ntfs_unmap_meta(struct super_block *sb, CLST lcn, CLST len) +{ + struct ntfs_sb_info *sbi = sb->s_fs_info; + struct block_device *bdev = sb->s_bdev; + sector_t devblock = (u64)lcn * sbi->blocks_per_cluster; + unsigned long blocks = (u64)len * sbi->blocks_per_cluster; + unsigned long cnt = 0; + unsigned long limit = global_zone_page_state(NR_FREE_PAGES) + << (PAGE_SHIFT - sb->s_blocksize_bits); + + if (limit >= 0x2000) + limit -= 0x1000; + else if (limit < 32) + limit = 32; + else + limit >>= 1; + + while (blocks--) { + clean_bdev_aliases(bdev, devblock++, 1); + if (cnt++ >= limit) { + sync_blockdev(bdev); + cnt = 0; + } + } +} + +/* + * ntfs_discard - Issue a discard request (trim for SSD). + */ +int ntfs_discard(struct ntfs_sb_info *sbi, CLST lcn, CLST len) +{ + int err; + u64 lbo, bytes, start, end; + struct super_block *sb; + + if (sbi->used.next_free_lcn == lcn + len) + sbi->used.next_free_lcn = lcn; + + if (sbi->flags & NTFS_FLAGS_NODISCARD) + return -EOPNOTSUPP; + + if (!sbi->options->discard) + return -EOPNOTSUPP; + + lbo = (u64)lcn << sbi->cluster_bits; + bytes = (u64)len << sbi->cluster_bits; + + /* Align up 'start' on discard_granularity. */ + start = (lbo + sbi->discard_granularity - 1) & + sbi->discard_granularity_mask_inv; + /* Align down 'end' on discard_granularity. */ + end = (lbo + bytes) & sbi->discard_granularity_mask_inv; + + sb = sbi->sb; + if (start >= end) + return 0; + + err = blkdev_issue_discard(sb->s_bdev, start >> 9, (end - start) >> 9, + GFP_NOFS); + + if (err == -EOPNOTSUPP) + sbi->flags |= NTFS_FLAGS_NODISCARD; + + return err; +} + +static int ntfs_fs_get_tree(struct fs_context *fc) +{ + return get_tree_bdev(fc, ntfs_fill_super); +} + +/* + * ntfs_fs_free - Free fs_context. + * + * Note that this will be called after fill_super and reconfigure + * even when they pass. So they have to take pointers if they pass. + */ +static void ntfs_fs_free(struct fs_context *fc) +{ + struct ntfs_mount_options *opts = fc->fs_private; + struct ntfs_sb_info *sbi = fc->s_fs_info; + + if (sbi) + put_ntfs(sbi); + + if (opts) + put_mount_options(opts); +} + +static const struct fs_context_operations ntfs_context_ops = { + .parse_param = ntfs_fs_parse_param, + .get_tree = ntfs_fs_get_tree, + .reconfigure = ntfs_fs_reconfigure, + .free = ntfs_fs_free, +}; + +/* + * ntfs_init_fs_context - Initialize spi and opts + * + * This will called when mount/remount. We will first initialize + * options so that if remount we can use just that. + */ +static int ntfs_init_fs_context(struct fs_context *fc) +{ + struct ntfs_mount_options *opts; + struct ntfs_sb_info *sbi; + + opts = kzalloc(sizeof(struct ntfs_mount_options), GFP_NOFS); + if (!opts) + return -ENOMEM; + + /* Default options. */ + opts->fs_uid = current_uid(); + opts->fs_gid = current_gid(); + opts->fs_fmask_inv = ~current_umask(); + opts->fs_dmask_inv = ~current_umask(); + + if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE) + goto ok; + + sbi = kzalloc(sizeof(struct ntfs_sb_info), GFP_NOFS); + if (!sbi) + goto free_opts; + + sbi->upcase = kvmalloc(0x10000 * sizeof(short), GFP_KERNEL); + if (!sbi->upcase) + goto free_sbi; + + ratelimit_state_init(&sbi->msg_ratelimit, DEFAULT_RATELIMIT_INTERVAL, + DEFAULT_RATELIMIT_BURST); + + mutex_init(&sbi->compress.mtx_lznt); +#ifdef CONFIG_NTFS3_LZX_XPRESS + mutex_init(&sbi->compress.mtx_xpress); + mutex_init(&sbi->compress.mtx_lzx); +#endif + + fc->s_fs_info = sbi; +ok: + fc->fs_private = opts; + fc->ops = &ntfs_context_ops; + + return 0; +free_sbi: + kfree(sbi); +free_opts: + kfree(opts); + return -ENOMEM; +} + +// clang-format off +static struct file_system_type ntfs_fs_type = { + .owner = THIS_MODULE, + .name = "ntfs3", + .init_fs_context = ntfs_init_fs_context, + .parameters = ntfs_fs_parameters, + .kill_sb = kill_block_super, + .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP, +}; +// clang-format on + +static int __init init_ntfs_fs(void) +{ + int err; + + pr_info("ntfs3: Max link count %u\n", NTFS_LINK_MAX); + + if (IS_ENABLED(CONFIG_NTFS3_FS_POSIX_ACL)) + pr_info("ntfs3: Enabled Linux POSIX ACLs support\n"); + if (IS_ENABLED(CONFIG_NTFS3_64BIT_CLUSTER)) + pr_notice("ntfs3: Warning: Activated 64 bits per cluster. Windows does not support this\n"); + if (IS_ENABLED(CONFIG_NTFS3_LZX_XPRESS)) + pr_info("ntfs3: Read-only LZX/Xpress compression included\n"); + + err = ntfs3_init_bitmap(); + if (err) + return err; + + ntfs_inode_cachep = kmem_cache_create( + "ntfs_inode_cache", sizeof(struct ntfs_inode), 0, + (SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD | SLAB_ACCOUNT), + init_once); + if (!ntfs_inode_cachep) { + err = -ENOMEM; + goto out1; + } + + err = register_filesystem(&ntfs_fs_type); + if (err) + goto out; + + return 0; +out: + kmem_cache_destroy(ntfs_inode_cachep); +out1: + ntfs3_exit_bitmap(); + return err; +} + +static void __exit exit_ntfs_fs(void) +{ + if (ntfs_inode_cachep) { + rcu_barrier(); + kmem_cache_destroy(ntfs_inode_cachep); + } + + unregister_filesystem(&ntfs_fs_type); + ntfs3_exit_bitmap(); +} + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("ntfs3 read/write filesystem"); +#ifdef CONFIG_NTFS3_FS_POSIX_ACL +MODULE_INFO(behaviour, "Enabled Linux POSIX ACLs support"); +#endif +#ifdef CONFIG_NTFS3_64BIT_CLUSTER +MODULE_INFO(cluster, "Warning: Activated 64 bits per cluster. Windows does not support this"); +#endif +#ifdef CONFIG_NTFS3_LZX_XPRESS +MODULE_INFO(compression, "Read-only lzx/xpress compression included"); +#endif + +MODULE_AUTHOR("Konstantin Komarov"); +MODULE_ALIAS_FS("ntfs3"); + +module_init(init_ntfs_fs); +module_exit(exit_ntfs_fs); diff --git a/fs/ntfs3/upcase.c b/fs/ntfs3/upcase.c new file mode 100644 index 000000000..b5e8256fd --- /dev/null +++ b/fs/ntfs3/upcase.c @@ -0,0 +1,104 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * + * Copyright (C) 2019-2021 Paragon Software GmbH, All rights reserved. + * + */ + +#include <linux/kernel.h> +#include <linux/types.h> + +#include "ntfs_fs.h" + +static inline u16 upcase_unicode_char(const u16 *upcase, u16 chr) +{ + if (chr < 'a') + return chr; + + if (chr <= 'z') + return chr - ('a' - 'A'); + + return upcase[chr]; +} + +/* + * ntfs_cmp_names + * + * Thanks Kari Argillander <kari.argillander@gmail.com> for idea and implementation 'bothcase' + * + * Straight way to compare names: + * - Case insensitive + * - If name equals and 'bothcases' then + * - Case sensitive + * 'Straight way' code scans input names twice in worst case. + * Optimized code scans input names only once. + */ +int ntfs_cmp_names(const __le16 *s1, size_t l1, const __le16 *s2, size_t l2, + const u16 *upcase, bool bothcase) +{ + int diff1 = 0; + int diff2; + size_t len = min(l1, l2); + + if (!bothcase && upcase) + goto case_insentive; + + for (; len; s1++, s2++, len--) { + diff1 = le16_to_cpu(*s1) - le16_to_cpu(*s2); + if (diff1) { + if (bothcase && upcase) + goto case_insentive; + + return diff1; + } + } + return l1 - l2; + +case_insentive: + for (; len; s1++, s2++, len--) { + diff2 = upcase_unicode_char(upcase, le16_to_cpu(*s1)) - + upcase_unicode_char(upcase, le16_to_cpu(*s2)); + if (diff2) + return diff2; + } + + diff2 = l1 - l2; + return diff2 ? diff2 : diff1; +} + +int ntfs_cmp_names_cpu(const struct cpu_str *uni1, const struct le_str *uni2, + const u16 *upcase, bool bothcase) +{ + const u16 *s1 = uni1->name; + const __le16 *s2 = uni2->name; + size_t l1 = uni1->len; + size_t l2 = uni2->len; + size_t len = min(l1, l2); + int diff1 = 0; + int diff2; + + if (!bothcase && upcase) + goto case_insentive; + + for (; len; s1++, s2++, len--) { + diff1 = *s1 - le16_to_cpu(*s2); + if (diff1) { + if (bothcase && upcase) + goto case_insentive; + + return diff1; + } + } + return l1 - l2; + +case_insentive: + for (; len; s1++, s2++, len--) { + diff2 = upcase_unicode_char(upcase, *s1) - + upcase_unicode_char(upcase, le16_to_cpu(*s2)); + if (diff2) + return diff2; + } + + diff2 = l1 - l2; + return diff2 ? diff2 : diff1; +} diff --git a/fs/ntfs3/xattr.c b/fs/ntfs3/xattr.c new file mode 100644 index 000000000..df15e00c2 --- /dev/null +++ b/fs/ntfs3/xattr.c @@ -0,0 +1,1051 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * + * Copyright (C) 2019-2021 Paragon Software GmbH, All rights reserved. + * + */ + +#include <linux/fs.h> +#include <linux/posix_acl.h> +#include <linux/posix_acl_xattr.h> +#include <linux/xattr.h> + +#include "debug.h" +#include "ntfs.h" +#include "ntfs_fs.h" + +// clang-format off +#define SYSTEM_DOS_ATTRIB "system.dos_attrib" +#define SYSTEM_NTFS_ATTRIB "system.ntfs_attrib" +#define SYSTEM_NTFS_SECURITY "system.ntfs_security" +// clang-format on + +static inline size_t unpacked_ea_size(const struct EA_FULL *ea) +{ + return ea->size ? le32_to_cpu(ea->size) + : ALIGN(struct_size(ea, name, + 1 + ea->name_len + + le16_to_cpu(ea->elength)), + 4); +} + +static inline size_t packed_ea_size(const struct EA_FULL *ea) +{ + return struct_size(ea, name, + 1 + ea->name_len + le16_to_cpu(ea->elength)) - + offsetof(struct EA_FULL, flags); +} + +/* + * find_ea + * + * Assume there is at least one xattr in the list. + */ +static inline bool find_ea(const struct EA_FULL *ea_all, u32 bytes, + const char *name, u8 name_len, u32 *off, u32 *ea_sz) +{ + u32 ea_size; + + *off = 0; + if (!ea_all) + return false; + + for (; *off < bytes; *off += ea_size) { + const struct EA_FULL *ea = Add2Ptr(ea_all, *off); + ea_size = unpacked_ea_size(ea); + if (ea->name_len == name_len && + !memcmp(ea->name, name, name_len)) { + if (ea_sz) + *ea_sz = ea_size; + return true; + } + } + + return false; +} + +/* + * ntfs_read_ea - Read all extended attributes. + * @ea: New allocated memory. + * @info: Pointer into resident data. + */ +static int ntfs_read_ea(struct ntfs_inode *ni, struct EA_FULL **ea, + size_t add_bytes, const struct EA_INFO **info) +{ + int err = -EINVAL; + struct ntfs_sb_info *sbi = ni->mi.sbi; + struct ATTR_LIST_ENTRY *le = NULL; + struct ATTRIB *attr_info, *attr_ea; + void *ea_p; + u32 size, off, ea_size; + + static_assert(le32_to_cpu(ATTR_EA_INFO) < le32_to_cpu(ATTR_EA)); + + *ea = NULL; + *info = NULL; + + attr_info = + ni_find_attr(ni, NULL, &le, ATTR_EA_INFO, NULL, 0, NULL, NULL); + attr_ea = + ni_find_attr(ni, attr_info, &le, ATTR_EA, NULL, 0, NULL, NULL); + + if (!attr_ea || !attr_info) + return 0; + + *info = resident_data_ex(attr_info, sizeof(struct EA_INFO)); + if (!*info) + goto out; + + /* Check Ea limit. */ + size = le32_to_cpu((*info)->size); + if (size > sbi->ea_max_size) { + err = -EFBIG; + goto out; + } + + if (attr_size(attr_ea) > sbi->ea_max_size) { + err = -EFBIG; + goto out; + } + + if (!size) { + /* EA info persists, but xattr is empty. Looks like EA problem. */ + goto out; + } + + /* Allocate memory for packed Ea. */ + ea_p = kmalloc(size_add(size, add_bytes), GFP_NOFS); + if (!ea_p) + return -ENOMEM; + + if (attr_ea->non_res) { + struct runs_tree run; + + run_init(&run); + + err = attr_load_runs_range(ni, ATTR_EA, NULL, 0, &run, 0, size); + if (!err) + err = ntfs_read_run_nb(sbi, &run, 0, ea_p, size, NULL); + run_close(&run); + + if (err) + goto out1; + } else { + void *p = resident_data_ex(attr_ea, size); + + if (!p) + goto out1; + memcpy(ea_p, p, size); + } + + memset(Add2Ptr(ea_p, size), 0, add_bytes); + + /* Check all attributes for consistency. */ + for (off = 0; off < size; off += ea_size) { + const struct EA_FULL *ef = Add2Ptr(ea_p, off); + u32 bytes = size - off; + + /* Check if we can use field ea->size. */ + if (bytes < sizeof(ef->size)) + goto out1; + + if (ef->size) { + ea_size = le32_to_cpu(ef->size); + if (ea_size > bytes) + goto out1; + continue; + } + + /* Check if we can use fields ef->name_len and ef->elength. */ + if (bytes < offsetof(struct EA_FULL, name)) + goto out1; + + ea_size = ALIGN(struct_size(ef, name, + 1 + ef->name_len + + le16_to_cpu(ef->elength)), + 4); + if (ea_size > bytes) + goto out1; + } + + *ea = ea_p; + return 0; + +out1: + kfree(ea_p); +out: + ntfs_set_state(sbi, NTFS_DIRTY_DIRTY); + return err; +} + +/* + * ntfs_list_ea + * + * Copy a list of xattrs names into the buffer + * provided, or compute the buffer size required. + * + * Return: + * * Number of bytes used / required on + * * -ERRNO - on failure + */ +static ssize_t ntfs_list_ea(struct ntfs_inode *ni, char *buffer, + size_t bytes_per_buffer) +{ + const struct EA_INFO *info; + struct EA_FULL *ea_all = NULL; + const struct EA_FULL *ea; + u32 off, size; + int err; + int ea_size; + size_t ret; + + err = ntfs_read_ea(ni, &ea_all, 0, &info); + if (err) + return err; + + if (!info || !ea_all) + return 0; + + size = le32_to_cpu(info->size); + + /* Enumerate all xattrs. */ + ret = 0; + for (off = 0; off + sizeof(struct EA_FULL) < size; off += ea_size) { + ea = Add2Ptr(ea_all, off); + ea_size = unpacked_ea_size(ea); + + if (!ea->name_len) + break; + + if (buffer) { + /* Check if we can use field ea->name */ + if (off + ea_size > size) + break; + + if (ret + ea->name_len + 1 > bytes_per_buffer) { + err = -ERANGE; + goto out; + } + + memcpy(buffer + ret, ea->name, ea->name_len); + buffer[ret + ea->name_len] = 0; + } + + ret += ea->name_len + 1; + } + +out: + kfree(ea_all); + return err ? err : ret; +} + +static int ntfs_get_ea(struct inode *inode, const char *name, size_t name_len, + void *buffer, size_t size, size_t *required) +{ + struct ntfs_inode *ni = ntfs_i(inode); + const struct EA_INFO *info; + struct EA_FULL *ea_all = NULL; + const struct EA_FULL *ea; + u32 off, len; + int err; + + if (!(ni->ni_flags & NI_FLAG_EA)) + return -ENODATA; + + if (!required) + ni_lock(ni); + + len = 0; + + if (name_len > 255) { + err = -ENAMETOOLONG; + goto out; + } + + err = ntfs_read_ea(ni, &ea_all, 0, &info); + if (err) + goto out; + + if (!info) + goto out; + + /* Enumerate all xattrs. */ + if (!find_ea(ea_all, le32_to_cpu(info->size), name, name_len, &off, + NULL)) { + err = -ENODATA; + goto out; + } + ea = Add2Ptr(ea_all, off); + + len = le16_to_cpu(ea->elength); + if (!buffer) { + err = 0; + goto out; + } + + if (len > size) { + err = -ERANGE; + if (required) + *required = len; + goto out; + } + + memcpy(buffer, ea->name + ea->name_len + 1, len); + err = 0; + +out: + kfree(ea_all); + if (!required) + ni_unlock(ni); + + return err ? err : len; +} + +static noinline int ntfs_set_ea(struct inode *inode, const char *name, + size_t name_len, const void *value, + size_t val_size, int flags, bool locked) +{ + struct ntfs_inode *ni = ntfs_i(inode); + struct ntfs_sb_info *sbi = ni->mi.sbi; + int err; + struct EA_INFO ea_info; + const struct EA_INFO *info; + struct EA_FULL *new_ea; + struct EA_FULL *ea_all = NULL; + size_t add, new_pack; + u32 off, size, ea_sz; + __le16 size_pack; + struct ATTRIB *attr; + struct ATTR_LIST_ENTRY *le; + struct mft_inode *mi; + struct runs_tree ea_run; + u64 new_sz; + void *p; + + if (!locked) + ni_lock(ni); + + run_init(&ea_run); + + if (name_len > 255) { + err = -ENAMETOOLONG; + goto out; + } + + add = ALIGN(struct_size(ea_all, name, 1 + name_len + val_size), 4); + + err = ntfs_read_ea(ni, &ea_all, add, &info); + if (err) + goto out; + + if (!info) { + memset(&ea_info, 0, sizeof(ea_info)); + size = 0; + size_pack = 0; + } else { + memcpy(&ea_info, info, sizeof(ea_info)); + size = le32_to_cpu(ea_info.size); + size_pack = ea_info.size_pack; + } + + if (info && find_ea(ea_all, size, name, name_len, &off, &ea_sz)) { + struct EA_FULL *ea; + + if (flags & XATTR_CREATE) { + err = -EEXIST; + goto out; + } + + ea = Add2Ptr(ea_all, off); + + /* + * Check simple case when we try to insert xattr with the same value + * e.g. ntfs_save_wsl_perm + */ + if (val_size && le16_to_cpu(ea->elength) == val_size && + !memcmp(ea->name + ea->name_len + 1, value, val_size)) { + /* xattr already contains the required value. */ + goto out; + } + + /* Remove current xattr. */ + if (ea->flags & FILE_NEED_EA) + le16_add_cpu(&ea_info.count, -1); + + le16_add_cpu(&ea_info.size_pack, 0 - packed_ea_size(ea)); + + memmove(ea, Add2Ptr(ea, ea_sz), size - off - ea_sz); + + size -= ea_sz; + memset(Add2Ptr(ea_all, size), 0, ea_sz); + + ea_info.size = cpu_to_le32(size); + + if ((flags & XATTR_REPLACE) && !val_size) { + /* Remove xattr. */ + goto update_ea; + } + } else { + if (flags & XATTR_REPLACE) { + err = -ENODATA; + goto out; + } + + if (!ea_all) { + ea_all = kzalloc(add, GFP_NOFS); + if (!ea_all) { + err = -ENOMEM; + goto out; + } + } + } + + /* Append new xattr. */ + new_ea = Add2Ptr(ea_all, size); + new_ea->size = cpu_to_le32(add); + new_ea->flags = 0; + new_ea->name_len = name_len; + new_ea->elength = cpu_to_le16(val_size); + memcpy(new_ea->name, name, name_len); + new_ea->name[name_len] = 0; + memcpy(new_ea->name + name_len + 1, value, val_size); + new_pack = le16_to_cpu(ea_info.size_pack) + packed_ea_size(new_ea); + ea_info.size_pack = cpu_to_le16(new_pack); + /* New size of ATTR_EA. */ + size += add; + ea_info.size = cpu_to_le32(size); + + /* + * 1. Check ea_info.size_pack for overflow. + * 2. New attibute size must fit value from $AttrDef + */ + if (new_pack > 0xffff || size > sbi->ea_max_size) { + ntfs_inode_warn( + inode, + "The size of extended attributes must not exceed 64KiB"); + err = -EFBIG; // -EINVAL? + goto out; + } + +update_ea: + + if (!info) { + /* Create xattr. */ + if (!size) { + err = 0; + goto out; + } + + err = ni_insert_resident(ni, sizeof(struct EA_INFO), + ATTR_EA_INFO, NULL, 0, NULL, NULL, + NULL); + if (err) + goto out; + + err = ni_insert_resident(ni, 0, ATTR_EA, NULL, 0, NULL, NULL, + NULL); + if (err) + goto out; + } + + new_sz = size; + err = attr_set_size(ni, ATTR_EA, NULL, 0, &ea_run, new_sz, &new_sz, + false, NULL); + if (err) + goto out; + + le = NULL; + attr = ni_find_attr(ni, NULL, &le, ATTR_EA_INFO, NULL, 0, NULL, &mi); + if (!attr) { + err = -EINVAL; + goto out; + } + + if (!size) { + /* Delete xattr, ATTR_EA_INFO */ + ni_remove_attr_le(ni, attr, mi, le); + } else { + p = resident_data_ex(attr, sizeof(struct EA_INFO)); + if (!p) { + err = -EINVAL; + goto out; + } + memcpy(p, &ea_info, sizeof(struct EA_INFO)); + mi->dirty = true; + } + + le = NULL; + attr = ni_find_attr(ni, NULL, &le, ATTR_EA, NULL, 0, NULL, &mi); + if (!attr) { + err = -EINVAL; + goto out; + } + + if (!size) { + /* Delete xattr, ATTR_EA */ + ni_remove_attr_le(ni, attr, mi, le); + } else if (attr->non_res) { + err = attr_load_runs_range(ni, ATTR_EA, NULL, 0, &ea_run, 0, + size); + if (err) + goto out; + + err = ntfs_sb_write_run(sbi, &ea_run, 0, ea_all, size, 0); + if (err) + goto out; + } else { + p = resident_data_ex(attr, size); + if (!p) { + err = -EINVAL; + goto out; + } + memcpy(p, ea_all, size); + mi->dirty = true; + } + + /* Check if we delete the last xattr. */ + if (size) + ni->ni_flags |= NI_FLAG_EA; + else + ni->ni_flags &= ~NI_FLAG_EA; + + if (ea_info.size_pack != size_pack) + ni->ni_flags |= NI_FLAG_UPDATE_PARENT; + mark_inode_dirty(&ni->vfs_inode); + +out: + if (!locked) + ni_unlock(ni); + + run_close(&ea_run); + kfree(ea_all); + + return err; +} + +#ifdef CONFIG_NTFS3_FS_POSIX_ACL +static struct posix_acl *ntfs_get_acl_ex(struct inode *inode, int type, + int locked) +{ + struct ntfs_inode *ni = ntfs_i(inode); + const char *name; + size_t name_len; + struct posix_acl *acl; + size_t req; + int err; + void *buf; + + /* Allocate PATH_MAX bytes. */ + buf = __getname(); + if (!buf) + return ERR_PTR(-ENOMEM); + + /* Possible values of 'type' was already checked above. */ + if (type == ACL_TYPE_ACCESS) { + name = XATTR_NAME_POSIX_ACL_ACCESS; + name_len = sizeof(XATTR_NAME_POSIX_ACL_ACCESS) - 1; + } else { + name = XATTR_NAME_POSIX_ACL_DEFAULT; + name_len = sizeof(XATTR_NAME_POSIX_ACL_DEFAULT) - 1; + } + + if (!locked) + ni_lock(ni); + + err = ntfs_get_ea(inode, name, name_len, buf, PATH_MAX, &req); + + if (!locked) + ni_unlock(ni); + + /* Translate extended attribute to acl. */ + if (err >= 0) { + acl = posix_acl_from_xattr(&init_user_ns, buf, err); + } else if (err == -ENODATA) { + acl = NULL; + } else { + acl = ERR_PTR(err); + } + + if (!IS_ERR(acl)) + set_cached_acl(inode, type, acl); + + __putname(buf); + + return acl; +} + +/* + * ntfs_get_acl - inode_operations::get_acl + */ +struct posix_acl *ntfs_get_acl(struct inode *inode, int type, bool rcu) +{ + if (rcu) + return ERR_PTR(-ECHILD); + + return ntfs_get_acl_ex(inode, type, 0); +} + +static noinline int ntfs_set_acl_ex(struct user_namespace *mnt_userns, + struct inode *inode, struct posix_acl *acl, + int type, bool init_acl) +{ + const char *name; + size_t size, name_len; + void *value; + int err; + int flags; + umode_t mode; + + if (S_ISLNK(inode->i_mode)) + return -EOPNOTSUPP; + + mode = inode->i_mode; + switch (type) { + case ACL_TYPE_ACCESS: + /* Do not change i_mode if we are in init_acl */ + if (acl && !init_acl) { + err = posix_acl_update_mode(mnt_userns, inode, &mode, + &acl); + if (err) + return err; + } + name = XATTR_NAME_POSIX_ACL_ACCESS; + name_len = sizeof(XATTR_NAME_POSIX_ACL_ACCESS) - 1; + break; + + case ACL_TYPE_DEFAULT: + if (!S_ISDIR(inode->i_mode)) + return acl ? -EACCES : 0; + name = XATTR_NAME_POSIX_ACL_DEFAULT; + name_len = sizeof(XATTR_NAME_POSIX_ACL_DEFAULT) - 1; + break; + + default: + return -EINVAL; + } + + if (!acl) { + /* Remove xattr if it can be presented via mode. */ + size = 0; + value = NULL; + flags = XATTR_REPLACE; + } else { + size = posix_acl_xattr_size(acl->a_count); + value = kmalloc(size, GFP_NOFS); + if (!value) + return -ENOMEM; + err = posix_acl_to_xattr(&init_user_ns, acl, value, size); + if (err < 0) + goto out; + flags = 0; + } + + err = ntfs_set_ea(inode, name, name_len, value, size, flags, 0); + if (err == -ENODATA && !size) + err = 0; /* Removing non existed xattr. */ + if (!err) { + set_cached_acl(inode, type, acl); + if (inode->i_mode != mode) { + inode->i_mode = mode; + mark_inode_dirty(inode); + } + } + +out: + kfree(value); + + return err; +} + +/* + * ntfs_set_acl - inode_operations::set_acl + */ +int ntfs_set_acl(struct user_namespace *mnt_userns, struct inode *inode, + struct posix_acl *acl, int type) +{ + return ntfs_set_acl_ex(mnt_userns, inode, acl, type, false); +} + +/* + * ntfs_init_acl - Initialize the ACLs of a new inode. + * + * Called from ntfs_create_inode(). + */ +int ntfs_init_acl(struct user_namespace *mnt_userns, struct inode *inode, + struct inode *dir) +{ + struct posix_acl *default_acl, *acl; + int err; + + err = posix_acl_create(dir, &inode->i_mode, &default_acl, &acl); + if (err) + return err; + + if (default_acl) { + err = ntfs_set_acl_ex(mnt_userns, inode, default_acl, + ACL_TYPE_DEFAULT, true); + posix_acl_release(default_acl); + } else { + inode->i_default_acl = NULL; + } + + if (acl) { + if (!err) + err = ntfs_set_acl_ex(mnt_userns, inode, acl, + ACL_TYPE_ACCESS, true); + posix_acl_release(acl); + } else { + inode->i_acl = NULL; + } + + return err; +} +#endif + +/* + * ntfs_acl_chmod - Helper for ntfs3_setattr(). + */ +int ntfs_acl_chmod(struct user_namespace *mnt_userns, struct inode *inode) +{ + struct super_block *sb = inode->i_sb; + + if (!(sb->s_flags & SB_POSIXACL)) + return 0; + + if (S_ISLNK(inode->i_mode)) + return -EOPNOTSUPP; + + return posix_acl_chmod(mnt_userns, inode, inode->i_mode); +} + +/* + * ntfs_permission - inode_operations::permission + */ +int ntfs_permission(struct user_namespace *mnt_userns, struct inode *inode, + int mask) +{ + if (ntfs_sb(inode->i_sb)->options->noacsrules) { + /* "No access rules" mode - Allow all changes. */ + return 0; + } + + return generic_permission(mnt_userns, inode, mask); +} + +/* + * ntfs_listxattr - inode_operations::listxattr + */ +ssize_t ntfs_listxattr(struct dentry *dentry, char *buffer, size_t size) +{ + struct inode *inode = d_inode(dentry); + struct ntfs_inode *ni = ntfs_i(inode); + ssize_t ret; + + if (!(ni->ni_flags & NI_FLAG_EA)) { + /* no xattr in file */ + return 0; + } + + ni_lock(ni); + + ret = ntfs_list_ea(ni, buffer, size); + + ni_unlock(ni); + + return ret; +} + +static int ntfs_getxattr(const struct xattr_handler *handler, struct dentry *de, + struct inode *inode, const char *name, void *buffer, + size_t size) +{ + int err; + struct ntfs_inode *ni = ntfs_i(inode); + size_t name_len = strlen(name); + + /* Dispatch request. */ + if (name_len == sizeof(SYSTEM_DOS_ATTRIB) - 1 && + !memcmp(name, SYSTEM_DOS_ATTRIB, sizeof(SYSTEM_DOS_ATTRIB))) { + /* system.dos_attrib */ + if (!buffer) { + err = sizeof(u8); + } else if (size < sizeof(u8)) { + err = -ENODATA; + } else { + err = sizeof(u8); + *(u8 *)buffer = le32_to_cpu(ni->std_fa); + } + goto out; + } + + if (name_len == sizeof(SYSTEM_NTFS_ATTRIB) - 1 && + !memcmp(name, SYSTEM_NTFS_ATTRIB, sizeof(SYSTEM_NTFS_ATTRIB))) { + /* system.ntfs_attrib */ + if (!buffer) { + err = sizeof(u32); + } else if (size < sizeof(u32)) { + err = -ENODATA; + } else { + err = sizeof(u32); + *(u32 *)buffer = le32_to_cpu(ni->std_fa); + } + goto out; + } + + if (name_len == sizeof(SYSTEM_NTFS_SECURITY) - 1 && + !memcmp(name, SYSTEM_NTFS_SECURITY, sizeof(SYSTEM_NTFS_SECURITY))) { + /* system.ntfs_security*/ + struct SECURITY_DESCRIPTOR_RELATIVE *sd = NULL; + size_t sd_size = 0; + + if (!is_ntfs3(ni->mi.sbi)) { + /* We should get nt4 security. */ + err = -EINVAL; + goto out; + } else if (le32_to_cpu(ni->std_security_id) < + SECURITY_ID_FIRST) { + err = -ENOENT; + goto out; + } + + err = ntfs_get_security_by_id(ni->mi.sbi, ni->std_security_id, + &sd, &sd_size); + if (err) + goto out; + + if (!is_sd_valid(sd, sd_size)) { + ntfs_inode_warn( + inode, + "looks like you get incorrect security descriptor id=%u", + ni->std_security_id); + } + + if (!buffer) { + err = sd_size; + } else if (size < sd_size) { + err = -ENODATA; + } else { + err = sd_size; + memcpy(buffer, sd, sd_size); + } + kfree(sd); + goto out; + } + + /* Deal with NTFS extended attribute. */ + err = ntfs_get_ea(inode, name, name_len, buffer, size, NULL); + +out: + return err; +} + +/* + * ntfs_setxattr - inode_operations::setxattr + */ +static noinline int ntfs_setxattr(const struct xattr_handler *handler, + struct user_namespace *mnt_userns, + struct dentry *de, struct inode *inode, + const char *name, const void *value, + size_t size, int flags) +{ + int err = -EINVAL; + struct ntfs_inode *ni = ntfs_i(inode); + size_t name_len = strlen(name); + enum FILE_ATTRIBUTE new_fa; + + /* Dispatch request. */ + if (name_len == sizeof(SYSTEM_DOS_ATTRIB) - 1 && + !memcmp(name, SYSTEM_DOS_ATTRIB, sizeof(SYSTEM_DOS_ATTRIB))) { + if (sizeof(u8) != size) + goto out; + new_fa = cpu_to_le32(*(u8 *)value); + goto set_new_fa; + } + + if (name_len == sizeof(SYSTEM_NTFS_ATTRIB) - 1 && + !memcmp(name, SYSTEM_NTFS_ATTRIB, sizeof(SYSTEM_NTFS_ATTRIB))) { + if (size != sizeof(u32)) + goto out; + new_fa = cpu_to_le32(*(u32 *)value); + + if (S_ISREG(inode->i_mode)) { + /* Process compressed/sparsed in special way. */ + ni_lock(ni); + err = ni_new_attr_flags(ni, new_fa); + ni_unlock(ni); + if (err) + goto out; + } +set_new_fa: + /* + * Thanks Mark Harmstone: + * Keep directory bit consistency. + */ + if (S_ISDIR(inode->i_mode)) + new_fa |= FILE_ATTRIBUTE_DIRECTORY; + else + new_fa &= ~FILE_ATTRIBUTE_DIRECTORY; + + if (ni->std_fa != new_fa) { + ni->std_fa = new_fa; + if (new_fa & FILE_ATTRIBUTE_READONLY) + inode->i_mode &= ~0222; + else + inode->i_mode |= 0222; + /* Std attribute always in primary record. */ + ni->mi.dirty = true; + mark_inode_dirty(inode); + } + err = 0; + + goto out; + } + + if (name_len == sizeof(SYSTEM_NTFS_SECURITY) - 1 && + !memcmp(name, SYSTEM_NTFS_SECURITY, sizeof(SYSTEM_NTFS_SECURITY))) { + /* system.ntfs_security*/ + __le32 security_id; + bool inserted; + struct ATTR_STD_INFO5 *std; + + if (!is_ntfs3(ni->mi.sbi)) { + /* + * We should replace ATTR_SECURE. + * Skip this way cause it is nt4 feature. + */ + err = -EINVAL; + goto out; + } + + if (!is_sd_valid(value, size)) { + err = -EINVAL; + ntfs_inode_warn( + inode, + "you try to set invalid security descriptor"); + goto out; + } + + err = ntfs_insert_security(ni->mi.sbi, value, size, + &security_id, &inserted); + if (err) + goto out; + + ni_lock(ni); + std = ni_std5(ni); + if (!std) { + err = -EINVAL; + } else if (std->security_id != security_id) { + std->security_id = ni->std_security_id = security_id; + /* Std attribute always in primary record. */ + ni->mi.dirty = true; + mark_inode_dirty(&ni->vfs_inode); + } + ni_unlock(ni); + goto out; + } + + /* Deal with NTFS extended attribute. */ + err = ntfs_set_ea(inode, name, name_len, value, size, flags, 0); + +out: + inode->i_ctime = current_time(inode); + mark_inode_dirty(inode); + + return err; +} + +/* + * ntfs_save_wsl_perm + * + * save uid/gid/mode in xattr + */ +int ntfs_save_wsl_perm(struct inode *inode) +{ + int err; + __le32 value; + struct ntfs_inode *ni = ntfs_i(inode); + + ni_lock(ni); + value = cpu_to_le32(i_uid_read(inode)); + err = ntfs_set_ea(inode, "$LXUID", sizeof("$LXUID") - 1, &value, + sizeof(value), 0, true); /* true == already locked. */ + if (err) + goto out; + + value = cpu_to_le32(i_gid_read(inode)); + err = ntfs_set_ea(inode, "$LXGID", sizeof("$LXGID") - 1, &value, + sizeof(value), 0, true); + if (err) + goto out; + + value = cpu_to_le32(inode->i_mode); + err = ntfs_set_ea(inode, "$LXMOD", sizeof("$LXMOD") - 1, &value, + sizeof(value), 0, true); + if (err) + goto out; + + if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) { + value = cpu_to_le32(inode->i_rdev); + err = ntfs_set_ea(inode, "$LXDEV", sizeof("$LXDEV") - 1, &value, + sizeof(value), 0, true); + if (err) + goto out; + } + +out: + ni_unlock(ni); + /* In case of error should we delete all WSL xattr? */ + return err; +} + +/* + * ntfs_get_wsl_perm + * + * get uid/gid/mode from xattr + * it is called from ntfs_iget5->ntfs_read_mft + */ +void ntfs_get_wsl_perm(struct inode *inode) +{ + size_t sz; + __le32 value[3]; + + if (ntfs_get_ea(inode, "$LXUID", sizeof("$LXUID") - 1, &value[0], + sizeof(value[0]), &sz) == sizeof(value[0]) && + ntfs_get_ea(inode, "$LXGID", sizeof("$LXGID") - 1, &value[1], + sizeof(value[1]), &sz) == sizeof(value[1]) && + ntfs_get_ea(inode, "$LXMOD", sizeof("$LXMOD") - 1, &value[2], + sizeof(value[2]), &sz) == sizeof(value[2])) { + i_uid_write(inode, (uid_t)le32_to_cpu(value[0])); + i_gid_write(inode, (gid_t)le32_to_cpu(value[1])); + inode->i_mode = le32_to_cpu(value[2]); + + if (ntfs_get_ea(inode, "$LXDEV", sizeof("$$LXDEV") - 1, + &value[0], sizeof(value), + &sz) == sizeof(value[0])) { + inode->i_rdev = le32_to_cpu(value[0]); + } + } +} + +static bool ntfs_xattr_user_list(struct dentry *dentry) +{ + return true; +} + +// clang-format off +static const struct xattr_handler ntfs_other_xattr_handler = { + .prefix = "", + .get = ntfs_getxattr, + .set = ntfs_setxattr, + .list = ntfs_xattr_user_list, +}; + +const struct xattr_handler *ntfs_xattr_handlers[] = { +#ifdef CONFIG_NTFS3_FS_POSIX_ACL + &posix_acl_access_xattr_handler, + &posix_acl_default_xattr_handler, +#endif + &ntfs_other_xattr_handler, + NULL, +}; +// clang-format on |