summaryrefslogtreecommitdiffstats
path: root/fs/ntfs
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-07 18:49:45 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-07 18:49:45 +0000
commit2c3c1048746a4622d8c89a29670120dc8fab93c4 (patch)
tree848558de17fb3008cdf4d861b01ac7781903ce39 /fs/ntfs
parentInitial commit. (diff)
downloadlinux-2c3c1048746a4622d8c89a29670120dc8fab93c4.tar.xz
linux-2c3c1048746a4622d8c89a29670120dc8fab93c4.zip
Adding upstream version 6.1.76.upstream/6.1.76
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to '')
-rw-r--r--fs/ntfs/Kconfig80
-rw-r--r--fs/ntfs/Makefile15
-rw-r--r--fs/ntfs/aops.c1761
-rw-r--r--fs/ntfs/aops.h88
-rw-r--r--fs/ntfs/attrib.c2624
-rw-r--r--fs/ntfs/attrib.h102
-rw-r--r--fs/ntfs/bitmap.c179
-rw-r--r--fs/ntfs/bitmap.h104
-rw-r--r--fs/ntfs/collate.c110
-rw-r--r--fs/ntfs/collate.h36
-rw-r--r--fs/ntfs/compress.c950
-rw-r--r--fs/ntfs/debug.c159
-rw-r--r--fs/ntfs/debug.h57
-rw-r--r--fs/ntfs/dir.c1538
-rw-r--r--fs/ntfs/dir.h34
-rw-r--r--fs/ntfs/endian.h79
-rw-r--r--fs/ntfs/file.c2006
-rw-r--r--fs/ntfs/index.c440
-rw-r--r--fs/ntfs/index.h134
-rw-r--r--fs/ntfs/inode.c3100
-rw-r--r--fs/ntfs/inode.h310
-rw-r--r--fs/ntfs/layout.h2421
-rw-r--r--fs/ntfs/lcnalloc.c1000
-rw-r--r--fs/ntfs/lcnalloc.h131
-rw-r--r--fs/ntfs/logfile.c849
-rw-r--r--fs/ntfs/logfile.h295
-rw-r--r--fs/ntfs/malloc.h77
-rw-r--r--fs/ntfs/mft.c2906
-rw-r--r--fs/ntfs/mft.h110
-rw-r--r--fs/ntfs/mst.c189
-rw-r--r--fs/ntfs/namei.c391
-rw-r--r--fs/ntfs/ntfs.h150
-rw-r--r--fs/ntfs/quota.c103
-rw-r--r--fs/ntfs/quota.h21
-rw-r--r--fs/ntfs/runlist.c1893
-rw-r--r--fs/ntfs/runlist.h88
-rw-r--r--fs/ntfs/super.c3194
-rw-r--r--fs/ntfs/sysctl.c69
-rw-r--r--fs/ntfs/sysctl.h27
-rw-r--r--fs/ntfs/time.h89
-rw-r--r--fs/ntfs/types.h55
-rw-r--r--fs/ntfs/unistr.c384
-rw-r--r--fs/ntfs/upcase.c73
-rw-r--r--fs/ntfs/usnjrnl.c70
-rw-r--r--fs/ntfs/usnjrnl.h191
-rw-r--r--fs/ntfs/volume.h164
-rw-r--r--fs/ntfs3/Kconfig46
-rw-r--r--fs/ntfs3/Makefile36
-rw-r--r--fs/ntfs3/attrib.c2470
-rw-r--r--fs/ntfs3/attrlist.c473
-rw-r--r--fs/ntfs3/bitfunc.c128
-rw-r--r--fs/ntfs3/bitmap.c1485
-rw-r--r--fs/ntfs3/debug.h55
-rw-r--r--fs/ntfs3/dir.c597
-rw-r--r--fs/ntfs3/file.c1279
-rw-r--r--fs/ntfs3/frecord.c3368
-rw-r--r--fs/ntfs3/fslog.c5210
-rw-r--r--fs/ntfs3/fsntfs.c2503
-rw-r--r--fs/ntfs3/index.c2668
-rw-r--r--fs/ntfs3/inode.c1970
-rw-r--r--fs/ntfs3/lib/decompress_common.c319
-rw-r--r--fs/ntfs3/lib/decompress_common.h343
-rw-r--r--fs/ntfs3/lib/lib.h32
-rw-r--r--fs/ntfs3/lib/lzx_decompress.c670
-rw-r--r--fs/ntfs3/lib/xpress_decompress.c142
-rw-r--r--fs/ntfs3/lznt.c453
-rw-r--r--fs/ntfs3/namei.c395
-rw-r--r--fs/ntfs3/ntfs.h1221
-rw-r--r--fs/ntfs3/ntfs_fs.h1142
-rw-r--r--fs/ntfs3/record.c595
-rw-r--r--fs/ntfs3/run.c1186
-rw-r--r--fs/ntfs3/super.c1517
-rw-r--r--fs/ntfs3/upcase.c104
-rw-r--r--fs/ntfs3/xattr.c1051
74 files changed, 60304 insertions, 0 deletions
diff --git a/fs/ntfs/Kconfig b/fs/ntfs/Kconfig
new file mode 100644
index 000000000..f93e69a61
--- /dev/null
+++ b/fs/ntfs/Kconfig
@@ -0,0 +1,80 @@
+# SPDX-License-Identifier: GPL-2.0-only
+config NTFS_FS
+ tristate "NTFS file system support"
+ select NLS
+ help
+ NTFS is the file system of Microsoft Windows NT, 2000, XP and 2003.
+
+ Saying Y or M here enables read support. There is partial, but
+ safe, write support available. For write support you must also
+ say Y to "NTFS write support" below.
+
+ There are also a number of user-space tools available, called
+ ntfsprogs. These include ntfsundelete and ntfsresize, that work
+ without NTFS support enabled in the kernel.
+
+ This is a rewrite from scratch of Linux NTFS support and replaced
+ the old NTFS code starting with Linux 2.5.11. A backport to
+ the Linux 2.4 kernel series is separately available as a patch
+ from the project web site.
+
+ For more information see <file:Documentation/filesystems/ntfs.rst>
+ and <http://www.linux-ntfs.org/>.
+
+ To compile this file system support as a module, choose M here: the
+ module will be called ntfs.
+
+ If you are not using Windows NT, 2000, XP or 2003 in addition to
+ Linux on your computer it is safe to say N.
+
+config NTFS_DEBUG
+ bool "NTFS debugging support"
+ depends on NTFS_FS
+ help
+ If you are experiencing any problems with the NTFS file system, say
+ Y here. This will result in additional consistency checks to be
+ performed by the driver as well as additional debugging messages to
+ be written to the system log. Note that debugging messages are
+ disabled by default. To enable them, supply the option debug_msgs=1
+ at the kernel command line when booting the kernel or as an option
+ to insmod when loading the ntfs module. Once the driver is active,
+ you can enable debugging messages by doing (as root):
+ echo 1 > /proc/sys/fs/ntfs-debug
+ Replacing the "1" with "0" would disable debug messages.
+
+ If you leave debugging messages disabled, this results in little
+ overhead, but enabling debug messages results in very significant
+ slowdown of the system.
+
+ When reporting bugs, please try to have available a full dump of
+ debugging messages while the misbehaviour was occurring.
+
+config NTFS_RW
+ bool "NTFS write support"
+ depends on NTFS_FS
+ depends on PAGE_SIZE_LESS_THAN_64KB
+ help
+ This enables the partial, but safe, write support in the NTFS driver.
+
+ The only supported operation is overwriting existing files, without
+ changing the file length. No file or directory creation, deletion or
+ renaming is possible. Note only non-resident files can be written to
+ so you may find that some very small files (<500 bytes or so) cannot
+ be written to.
+
+ While we cannot guarantee that it will not damage any data, we have
+ so far not received a single report where the driver would have
+ damaged someones data so we assume it is perfectly safe to use.
+
+ Note: While write support is safe in this version (a rewrite from
+ scratch of the NTFS support), it should be noted that the old NTFS
+ write support, included in Linux 2.5.10 and before (since 1997),
+ is not safe.
+
+ This is currently useful with TopologiLinux. TopologiLinux is run
+ on top of any DOS/Microsoft Windows system without partitioning your
+ hard disk. Unlike other Linux distributions TopologiLinux does not
+ need its own partition. For more information see
+ <http://topologi-linux.sourceforge.net/>
+
+ It is perfectly safe to say N here.
diff --git a/fs/ntfs/Makefile b/fs/ntfs/Makefile
new file mode 100644
index 000000000..3e736572e
--- /dev/null
+++ b/fs/ntfs/Makefile
@@ -0,0 +1,15 @@
+# SPDX-License-Identifier: GPL-2.0
+# Rules for making the NTFS driver.
+
+obj-$(CONFIG_NTFS_FS) += ntfs.o
+
+ntfs-y := aops.o attrib.o collate.o compress.o debug.o dir.o file.o \
+ index.o inode.o mft.o mst.o namei.o runlist.o super.o sysctl.o \
+ unistr.o upcase.o
+
+ntfs-$(CONFIG_NTFS_RW) += bitmap.o lcnalloc.o logfile.o quota.o usnjrnl.o
+
+ccflags-y := -DNTFS_VERSION=\"2.1.32\"
+ccflags-$(CONFIG_NTFS_DEBUG) += -DDEBUG
+ccflags-$(CONFIG_NTFS_RW) += -DNTFS_RW
+
diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c
new file mode 100644
index 000000000..9364d35b4
--- /dev/null
+++ b/fs/ntfs/aops.c
@@ -0,0 +1,1761 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/**
+ * aops.c - NTFS kernel address space operations and page cache handling.
+ *
+ * Copyright (c) 2001-2014 Anton Altaparmakov and Tuxera Inc.
+ * Copyright (c) 2002 Richard Russon
+ */
+
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/gfp.h>
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+#include <linux/swap.h>
+#include <linux/buffer_head.h>
+#include <linux/writeback.h>
+#include <linux/bit_spinlock.h>
+#include <linux/bio.h>
+
+#include "aops.h"
+#include "attrib.h"
+#include "debug.h"
+#include "inode.h"
+#include "mft.h"
+#include "runlist.h"
+#include "types.h"
+#include "ntfs.h"
+
+/**
+ * ntfs_end_buffer_async_read - async io completion for reading attributes
+ * @bh: buffer head on which io is completed
+ * @uptodate: whether @bh is now uptodate or not
+ *
+ * Asynchronous I/O completion handler for reading pages belonging to the
+ * attribute address space of an inode. The inodes can either be files or
+ * directories or they can be fake inodes describing some attribute.
+ *
+ * If NInoMstProtected(), perform the post read mst fixups when all IO on the
+ * page has been completed and mark the page uptodate or set the error bit on
+ * the page. To determine the size of the records that need fixing up, we
+ * cheat a little bit by setting the index_block_size in ntfs_inode to the ntfs
+ * record size, and index_block_size_bits, to the log(base 2) of the ntfs
+ * record size.
+ */
+static void ntfs_end_buffer_async_read(struct buffer_head *bh, int uptodate)
+{
+ unsigned long flags;
+ struct buffer_head *first, *tmp;
+ struct page *page;
+ struct inode *vi;
+ ntfs_inode *ni;
+ int page_uptodate = 1;
+
+ page = bh->b_page;
+ vi = page->mapping->host;
+ ni = NTFS_I(vi);
+
+ if (likely(uptodate)) {
+ loff_t i_size;
+ s64 file_ofs, init_size;
+
+ set_buffer_uptodate(bh);
+
+ file_ofs = ((s64)page->index << PAGE_SHIFT) +
+ bh_offset(bh);
+ read_lock_irqsave(&ni->size_lock, flags);
+ init_size = ni->initialized_size;
+ i_size = i_size_read(vi);
+ read_unlock_irqrestore(&ni->size_lock, flags);
+ if (unlikely(init_size > i_size)) {
+ /* Race with shrinking truncate. */
+ init_size = i_size;
+ }
+ /* Check for the current buffer head overflowing. */
+ if (unlikely(file_ofs + bh->b_size > init_size)) {
+ int ofs;
+ void *kaddr;
+
+ ofs = 0;
+ if (file_ofs < init_size)
+ ofs = init_size - file_ofs;
+ kaddr = kmap_atomic(page);
+ memset(kaddr + bh_offset(bh) + ofs, 0,
+ bh->b_size - ofs);
+ flush_dcache_page(page);
+ kunmap_atomic(kaddr);
+ }
+ } else {
+ clear_buffer_uptodate(bh);
+ SetPageError(page);
+ ntfs_error(ni->vol->sb, "Buffer I/O error, logical block "
+ "0x%llx.", (unsigned long long)bh->b_blocknr);
+ }
+ first = page_buffers(page);
+ spin_lock_irqsave(&first->b_uptodate_lock, flags);
+ clear_buffer_async_read(bh);
+ unlock_buffer(bh);
+ tmp = bh;
+ do {
+ if (!buffer_uptodate(tmp))
+ page_uptodate = 0;
+ if (buffer_async_read(tmp)) {
+ if (likely(buffer_locked(tmp)))
+ goto still_busy;
+ /* Async buffers must be locked. */
+ BUG();
+ }
+ tmp = tmp->b_this_page;
+ } while (tmp != bh);
+ spin_unlock_irqrestore(&first->b_uptodate_lock, flags);
+ /*
+ * If none of the buffers had errors then we can set the page uptodate,
+ * but we first have to perform the post read mst fixups, if the
+ * attribute is mst protected, i.e. if NInoMstProteced(ni) is true.
+ * Note we ignore fixup errors as those are detected when
+ * map_mft_record() is called which gives us per record granularity
+ * rather than per page granularity.
+ */
+ if (!NInoMstProtected(ni)) {
+ if (likely(page_uptodate && !PageError(page)))
+ SetPageUptodate(page);
+ } else {
+ u8 *kaddr;
+ unsigned int i, recs;
+ u32 rec_size;
+
+ rec_size = ni->itype.index.block_size;
+ recs = PAGE_SIZE / rec_size;
+ /* Should have been verified before we got here... */
+ BUG_ON(!recs);
+ kaddr = kmap_atomic(page);
+ for (i = 0; i < recs; i++)
+ post_read_mst_fixup((NTFS_RECORD*)(kaddr +
+ i * rec_size), rec_size);
+ kunmap_atomic(kaddr);
+ flush_dcache_page(page);
+ if (likely(page_uptodate && !PageError(page)))
+ SetPageUptodate(page);
+ }
+ unlock_page(page);
+ return;
+still_busy:
+ spin_unlock_irqrestore(&first->b_uptodate_lock, flags);
+ return;
+}
+
+/**
+ * ntfs_read_block - fill a @page of an address space with data
+ * @page: page cache page to fill with data
+ *
+ * Fill the page @page of the address space belonging to the @page->host inode.
+ * We read each buffer asynchronously and when all buffers are read in, our io
+ * completion handler ntfs_end_buffer_read_async(), if required, automatically
+ * applies the mst fixups to the page before finally marking it uptodate and
+ * unlocking it.
+ *
+ * We only enforce allocated_size limit because i_size is checked for in
+ * generic_file_read().
+ *
+ * Return 0 on success and -errno on error.
+ *
+ * Contains an adapted version of fs/buffer.c::block_read_full_folio().
+ */
+static int ntfs_read_block(struct page *page)
+{
+ loff_t i_size;
+ VCN vcn;
+ LCN lcn;
+ s64 init_size;
+ struct inode *vi;
+ ntfs_inode *ni;
+ ntfs_volume *vol;
+ runlist_element *rl;
+ struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE];
+ sector_t iblock, lblock, zblock;
+ unsigned long flags;
+ unsigned int blocksize, vcn_ofs;
+ int i, nr;
+ unsigned char blocksize_bits;
+
+ vi = page->mapping->host;
+ ni = NTFS_I(vi);
+ vol = ni->vol;
+
+ /* $MFT/$DATA must have its complete runlist in memory at all times. */
+ BUG_ON(!ni->runlist.rl && !ni->mft_no && !NInoAttr(ni));
+
+ blocksize = vol->sb->s_blocksize;
+ blocksize_bits = vol->sb->s_blocksize_bits;
+
+ if (!page_has_buffers(page)) {
+ create_empty_buffers(page, blocksize, 0);
+ if (unlikely(!page_has_buffers(page))) {
+ unlock_page(page);
+ return -ENOMEM;
+ }
+ }
+ bh = head = page_buffers(page);
+ BUG_ON(!bh);
+
+ /*
+ * We may be racing with truncate. To avoid some of the problems we
+ * now take a snapshot of the various sizes and use those for the whole
+ * of the function. In case of an extending truncate it just means we
+ * may leave some buffers unmapped which are now allocated. This is
+ * not a problem since these buffers will just get mapped when a write
+ * occurs. In case of a shrinking truncate, we will detect this later
+ * on due to the runlist being incomplete and if the page is being
+ * fully truncated, truncate will throw it away as soon as we unlock
+ * it so no need to worry what we do with it.
+ */
+ iblock = (s64)page->index << (PAGE_SHIFT - blocksize_bits);
+ read_lock_irqsave(&ni->size_lock, flags);
+ lblock = (ni->allocated_size + blocksize - 1) >> blocksize_bits;
+ init_size = ni->initialized_size;
+ i_size = i_size_read(vi);
+ read_unlock_irqrestore(&ni->size_lock, flags);
+ if (unlikely(init_size > i_size)) {
+ /* Race with shrinking truncate. */
+ init_size = i_size;
+ }
+ zblock = (init_size + blocksize - 1) >> blocksize_bits;
+
+ /* Loop through all the buffers in the page. */
+ rl = NULL;
+ nr = i = 0;
+ do {
+ int err = 0;
+
+ if (unlikely(buffer_uptodate(bh)))
+ continue;
+ if (unlikely(buffer_mapped(bh))) {
+ arr[nr++] = bh;
+ continue;
+ }
+ bh->b_bdev = vol->sb->s_bdev;
+ /* Is the block within the allowed limits? */
+ if (iblock < lblock) {
+ bool is_retry = false;
+
+ /* Convert iblock into corresponding vcn and offset. */
+ vcn = (VCN)iblock << blocksize_bits >>
+ vol->cluster_size_bits;
+ vcn_ofs = ((VCN)iblock << blocksize_bits) &
+ vol->cluster_size_mask;
+ if (!rl) {
+lock_retry_remap:
+ down_read(&ni->runlist.lock);
+ rl = ni->runlist.rl;
+ }
+ if (likely(rl != NULL)) {
+ /* Seek to element containing target vcn. */
+ while (rl->length && rl[1].vcn <= vcn)
+ rl++;
+ lcn = ntfs_rl_vcn_to_lcn(rl, vcn);
+ } else
+ lcn = LCN_RL_NOT_MAPPED;
+ /* Successful remap. */
+ if (lcn >= 0) {
+ /* Setup buffer head to correct block. */
+ bh->b_blocknr = ((lcn << vol->cluster_size_bits)
+ + vcn_ofs) >> blocksize_bits;
+ set_buffer_mapped(bh);
+ /* Only read initialized data blocks. */
+ if (iblock < zblock) {
+ arr[nr++] = bh;
+ continue;
+ }
+ /* Fully non-initialized data block, zero it. */
+ goto handle_zblock;
+ }
+ /* It is a hole, need to zero it. */
+ if (lcn == LCN_HOLE)
+ goto handle_hole;
+ /* If first try and runlist unmapped, map and retry. */
+ if (!is_retry && lcn == LCN_RL_NOT_MAPPED) {
+ is_retry = true;
+ /*
+ * Attempt to map runlist, dropping lock for
+ * the duration.
+ */
+ up_read(&ni->runlist.lock);
+ err = ntfs_map_runlist(ni, vcn);
+ if (likely(!err))
+ goto lock_retry_remap;
+ rl = NULL;
+ } else if (!rl)
+ up_read(&ni->runlist.lock);
+ /*
+ * If buffer is outside the runlist, treat it as a
+ * hole. This can happen due to concurrent truncate
+ * for example.
+ */
+ if (err == -ENOENT || lcn == LCN_ENOENT) {
+ err = 0;
+ goto handle_hole;
+ }
+ /* Hard error, zero out region. */
+ if (!err)
+ err = -EIO;
+ bh->b_blocknr = -1;
+ SetPageError(page);
+ ntfs_error(vol->sb, "Failed to read from inode 0x%lx, "
+ "attribute type 0x%x, vcn 0x%llx, "
+ "offset 0x%x because its location on "
+ "disk could not be determined%s "
+ "(error code %i).", ni->mft_no,
+ ni->type, (unsigned long long)vcn,
+ vcn_ofs, is_retry ? " even after "
+ "retrying" : "", err);
+ }
+ /*
+ * Either iblock was outside lblock limits or
+ * ntfs_rl_vcn_to_lcn() returned error. Just zero that portion
+ * of the page and set the buffer uptodate.
+ */
+handle_hole:
+ bh->b_blocknr = -1UL;
+ clear_buffer_mapped(bh);
+handle_zblock:
+ zero_user(page, i * blocksize, blocksize);
+ if (likely(!err))
+ set_buffer_uptodate(bh);
+ } while (i++, iblock++, (bh = bh->b_this_page) != head);
+
+ /* Release the lock if we took it. */
+ if (rl)
+ up_read(&ni->runlist.lock);
+
+ /* Check we have at least one buffer ready for i/o. */
+ if (nr) {
+ struct buffer_head *tbh;
+
+ /* Lock the buffers. */
+ for (i = 0; i < nr; i++) {
+ tbh = arr[i];
+ lock_buffer(tbh);
+ tbh->b_end_io = ntfs_end_buffer_async_read;
+ set_buffer_async_read(tbh);
+ }
+ /* Finally, start i/o on the buffers. */
+ for (i = 0; i < nr; i++) {
+ tbh = arr[i];
+ if (likely(!buffer_uptodate(tbh)))
+ submit_bh(REQ_OP_READ, tbh);
+ else
+ ntfs_end_buffer_async_read(tbh, 1);
+ }
+ return 0;
+ }
+ /* No i/o was scheduled on any of the buffers. */
+ if (likely(!PageError(page)))
+ SetPageUptodate(page);
+ else /* Signal synchronous i/o error. */
+ nr = -EIO;
+ unlock_page(page);
+ return nr;
+}
+
+/**
+ * ntfs_read_folio - fill a @folio of a @file with data from the device
+ * @file: open file to which the folio @folio belongs or NULL
+ * @folio: page cache folio to fill with data
+ *
+ * For non-resident attributes, ntfs_read_folio() fills the @folio of the open
+ * file @file by calling the ntfs version of the generic block_read_full_folio()
+ * function, ntfs_read_block(), which in turn creates and reads in the buffers
+ * associated with the folio asynchronously.
+ *
+ * For resident attributes, OTOH, ntfs_read_folio() fills @folio by copying the
+ * data from the mft record (which at this stage is most likely in memory) and
+ * fills the remainder with zeroes. Thus, in this case, I/O is synchronous, as
+ * even if the mft record is not cached at this point in time, we need to wait
+ * for it to be read in before we can do the copy.
+ *
+ * Return 0 on success and -errno on error.
+ */
+static int ntfs_read_folio(struct file *file, struct folio *folio)
+{
+ struct page *page = &folio->page;
+ loff_t i_size;
+ struct inode *vi;
+ ntfs_inode *ni, *base_ni;
+ u8 *addr;
+ ntfs_attr_search_ctx *ctx;
+ MFT_RECORD *mrec;
+ unsigned long flags;
+ u32 attr_len;
+ int err = 0;
+
+retry_readpage:
+ BUG_ON(!PageLocked(page));
+ vi = page->mapping->host;
+ i_size = i_size_read(vi);
+ /* Is the page fully outside i_size? (truncate in progress) */
+ if (unlikely(page->index >= (i_size + PAGE_SIZE - 1) >>
+ PAGE_SHIFT)) {
+ zero_user(page, 0, PAGE_SIZE);
+ ntfs_debug("Read outside i_size - truncated?");
+ goto done;
+ }
+ /*
+ * This can potentially happen because we clear PageUptodate() during
+ * ntfs_writepage() of MstProtected() attributes.
+ */
+ if (PageUptodate(page)) {
+ unlock_page(page);
+ return 0;
+ }
+ ni = NTFS_I(vi);
+ /*
+ * Only $DATA attributes can be encrypted and only unnamed $DATA
+ * attributes can be compressed. Index root can have the flags set but
+ * this means to create compressed/encrypted files, not that the
+ * attribute is compressed/encrypted. Note we need to check for
+ * AT_INDEX_ALLOCATION since this is the type of both directory and
+ * index inodes.
+ */
+ if (ni->type != AT_INDEX_ALLOCATION) {
+ /* If attribute is encrypted, deny access, just like NT4. */
+ if (NInoEncrypted(ni)) {
+ BUG_ON(ni->type != AT_DATA);
+ err = -EACCES;
+ goto err_out;
+ }
+ /* Compressed data streams are handled in compress.c. */
+ if (NInoNonResident(ni) && NInoCompressed(ni)) {
+ BUG_ON(ni->type != AT_DATA);
+ BUG_ON(ni->name_len);
+ return ntfs_read_compressed_block(page);
+ }
+ }
+ /* NInoNonResident() == NInoIndexAllocPresent() */
+ if (NInoNonResident(ni)) {
+ /* Normal, non-resident data stream. */
+ return ntfs_read_block(page);
+ }
+ /*
+ * Attribute is resident, implying it is not compressed or encrypted.
+ * This also means the attribute is smaller than an mft record and
+ * hence smaller than a page, so can simply zero out any pages with
+ * index above 0. Note the attribute can actually be marked compressed
+ * but if it is resident the actual data is not compressed so we are
+ * ok to ignore the compressed flag here.
+ */
+ if (unlikely(page->index > 0)) {
+ zero_user(page, 0, PAGE_SIZE);
+ goto done;
+ }
+ if (!NInoAttr(ni))
+ base_ni = ni;
+ else
+ base_ni = ni->ext.base_ntfs_ino;
+ /* Map, pin, and lock the mft record. */
+ mrec = map_mft_record(base_ni);
+ if (IS_ERR(mrec)) {
+ err = PTR_ERR(mrec);
+ goto err_out;
+ }
+ /*
+ * If a parallel write made the attribute non-resident, drop the mft
+ * record and retry the read_folio.
+ */
+ if (unlikely(NInoNonResident(ni))) {
+ unmap_mft_record(base_ni);
+ goto retry_readpage;
+ }
+ ctx = ntfs_attr_get_search_ctx(base_ni, mrec);
+ if (unlikely(!ctx)) {
+ err = -ENOMEM;
+ goto unm_err_out;
+ }
+ err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
+ CASE_SENSITIVE, 0, NULL, 0, ctx);
+ if (unlikely(err))
+ goto put_unm_err_out;
+ attr_len = le32_to_cpu(ctx->attr->data.resident.value_length);
+ read_lock_irqsave(&ni->size_lock, flags);
+ if (unlikely(attr_len > ni->initialized_size))
+ attr_len = ni->initialized_size;
+ i_size = i_size_read(vi);
+ read_unlock_irqrestore(&ni->size_lock, flags);
+ if (unlikely(attr_len > i_size)) {
+ /* Race with shrinking truncate. */
+ attr_len = i_size;
+ }
+ addr = kmap_atomic(page);
+ /* Copy the data to the page. */
+ memcpy(addr, (u8*)ctx->attr +
+ le16_to_cpu(ctx->attr->data.resident.value_offset),
+ attr_len);
+ /* Zero the remainder of the page. */
+ memset(addr + attr_len, 0, PAGE_SIZE - attr_len);
+ flush_dcache_page(page);
+ kunmap_atomic(addr);
+put_unm_err_out:
+ ntfs_attr_put_search_ctx(ctx);
+unm_err_out:
+ unmap_mft_record(base_ni);
+done:
+ SetPageUptodate(page);
+err_out:
+ unlock_page(page);
+ return err;
+}
+
+#ifdef NTFS_RW
+
+/**
+ * ntfs_write_block - write a @page to the backing store
+ * @page: page cache page to write out
+ * @wbc: writeback control structure
+ *
+ * This function is for writing pages belonging to non-resident, non-mst
+ * protected attributes to their backing store.
+ *
+ * For a page with buffers, map and write the dirty buffers asynchronously
+ * under page writeback. For a page without buffers, create buffers for the
+ * page, then proceed as above.
+ *
+ * If a page doesn't have buffers the page dirty state is definitive. If a page
+ * does have buffers, the page dirty state is just a hint, and the buffer dirty
+ * state is definitive. (A hint which has rules: dirty buffers against a clean
+ * page is illegal. Other combinations are legal and need to be handled. In
+ * particular a dirty page containing clean buffers for example.)
+ *
+ * Return 0 on success and -errno on error.
+ *
+ * Based on ntfs_read_block() and __block_write_full_page().
+ */
+static int ntfs_write_block(struct page *page, struct writeback_control *wbc)
+{
+ VCN vcn;
+ LCN lcn;
+ s64 initialized_size;
+ loff_t i_size;
+ sector_t block, dblock, iblock;
+ struct inode *vi;
+ ntfs_inode *ni;
+ ntfs_volume *vol;
+ runlist_element *rl;
+ struct buffer_head *bh, *head;
+ unsigned long flags;
+ unsigned int blocksize, vcn_ofs;
+ int err;
+ bool need_end_writeback;
+ unsigned char blocksize_bits;
+
+ vi = page->mapping->host;
+ ni = NTFS_I(vi);
+ vol = ni->vol;
+
+ ntfs_debug("Entering for inode 0x%lx, attribute type 0x%x, page index "
+ "0x%lx.", ni->mft_no, ni->type, page->index);
+
+ BUG_ON(!NInoNonResident(ni));
+ BUG_ON(NInoMstProtected(ni));
+ blocksize = vol->sb->s_blocksize;
+ blocksize_bits = vol->sb->s_blocksize_bits;
+ if (!page_has_buffers(page)) {
+ BUG_ON(!PageUptodate(page));
+ create_empty_buffers(page, blocksize,
+ (1 << BH_Uptodate) | (1 << BH_Dirty));
+ if (unlikely(!page_has_buffers(page))) {
+ ntfs_warning(vol->sb, "Error allocating page "
+ "buffers. Redirtying page so we try "
+ "again later.");
+ /*
+ * Put the page back on mapping->dirty_pages, but leave
+ * its buffers' dirty state as-is.
+ */
+ redirty_page_for_writepage(wbc, page);
+ unlock_page(page);
+ return 0;
+ }
+ }
+ bh = head = page_buffers(page);
+ BUG_ON(!bh);
+
+ /* NOTE: Different naming scheme to ntfs_read_block()! */
+
+ /* The first block in the page. */
+ block = (s64)page->index << (PAGE_SHIFT - blocksize_bits);
+
+ read_lock_irqsave(&ni->size_lock, flags);
+ i_size = i_size_read(vi);
+ initialized_size = ni->initialized_size;
+ read_unlock_irqrestore(&ni->size_lock, flags);
+
+ /* The first out of bounds block for the data size. */
+ dblock = (i_size + blocksize - 1) >> blocksize_bits;
+
+ /* The last (fully or partially) initialized block. */
+ iblock = initialized_size >> blocksize_bits;
+
+ /*
+ * Be very careful. We have no exclusion from block_dirty_folio
+ * here, and the (potentially unmapped) buffers may become dirty at
+ * any time. If a buffer becomes dirty here after we've inspected it
+ * then we just miss that fact, and the page stays dirty.
+ *
+ * Buffers outside i_size may be dirtied by block_dirty_folio;
+ * handle that here by just cleaning them.
+ */
+
+ /*
+ * Loop through all the buffers in the page, mapping all the dirty
+ * buffers to disk addresses and handling any aliases from the
+ * underlying block device's mapping.
+ */
+ rl = NULL;
+ err = 0;
+ do {
+ bool is_retry = false;
+
+ if (unlikely(block >= dblock)) {
+ /*
+ * Mapped buffers outside i_size will occur, because
+ * this page can be outside i_size when there is a
+ * truncate in progress. The contents of such buffers
+ * were zeroed by ntfs_writepage().
+ *
+ * FIXME: What about the small race window where
+ * ntfs_writepage() has not done any clearing because
+ * the page was within i_size but before we get here,
+ * vmtruncate() modifies i_size?
+ */
+ clear_buffer_dirty(bh);
+ set_buffer_uptodate(bh);
+ continue;
+ }
+
+ /* Clean buffers are not written out, so no need to map them. */
+ if (!buffer_dirty(bh))
+ continue;
+
+ /* Make sure we have enough initialized size. */
+ if (unlikely((block >= iblock) &&
+ (initialized_size < i_size))) {
+ /*
+ * If this page is fully outside initialized
+ * size, zero out all pages between the current
+ * initialized size and the current page. Just
+ * use ntfs_read_folio() to do the zeroing
+ * transparently.
+ */
+ if (block > iblock) {
+ // TODO:
+ // For each page do:
+ // - read_cache_page()
+ // Again for each page do:
+ // - wait_on_page_locked()
+ // - Check (PageUptodate(page) &&
+ // !PageError(page))
+ // Update initialized size in the attribute and
+ // in the inode.
+ // Again, for each page do:
+ // block_dirty_folio();
+ // put_page()
+ // We don't need to wait on the writes.
+ // Update iblock.
+ }
+ /*
+ * The current page straddles initialized size. Zero
+ * all non-uptodate buffers and set them uptodate (and
+ * dirty?). Note, there aren't any non-uptodate buffers
+ * if the page is uptodate.
+ * FIXME: For an uptodate page, the buffers may need to
+ * be written out because they were not initialized on
+ * disk before.
+ */
+ if (!PageUptodate(page)) {
+ // TODO:
+ // Zero any non-uptodate buffers up to i_size.
+ // Set them uptodate and dirty.
+ }
+ // TODO:
+ // Update initialized size in the attribute and in the
+ // inode (up to i_size).
+ // Update iblock.
+ // FIXME: This is inefficient. Try to batch the two
+ // size changes to happen in one go.
+ ntfs_error(vol->sb, "Writing beyond initialized size "
+ "is not supported yet. Sorry.");
+ err = -EOPNOTSUPP;
+ break;
+ // Do NOT set_buffer_new() BUT DO clear buffer range
+ // outside write request range.
+ // set_buffer_uptodate() on complete buffers as well as
+ // set_buffer_dirty().
+ }
+
+ /* No need to map buffers that are already mapped. */
+ if (buffer_mapped(bh))
+ continue;
+
+ /* Unmapped, dirty buffer. Need to map it. */
+ bh->b_bdev = vol->sb->s_bdev;
+
+ /* Convert block into corresponding vcn and offset. */
+ vcn = (VCN)block << blocksize_bits;
+ vcn_ofs = vcn & vol->cluster_size_mask;
+ vcn >>= vol->cluster_size_bits;
+ if (!rl) {
+lock_retry_remap:
+ down_read(&ni->runlist.lock);
+ rl = ni->runlist.rl;
+ }
+ if (likely(rl != NULL)) {
+ /* Seek to element containing target vcn. */
+ while (rl->length && rl[1].vcn <= vcn)
+ rl++;
+ lcn = ntfs_rl_vcn_to_lcn(rl, vcn);
+ } else
+ lcn = LCN_RL_NOT_MAPPED;
+ /* Successful remap. */
+ if (lcn >= 0) {
+ /* Setup buffer head to point to correct block. */
+ bh->b_blocknr = ((lcn << vol->cluster_size_bits) +
+ vcn_ofs) >> blocksize_bits;
+ set_buffer_mapped(bh);
+ continue;
+ }
+ /* It is a hole, need to instantiate it. */
+ if (lcn == LCN_HOLE) {
+ u8 *kaddr;
+ unsigned long *bpos, *bend;
+
+ /* Check if the buffer is zero. */
+ kaddr = kmap_atomic(page);
+ bpos = (unsigned long *)(kaddr + bh_offset(bh));
+ bend = (unsigned long *)((u8*)bpos + blocksize);
+ do {
+ if (unlikely(*bpos))
+ break;
+ } while (likely(++bpos < bend));
+ kunmap_atomic(kaddr);
+ if (bpos == bend) {
+ /*
+ * Buffer is zero and sparse, no need to write
+ * it.
+ */
+ bh->b_blocknr = -1;
+ clear_buffer_dirty(bh);
+ continue;
+ }
+ // TODO: Instantiate the hole.
+ // clear_buffer_new(bh);
+ // clean_bdev_bh_alias(bh);
+ ntfs_error(vol->sb, "Writing into sparse regions is "
+ "not supported yet. Sorry.");
+ err = -EOPNOTSUPP;
+ break;
+ }
+ /* If first try and runlist unmapped, map and retry. */
+ if (!is_retry && lcn == LCN_RL_NOT_MAPPED) {
+ is_retry = true;
+ /*
+ * Attempt to map runlist, dropping lock for
+ * the duration.
+ */
+ up_read(&ni->runlist.lock);
+ err = ntfs_map_runlist(ni, vcn);
+ if (likely(!err))
+ goto lock_retry_remap;
+ rl = NULL;
+ } else if (!rl)
+ up_read(&ni->runlist.lock);
+ /*
+ * If buffer is outside the runlist, truncate has cut it out
+ * of the runlist. Just clean and clear the buffer and set it
+ * uptodate so it can get discarded by the VM.
+ */
+ if (err == -ENOENT || lcn == LCN_ENOENT) {
+ bh->b_blocknr = -1;
+ clear_buffer_dirty(bh);
+ zero_user(page, bh_offset(bh), blocksize);
+ set_buffer_uptodate(bh);
+ err = 0;
+ continue;
+ }
+ /* Failed to map the buffer, even after retrying. */
+ if (!err)
+ err = -EIO;
+ bh->b_blocknr = -1;
+ ntfs_error(vol->sb, "Failed to write to inode 0x%lx, "
+ "attribute type 0x%x, vcn 0x%llx, offset 0x%x "
+ "because its location on disk could not be "
+ "determined%s (error code %i).", ni->mft_no,
+ ni->type, (unsigned long long)vcn,
+ vcn_ofs, is_retry ? " even after "
+ "retrying" : "", err);
+ break;
+ } while (block++, (bh = bh->b_this_page) != head);
+
+ /* Release the lock if we took it. */
+ if (rl)
+ up_read(&ni->runlist.lock);
+
+ /* For the error case, need to reset bh to the beginning. */
+ bh = head;
+
+ /* Just an optimization, so ->read_folio() is not called later. */
+ if (unlikely(!PageUptodate(page))) {
+ int uptodate = 1;
+ do {
+ if (!buffer_uptodate(bh)) {
+ uptodate = 0;
+ bh = head;
+ break;
+ }
+ } while ((bh = bh->b_this_page) != head);
+ if (uptodate)
+ SetPageUptodate(page);
+ }
+
+ /* Setup all mapped, dirty buffers for async write i/o. */
+ do {
+ if (buffer_mapped(bh) && buffer_dirty(bh)) {
+ lock_buffer(bh);
+ if (test_clear_buffer_dirty(bh)) {
+ BUG_ON(!buffer_uptodate(bh));
+ mark_buffer_async_write(bh);
+ } else
+ unlock_buffer(bh);
+ } else if (unlikely(err)) {
+ /*
+ * For the error case. The buffer may have been set
+ * dirty during attachment to a dirty page.
+ */
+ if (err != -ENOMEM)
+ clear_buffer_dirty(bh);
+ }
+ } while ((bh = bh->b_this_page) != head);
+
+ if (unlikely(err)) {
+ // TODO: Remove the -EOPNOTSUPP check later on...
+ if (unlikely(err == -EOPNOTSUPP))
+ err = 0;
+ else if (err == -ENOMEM) {
+ ntfs_warning(vol->sb, "Error allocating memory. "
+ "Redirtying page so we try again "
+ "later.");
+ /*
+ * Put the page back on mapping->dirty_pages, but
+ * leave its buffer's dirty state as-is.
+ */
+ redirty_page_for_writepage(wbc, page);
+ err = 0;
+ } else
+ SetPageError(page);
+ }
+
+ BUG_ON(PageWriteback(page));
+ set_page_writeback(page); /* Keeps try_to_free_buffers() away. */
+
+ /* Submit the prepared buffers for i/o. */
+ need_end_writeback = true;
+ do {
+ struct buffer_head *next = bh->b_this_page;
+ if (buffer_async_write(bh)) {
+ submit_bh(REQ_OP_WRITE, bh);
+ need_end_writeback = false;
+ }
+ bh = next;
+ } while (bh != head);
+ unlock_page(page);
+
+ /* If no i/o was started, need to end_page_writeback(). */
+ if (unlikely(need_end_writeback))
+ end_page_writeback(page);
+
+ ntfs_debug("Done.");
+ return err;
+}
+
+/**
+ * ntfs_write_mst_block - write a @page to the backing store
+ * @page: page cache page to write out
+ * @wbc: writeback control structure
+ *
+ * This function is for writing pages belonging to non-resident, mst protected
+ * attributes to their backing store. The only supported attributes are index
+ * allocation and $MFT/$DATA. Both directory inodes and index inodes are
+ * supported for the index allocation case.
+ *
+ * The page must remain locked for the duration of the write because we apply
+ * the mst fixups, write, and then undo the fixups, so if we were to unlock the
+ * page before undoing the fixups, any other user of the page will see the
+ * page contents as corrupt.
+ *
+ * We clear the page uptodate flag for the duration of the function to ensure
+ * exclusion for the $MFT/$DATA case against someone mapping an mft record we
+ * are about to apply the mst fixups to.
+ *
+ * Return 0 on success and -errno on error.
+ *
+ * Based on ntfs_write_block(), ntfs_mft_writepage(), and
+ * write_mft_record_nolock().
+ */
+static int ntfs_write_mst_block(struct page *page,
+ struct writeback_control *wbc)
+{
+ sector_t block, dblock, rec_block;
+ struct inode *vi = page->mapping->host;
+ ntfs_inode *ni = NTFS_I(vi);
+ ntfs_volume *vol = ni->vol;
+ u8 *kaddr;
+ unsigned int rec_size = ni->itype.index.block_size;
+ ntfs_inode *locked_nis[PAGE_SIZE / NTFS_BLOCK_SIZE];
+ struct buffer_head *bh, *head, *tbh, *rec_start_bh;
+ struct buffer_head *bhs[MAX_BUF_PER_PAGE];
+ runlist_element *rl;
+ int i, nr_locked_nis, nr_recs, nr_bhs, max_bhs, bhs_per_rec, err, err2;
+ unsigned bh_size, rec_size_bits;
+ bool sync, is_mft, page_is_dirty, rec_is_dirty;
+ unsigned char bh_size_bits;
+
+ if (WARN_ON(rec_size < NTFS_BLOCK_SIZE))
+ return -EINVAL;
+
+ ntfs_debug("Entering for inode 0x%lx, attribute type 0x%x, page index "
+ "0x%lx.", vi->i_ino, ni->type, page->index);
+ BUG_ON(!NInoNonResident(ni));
+ BUG_ON(!NInoMstProtected(ni));
+ is_mft = (S_ISREG(vi->i_mode) && !vi->i_ino);
+ /*
+ * NOTE: ntfs_write_mst_block() would be called for $MFTMirr if a page
+ * in its page cache were to be marked dirty. However this should
+ * never happen with the current driver and considering we do not
+ * handle this case here we do want to BUG(), at least for now.
+ */
+ BUG_ON(!(is_mft || S_ISDIR(vi->i_mode) ||
+ (NInoAttr(ni) && ni->type == AT_INDEX_ALLOCATION)));
+ bh_size = vol->sb->s_blocksize;
+ bh_size_bits = vol->sb->s_blocksize_bits;
+ max_bhs = PAGE_SIZE / bh_size;
+ BUG_ON(!max_bhs);
+ BUG_ON(max_bhs > MAX_BUF_PER_PAGE);
+
+ /* Were we called for sync purposes? */
+ sync = (wbc->sync_mode == WB_SYNC_ALL);
+
+ /* Make sure we have mapped buffers. */
+ bh = head = page_buffers(page);
+ BUG_ON(!bh);
+
+ rec_size_bits = ni->itype.index.block_size_bits;
+ BUG_ON(!(PAGE_SIZE >> rec_size_bits));
+ bhs_per_rec = rec_size >> bh_size_bits;
+ BUG_ON(!bhs_per_rec);
+
+ /* The first block in the page. */
+ rec_block = block = (sector_t)page->index <<
+ (PAGE_SHIFT - bh_size_bits);
+
+ /* The first out of bounds block for the data size. */
+ dblock = (i_size_read(vi) + bh_size - 1) >> bh_size_bits;
+
+ rl = NULL;
+ err = err2 = nr_bhs = nr_recs = nr_locked_nis = 0;
+ page_is_dirty = rec_is_dirty = false;
+ rec_start_bh = NULL;
+ do {
+ bool is_retry = false;
+
+ if (likely(block < rec_block)) {
+ if (unlikely(block >= dblock)) {
+ clear_buffer_dirty(bh);
+ set_buffer_uptodate(bh);
+ continue;
+ }
+ /*
+ * This block is not the first one in the record. We
+ * ignore the buffer's dirty state because we could
+ * have raced with a parallel mark_ntfs_record_dirty().
+ */
+ if (!rec_is_dirty)
+ continue;
+ if (unlikely(err2)) {
+ if (err2 != -ENOMEM)
+ clear_buffer_dirty(bh);
+ continue;
+ }
+ } else /* if (block == rec_block) */ {
+ BUG_ON(block > rec_block);
+ /* This block is the first one in the record. */
+ rec_block += bhs_per_rec;
+ err2 = 0;
+ if (unlikely(block >= dblock)) {
+ clear_buffer_dirty(bh);
+ continue;
+ }
+ if (!buffer_dirty(bh)) {
+ /* Clean records are not written out. */
+ rec_is_dirty = false;
+ continue;
+ }
+ rec_is_dirty = true;
+ rec_start_bh = bh;
+ }
+ /* Need to map the buffer if it is not mapped already. */
+ if (unlikely(!buffer_mapped(bh))) {
+ VCN vcn;
+ LCN lcn;
+ unsigned int vcn_ofs;
+
+ bh->b_bdev = vol->sb->s_bdev;
+ /* Obtain the vcn and offset of the current block. */
+ vcn = (VCN)block << bh_size_bits;
+ vcn_ofs = vcn & vol->cluster_size_mask;
+ vcn >>= vol->cluster_size_bits;
+ if (!rl) {
+lock_retry_remap:
+ down_read(&ni->runlist.lock);
+ rl = ni->runlist.rl;
+ }
+ if (likely(rl != NULL)) {
+ /* Seek to element containing target vcn. */
+ while (rl->length && rl[1].vcn <= vcn)
+ rl++;
+ lcn = ntfs_rl_vcn_to_lcn(rl, vcn);
+ } else
+ lcn = LCN_RL_NOT_MAPPED;
+ /* Successful remap. */
+ if (likely(lcn >= 0)) {
+ /* Setup buffer head to correct block. */
+ bh->b_blocknr = ((lcn <<
+ vol->cluster_size_bits) +
+ vcn_ofs) >> bh_size_bits;
+ set_buffer_mapped(bh);
+ } else {
+ /*
+ * Remap failed. Retry to map the runlist once
+ * unless we are working on $MFT which always
+ * has the whole of its runlist in memory.
+ */
+ if (!is_mft && !is_retry &&
+ lcn == LCN_RL_NOT_MAPPED) {
+ is_retry = true;
+ /*
+ * Attempt to map runlist, dropping
+ * lock for the duration.
+ */
+ up_read(&ni->runlist.lock);
+ err2 = ntfs_map_runlist(ni, vcn);
+ if (likely(!err2))
+ goto lock_retry_remap;
+ if (err2 == -ENOMEM)
+ page_is_dirty = true;
+ lcn = err2;
+ } else {
+ err2 = -EIO;
+ if (!rl)
+ up_read(&ni->runlist.lock);
+ }
+ /* Hard error. Abort writing this record. */
+ if (!err || err == -ENOMEM)
+ err = err2;
+ bh->b_blocknr = -1;
+ ntfs_error(vol->sb, "Cannot write ntfs record "
+ "0x%llx (inode 0x%lx, "
+ "attribute type 0x%x) because "
+ "its location on disk could "
+ "not be determined (error "
+ "code %lli).",
+ (long long)block <<
+ bh_size_bits >>
+ vol->mft_record_size_bits,
+ ni->mft_no, ni->type,
+ (long long)lcn);
+ /*
+ * If this is not the first buffer, remove the
+ * buffers in this record from the list of
+ * buffers to write and clear their dirty bit
+ * if not error -ENOMEM.
+ */
+ if (rec_start_bh != bh) {
+ while (bhs[--nr_bhs] != rec_start_bh)
+ ;
+ if (err2 != -ENOMEM) {
+ do {
+ clear_buffer_dirty(
+ rec_start_bh);
+ } while ((rec_start_bh =
+ rec_start_bh->
+ b_this_page) !=
+ bh);
+ }
+ }
+ continue;
+ }
+ }
+ BUG_ON(!buffer_uptodate(bh));
+ BUG_ON(nr_bhs >= max_bhs);
+ bhs[nr_bhs++] = bh;
+ } while (block++, (bh = bh->b_this_page) != head);
+ if (unlikely(rl))
+ up_read(&ni->runlist.lock);
+ /* If there were no dirty buffers, we are done. */
+ if (!nr_bhs)
+ goto done;
+ /* Map the page so we can access its contents. */
+ kaddr = kmap(page);
+ /* Clear the page uptodate flag whilst the mst fixups are applied. */
+ BUG_ON(!PageUptodate(page));
+ ClearPageUptodate(page);
+ for (i = 0; i < nr_bhs; i++) {
+ unsigned int ofs;
+
+ /* Skip buffers which are not at the beginning of records. */
+ if (i % bhs_per_rec)
+ continue;
+ tbh = bhs[i];
+ ofs = bh_offset(tbh);
+ if (is_mft) {
+ ntfs_inode *tni;
+ unsigned long mft_no;
+
+ /* Get the mft record number. */
+ mft_no = (((s64)page->index << PAGE_SHIFT) + ofs)
+ >> rec_size_bits;
+ /* Check whether to write this mft record. */
+ tni = NULL;
+ if (!ntfs_may_write_mft_record(vol, mft_no,
+ (MFT_RECORD*)(kaddr + ofs), &tni)) {
+ /*
+ * The record should not be written. This
+ * means we need to redirty the page before
+ * returning.
+ */
+ page_is_dirty = true;
+ /*
+ * Remove the buffers in this mft record from
+ * the list of buffers to write.
+ */
+ do {
+ bhs[i] = NULL;
+ } while (++i % bhs_per_rec);
+ continue;
+ }
+ /*
+ * The record should be written. If a locked ntfs
+ * inode was returned, add it to the array of locked
+ * ntfs inodes.
+ */
+ if (tni)
+ locked_nis[nr_locked_nis++] = tni;
+ }
+ /* Apply the mst protection fixups. */
+ err2 = pre_write_mst_fixup((NTFS_RECORD*)(kaddr + ofs),
+ rec_size);
+ if (unlikely(err2)) {
+ if (!err || err == -ENOMEM)
+ err = -EIO;
+ ntfs_error(vol->sb, "Failed to apply mst fixups "
+ "(inode 0x%lx, attribute type 0x%x, "
+ "page index 0x%lx, page offset 0x%x)!"
+ " Unmount and run chkdsk.", vi->i_ino,
+ ni->type, page->index, ofs);
+ /*
+ * Mark all the buffers in this record clean as we do
+ * not want to write corrupt data to disk.
+ */
+ do {
+ clear_buffer_dirty(bhs[i]);
+ bhs[i] = NULL;
+ } while (++i % bhs_per_rec);
+ continue;
+ }
+ nr_recs++;
+ }
+ /* If no records are to be written out, we are done. */
+ if (!nr_recs)
+ goto unm_done;
+ flush_dcache_page(page);
+ /* Lock buffers and start synchronous write i/o on them. */
+ for (i = 0; i < nr_bhs; i++) {
+ tbh = bhs[i];
+ if (!tbh)
+ continue;
+ if (!trylock_buffer(tbh))
+ BUG();
+ /* The buffer dirty state is now irrelevant, just clean it. */
+ clear_buffer_dirty(tbh);
+ BUG_ON(!buffer_uptodate(tbh));
+ BUG_ON(!buffer_mapped(tbh));
+ get_bh(tbh);
+ tbh->b_end_io = end_buffer_write_sync;
+ submit_bh(REQ_OP_WRITE, tbh);
+ }
+ /* Synchronize the mft mirror now if not @sync. */
+ if (is_mft && !sync)
+ goto do_mirror;
+do_wait:
+ /* Wait on i/o completion of buffers. */
+ for (i = 0; i < nr_bhs; i++) {
+ tbh = bhs[i];
+ if (!tbh)
+ continue;
+ wait_on_buffer(tbh);
+ if (unlikely(!buffer_uptodate(tbh))) {
+ ntfs_error(vol->sb, "I/O error while writing ntfs "
+ "record buffer (inode 0x%lx, "
+ "attribute type 0x%x, page index "
+ "0x%lx, page offset 0x%lx)! Unmount "
+ "and run chkdsk.", vi->i_ino, ni->type,
+ page->index, bh_offset(tbh));
+ if (!err || err == -ENOMEM)
+ err = -EIO;
+ /*
+ * Set the buffer uptodate so the page and buffer
+ * states do not become out of sync.
+ */
+ set_buffer_uptodate(tbh);
+ }
+ }
+ /* If @sync, now synchronize the mft mirror. */
+ if (is_mft && sync) {
+do_mirror:
+ for (i = 0; i < nr_bhs; i++) {
+ unsigned long mft_no;
+ unsigned int ofs;
+
+ /*
+ * Skip buffers which are not at the beginning of
+ * records.
+ */
+ if (i % bhs_per_rec)
+ continue;
+ tbh = bhs[i];
+ /* Skip removed buffers (and hence records). */
+ if (!tbh)
+ continue;
+ ofs = bh_offset(tbh);
+ /* Get the mft record number. */
+ mft_no = (((s64)page->index << PAGE_SHIFT) + ofs)
+ >> rec_size_bits;
+ if (mft_no < vol->mftmirr_size)
+ ntfs_sync_mft_mirror(vol, mft_no,
+ (MFT_RECORD*)(kaddr + ofs),
+ sync);
+ }
+ if (!sync)
+ goto do_wait;
+ }
+ /* Remove the mst protection fixups again. */
+ for (i = 0; i < nr_bhs; i++) {
+ if (!(i % bhs_per_rec)) {
+ tbh = bhs[i];
+ if (!tbh)
+ continue;
+ post_write_mst_fixup((NTFS_RECORD*)(kaddr +
+ bh_offset(tbh)));
+ }
+ }
+ flush_dcache_page(page);
+unm_done:
+ /* Unlock any locked inodes. */
+ while (nr_locked_nis-- > 0) {
+ ntfs_inode *tni, *base_tni;
+
+ tni = locked_nis[nr_locked_nis];
+ /* Get the base inode. */
+ mutex_lock(&tni->extent_lock);
+ if (tni->nr_extents >= 0)
+ base_tni = tni;
+ else {
+ base_tni = tni->ext.base_ntfs_ino;
+ BUG_ON(!base_tni);
+ }
+ mutex_unlock(&tni->extent_lock);
+ ntfs_debug("Unlocking %s inode 0x%lx.",
+ tni == base_tni ? "base" : "extent",
+ tni->mft_no);
+ mutex_unlock(&tni->mrec_lock);
+ atomic_dec(&tni->count);
+ iput(VFS_I(base_tni));
+ }
+ SetPageUptodate(page);
+ kunmap(page);
+done:
+ if (unlikely(err && err != -ENOMEM)) {
+ /*
+ * Set page error if there is only one ntfs record in the page.
+ * Otherwise we would loose per-record granularity.
+ */
+ if (ni->itype.index.block_size == PAGE_SIZE)
+ SetPageError(page);
+ NVolSetErrors(vol);
+ }
+ if (page_is_dirty) {
+ ntfs_debug("Page still contains one or more dirty ntfs "
+ "records. Redirtying the page starting at "
+ "record 0x%lx.", page->index <<
+ (PAGE_SHIFT - rec_size_bits));
+ redirty_page_for_writepage(wbc, page);
+ unlock_page(page);
+ } else {
+ /*
+ * Keep the VM happy. This must be done otherwise the
+ * radix-tree tag PAGECACHE_TAG_DIRTY remains set even though
+ * the page is clean.
+ */
+ BUG_ON(PageWriteback(page));
+ set_page_writeback(page);
+ unlock_page(page);
+ end_page_writeback(page);
+ }
+ if (likely(!err))
+ ntfs_debug("Done.");
+ return err;
+}
+
+/**
+ * ntfs_writepage - write a @page to the backing store
+ * @page: page cache page to write out
+ * @wbc: writeback control structure
+ *
+ * This is called from the VM when it wants to have a dirty ntfs page cache
+ * page cleaned. The VM has already locked the page and marked it clean.
+ *
+ * For non-resident attributes, ntfs_writepage() writes the @page by calling
+ * the ntfs version of the generic block_write_full_page() function,
+ * ntfs_write_block(), which in turn if necessary creates and writes the
+ * buffers associated with the page asynchronously.
+ *
+ * For resident attributes, OTOH, ntfs_writepage() writes the @page by copying
+ * the data to the mft record (which at this stage is most likely in memory).
+ * The mft record is then marked dirty and written out asynchronously via the
+ * vfs inode dirty code path for the inode the mft record belongs to or via the
+ * vm page dirty code path for the page the mft record is in.
+ *
+ * Based on ntfs_read_folio() and fs/buffer.c::block_write_full_page().
+ *
+ * Return 0 on success and -errno on error.
+ */
+static int ntfs_writepage(struct page *page, struct writeback_control *wbc)
+{
+ loff_t i_size;
+ struct inode *vi = page->mapping->host;
+ ntfs_inode *base_ni = NULL, *ni = NTFS_I(vi);
+ char *addr;
+ ntfs_attr_search_ctx *ctx = NULL;
+ MFT_RECORD *m = NULL;
+ u32 attr_len;
+ int err;
+
+retry_writepage:
+ BUG_ON(!PageLocked(page));
+ i_size = i_size_read(vi);
+ /* Is the page fully outside i_size? (truncate in progress) */
+ if (unlikely(page->index >= (i_size + PAGE_SIZE - 1) >>
+ PAGE_SHIFT)) {
+ struct folio *folio = page_folio(page);
+ /*
+ * The page may have dirty, unmapped buffers. Make them
+ * freeable here, so the page does not leak.
+ */
+ block_invalidate_folio(folio, 0, folio_size(folio));
+ folio_unlock(folio);
+ ntfs_debug("Write outside i_size - truncated?");
+ return 0;
+ }
+ /*
+ * Only $DATA attributes can be encrypted and only unnamed $DATA
+ * attributes can be compressed. Index root can have the flags set but
+ * this means to create compressed/encrypted files, not that the
+ * attribute is compressed/encrypted. Note we need to check for
+ * AT_INDEX_ALLOCATION since this is the type of both directory and
+ * index inodes.
+ */
+ if (ni->type != AT_INDEX_ALLOCATION) {
+ /* If file is encrypted, deny access, just like NT4. */
+ if (NInoEncrypted(ni)) {
+ unlock_page(page);
+ BUG_ON(ni->type != AT_DATA);
+ ntfs_debug("Denying write access to encrypted file.");
+ return -EACCES;
+ }
+ /* Compressed data streams are handled in compress.c. */
+ if (NInoNonResident(ni) && NInoCompressed(ni)) {
+ BUG_ON(ni->type != AT_DATA);
+ BUG_ON(ni->name_len);
+ // TODO: Implement and replace this with
+ // return ntfs_write_compressed_block(page);
+ unlock_page(page);
+ ntfs_error(vi->i_sb, "Writing to compressed files is "
+ "not supported yet. Sorry.");
+ return -EOPNOTSUPP;
+ }
+ // TODO: Implement and remove this check.
+ if (NInoNonResident(ni) && NInoSparse(ni)) {
+ unlock_page(page);
+ ntfs_error(vi->i_sb, "Writing to sparse files is not "
+ "supported yet. Sorry.");
+ return -EOPNOTSUPP;
+ }
+ }
+ /* NInoNonResident() == NInoIndexAllocPresent() */
+ if (NInoNonResident(ni)) {
+ /* We have to zero every time due to mmap-at-end-of-file. */
+ if (page->index >= (i_size >> PAGE_SHIFT)) {
+ /* The page straddles i_size. */
+ unsigned int ofs = i_size & ~PAGE_MASK;
+ zero_user_segment(page, ofs, PAGE_SIZE);
+ }
+ /* Handle mst protected attributes. */
+ if (NInoMstProtected(ni))
+ return ntfs_write_mst_block(page, wbc);
+ /* Normal, non-resident data stream. */
+ return ntfs_write_block(page, wbc);
+ }
+ /*
+ * Attribute is resident, implying it is not compressed, encrypted, or
+ * mst protected. This also means the attribute is smaller than an mft
+ * record and hence smaller than a page, so can simply return error on
+ * any pages with index above 0. Note the attribute can actually be
+ * marked compressed but if it is resident the actual data is not
+ * compressed so we are ok to ignore the compressed flag here.
+ */
+ BUG_ON(page_has_buffers(page));
+ BUG_ON(!PageUptodate(page));
+ if (unlikely(page->index > 0)) {
+ ntfs_error(vi->i_sb, "BUG()! page->index (0x%lx) > 0. "
+ "Aborting write.", page->index);
+ BUG_ON(PageWriteback(page));
+ set_page_writeback(page);
+ unlock_page(page);
+ end_page_writeback(page);
+ return -EIO;
+ }
+ if (!NInoAttr(ni))
+ base_ni = ni;
+ else
+ base_ni = ni->ext.base_ntfs_ino;
+ /* Map, pin, and lock the mft record. */
+ m = map_mft_record(base_ni);
+ if (IS_ERR(m)) {
+ err = PTR_ERR(m);
+ m = NULL;
+ ctx = NULL;
+ goto err_out;
+ }
+ /*
+ * If a parallel write made the attribute non-resident, drop the mft
+ * record and retry the writepage.
+ */
+ if (unlikely(NInoNonResident(ni))) {
+ unmap_mft_record(base_ni);
+ goto retry_writepage;
+ }
+ ctx = ntfs_attr_get_search_ctx(base_ni, m);
+ if (unlikely(!ctx)) {
+ err = -ENOMEM;
+ goto err_out;
+ }
+ err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
+ CASE_SENSITIVE, 0, NULL, 0, ctx);
+ if (unlikely(err))
+ goto err_out;
+ /*
+ * Keep the VM happy. This must be done otherwise the radix-tree tag
+ * PAGECACHE_TAG_DIRTY remains set even though the page is clean.
+ */
+ BUG_ON(PageWriteback(page));
+ set_page_writeback(page);
+ unlock_page(page);
+ attr_len = le32_to_cpu(ctx->attr->data.resident.value_length);
+ i_size = i_size_read(vi);
+ if (unlikely(attr_len > i_size)) {
+ /* Race with shrinking truncate or a failed truncate. */
+ attr_len = i_size;
+ /*
+ * If the truncate failed, fix it up now. If a concurrent
+ * truncate, we do its job, so it does not have to do anything.
+ */
+ err = ntfs_resident_attr_value_resize(ctx->mrec, ctx->attr,
+ attr_len);
+ /* Shrinking cannot fail. */
+ BUG_ON(err);
+ }
+ addr = kmap_atomic(page);
+ /* Copy the data from the page to the mft record. */
+ memcpy((u8*)ctx->attr +
+ le16_to_cpu(ctx->attr->data.resident.value_offset),
+ addr, attr_len);
+ /* Zero out of bounds area in the page cache page. */
+ memset(addr + attr_len, 0, PAGE_SIZE - attr_len);
+ kunmap_atomic(addr);
+ flush_dcache_page(page);
+ flush_dcache_mft_record_page(ctx->ntfs_ino);
+ /* We are done with the page. */
+ end_page_writeback(page);
+ /* Finally, mark the mft record dirty, so it gets written back. */
+ mark_mft_record_dirty(ctx->ntfs_ino);
+ ntfs_attr_put_search_ctx(ctx);
+ unmap_mft_record(base_ni);
+ return 0;
+err_out:
+ if (err == -ENOMEM) {
+ ntfs_warning(vi->i_sb, "Error allocating memory. Redirtying "
+ "page so we try again later.");
+ /*
+ * Put the page back on mapping->dirty_pages, but leave its
+ * buffers' dirty state as-is.
+ */
+ redirty_page_for_writepage(wbc, page);
+ err = 0;
+ } else {
+ ntfs_error(vi->i_sb, "Resident attribute write failed with "
+ "error %i.", err);
+ SetPageError(page);
+ NVolSetErrors(ni->vol);
+ }
+ unlock_page(page);
+ if (ctx)
+ ntfs_attr_put_search_ctx(ctx);
+ if (m)
+ unmap_mft_record(base_ni);
+ return err;
+}
+
+#endif /* NTFS_RW */
+
+/**
+ * ntfs_bmap - map logical file block to physical device block
+ * @mapping: address space mapping to which the block to be mapped belongs
+ * @block: logical block to map to its physical device block
+ *
+ * For regular, non-resident files (i.e. not compressed and not encrypted), map
+ * the logical @block belonging to the file described by the address space
+ * mapping @mapping to its physical device block.
+ *
+ * The size of the block is equal to the @s_blocksize field of the super block
+ * of the mounted file system which is guaranteed to be smaller than or equal
+ * to the cluster size thus the block is guaranteed to fit entirely inside the
+ * cluster which means we do not need to care how many contiguous bytes are
+ * available after the beginning of the block.
+ *
+ * Return the physical device block if the mapping succeeded or 0 if the block
+ * is sparse or there was an error.
+ *
+ * Note: This is a problem if someone tries to run bmap() on $Boot system file
+ * as that really is in block zero but there is nothing we can do. bmap() is
+ * just broken in that respect (just like it cannot distinguish sparse from
+ * not available or error).
+ */
+static sector_t ntfs_bmap(struct address_space *mapping, sector_t block)
+{
+ s64 ofs, size;
+ loff_t i_size;
+ LCN lcn;
+ unsigned long blocksize, flags;
+ ntfs_inode *ni = NTFS_I(mapping->host);
+ ntfs_volume *vol = ni->vol;
+ unsigned delta;
+ unsigned char blocksize_bits, cluster_size_shift;
+
+ ntfs_debug("Entering for mft_no 0x%lx, logical block 0x%llx.",
+ ni->mft_no, (unsigned long long)block);
+ if (ni->type != AT_DATA || !NInoNonResident(ni) || NInoEncrypted(ni)) {
+ ntfs_error(vol->sb, "BMAP does not make sense for %s "
+ "attributes, returning 0.",
+ (ni->type != AT_DATA) ? "non-data" :
+ (!NInoNonResident(ni) ? "resident" :
+ "encrypted"));
+ return 0;
+ }
+ /* None of these can happen. */
+ BUG_ON(NInoCompressed(ni));
+ BUG_ON(NInoMstProtected(ni));
+ blocksize = vol->sb->s_blocksize;
+ blocksize_bits = vol->sb->s_blocksize_bits;
+ ofs = (s64)block << blocksize_bits;
+ read_lock_irqsave(&ni->size_lock, flags);
+ size = ni->initialized_size;
+ i_size = i_size_read(VFS_I(ni));
+ read_unlock_irqrestore(&ni->size_lock, flags);
+ /*
+ * If the offset is outside the initialized size or the block straddles
+ * the initialized size then pretend it is a hole unless the
+ * initialized size equals the file size.
+ */
+ if (unlikely(ofs >= size || (ofs + blocksize > size && size < i_size)))
+ goto hole;
+ cluster_size_shift = vol->cluster_size_bits;
+ down_read(&ni->runlist.lock);
+ lcn = ntfs_attr_vcn_to_lcn_nolock(ni, ofs >> cluster_size_shift, false);
+ up_read(&ni->runlist.lock);
+ if (unlikely(lcn < LCN_HOLE)) {
+ /*
+ * Step down to an integer to avoid gcc doing a long long
+ * comparision in the switch when we know @lcn is between
+ * LCN_HOLE and LCN_EIO (i.e. -1 to -5).
+ *
+ * Otherwise older gcc (at least on some architectures) will
+ * try to use __cmpdi2() which is of course not available in
+ * the kernel.
+ */
+ switch ((int)lcn) {
+ case LCN_ENOENT:
+ /*
+ * If the offset is out of bounds then pretend it is a
+ * hole.
+ */
+ goto hole;
+ case LCN_ENOMEM:
+ ntfs_error(vol->sb, "Not enough memory to complete "
+ "mapping for inode 0x%lx. "
+ "Returning 0.", ni->mft_no);
+ break;
+ default:
+ ntfs_error(vol->sb, "Failed to complete mapping for "
+ "inode 0x%lx. Run chkdsk. "
+ "Returning 0.", ni->mft_no);
+ break;
+ }
+ return 0;
+ }
+ if (lcn < 0) {
+ /* It is a hole. */
+hole:
+ ntfs_debug("Done (returning hole).");
+ return 0;
+ }
+ /*
+ * The block is really allocated and fullfils all our criteria.
+ * Convert the cluster to units of block size and return the result.
+ */
+ delta = ofs & vol->cluster_size_mask;
+ if (unlikely(sizeof(block) < sizeof(lcn))) {
+ block = lcn = ((lcn << cluster_size_shift) + delta) >>
+ blocksize_bits;
+ /* If the block number was truncated return 0. */
+ if (unlikely(block != lcn)) {
+ ntfs_error(vol->sb, "Physical block 0x%llx is too "
+ "large to be returned, returning 0.",
+ (long long)lcn);
+ return 0;
+ }
+ } else
+ block = ((lcn << cluster_size_shift) + delta) >>
+ blocksize_bits;
+ ntfs_debug("Done (returning block 0x%llx).", (unsigned long long)lcn);
+ return block;
+}
+
+/**
+ * ntfs_normal_aops - address space operations for normal inodes and attributes
+ *
+ * Note these are not used for compressed or mst protected inodes and
+ * attributes.
+ */
+const struct address_space_operations ntfs_normal_aops = {
+ .read_folio = ntfs_read_folio,
+#ifdef NTFS_RW
+ .writepage = ntfs_writepage,
+ .dirty_folio = block_dirty_folio,
+#endif /* NTFS_RW */
+ .bmap = ntfs_bmap,
+ .migrate_folio = buffer_migrate_folio,
+ .is_partially_uptodate = block_is_partially_uptodate,
+ .error_remove_page = generic_error_remove_page,
+};
+
+/**
+ * ntfs_compressed_aops - address space operations for compressed inodes
+ */
+const struct address_space_operations ntfs_compressed_aops = {
+ .read_folio = ntfs_read_folio,
+#ifdef NTFS_RW
+ .writepage = ntfs_writepage,
+ .dirty_folio = block_dirty_folio,
+#endif /* NTFS_RW */
+ .migrate_folio = buffer_migrate_folio,
+ .is_partially_uptodate = block_is_partially_uptodate,
+ .error_remove_page = generic_error_remove_page,
+};
+
+/**
+ * ntfs_mst_aops - general address space operations for mst protecteed inodes
+ * and attributes
+ */
+const struct address_space_operations ntfs_mst_aops = {
+ .read_folio = ntfs_read_folio, /* Fill page with data. */
+#ifdef NTFS_RW
+ .writepage = ntfs_writepage, /* Write dirty page to disk. */
+ .dirty_folio = filemap_dirty_folio,
+#endif /* NTFS_RW */
+ .migrate_folio = buffer_migrate_folio,
+ .is_partially_uptodate = block_is_partially_uptodate,
+ .error_remove_page = generic_error_remove_page,
+};
+
+#ifdef NTFS_RW
+
+/**
+ * mark_ntfs_record_dirty - mark an ntfs record dirty
+ * @page: page containing the ntfs record to mark dirty
+ * @ofs: byte offset within @page at which the ntfs record begins
+ *
+ * Set the buffers and the page in which the ntfs record is located dirty.
+ *
+ * The latter also marks the vfs inode the ntfs record belongs to dirty
+ * (I_DIRTY_PAGES only).
+ *
+ * If the page does not have buffers, we create them and set them uptodate.
+ * The page may not be locked which is why we need to handle the buffers under
+ * the mapping->private_lock. Once the buffers are marked dirty we no longer
+ * need the lock since try_to_free_buffers() does not free dirty buffers.
+ */
+void mark_ntfs_record_dirty(struct page *page, const unsigned int ofs) {
+ struct address_space *mapping = page->mapping;
+ ntfs_inode *ni = NTFS_I(mapping->host);
+ struct buffer_head *bh, *head, *buffers_to_free = NULL;
+ unsigned int end, bh_size, bh_ofs;
+
+ BUG_ON(!PageUptodate(page));
+ end = ofs + ni->itype.index.block_size;
+ bh_size = VFS_I(ni)->i_sb->s_blocksize;
+ spin_lock(&mapping->private_lock);
+ if (unlikely(!page_has_buffers(page))) {
+ spin_unlock(&mapping->private_lock);
+ bh = head = alloc_page_buffers(page, bh_size, true);
+ spin_lock(&mapping->private_lock);
+ if (likely(!page_has_buffers(page))) {
+ struct buffer_head *tail;
+
+ do {
+ set_buffer_uptodate(bh);
+ tail = bh;
+ bh = bh->b_this_page;
+ } while (bh);
+ tail->b_this_page = head;
+ attach_page_private(page, head);
+ } else
+ buffers_to_free = bh;
+ }
+ bh = head = page_buffers(page);
+ BUG_ON(!bh);
+ do {
+ bh_ofs = bh_offset(bh);
+ if (bh_ofs + bh_size <= ofs)
+ continue;
+ if (unlikely(bh_ofs >= end))
+ break;
+ set_buffer_dirty(bh);
+ } while ((bh = bh->b_this_page) != head);
+ spin_unlock(&mapping->private_lock);
+ filemap_dirty_folio(mapping, page_folio(page));
+ if (unlikely(buffers_to_free)) {
+ do {
+ bh = buffers_to_free->b_this_page;
+ free_buffer_head(buffers_to_free);
+ buffers_to_free = bh;
+ } while (buffers_to_free);
+ }
+}
+
+#endif /* NTFS_RW */
diff --git a/fs/ntfs/aops.h b/fs/ntfs/aops.h
new file mode 100644
index 000000000..0cac5458c
--- /dev/null
+++ b/fs/ntfs/aops.h
@@ -0,0 +1,88 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/**
+ * aops.h - Defines for NTFS kernel address space operations and page cache
+ * handling. Part of the Linux-NTFS project.
+ *
+ * Copyright (c) 2001-2004 Anton Altaparmakov
+ * Copyright (c) 2002 Richard Russon
+ */
+
+#ifndef _LINUX_NTFS_AOPS_H
+#define _LINUX_NTFS_AOPS_H
+
+#include <linux/mm.h>
+#include <linux/highmem.h>
+#include <linux/pagemap.h>
+#include <linux/fs.h>
+
+#include "inode.h"
+
+/**
+ * ntfs_unmap_page - release a page that was mapped using ntfs_map_page()
+ * @page: the page to release
+ *
+ * Unpin, unmap and release a page that was obtained from ntfs_map_page().
+ */
+static inline void ntfs_unmap_page(struct page *page)
+{
+ kunmap(page);
+ put_page(page);
+}
+
+/**
+ * ntfs_map_page - map a page into accessible memory, reading it if necessary
+ * @mapping: address space for which to obtain the page
+ * @index: index into the page cache for @mapping of the page to map
+ *
+ * Read a page from the page cache of the address space @mapping at position
+ * @index, where @index is in units of PAGE_SIZE, and not in bytes.
+ *
+ * If the page is not in memory it is loaded from disk first using the
+ * read_folio method defined in the address space operations of @mapping
+ * and the page is added to the page cache of @mapping in the process.
+ *
+ * If the page belongs to an mst protected attribute and it is marked as such
+ * in its ntfs inode (NInoMstProtected()) the mst fixups are applied but no
+ * error checking is performed. This means the caller has to verify whether
+ * the ntfs record(s) contained in the page are valid or not using one of the
+ * ntfs_is_XXXX_record{,p}() macros, where XXXX is the record type you are
+ * expecting to see. (For details of the macros, see fs/ntfs/layout.h.)
+ *
+ * If the page is in high memory it is mapped into memory directly addressible
+ * by the kernel.
+ *
+ * Finally the page count is incremented, thus pinning the page into place.
+ *
+ * The above means that page_address(page) can be used on all pages obtained
+ * with ntfs_map_page() to get the kernel virtual address of the page.
+ *
+ * When finished with the page, the caller has to call ntfs_unmap_page() to
+ * unpin, unmap and release the page.
+ *
+ * Note this does not grant exclusive access. If such is desired, the caller
+ * must provide it independently of the ntfs_{un}map_page() calls by using
+ * a {rw_}semaphore or other means of serialization. A spin lock cannot be
+ * used as ntfs_map_page() can block.
+ *
+ * The unlocked and uptodate page is returned on success or an encoded error
+ * on failure. Caller has to test for error using the IS_ERR() macro on the
+ * return value. If that evaluates to 'true', the negative error code can be
+ * obtained using PTR_ERR() on the return value of ntfs_map_page().
+ */
+static inline struct page *ntfs_map_page(struct address_space *mapping,
+ unsigned long index)
+{
+ struct page *page = read_mapping_page(mapping, index, NULL);
+
+ if (!IS_ERR(page))
+ kmap(page);
+ return page;
+}
+
+#ifdef NTFS_RW
+
+extern void mark_ntfs_record_dirty(struct page *page, const unsigned int ofs);
+
+#endif /* NTFS_RW */
+
+#endif /* _LINUX_NTFS_AOPS_H */
diff --git a/fs/ntfs/attrib.c b/fs/ntfs/attrib.c
new file mode 100644
index 000000000..a3865bc4a
--- /dev/null
+++ b/fs/ntfs/attrib.c
@@ -0,0 +1,2624 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * attrib.c - NTFS attribute operations. Part of the Linux-NTFS project.
+ *
+ * Copyright (c) 2001-2012 Anton Altaparmakov and Tuxera Inc.
+ * Copyright (c) 2002 Richard Russon
+ */
+
+#include <linux/buffer_head.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/swap.h>
+#include <linux/writeback.h>
+
+#include "attrib.h"
+#include "debug.h"
+#include "layout.h"
+#include "lcnalloc.h"
+#include "malloc.h"
+#include "mft.h"
+#include "ntfs.h"
+#include "types.h"
+
+/**
+ * ntfs_map_runlist_nolock - map (a part of) a runlist of an ntfs inode
+ * @ni: ntfs inode for which to map (part of) a runlist
+ * @vcn: map runlist part containing this vcn
+ * @ctx: active attribute search context if present or NULL if not
+ *
+ * Map the part of a runlist containing the @vcn of the ntfs inode @ni.
+ *
+ * If @ctx is specified, it is an active search context of @ni and its base mft
+ * record. This is needed when ntfs_map_runlist_nolock() encounters unmapped
+ * runlist fragments and allows their mapping. If you do not have the mft
+ * record mapped, you can specify @ctx as NULL and ntfs_map_runlist_nolock()
+ * will perform the necessary mapping and unmapping.
+ *
+ * Note, ntfs_map_runlist_nolock() saves the state of @ctx on entry and
+ * restores it before returning. Thus, @ctx will be left pointing to the same
+ * attribute on return as on entry. However, the actual pointers in @ctx may
+ * point to different memory locations on return, so you must remember to reset
+ * any cached pointers from the @ctx, i.e. after the call to
+ * ntfs_map_runlist_nolock(), you will probably want to do:
+ * m = ctx->mrec;
+ * a = ctx->attr;
+ * Assuming you cache ctx->attr in a variable @a of type ATTR_RECORD * and that
+ * you cache ctx->mrec in a variable @m of type MFT_RECORD *.
+ *
+ * Return 0 on success and -errno on error. There is one special error code
+ * which is not an error as such. This is -ENOENT. It means that @vcn is out
+ * of bounds of the runlist.
+ *
+ * Note the runlist can be NULL after this function returns if @vcn is zero and
+ * the attribute has zero allocated size, i.e. there simply is no runlist.
+ *
+ * WARNING: If @ctx is supplied, regardless of whether success or failure is
+ * returned, you need to check IS_ERR(@ctx->mrec) and if 'true' the @ctx
+ * is no longer valid, i.e. you need to either call
+ * ntfs_attr_reinit_search_ctx() or ntfs_attr_put_search_ctx() on it.
+ * In that case PTR_ERR(@ctx->mrec) will give you the error code for
+ * why the mapping of the old inode failed.
+ *
+ * Locking: - The runlist described by @ni must be locked for writing on entry
+ * and is locked on return. Note the runlist will be modified.
+ * - If @ctx is NULL, the base mft record of @ni must not be mapped on
+ * entry and it will be left unmapped on return.
+ * - If @ctx is not NULL, the base mft record must be mapped on entry
+ * and it will be left mapped on return.
+ */
+int ntfs_map_runlist_nolock(ntfs_inode *ni, VCN vcn, ntfs_attr_search_ctx *ctx)
+{
+ VCN end_vcn;
+ unsigned long flags;
+ ntfs_inode *base_ni;
+ MFT_RECORD *m;
+ ATTR_RECORD *a;
+ runlist_element *rl;
+ struct page *put_this_page = NULL;
+ int err = 0;
+ bool ctx_is_temporary, ctx_needs_reset;
+ ntfs_attr_search_ctx old_ctx = { NULL, };
+
+ ntfs_debug("Mapping runlist part containing vcn 0x%llx.",
+ (unsigned long long)vcn);
+ if (!NInoAttr(ni))
+ base_ni = ni;
+ else
+ base_ni = ni->ext.base_ntfs_ino;
+ if (!ctx) {
+ ctx_is_temporary = ctx_needs_reset = true;
+ m = map_mft_record(base_ni);
+ if (IS_ERR(m))
+ return PTR_ERR(m);
+ ctx = ntfs_attr_get_search_ctx(base_ni, m);
+ if (unlikely(!ctx)) {
+ err = -ENOMEM;
+ goto err_out;
+ }
+ } else {
+ VCN allocated_size_vcn;
+
+ BUG_ON(IS_ERR(ctx->mrec));
+ a = ctx->attr;
+ BUG_ON(!a->non_resident);
+ ctx_is_temporary = false;
+ end_vcn = sle64_to_cpu(a->data.non_resident.highest_vcn);
+ read_lock_irqsave(&ni->size_lock, flags);
+ allocated_size_vcn = ni->allocated_size >>
+ ni->vol->cluster_size_bits;
+ read_unlock_irqrestore(&ni->size_lock, flags);
+ if (!a->data.non_resident.lowest_vcn && end_vcn <= 0)
+ end_vcn = allocated_size_vcn - 1;
+ /*
+ * If we already have the attribute extent containing @vcn in
+ * @ctx, no need to look it up again. We slightly cheat in
+ * that if vcn exceeds the allocated size, we will refuse to
+ * map the runlist below, so there is definitely no need to get
+ * the right attribute extent.
+ */
+ if (vcn >= allocated_size_vcn || (a->type == ni->type &&
+ a->name_length == ni->name_len &&
+ !memcmp((u8*)a + le16_to_cpu(a->name_offset),
+ ni->name, ni->name_len) &&
+ sle64_to_cpu(a->data.non_resident.lowest_vcn)
+ <= vcn && end_vcn >= vcn))
+ ctx_needs_reset = false;
+ else {
+ /* Save the old search context. */
+ old_ctx = *ctx;
+ /*
+ * If the currently mapped (extent) inode is not the
+ * base inode we will unmap it when we reinitialize the
+ * search context which means we need to get a
+ * reference to the page containing the mapped mft
+ * record so we do not accidentally drop changes to the
+ * mft record when it has not been marked dirty yet.
+ */
+ if (old_ctx.base_ntfs_ino && old_ctx.ntfs_ino !=
+ old_ctx.base_ntfs_ino) {
+ put_this_page = old_ctx.ntfs_ino->page;
+ get_page(put_this_page);
+ }
+ /*
+ * Reinitialize the search context so we can lookup the
+ * needed attribute extent.
+ */
+ ntfs_attr_reinit_search_ctx(ctx);
+ ctx_needs_reset = true;
+ }
+ }
+ if (ctx_needs_reset) {
+ err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
+ CASE_SENSITIVE, vcn, NULL, 0, ctx);
+ if (unlikely(err)) {
+ if (err == -ENOENT)
+ err = -EIO;
+ goto err_out;
+ }
+ BUG_ON(!ctx->attr->non_resident);
+ }
+ a = ctx->attr;
+ /*
+ * Only decompress the mapping pairs if @vcn is inside it. Otherwise
+ * we get into problems when we try to map an out of bounds vcn because
+ * we then try to map the already mapped runlist fragment and
+ * ntfs_mapping_pairs_decompress() fails.
+ */
+ end_vcn = sle64_to_cpu(a->data.non_resident.highest_vcn) + 1;
+ if (unlikely(vcn && vcn >= end_vcn)) {
+ err = -ENOENT;
+ goto err_out;
+ }
+ rl = ntfs_mapping_pairs_decompress(ni->vol, a, ni->runlist.rl);
+ if (IS_ERR(rl))
+ err = PTR_ERR(rl);
+ else
+ ni->runlist.rl = rl;
+err_out:
+ if (ctx_is_temporary) {
+ if (likely(ctx))
+ ntfs_attr_put_search_ctx(ctx);
+ unmap_mft_record(base_ni);
+ } else if (ctx_needs_reset) {
+ /*
+ * If there is no attribute list, restoring the search context
+ * is accomplished simply by copying the saved context back over
+ * the caller supplied context. If there is an attribute list,
+ * things are more complicated as we need to deal with mapping
+ * of mft records and resulting potential changes in pointers.
+ */
+ if (NInoAttrList(base_ni)) {
+ /*
+ * If the currently mapped (extent) inode is not the
+ * one we had before, we need to unmap it and map the
+ * old one.
+ */
+ if (ctx->ntfs_ino != old_ctx.ntfs_ino) {
+ /*
+ * If the currently mapped inode is not the
+ * base inode, unmap it.
+ */
+ if (ctx->base_ntfs_ino && ctx->ntfs_ino !=
+ ctx->base_ntfs_ino) {
+ unmap_extent_mft_record(ctx->ntfs_ino);
+ ctx->mrec = ctx->base_mrec;
+ BUG_ON(!ctx->mrec);
+ }
+ /*
+ * If the old mapped inode is not the base
+ * inode, map it.
+ */
+ if (old_ctx.base_ntfs_ino &&
+ old_ctx.ntfs_ino !=
+ old_ctx.base_ntfs_ino) {
+retry_map:
+ ctx->mrec = map_mft_record(
+ old_ctx.ntfs_ino);
+ /*
+ * Something bad has happened. If out
+ * of memory retry till it succeeds.
+ * Any other errors are fatal and we
+ * return the error code in ctx->mrec.
+ * Let the caller deal with it... We
+ * just need to fudge things so the
+ * caller can reinit and/or put the
+ * search context safely.
+ */
+ if (IS_ERR(ctx->mrec)) {
+ if (PTR_ERR(ctx->mrec) ==
+ -ENOMEM) {
+ schedule();
+ goto retry_map;
+ } else
+ old_ctx.ntfs_ino =
+ old_ctx.
+ base_ntfs_ino;
+ }
+ }
+ }
+ /* Update the changed pointers in the saved context. */
+ if (ctx->mrec != old_ctx.mrec) {
+ if (!IS_ERR(ctx->mrec))
+ old_ctx.attr = (ATTR_RECORD*)(
+ (u8*)ctx->mrec +
+ ((u8*)old_ctx.attr -
+ (u8*)old_ctx.mrec));
+ old_ctx.mrec = ctx->mrec;
+ }
+ }
+ /* Restore the search context to the saved one. */
+ *ctx = old_ctx;
+ /*
+ * We drop the reference on the page we took earlier. In the
+ * case that IS_ERR(ctx->mrec) is true this means we might lose
+ * some changes to the mft record that had been made between
+ * the last time it was marked dirty/written out and now. This
+ * at this stage is not a problem as the mapping error is fatal
+ * enough that the mft record cannot be written out anyway and
+ * the caller is very likely to shutdown the whole inode
+ * immediately and mark the volume dirty for chkdsk to pick up
+ * the pieces anyway.
+ */
+ if (put_this_page)
+ put_page(put_this_page);
+ }
+ return err;
+}
+
+/**
+ * ntfs_map_runlist - map (a part of) a runlist of an ntfs inode
+ * @ni: ntfs inode for which to map (part of) a runlist
+ * @vcn: map runlist part containing this vcn
+ *
+ * Map the part of a runlist containing the @vcn of the ntfs inode @ni.
+ *
+ * Return 0 on success and -errno on error. There is one special error code
+ * which is not an error as such. This is -ENOENT. It means that @vcn is out
+ * of bounds of the runlist.
+ *
+ * Locking: - The runlist must be unlocked on entry and is unlocked on return.
+ * - This function takes the runlist lock for writing and may modify
+ * the runlist.
+ */
+int ntfs_map_runlist(ntfs_inode *ni, VCN vcn)
+{
+ int err = 0;
+
+ down_write(&ni->runlist.lock);
+ /* Make sure someone else didn't do the work while we were sleeping. */
+ if (likely(ntfs_rl_vcn_to_lcn(ni->runlist.rl, vcn) <=
+ LCN_RL_NOT_MAPPED))
+ err = ntfs_map_runlist_nolock(ni, vcn, NULL);
+ up_write(&ni->runlist.lock);
+ return err;
+}
+
+/**
+ * ntfs_attr_vcn_to_lcn_nolock - convert a vcn into a lcn given an ntfs inode
+ * @ni: ntfs inode of the attribute whose runlist to search
+ * @vcn: vcn to convert
+ * @write_locked: true if the runlist is locked for writing
+ *
+ * Find the virtual cluster number @vcn in the runlist of the ntfs attribute
+ * described by the ntfs inode @ni and return the corresponding logical cluster
+ * number (lcn).
+ *
+ * If the @vcn is not mapped yet, the attempt is made to map the attribute
+ * extent containing the @vcn and the vcn to lcn conversion is retried.
+ *
+ * If @write_locked is true the caller has locked the runlist for writing and
+ * if false for reading.
+ *
+ * Since lcns must be >= 0, we use negative return codes with special meaning:
+ *
+ * Return code Meaning / Description
+ * ==========================================
+ * LCN_HOLE Hole / not allocated on disk.
+ * LCN_ENOENT There is no such vcn in the runlist, i.e. @vcn is out of bounds.
+ * LCN_ENOMEM Not enough memory to map runlist.
+ * LCN_EIO Critical error (runlist/file is corrupt, i/o error, etc).
+ *
+ * Locking: - The runlist must be locked on entry and is left locked on return.
+ * - If @write_locked is 'false', i.e. the runlist is locked for reading,
+ * the lock may be dropped inside the function so you cannot rely on
+ * the runlist still being the same when this function returns.
+ */
+LCN ntfs_attr_vcn_to_lcn_nolock(ntfs_inode *ni, const VCN vcn,
+ const bool write_locked)
+{
+ LCN lcn;
+ unsigned long flags;
+ bool is_retry = false;
+
+ BUG_ON(!ni);
+ ntfs_debug("Entering for i_ino 0x%lx, vcn 0x%llx, %s_locked.",
+ ni->mft_no, (unsigned long long)vcn,
+ write_locked ? "write" : "read");
+ BUG_ON(!NInoNonResident(ni));
+ BUG_ON(vcn < 0);
+ if (!ni->runlist.rl) {
+ read_lock_irqsave(&ni->size_lock, flags);
+ if (!ni->allocated_size) {
+ read_unlock_irqrestore(&ni->size_lock, flags);
+ return LCN_ENOENT;
+ }
+ read_unlock_irqrestore(&ni->size_lock, flags);
+ }
+retry_remap:
+ /* Convert vcn to lcn. If that fails map the runlist and retry once. */
+ lcn = ntfs_rl_vcn_to_lcn(ni->runlist.rl, vcn);
+ if (likely(lcn >= LCN_HOLE)) {
+ ntfs_debug("Done, lcn 0x%llx.", (long long)lcn);
+ return lcn;
+ }
+ if (lcn != LCN_RL_NOT_MAPPED) {
+ if (lcn != LCN_ENOENT)
+ lcn = LCN_EIO;
+ } else if (!is_retry) {
+ int err;
+
+ if (!write_locked) {
+ up_read(&ni->runlist.lock);
+ down_write(&ni->runlist.lock);
+ if (unlikely(ntfs_rl_vcn_to_lcn(ni->runlist.rl, vcn) !=
+ LCN_RL_NOT_MAPPED)) {
+ up_write(&ni->runlist.lock);
+ down_read(&ni->runlist.lock);
+ goto retry_remap;
+ }
+ }
+ err = ntfs_map_runlist_nolock(ni, vcn, NULL);
+ if (!write_locked) {
+ up_write(&ni->runlist.lock);
+ down_read(&ni->runlist.lock);
+ }
+ if (likely(!err)) {
+ is_retry = true;
+ goto retry_remap;
+ }
+ if (err == -ENOENT)
+ lcn = LCN_ENOENT;
+ else if (err == -ENOMEM)
+ lcn = LCN_ENOMEM;
+ else
+ lcn = LCN_EIO;
+ }
+ if (lcn != LCN_ENOENT)
+ ntfs_error(ni->vol->sb, "Failed with error code %lli.",
+ (long long)lcn);
+ return lcn;
+}
+
+/**
+ * ntfs_attr_find_vcn_nolock - find a vcn in the runlist of an ntfs inode
+ * @ni: ntfs inode describing the runlist to search
+ * @vcn: vcn to find
+ * @ctx: active attribute search context if present or NULL if not
+ *
+ * Find the virtual cluster number @vcn in the runlist described by the ntfs
+ * inode @ni and return the address of the runlist element containing the @vcn.
+ *
+ * If the @vcn is not mapped yet, the attempt is made to map the attribute
+ * extent containing the @vcn and the vcn to lcn conversion is retried.
+ *
+ * If @ctx is specified, it is an active search context of @ni and its base mft
+ * record. This is needed when ntfs_attr_find_vcn_nolock() encounters unmapped
+ * runlist fragments and allows their mapping. If you do not have the mft
+ * record mapped, you can specify @ctx as NULL and ntfs_attr_find_vcn_nolock()
+ * will perform the necessary mapping and unmapping.
+ *
+ * Note, ntfs_attr_find_vcn_nolock() saves the state of @ctx on entry and
+ * restores it before returning. Thus, @ctx will be left pointing to the same
+ * attribute on return as on entry. However, the actual pointers in @ctx may
+ * point to different memory locations on return, so you must remember to reset
+ * any cached pointers from the @ctx, i.e. after the call to
+ * ntfs_attr_find_vcn_nolock(), you will probably want to do:
+ * m = ctx->mrec;
+ * a = ctx->attr;
+ * Assuming you cache ctx->attr in a variable @a of type ATTR_RECORD * and that
+ * you cache ctx->mrec in a variable @m of type MFT_RECORD *.
+ * Note you need to distinguish between the lcn of the returned runlist element
+ * being >= 0 and LCN_HOLE. In the later case you have to return zeroes on
+ * read and allocate clusters on write.
+ *
+ * Return the runlist element containing the @vcn on success and
+ * ERR_PTR(-errno) on error. You need to test the return value with IS_ERR()
+ * to decide if the return is success or failure and PTR_ERR() to get to the
+ * error code if IS_ERR() is true.
+ *
+ * The possible error return codes are:
+ * -ENOENT - No such vcn in the runlist, i.e. @vcn is out of bounds.
+ * -ENOMEM - Not enough memory to map runlist.
+ * -EIO - Critical error (runlist/file is corrupt, i/o error, etc).
+ *
+ * WARNING: If @ctx is supplied, regardless of whether success or failure is
+ * returned, you need to check IS_ERR(@ctx->mrec) and if 'true' the @ctx
+ * is no longer valid, i.e. you need to either call
+ * ntfs_attr_reinit_search_ctx() or ntfs_attr_put_search_ctx() on it.
+ * In that case PTR_ERR(@ctx->mrec) will give you the error code for
+ * why the mapping of the old inode failed.
+ *
+ * Locking: - The runlist described by @ni must be locked for writing on entry
+ * and is locked on return. Note the runlist may be modified when
+ * needed runlist fragments need to be mapped.
+ * - If @ctx is NULL, the base mft record of @ni must not be mapped on
+ * entry and it will be left unmapped on return.
+ * - If @ctx is not NULL, the base mft record must be mapped on entry
+ * and it will be left mapped on return.
+ */
+runlist_element *ntfs_attr_find_vcn_nolock(ntfs_inode *ni, const VCN vcn,
+ ntfs_attr_search_ctx *ctx)
+{
+ unsigned long flags;
+ runlist_element *rl;
+ int err = 0;
+ bool is_retry = false;
+
+ BUG_ON(!ni);
+ ntfs_debug("Entering for i_ino 0x%lx, vcn 0x%llx, with%s ctx.",
+ ni->mft_no, (unsigned long long)vcn, ctx ? "" : "out");
+ BUG_ON(!NInoNonResident(ni));
+ BUG_ON(vcn < 0);
+ if (!ni->runlist.rl) {
+ read_lock_irqsave(&ni->size_lock, flags);
+ if (!ni->allocated_size) {
+ read_unlock_irqrestore(&ni->size_lock, flags);
+ return ERR_PTR(-ENOENT);
+ }
+ read_unlock_irqrestore(&ni->size_lock, flags);
+ }
+retry_remap:
+ rl = ni->runlist.rl;
+ if (likely(rl && vcn >= rl[0].vcn)) {
+ while (likely(rl->length)) {
+ if (unlikely(vcn < rl[1].vcn)) {
+ if (likely(rl->lcn >= LCN_HOLE)) {
+ ntfs_debug("Done.");
+ return rl;
+ }
+ break;
+ }
+ rl++;
+ }
+ if (likely(rl->lcn != LCN_RL_NOT_MAPPED)) {
+ if (likely(rl->lcn == LCN_ENOENT))
+ err = -ENOENT;
+ else
+ err = -EIO;
+ }
+ }
+ if (!err && !is_retry) {
+ /*
+ * If the search context is invalid we cannot map the unmapped
+ * region.
+ */
+ if (IS_ERR(ctx->mrec))
+ err = PTR_ERR(ctx->mrec);
+ else {
+ /*
+ * The @vcn is in an unmapped region, map the runlist
+ * and retry.
+ */
+ err = ntfs_map_runlist_nolock(ni, vcn, ctx);
+ if (likely(!err)) {
+ is_retry = true;
+ goto retry_remap;
+ }
+ }
+ if (err == -EINVAL)
+ err = -EIO;
+ } else if (!err)
+ err = -EIO;
+ if (err != -ENOENT)
+ ntfs_error(ni->vol->sb, "Failed with error code %i.", err);
+ return ERR_PTR(err);
+}
+
+/**
+ * ntfs_attr_find - find (next) attribute in mft record
+ * @type: attribute type to find
+ * @name: attribute name to find (optional, i.e. NULL means don't care)
+ * @name_len: attribute name length (only needed if @name present)
+ * @ic: IGNORE_CASE or CASE_SENSITIVE (ignored if @name not present)
+ * @val: attribute value to find (optional, resident attributes only)
+ * @val_len: attribute value length
+ * @ctx: search context with mft record and attribute to search from
+ *
+ * You should not need to call this function directly. Use ntfs_attr_lookup()
+ * instead.
+ *
+ * ntfs_attr_find() takes a search context @ctx as parameter and searches the
+ * mft record specified by @ctx->mrec, beginning at @ctx->attr, for an
+ * attribute of @type, optionally @name and @val.
+ *
+ * If the attribute is found, ntfs_attr_find() returns 0 and @ctx->attr will
+ * point to the found attribute.
+ *
+ * If the attribute is not found, ntfs_attr_find() returns -ENOENT and
+ * @ctx->attr will point to the attribute before which the attribute being
+ * searched for would need to be inserted if such an action were to be desired.
+ *
+ * On actual error, ntfs_attr_find() returns -EIO. In this case @ctx->attr is
+ * undefined and in particular do not rely on it not changing.
+ *
+ * If @ctx->is_first is 'true', the search begins with @ctx->attr itself. If it
+ * is 'false', the search begins after @ctx->attr.
+ *
+ * If @ic is IGNORE_CASE, the @name comparisson is not case sensitive and
+ * @ctx->ntfs_ino must be set to the ntfs inode to which the mft record
+ * @ctx->mrec belongs. This is so we can get at the ntfs volume and hence at
+ * the upcase table. If @ic is CASE_SENSITIVE, the comparison is case
+ * sensitive. When @name is present, @name_len is the @name length in Unicode
+ * characters.
+ *
+ * If @name is not present (NULL), we assume that the unnamed attribute is
+ * being searched for.
+ *
+ * Finally, the resident attribute value @val is looked for, if present. If
+ * @val is not present (NULL), @val_len is ignored.
+ *
+ * ntfs_attr_find() only searches the specified mft record and it ignores the
+ * presence of an attribute list attribute (unless it is the one being searched
+ * for, obviously). If you need to take attribute lists into consideration,
+ * use ntfs_attr_lookup() instead (see below). This also means that you cannot
+ * use ntfs_attr_find() to search for extent records of non-resident
+ * attributes, as extents with lowest_vcn != 0 are usually described by the
+ * attribute list attribute only. - Note that it is possible that the first
+ * extent is only in the attribute list while the last extent is in the base
+ * mft record, so do not rely on being able to find the first extent in the
+ * base mft record.
+ *
+ * Warning: Never use @val when looking for attribute types which can be
+ * non-resident as this most likely will result in a crash!
+ */
+static int ntfs_attr_find(const ATTR_TYPE type, const ntfschar *name,
+ const u32 name_len, const IGNORE_CASE_BOOL ic,
+ const u8 *val, const u32 val_len, ntfs_attr_search_ctx *ctx)
+{
+ ATTR_RECORD *a;
+ ntfs_volume *vol = ctx->ntfs_ino->vol;
+ ntfschar *upcase = vol->upcase;
+ u32 upcase_len = vol->upcase_len;
+
+ /*
+ * Iterate over attributes in mft record starting at @ctx->attr, or the
+ * attribute following that, if @ctx->is_first is 'true'.
+ */
+ if (ctx->is_first) {
+ a = ctx->attr;
+ ctx->is_first = false;
+ } else
+ a = (ATTR_RECORD*)((u8*)ctx->attr +
+ le32_to_cpu(ctx->attr->length));
+ for (;; a = (ATTR_RECORD*)((u8*)a + le32_to_cpu(a->length))) {
+ u8 *mrec_end = (u8 *)ctx->mrec +
+ le32_to_cpu(ctx->mrec->bytes_allocated);
+ u8 *name_end;
+
+ /* check whether ATTR_RECORD wrap */
+ if ((u8 *)a < (u8 *)ctx->mrec)
+ break;
+
+ /* check whether Attribute Record Header is within bounds */
+ if ((u8 *)a > mrec_end ||
+ (u8 *)a + sizeof(ATTR_RECORD) > mrec_end)
+ break;
+
+ /* check whether ATTR_RECORD's name is within bounds */
+ name_end = (u8 *)a + le16_to_cpu(a->name_offset) +
+ a->name_length * sizeof(ntfschar);
+ if (name_end > mrec_end)
+ break;
+
+ ctx->attr = a;
+ if (unlikely(le32_to_cpu(a->type) > le32_to_cpu(type) ||
+ a->type == AT_END))
+ return -ENOENT;
+ if (unlikely(!a->length))
+ break;
+
+ /* check whether ATTR_RECORD's length wrap */
+ if ((u8 *)a + le32_to_cpu(a->length) < (u8 *)a)
+ break;
+ /* check whether ATTR_RECORD's length is within bounds */
+ if ((u8 *)a + le32_to_cpu(a->length) > mrec_end)
+ break;
+
+ if (a->type != type)
+ continue;
+ /*
+ * If @name is present, compare the two names. If @name is
+ * missing, assume we want an unnamed attribute.
+ */
+ if (!name) {
+ /* The search failed if the found attribute is named. */
+ if (a->name_length)
+ return -ENOENT;
+ } else if (!ntfs_are_names_equal(name, name_len,
+ (ntfschar*)((u8*)a + le16_to_cpu(a->name_offset)),
+ a->name_length, ic, upcase, upcase_len)) {
+ register int rc;
+
+ rc = ntfs_collate_names(name, name_len,
+ (ntfschar*)((u8*)a +
+ le16_to_cpu(a->name_offset)),
+ a->name_length, 1, IGNORE_CASE,
+ upcase, upcase_len);
+ /*
+ * If @name collates before a->name, there is no
+ * matching attribute.
+ */
+ if (rc == -1)
+ return -ENOENT;
+ /* If the strings are not equal, continue search. */
+ if (rc)
+ continue;
+ rc = ntfs_collate_names(name, name_len,
+ (ntfschar*)((u8*)a +
+ le16_to_cpu(a->name_offset)),
+ a->name_length, 1, CASE_SENSITIVE,
+ upcase, upcase_len);
+ if (rc == -1)
+ return -ENOENT;
+ if (rc)
+ continue;
+ }
+ /*
+ * The names match or @name not present and attribute is
+ * unnamed. If no @val specified, we have found the attribute
+ * and are done.
+ */
+ if (!val)
+ return 0;
+ /* @val is present; compare values. */
+ else {
+ register int rc;
+
+ rc = memcmp(val, (u8*)a + le16_to_cpu(
+ a->data.resident.value_offset),
+ min_t(u32, val_len, le32_to_cpu(
+ a->data.resident.value_length)));
+ /*
+ * If @val collates before the current attribute's
+ * value, there is no matching attribute.
+ */
+ if (!rc) {
+ register u32 avl;
+
+ avl = le32_to_cpu(
+ a->data.resident.value_length);
+ if (val_len == avl)
+ return 0;
+ if (val_len < avl)
+ return -ENOENT;
+ } else if (rc < 0)
+ return -ENOENT;
+ }
+ }
+ ntfs_error(vol->sb, "Inode is corrupt. Run chkdsk.");
+ NVolSetErrors(vol);
+ return -EIO;
+}
+
+/**
+ * load_attribute_list - load an attribute list into memory
+ * @vol: ntfs volume from which to read
+ * @runlist: runlist of the attribute list
+ * @al_start: destination buffer
+ * @size: size of the destination buffer in bytes
+ * @initialized_size: initialized size of the attribute list
+ *
+ * Walk the runlist @runlist and load all clusters from it copying them into
+ * the linear buffer @al. The maximum number of bytes copied to @al is @size
+ * bytes. Note, @size does not need to be a multiple of the cluster size. If
+ * @initialized_size is less than @size, the region in @al between
+ * @initialized_size and @size will be zeroed and not read from disk.
+ *
+ * Return 0 on success or -errno on error.
+ */
+int load_attribute_list(ntfs_volume *vol, runlist *runlist, u8 *al_start,
+ const s64 size, const s64 initialized_size)
+{
+ LCN lcn;
+ u8 *al = al_start;
+ u8 *al_end = al + initialized_size;
+ runlist_element *rl;
+ struct buffer_head *bh;
+ struct super_block *sb;
+ unsigned long block_size;
+ unsigned long block, max_block;
+ int err = 0;
+ unsigned char block_size_bits;
+
+ ntfs_debug("Entering.");
+ if (!vol || !runlist || !al || size <= 0 || initialized_size < 0 ||
+ initialized_size > size)
+ return -EINVAL;
+ if (!initialized_size) {
+ memset(al, 0, size);
+ return 0;
+ }
+ sb = vol->sb;
+ block_size = sb->s_blocksize;
+ block_size_bits = sb->s_blocksize_bits;
+ down_read(&runlist->lock);
+ rl = runlist->rl;
+ if (!rl) {
+ ntfs_error(sb, "Cannot read attribute list since runlist is "
+ "missing.");
+ goto err_out;
+ }
+ /* Read all clusters specified by the runlist one run at a time. */
+ while (rl->length) {
+ lcn = ntfs_rl_vcn_to_lcn(rl, rl->vcn);
+ ntfs_debug("Reading vcn = 0x%llx, lcn = 0x%llx.",
+ (unsigned long long)rl->vcn,
+ (unsigned long long)lcn);
+ /* The attribute list cannot be sparse. */
+ if (lcn < 0) {
+ ntfs_error(sb, "ntfs_rl_vcn_to_lcn() failed. Cannot "
+ "read attribute list.");
+ goto err_out;
+ }
+ block = lcn << vol->cluster_size_bits >> block_size_bits;
+ /* Read the run from device in chunks of block_size bytes. */
+ max_block = block + (rl->length << vol->cluster_size_bits >>
+ block_size_bits);
+ ntfs_debug("max_block = 0x%lx.", max_block);
+ do {
+ ntfs_debug("Reading block = 0x%lx.", block);
+ bh = sb_bread(sb, block);
+ if (!bh) {
+ ntfs_error(sb, "sb_bread() failed. Cannot "
+ "read attribute list.");
+ goto err_out;
+ }
+ if (al + block_size >= al_end)
+ goto do_final;
+ memcpy(al, bh->b_data, block_size);
+ brelse(bh);
+ al += block_size;
+ } while (++block < max_block);
+ rl++;
+ }
+ if (initialized_size < size) {
+initialize:
+ memset(al_start + initialized_size, 0, size - initialized_size);
+ }
+done:
+ up_read(&runlist->lock);
+ return err;
+do_final:
+ if (al < al_end) {
+ /*
+ * Partial block.
+ *
+ * Note: The attribute list can be smaller than its allocation
+ * by multiple clusters. This has been encountered by at least
+ * two people running Windows XP, thus we cannot do any
+ * truncation sanity checking here. (AIA)
+ */
+ memcpy(al, bh->b_data, al_end - al);
+ brelse(bh);
+ if (initialized_size < size)
+ goto initialize;
+ goto done;
+ }
+ brelse(bh);
+ /* Real overflow! */
+ ntfs_error(sb, "Attribute list buffer overflow. Read attribute list "
+ "is truncated.");
+err_out:
+ err = -EIO;
+ goto done;
+}
+
+/**
+ * ntfs_external_attr_find - find an attribute in the attribute list of an inode
+ * @type: attribute type to find
+ * @name: attribute name to find (optional, i.e. NULL means don't care)
+ * @name_len: attribute name length (only needed if @name present)
+ * @ic: IGNORE_CASE or CASE_SENSITIVE (ignored if @name not present)
+ * @lowest_vcn: lowest vcn to find (optional, non-resident attributes only)
+ * @val: attribute value to find (optional, resident attributes only)
+ * @val_len: attribute value length
+ * @ctx: search context with mft record and attribute to search from
+ *
+ * You should not need to call this function directly. Use ntfs_attr_lookup()
+ * instead.
+ *
+ * Find an attribute by searching the attribute list for the corresponding
+ * attribute list entry. Having found the entry, map the mft record if the
+ * attribute is in a different mft record/inode, ntfs_attr_find() the attribute
+ * in there and return it.
+ *
+ * On first search @ctx->ntfs_ino must be the base mft record and @ctx must
+ * have been obtained from a call to ntfs_attr_get_search_ctx(). On subsequent
+ * calls @ctx->ntfs_ino can be any extent inode, too (@ctx->base_ntfs_ino is
+ * then the base inode).
+ *
+ * After finishing with the attribute/mft record you need to call
+ * ntfs_attr_put_search_ctx() to cleanup the search context (unmapping any
+ * mapped inodes, etc).
+ *
+ * If the attribute is found, ntfs_external_attr_find() returns 0 and
+ * @ctx->attr will point to the found attribute. @ctx->mrec will point to the
+ * mft record in which @ctx->attr is located and @ctx->al_entry will point to
+ * the attribute list entry for the attribute.
+ *
+ * If the attribute is not found, ntfs_external_attr_find() returns -ENOENT and
+ * @ctx->attr will point to the attribute in the base mft record before which
+ * the attribute being searched for would need to be inserted if such an action
+ * were to be desired. @ctx->mrec will point to the mft record in which
+ * @ctx->attr is located and @ctx->al_entry will point to the attribute list
+ * entry of the attribute before which the attribute being searched for would
+ * need to be inserted if such an action were to be desired.
+ *
+ * Thus to insert the not found attribute, one wants to add the attribute to
+ * @ctx->mrec (the base mft record) and if there is not enough space, the
+ * attribute should be placed in a newly allocated extent mft record. The
+ * attribute list entry for the inserted attribute should be inserted in the
+ * attribute list attribute at @ctx->al_entry.
+ *
+ * On actual error, ntfs_external_attr_find() returns -EIO. In this case
+ * @ctx->attr is undefined and in particular do not rely on it not changing.
+ */
+static int ntfs_external_attr_find(const ATTR_TYPE type,
+ const ntfschar *name, const u32 name_len,
+ const IGNORE_CASE_BOOL ic, const VCN lowest_vcn,
+ const u8 *val, const u32 val_len, ntfs_attr_search_ctx *ctx)
+{
+ ntfs_inode *base_ni, *ni;
+ ntfs_volume *vol;
+ ATTR_LIST_ENTRY *al_entry, *next_al_entry;
+ u8 *al_start, *al_end;
+ ATTR_RECORD *a;
+ ntfschar *al_name;
+ u32 al_name_len;
+ int err = 0;
+ static const char *es = " Unmount and run chkdsk.";
+
+ ni = ctx->ntfs_ino;
+ base_ni = ctx->base_ntfs_ino;
+ ntfs_debug("Entering for inode 0x%lx, type 0x%x.", ni->mft_no, type);
+ if (!base_ni) {
+ /* First call happens with the base mft record. */
+ base_ni = ctx->base_ntfs_ino = ctx->ntfs_ino;
+ ctx->base_mrec = ctx->mrec;
+ }
+ if (ni == base_ni)
+ ctx->base_attr = ctx->attr;
+ if (type == AT_END)
+ goto not_found;
+ vol = base_ni->vol;
+ al_start = base_ni->attr_list;
+ al_end = al_start + base_ni->attr_list_size;
+ if (!ctx->al_entry)
+ ctx->al_entry = (ATTR_LIST_ENTRY*)al_start;
+ /*
+ * Iterate over entries in attribute list starting at @ctx->al_entry,
+ * or the entry following that, if @ctx->is_first is 'true'.
+ */
+ if (ctx->is_first) {
+ al_entry = ctx->al_entry;
+ ctx->is_first = false;
+ } else
+ al_entry = (ATTR_LIST_ENTRY*)((u8*)ctx->al_entry +
+ le16_to_cpu(ctx->al_entry->length));
+ for (;; al_entry = next_al_entry) {
+ /* Out of bounds check. */
+ if ((u8*)al_entry < base_ni->attr_list ||
+ (u8*)al_entry > al_end)
+ break; /* Inode is corrupt. */
+ ctx->al_entry = al_entry;
+ /* Catch the end of the attribute list. */
+ if ((u8*)al_entry == al_end)
+ goto not_found;
+ if (!al_entry->length)
+ break;
+ if ((u8*)al_entry + 6 > al_end || (u8*)al_entry +
+ le16_to_cpu(al_entry->length) > al_end)
+ break;
+ next_al_entry = (ATTR_LIST_ENTRY*)((u8*)al_entry +
+ le16_to_cpu(al_entry->length));
+ if (le32_to_cpu(al_entry->type) > le32_to_cpu(type))
+ goto not_found;
+ if (type != al_entry->type)
+ continue;
+ /*
+ * If @name is present, compare the two names. If @name is
+ * missing, assume we want an unnamed attribute.
+ */
+ al_name_len = al_entry->name_length;
+ al_name = (ntfschar*)((u8*)al_entry + al_entry->name_offset);
+ if (!name) {
+ if (al_name_len)
+ goto not_found;
+ } else if (!ntfs_are_names_equal(al_name, al_name_len, name,
+ name_len, ic, vol->upcase, vol->upcase_len)) {
+ register int rc;
+
+ rc = ntfs_collate_names(name, name_len, al_name,
+ al_name_len, 1, IGNORE_CASE,
+ vol->upcase, vol->upcase_len);
+ /*
+ * If @name collates before al_name, there is no
+ * matching attribute.
+ */
+ if (rc == -1)
+ goto not_found;
+ /* If the strings are not equal, continue search. */
+ if (rc)
+ continue;
+ /*
+ * FIXME: Reverse engineering showed 0, IGNORE_CASE but
+ * that is inconsistent with ntfs_attr_find(). The
+ * subsequent rc checks were also different. Perhaps I
+ * made a mistake in one of the two. Need to recheck
+ * which is correct or at least see what is going on...
+ * (AIA)
+ */
+ rc = ntfs_collate_names(name, name_len, al_name,
+ al_name_len, 1, CASE_SENSITIVE,
+ vol->upcase, vol->upcase_len);
+ if (rc == -1)
+ goto not_found;
+ if (rc)
+ continue;
+ }
+ /*
+ * The names match or @name not present and attribute is
+ * unnamed. Now check @lowest_vcn. Continue search if the
+ * next attribute list entry still fits @lowest_vcn. Otherwise
+ * we have reached the right one or the search has failed.
+ */
+ if (lowest_vcn && (u8*)next_al_entry >= al_start &&
+ (u8*)next_al_entry + 6 < al_end &&
+ (u8*)next_al_entry + le16_to_cpu(
+ next_al_entry->length) <= al_end &&
+ sle64_to_cpu(next_al_entry->lowest_vcn) <=
+ lowest_vcn &&
+ next_al_entry->type == al_entry->type &&
+ next_al_entry->name_length == al_name_len &&
+ ntfs_are_names_equal((ntfschar*)((u8*)
+ next_al_entry +
+ next_al_entry->name_offset),
+ next_al_entry->name_length,
+ al_name, al_name_len, CASE_SENSITIVE,
+ vol->upcase, vol->upcase_len))
+ continue;
+ if (MREF_LE(al_entry->mft_reference) == ni->mft_no) {
+ if (MSEQNO_LE(al_entry->mft_reference) != ni->seq_no) {
+ ntfs_error(vol->sb, "Found stale mft "
+ "reference in attribute list "
+ "of base inode 0x%lx.%s",
+ base_ni->mft_no, es);
+ err = -EIO;
+ break;
+ }
+ } else { /* Mft references do not match. */
+ /* If there is a mapped record unmap it first. */
+ if (ni != base_ni)
+ unmap_extent_mft_record(ni);
+ /* Do we want the base record back? */
+ if (MREF_LE(al_entry->mft_reference) ==
+ base_ni->mft_no) {
+ ni = ctx->ntfs_ino = base_ni;
+ ctx->mrec = ctx->base_mrec;
+ } else {
+ /* We want an extent record. */
+ ctx->mrec = map_extent_mft_record(base_ni,
+ le64_to_cpu(
+ al_entry->mft_reference), &ni);
+ if (IS_ERR(ctx->mrec)) {
+ ntfs_error(vol->sb, "Failed to map "
+ "extent mft record "
+ "0x%lx of base inode "
+ "0x%lx.%s",
+ MREF_LE(al_entry->
+ mft_reference),
+ base_ni->mft_no, es);
+ err = PTR_ERR(ctx->mrec);
+ if (err == -ENOENT)
+ err = -EIO;
+ /* Cause @ctx to be sanitized below. */
+ ni = NULL;
+ break;
+ }
+ ctx->ntfs_ino = ni;
+ }
+ ctx->attr = (ATTR_RECORD*)((u8*)ctx->mrec +
+ le16_to_cpu(ctx->mrec->attrs_offset));
+ }
+ /*
+ * ctx->vfs_ino, ctx->mrec, and ctx->attr now point to the
+ * mft record containing the attribute represented by the
+ * current al_entry.
+ */
+ /*
+ * We could call into ntfs_attr_find() to find the right
+ * attribute in this mft record but this would be less
+ * efficient and not quite accurate as ntfs_attr_find() ignores
+ * the attribute instance numbers for example which become
+ * important when one plays with attribute lists. Also,
+ * because a proper match has been found in the attribute list
+ * entry above, the comparison can now be optimized. So it is
+ * worth re-implementing a simplified ntfs_attr_find() here.
+ */
+ a = ctx->attr;
+ /*
+ * Use a manual loop so we can still use break and continue
+ * with the same meanings as above.
+ */
+do_next_attr_loop:
+ if ((u8*)a < (u8*)ctx->mrec || (u8*)a > (u8*)ctx->mrec +
+ le32_to_cpu(ctx->mrec->bytes_allocated))
+ break;
+ if (a->type == AT_END)
+ break;
+ if (!a->length)
+ break;
+ if (al_entry->instance != a->instance)
+ goto do_next_attr;
+ /*
+ * If the type and/or the name are mismatched between the
+ * attribute list entry and the attribute record, there is
+ * corruption so we break and return error EIO.
+ */
+ if (al_entry->type != a->type)
+ break;
+ if (!ntfs_are_names_equal((ntfschar*)((u8*)a +
+ le16_to_cpu(a->name_offset)), a->name_length,
+ al_name, al_name_len, CASE_SENSITIVE,
+ vol->upcase, vol->upcase_len))
+ break;
+ ctx->attr = a;
+ /*
+ * If no @val specified or @val specified and it matches, we
+ * have found it!
+ */
+ if (!val || (!a->non_resident && le32_to_cpu(
+ a->data.resident.value_length) == val_len &&
+ !memcmp((u8*)a +
+ le16_to_cpu(a->data.resident.value_offset),
+ val, val_len))) {
+ ntfs_debug("Done, found.");
+ return 0;
+ }
+do_next_attr:
+ /* Proceed to the next attribute in the current mft record. */
+ a = (ATTR_RECORD*)((u8*)a + le32_to_cpu(a->length));
+ goto do_next_attr_loop;
+ }
+ if (!err) {
+ ntfs_error(vol->sb, "Base inode 0x%lx contains corrupt "
+ "attribute list attribute.%s", base_ni->mft_no,
+ es);
+ err = -EIO;
+ }
+ if (ni != base_ni) {
+ if (ni)
+ unmap_extent_mft_record(ni);
+ ctx->ntfs_ino = base_ni;
+ ctx->mrec = ctx->base_mrec;
+ ctx->attr = ctx->base_attr;
+ }
+ if (err != -ENOMEM)
+ NVolSetErrors(vol);
+ return err;
+not_found:
+ /*
+ * If we were looking for AT_END, we reset the search context @ctx and
+ * use ntfs_attr_find() to seek to the end of the base mft record.
+ */
+ if (type == AT_END) {
+ ntfs_attr_reinit_search_ctx(ctx);
+ return ntfs_attr_find(AT_END, name, name_len, ic, val, val_len,
+ ctx);
+ }
+ /*
+ * The attribute was not found. Before we return, we want to ensure
+ * @ctx->mrec and @ctx->attr indicate the position at which the
+ * attribute should be inserted in the base mft record. Since we also
+ * want to preserve @ctx->al_entry we cannot reinitialize the search
+ * context using ntfs_attr_reinit_search_ctx() as this would set
+ * @ctx->al_entry to NULL. Thus we do the necessary bits manually (see
+ * ntfs_attr_init_search_ctx() below). Note, we _only_ preserve
+ * @ctx->al_entry as the remaining fields (base_*) are identical to
+ * their non base_ counterparts and we cannot set @ctx->base_attr
+ * correctly yet as we do not know what @ctx->attr will be set to by
+ * the call to ntfs_attr_find() below.
+ */
+ if (ni != base_ni)
+ unmap_extent_mft_record(ni);
+ ctx->mrec = ctx->base_mrec;
+ ctx->attr = (ATTR_RECORD*)((u8*)ctx->mrec +
+ le16_to_cpu(ctx->mrec->attrs_offset));
+ ctx->is_first = true;
+ ctx->ntfs_ino = base_ni;
+ ctx->base_ntfs_ino = NULL;
+ ctx->base_mrec = NULL;
+ ctx->base_attr = NULL;
+ /*
+ * In case there are multiple matches in the base mft record, need to
+ * keep enumerating until we get an attribute not found response (or
+ * another error), otherwise we would keep returning the same attribute
+ * over and over again and all programs using us for enumeration would
+ * lock up in a tight loop.
+ */
+ do {
+ err = ntfs_attr_find(type, name, name_len, ic, val, val_len,
+ ctx);
+ } while (!err);
+ ntfs_debug("Done, not found.");
+ return err;
+}
+
+/**
+ * ntfs_attr_lookup - find an attribute in an ntfs inode
+ * @type: attribute type to find
+ * @name: attribute name to find (optional, i.e. NULL means don't care)
+ * @name_len: attribute name length (only needed if @name present)
+ * @ic: IGNORE_CASE or CASE_SENSITIVE (ignored if @name not present)
+ * @lowest_vcn: lowest vcn to find (optional, non-resident attributes only)
+ * @val: attribute value to find (optional, resident attributes only)
+ * @val_len: attribute value length
+ * @ctx: search context with mft record and attribute to search from
+ *
+ * Find an attribute in an ntfs inode. On first search @ctx->ntfs_ino must
+ * be the base mft record and @ctx must have been obtained from a call to
+ * ntfs_attr_get_search_ctx().
+ *
+ * This function transparently handles attribute lists and @ctx is used to
+ * continue searches where they were left off at.
+ *
+ * After finishing with the attribute/mft record you need to call
+ * ntfs_attr_put_search_ctx() to cleanup the search context (unmapping any
+ * mapped inodes, etc).
+ *
+ * Return 0 if the search was successful and -errno if not.
+ *
+ * When 0, @ctx->attr is the found attribute and it is in mft record
+ * @ctx->mrec. If an attribute list attribute is present, @ctx->al_entry is
+ * the attribute list entry of the found attribute.
+ *
+ * When -ENOENT, @ctx->attr is the attribute which collates just after the
+ * attribute being searched for, i.e. if one wants to add the attribute to the
+ * mft record this is the correct place to insert it into. If an attribute
+ * list attribute is present, @ctx->al_entry is the attribute list entry which
+ * collates just after the attribute list entry of the attribute being searched
+ * for, i.e. if one wants to add the attribute to the mft record this is the
+ * correct place to insert its attribute list entry into.
+ *
+ * When -errno != -ENOENT, an error occurred during the lookup. @ctx->attr is
+ * then undefined and in particular you should not rely on it not changing.
+ */
+int ntfs_attr_lookup(const ATTR_TYPE type, const ntfschar *name,
+ const u32 name_len, const IGNORE_CASE_BOOL ic,
+ const VCN lowest_vcn, const u8 *val, const u32 val_len,
+ ntfs_attr_search_ctx *ctx)
+{
+ ntfs_inode *base_ni;
+
+ ntfs_debug("Entering.");
+ BUG_ON(IS_ERR(ctx->mrec));
+ if (ctx->base_ntfs_ino)
+ base_ni = ctx->base_ntfs_ino;
+ else
+ base_ni = ctx->ntfs_ino;
+ /* Sanity check, just for debugging really. */
+ BUG_ON(!base_ni);
+ if (!NInoAttrList(base_ni) || type == AT_ATTRIBUTE_LIST)
+ return ntfs_attr_find(type, name, name_len, ic, val, val_len,
+ ctx);
+ return ntfs_external_attr_find(type, name, name_len, ic, lowest_vcn,
+ val, val_len, ctx);
+}
+
+/**
+ * ntfs_attr_init_search_ctx - initialize an attribute search context
+ * @ctx: attribute search context to initialize
+ * @ni: ntfs inode with which to initialize the search context
+ * @mrec: mft record with which to initialize the search context
+ *
+ * Initialize the attribute search context @ctx with @ni and @mrec.
+ */
+static inline void ntfs_attr_init_search_ctx(ntfs_attr_search_ctx *ctx,
+ ntfs_inode *ni, MFT_RECORD *mrec)
+{
+ *ctx = (ntfs_attr_search_ctx) {
+ .mrec = mrec,
+ /* Sanity checks are performed elsewhere. */
+ .attr = (ATTR_RECORD*)((u8*)mrec +
+ le16_to_cpu(mrec->attrs_offset)),
+ .is_first = true,
+ .ntfs_ino = ni,
+ };
+}
+
+/**
+ * ntfs_attr_reinit_search_ctx - reinitialize an attribute search context
+ * @ctx: attribute search context to reinitialize
+ *
+ * Reinitialize the attribute search context @ctx, unmapping an associated
+ * extent mft record if present, and initialize the search context again.
+ *
+ * This is used when a search for a new attribute is being started to reset
+ * the search context to the beginning.
+ */
+void ntfs_attr_reinit_search_ctx(ntfs_attr_search_ctx *ctx)
+{
+ if (likely(!ctx->base_ntfs_ino)) {
+ /* No attribute list. */
+ ctx->is_first = true;
+ /* Sanity checks are performed elsewhere. */
+ ctx->attr = (ATTR_RECORD*)((u8*)ctx->mrec +
+ le16_to_cpu(ctx->mrec->attrs_offset));
+ /*
+ * This needs resetting due to ntfs_external_attr_find() which
+ * can leave it set despite having zeroed ctx->base_ntfs_ino.
+ */
+ ctx->al_entry = NULL;
+ return;
+ } /* Attribute list. */
+ if (ctx->ntfs_ino != ctx->base_ntfs_ino)
+ unmap_extent_mft_record(ctx->ntfs_ino);
+ ntfs_attr_init_search_ctx(ctx, ctx->base_ntfs_ino, ctx->base_mrec);
+ return;
+}
+
+/**
+ * ntfs_attr_get_search_ctx - allocate/initialize a new attribute search context
+ * @ni: ntfs inode with which to initialize the search context
+ * @mrec: mft record with which to initialize the search context
+ *
+ * Allocate a new attribute search context, initialize it with @ni and @mrec,
+ * and return it. Return NULL if allocation failed.
+ */
+ntfs_attr_search_ctx *ntfs_attr_get_search_ctx(ntfs_inode *ni, MFT_RECORD *mrec)
+{
+ ntfs_attr_search_ctx *ctx;
+
+ ctx = kmem_cache_alloc(ntfs_attr_ctx_cache, GFP_NOFS);
+ if (ctx)
+ ntfs_attr_init_search_ctx(ctx, ni, mrec);
+ return ctx;
+}
+
+/**
+ * ntfs_attr_put_search_ctx - release an attribute search context
+ * @ctx: attribute search context to free
+ *
+ * Release the attribute search context @ctx, unmapping an associated extent
+ * mft record if present.
+ */
+void ntfs_attr_put_search_ctx(ntfs_attr_search_ctx *ctx)
+{
+ if (ctx->base_ntfs_ino && ctx->ntfs_ino != ctx->base_ntfs_ino)
+ unmap_extent_mft_record(ctx->ntfs_ino);
+ kmem_cache_free(ntfs_attr_ctx_cache, ctx);
+ return;
+}
+
+#ifdef NTFS_RW
+
+/**
+ * ntfs_attr_find_in_attrdef - find an attribute in the $AttrDef system file
+ * @vol: ntfs volume to which the attribute belongs
+ * @type: attribute type which to find
+ *
+ * Search for the attribute definition record corresponding to the attribute
+ * @type in the $AttrDef system file.
+ *
+ * Return the attribute type definition record if found and NULL if not found.
+ */
+static ATTR_DEF *ntfs_attr_find_in_attrdef(const ntfs_volume *vol,
+ const ATTR_TYPE type)
+{
+ ATTR_DEF *ad;
+
+ BUG_ON(!vol->attrdef);
+ BUG_ON(!type);
+ for (ad = vol->attrdef; (u8*)ad - (u8*)vol->attrdef <
+ vol->attrdef_size && ad->type; ++ad) {
+ /* We have not found it yet, carry on searching. */
+ if (likely(le32_to_cpu(ad->type) < le32_to_cpu(type)))
+ continue;
+ /* We found the attribute; return it. */
+ if (likely(ad->type == type))
+ return ad;
+ /* We have gone too far already. No point in continuing. */
+ break;
+ }
+ /* Attribute not found. */
+ ntfs_debug("Attribute type 0x%x not found in $AttrDef.",
+ le32_to_cpu(type));
+ return NULL;
+}
+
+/**
+ * ntfs_attr_size_bounds_check - check a size of an attribute type for validity
+ * @vol: ntfs volume to which the attribute belongs
+ * @type: attribute type which to check
+ * @size: size which to check
+ *
+ * Check whether the @size in bytes is valid for an attribute of @type on the
+ * ntfs volume @vol. This information is obtained from $AttrDef system file.
+ *
+ * Return 0 if valid, -ERANGE if not valid, or -ENOENT if the attribute is not
+ * listed in $AttrDef.
+ */
+int ntfs_attr_size_bounds_check(const ntfs_volume *vol, const ATTR_TYPE type,
+ const s64 size)
+{
+ ATTR_DEF *ad;
+
+ BUG_ON(size < 0);
+ /*
+ * $ATTRIBUTE_LIST has a maximum size of 256kiB, but this is not
+ * listed in $AttrDef.
+ */
+ if (unlikely(type == AT_ATTRIBUTE_LIST && size > 256 * 1024))
+ return -ERANGE;
+ /* Get the $AttrDef entry for the attribute @type. */
+ ad = ntfs_attr_find_in_attrdef(vol, type);
+ if (unlikely(!ad))
+ return -ENOENT;
+ /* Do the bounds check. */
+ if (((sle64_to_cpu(ad->min_size) > 0) &&
+ size < sle64_to_cpu(ad->min_size)) ||
+ ((sle64_to_cpu(ad->max_size) > 0) && size >
+ sle64_to_cpu(ad->max_size)))
+ return -ERANGE;
+ return 0;
+}
+
+/**
+ * ntfs_attr_can_be_non_resident - check if an attribute can be non-resident
+ * @vol: ntfs volume to which the attribute belongs
+ * @type: attribute type which to check
+ *
+ * Check whether the attribute of @type on the ntfs volume @vol is allowed to
+ * be non-resident. This information is obtained from $AttrDef system file.
+ *
+ * Return 0 if the attribute is allowed to be non-resident, -EPERM if not, and
+ * -ENOENT if the attribute is not listed in $AttrDef.
+ */
+int ntfs_attr_can_be_non_resident(const ntfs_volume *vol, const ATTR_TYPE type)
+{
+ ATTR_DEF *ad;
+
+ /* Find the attribute definition record in $AttrDef. */
+ ad = ntfs_attr_find_in_attrdef(vol, type);
+ if (unlikely(!ad))
+ return -ENOENT;
+ /* Check the flags and return the result. */
+ if (ad->flags & ATTR_DEF_RESIDENT)
+ return -EPERM;
+ return 0;
+}
+
+/**
+ * ntfs_attr_can_be_resident - check if an attribute can be resident
+ * @vol: ntfs volume to which the attribute belongs
+ * @type: attribute type which to check
+ *
+ * Check whether the attribute of @type on the ntfs volume @vol is allowed to
+ * be resident. This information is derived from our ntfs knowledge and may
+ * not be completely accurate, especially when user defined attributes are
+ * present. Basically we allow everything to be resident except for index
+ * allocation and $EA attributes.
+ *
+ * Return 0 if the attribute is allowed to be non-resident and -EPERM if not.
+ *
+ * Warning: In the system file $MFT the attribute $Bitmap must be non-resident
+ * otherwise windows will not boot (blue screen of death)! We cannot
+ * check for this here as we do not know which inode's $Bitmap is
+ * being asked about so the caller needs to special case this.
+ */
+int ntfs_attr_can_be_resident(const ntfs_volume *vol, const ATTR_TYPE type)
+{
+ if (type == AT_INDEX_ALLOCATION)
+ return -EPERM;
+ return 0;
+}
+
+/**
+ * ntfs_attr_record_resize - resize an attribute record
+ * @m: mft record containing attribute record
+ * @a: attribute record to resize
+ * @new_size: new size in bytes to which to resize the attribute record @a
+ *
+ * Resize the attribute record @a, i.e. the resident part of the attribute, in
+ * the mft record @m to @new_size bytes.
+ *
+ * Return 0 on success and -errno on error. The following error codes are
+ * defined:
+ * -ENOSPC - Not enough space in the mft record @m to perform the resize.
+ *
+ * Note: On error, no modifications have been performed whatsoever.
+ *
+ * Warning: If you make a record smaller without having copied all the data you
+ * are interested in the data may be overwritten.
+ */
+int ntfs_attr_record_resize(MFT_RECORD *m, ATTR_RECORD *a, u32 new_size)
+{
+ ntfs_debug("Entering for new_size %u.", new_size);
+ /* Align to 8 bytes if it is not already done. */
+ if (new_size & 7)
+ new_size = (new_size + 7) & ~7;
+ /* If the actual attribute length has changed, move things around. */
+ if (new_size != le32_to_cpu(a->length)) {
+ u32 new_muse = le32_to_cpu(m->bytes_in_use) -
+ le32_to_cpu(a->length) + new_size;
+ /* Not enough space in this mft record. */
+ if (new_muse > le32_to_cpu(m->bytes_allocated))
+ return -ENOSPC;
+ /* Move attributes following @a to their new location. */
+ memmove((u8*)a + new_size, (u8*)a + le32_to_cpu(a->length),
+ le32_to_cpu(m->bytes_in_use) - ((u8*)a -
+ (u8*)m) - le32_to_cpu(a->length));
+ /* Adjust @m to reflect the change in used space. */
+ m->bytes_in_use = cpu_to_le32(new_muse);
+ /* Adjust @a to reflect the new size. */
+ if (new_size >= offsetof(ATTR_REC, length) + sizeof(a->length))
+ a->length = cpu_to_le32(new_size);
+ }
+ return 0;
+}
+
+/**
+ * ntfs_resident_attr_value_resize - resize the value of a resident attribute
+ * @m: mft record containing attribute record
+ * @a: attribute record whose value to resize
+ * @new_size: new size in bytes to which to resize the attribute value of @a
+ *
+ * Resize the value of the attribute @a in the mft record @m to @new_size bytes.
+ * If the value is made bigger, the newly allocated space is cleared.
+ *
+ * Return 0 on success and -errno on error. The following error codes are
+ * defined:
+ * -ENOSPC - Not enough space in the mft record @m to perform the resize.
+ *
+ * Note: On error, no modifications have been performed whatsoever.
+ *
+ * Warning: If you make a record smaller without having copied all the data you
+ * are interested in the data may be overwritten.
+ */
+int ntfs_resident_attr_value_resize(MFT_RECORD *m, ATTR_RECORD *a,
+ const u32 new_size)
+{
+ u32 old_size;
+
+ /* Resize the resident part of the attribute record. */
+ if (ntfs_attr_record_resize(m, a,
+ le16_to_cpu(a->data.resident.value_offset) + new_size))
+ return -ENOSPC;
+ /*
+ * The resize succeeded! If we made the attribute value bigger, clear
+ * the area between the old size and @new_size.
+ */
+ old_size = le32_to_cpu(a->data.resident.value_length);
+ if (new_size > old_size)
+ memset((u8*)a + le16_to_cpu(a->data.resident.value_offset) +
+ old_size, 0, new_size - old_size);
+ /* Finally update the length of the attribute value. */
+ a->data.resident.value_length = cpu_to_le32(new_size);
+ return 0;
+}
+
+/**
+ * ntfs_attr_make_non_resident - convert a resident to a non-resident attribute
+ * @ni: ntfs inode describing the attribute to convert
+ * @data_size: size of the resident data to copy to the non-resident attribute
+ *
+ * Convert the resident ntfs attribute described by the ntfs inode @ni to a
+ * non-resident one.
+ *
+ * @data_size must be equal to the attribute value size. This is needed since
+ * we need to know the size before we can map the mft record and our callers
+ * always know it. The reason we cannot simply read the size from the vfs
+ * inode i_size is that this is not necessarily uptodate. This happens when
+ * ntfs_attr_make_non_resident() is called in the ->truncate call path(s).
+ *
+ * Return 0 on success and -errno on error. The following error return codes
+ * are defined:
+ * -EPERM - The attribute is not allowed to be non-resident.
+ * -ENOMEM - Not enough memory.
+ * -ENOSPC - Not enough disk space.
+ * -EINVAL - Attribute not defined on the volume.
+ * -EIO - I/o error or other error.
+ * Note that -ENOSPC is also returned in the case that there is not enough
+ * space in the mft record to do the conversion. This can happen when the mft
+ * record is already very full. The caller is responsible for trying to make
+ * space in the mft record and trying again. FIXME: Do we need a separate
+ * error return code for this kind of -ENOSPC or is it always worth trying
+ * again in case the attribute may then fit in a resident state so no need to
+ * make it non-resident at all? Ho-hum... (AIA)
+ *
+ * NOTE to self: No changes in the attribute list are required to move from
+ * a resident to a non-resident attribute.
+ *
+ * Locking: - The caller must hold i_mutex on the inode.
+ */
+int ntfs_attr_make_non_resident(ntfs_inode *ni, const u32 data_size)
+{
+ s64 new_size;
+ struct inode *vi = VFS_I(ni);
+ ntfs_volume *vol = ni->vol;
+ ntfs_inode *base_ni;
+ MFT_RECORD *m;
+ ATTR_RECORD *a;
+ ntfs_attr_search_ctx *ctx;
+ struct page *page;
+ runlist_element *rl;
+ u8 *kaddr;
+ unsigned long flags;
+ int mp_size, mp_ofs, name_ofs, arec_size, err, err2;
+ u32 attr_size;
+ u8 old_res_attr_flags;
+
+ /* Check that the attribute is allowed to be non-resident. */
+ err = ntfs_attr_can_be_non_resident(vol, ni->type);
+ if (unlikely(err)) {
+ if (err == -EPERM)
+ ntfs_debug("Attribute is not allowed to be "
+ "non-resident.");
+ else
+ ntfs_debug("Attribute not defined on the NTFS "
+ "volume!");
+ return err;
+ }
+ /*
+ * FIXME: Compressed and encrypted attributes are not supported when
+ * writing and we should never have gotten here for them.
+ */
+ BUG_ON(NInoCompressed(ni));
+ BUG_ON(NInoEncrypted(ni));
+ /*
+ * The size needs to be aligned to a cluster boundary for allocation
+ * purposes.
+ */
+ new_size = (data_size + vol->cluster_size - 1) &
+ ~(vol->cluster_size - 1);
+ if (new_size > 0) {
+ /*
+ * Will need the page later and since the page lock nests
+ * outside all ntfs locks, we need to get the page now.
+ */
+ page = find_or_create_page(vi->i_mapping, 0,
+ mapping_gfp_mask(vi->i_mapping));
+ if (unlikely(!page))
+ return -ENOMEM;
+ /* Start by allocating clusters to hold the attribute value. */
+ rl = ntfs_cluster_alloc(vol, 0, new_size >>
+ vol->cluster_size_bits, -1, DATA_ZONE, true);
+ if (IS_ERR(rl)) {
+ err = PTR_ERR(rl);
+ ntfs_debug("Failed to allocate cluster%s, error code "
+ "%i.", (new_size >>
+ vol->cluster_size_bits) > 1 ? "s" : "",
+ err);
+ goto page_err_out;
+ }
+ } else {
+ rl = NULL;
+ page = NULL;
+ }
+ /* Determine the size of the mapping pairs array. */
+ mp_size = ntfs_get_size_for_mapping_pairs(vol, rl, 0, -1);
+ if (unlikely(mp_size < 0)) {
+ err = mp_size;
+ ntfs_debug("Failed to get size for mapping pairs array, error "
+ "code %i.", err);
+ goto rl_err_out;
+ }
+ down_write(&ni->runlist.lock);
+ if (!NInoAttr(ni))
+ base_ni = ni;
+ else
+ base_ni = ni->ext.base_ntfs_ino;
+ m = map_mft_record(base_ni);
+ if (IS_ERR(m)) {
+ err = PTR_ERR(m);
+ m = NULL;
+ ctx = NULL;
+ goto err_out;
+ }
+ ctx = ntfs_attr_get_search_ctx(base_ni, m);
+ if (unlikely(!ctx)) {
+ err = -ENOMEM;
+ goto err_out;
+ }
+ err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
+ CASE_SENSITIVE, 0, NULL, 0, ctx);
+ if (unlikely(err)) {
+ if (err == -ENOENT)
+ err = -EIO;
+ goto err_out;
+ }
+ m = ctx->mrec;
+ a = ctx->attr;
+ BUG_ON(NInoNonResident(ni));
+ BUG_ON(a->non_resident);
+ /*
+ * Calculate new offsets for the name and the mapping pairs array.
+ */
+ if (NInoSparse(ni) || NInoCompressed(ni))
+ name_ofs = (offsetof(ATTR_REC,
+ data.non_resident.compressed_size) +
+ sizeof(a->data.non_resident.compressed_size) +
+ 7) & ~7;
+ else
+ name_ofs = (offsetof(ATTR_REC,
+ data.non_resident.compressed_size) + 7) & ~7;
+ mp_ofs = (name_ofs + a->name_length * sizeof(ntfschar) + 7) & ~7;
+ /*
+ * Determine the size of the resident part of the now non-resident
+ * attribute record.
+ */
+ arec_size = (mp_ofs + mp_size + 7) & ~7;
+ /*
+ * If the page is not uptodate bring it uptodate by copying from the
+ * attribute value.
+ */
+ attr_size = le32_to_cpu(a->data.resident.value_length);
+ BUG_ON(attr_size != data_size);
+ if (page && !PageUptodate(page)) {
+ kaddr = kmap_atomic(page);
+ memcpy(kaddr, (u8*)a +
+ le16_to_cpu(a->data.resident.value_offset),
+ attr_size);
+ memset(kaddr + attr_size, 0, PAGE_SIZE - attr_size);
+ kunmap_atomic(kaddr);
+ flush_dcache_page(page);
+ SetPageUptodate(page);
+ }
+ /* Backup the attribute flag. */
+ old_res_attr_flags = a->data.resident.flags;
+ /* Resize the resident part of the attribute record. */
+ err = ntfs_attr_record_resize(m, a, arec_size);
+ if (unlikely(err))
+ goto err_out;
+ /*
+ * Convert the resident part of the attribute record to describe a
+ * non-resident attribute.
+ */
+ a->non_resident = 1;
+ /* Move the attribute name if it exists and update the offset. */
+ if (a->name_length)
+ memmove((u8*)a + name_ofs, (u8*)a + le16_to_cpu(a->name_offset),
+ a->name_length * sizeof(ntfschar));
+ a->name_offset = cpu_to_le16(name_ofs);
+ /* Setup the fields specific to non-resident attributes. */
+ a->data.non_resident.lowest_vcn = 0;
+ a->data.non_resident.highest_vcn = cpu_to_sle64((new_size - 1) >>
+ vol->cluster_size_bits);
+ a->data.non_resident.mapping_pairs_offset = cpu_to_le16(mp_ofs);
+ memset(&a->data.non_resident.reserved, 0,
+ sizeof(a->data.non_resident.reserved));
+ a->data.non_resident.allocated_size = cpu_to_sle64(new_size);
+ a->data.non_resident.data_size =
+ a->data.non_resident.initialized_size =
+ cpu_to_sle64(attr_size);
+ if (NInoSparse(ni) || NInoCompressed(ni)) {
+ a->data.non_resident.compression_unit = 0;
+ if (NInoCompressed(ni) || vol->major_ver < 3)
+ a->data.non_resident.compression_unit = 4;
+ a->data.non_resident.compressed_size =
+ a->data.non_resident.allocated_size;
+ } else
+ a->data.non_resident.compression_unit = 0;
+ /* Generate the mapping pairs array into the attribute record. */
+ err = ntfs_mapping_pairs_build(vol, (u8*)a + mp_ofs,
+ arec_size - mp_ofs, rl, 0, -1, NULL);
+ if (unlikely(err)) {
+ ntfs_debug("Failed to build mapping pairs, error code %i.",
+ err);
+ goto undo_err_out;
+ }
+ /* Setup the in-memory attribute structure to be non-resident. */
+ ni->runlist.rl = rl;
+ write_lock_irqsave(&ni->size_lock, flags);
+ ni->allocated_size = new_size;
+ if (NInoSparse(ni) || NInoCompressed(ni)) {
+ ni->itype.compressed.size = ni->allocated_size;
+ if (a->data.non_resident.compression_unit) {
+ ni->itype.compressed.block_size = 1U << (a->data.
+ non_resident.compression_unit +
+ vol->cluster_size_bits);
+ ni->itype.compressed.block_size_bits =
+ ffs(ni->itype.compressed.block_size) -
+ 1;
+ ni->itype.compressed.block_clusters = 1U <<
+ a->data.non_resident.compression_unit;
+ } else {
+ ni->itype.compressed.block_size = 0;
+ ni->itype.compressed.block_size_bits = 0;
+ ni->itype.compressed.block_clusters = 0;
+ }
+ vi->i_blocks = ni->itype.compressed.size >> 9;
+ } else
+ vi->i_blocks = ni->allocated_size >> 9;
+ write_unlock_irqrestore(&ni->size_lock, flags);
+ /*
+ * This needs to be last since the address space operations ->read_folio
+ * and ->writepage can run concurrently with us as they are not
+ * serialized on i_mutex. Note, we are not allowed to fail once we flip
+ * this switch, which is another reason to do this last.
+ */
+ NInoSetNonResident(ni);
+ /* Mark the mft record dirty, so it gets written back. */
+ flush_dcache_mft_record_page(ctx->ntfs_ino);
+ mark_mft_record_dirty(ctx->ntfs_ino);
+ ntfs_attr_put_search_ctx(ctx);
+ unmap_mft_record(base_ni);
+ up_write(&ni->runlist.lock);
+ if (page) {
+ set_page_dirty(page);
+ unlock_page(page);
+ put_page(page);
+ }
+ ntfs_debug("Done.");
+ return 0;
+undo_err_out:
+ /* Convert the attribute back into a resident attribute. */
+ a->non_resident = 0;
+ /* Move the attribute name if it exists and update the offset. */
+ name_ofs = (offsetof(ATTR_RECORD, data.resident.reserved) +
+ sizeof(a->data.resident.reserved) + 7) & ~7;
+ if (a->name_length)
+ memmove((u8*)a + name_ofs, (u8*)a + le16_to_cpu(a->name_offset),
+ a->name_length * sizeof(ntfschar));
+ mp_ofs = (name_ofs + a->name_length * sizeof(ntfschar) + 7) & ~7;
+ a->name_offset = cpu_to_le16(name_ofs);
+ arec_size = (mp_ofs + attr_size + 7) & ~7;
+ /* Resize the resident part of the attribute record. */
+ err2 = ntfs_attr_record_resize(m, a, arec_size);
+ if (unlikely(err2)) {
+ /*
+ * This cannot happen (well if memory corruption is at work it
+ * could happen in theory), but deal with it as well as we can.
+ * If the old size is too small, truncate the attribute,
+ * otherwise simply give it a larger allocated size.
+ * FIXME: Should check whether chkdsk complains when the
+ * allocated size is much bigger than the resident value size.
+ */
+ arec_size = le32_to_cpu(a->length);
+ if ((mp_ofs + attr_size) > arec_size) {
+ err2 = attr_size;
+ attr_size = arec_size - mp_ofs;
+ ntfs_error(vol->sb, "Failed to undo partial resident "
+ "to non-resident attribute "
+ "conversion. Truncating inode 0x%lx, "
+ "attribute type 0x%x from %i bytes to "
+ "%i bytes to maintain metadata "
+ "consistency. THIS MEANS YOU ARE "
+ "LOSING %i BYTES DATA FROM THIS %s.",
+ vi->i_ino,
+ (unsigned)le32_to_cpu(ni->type),
+ err2, attr_size, err2 - attr_size,
+ ((ni->type == AT_DATA) &&
+ !ni->name_len) ? "FILE": "ATTRIBUTE");
+ write_lock_irqsave(&ni->size_lock, flags);
+ ni->initialized_size = attr_size;
+ i_size_write(vi, attr_size);
+ write_unlock_irqrestore(&ni->size_lock, flags);
+ }
+ }
+ /* Setup the fields specific to resident attributes. */
+ a->data.resident.value_length = cpu_to_le32(attr_size);
+ a->data.resident.value_offset = cpu_to_le16(mp_ofs);
+ a->data.resident.flags = old_res_attr_flags;
+ memset(&a->data.resident.reserved, 0,
+ sizeof(a->data.resident.reserved));
+ /* Copy the data from the page back to the attribute value. */
+ if (page) {
+ kaddr = kmap_atomic(page);
+ memcpy((u8*)a + mp_ofs, kaddr, attr_size);
+ kunmap_atomic(kaddr);
+ }
+ /* Setup the allocated size in the ntfs inode in case it changed. */
+ write_lock_irqsave(&ni->size_lock, flags);
+ ni->allocated_size = arec_size - mp_ofs;
+ write_unlock_irqrestore(&ni->size_lock, flags);
+ /* Mark the mft record dirty, so it gets written back. */
+ flush_dcache_mft_record_page(ctx->ntfs_ino);
+ mark_mft_record_dirty(ctx->ntfs_ino);
+err_out:
+ if (ctx)
+ ntfs_attr_put_search_ctx(ctx);
+ if (m)
+ unmap_mft_record(base_ni);
+ ni->runlist.rl = NULL;
+ up_write(&ni->runlist.lock);
+rl_err_out:
+ if (rl) {
+ if (ntfs_cluster_free_from_rl(vol, rl) < 0) {
+ ntfs_error(vol->sb, "Failed to release allocated "
+ "cluster(s) in error code path. Run "
+ "chkdsk to recover the lost "
+ "cluster(s).");
+ NVolSetErrors(vol);
+ }
+ ntfs_free(rl);
+page_err_out:
+ unlock_page(page);
+ put_page(page);
+ }
+ if (err == -EINVAL)
+ err = -EIO;
+ return err;
+}
+
+/**
+ * ntfs_attr_extend_allocation - extend the allocated space of an attribute
+ * @ni: ntfs inode of the attribute whose allocation to extend
+ * @new_alloc_size: new size in bytes to which to extend the allocation to
+ * @new_data_size: new size in bytes to which to extend the data to
+ * @data_start: beginning of region which is required to be non-sparse
+ *
+ * Extend the allocated space of an attribute described by the ntfs inode @ni
+ * to @new_alloc_size bytes. If @data_start is -1, the whole extension may be
+ * implemented as a hole in the file (as long as both the volume and the ntfs
+ * inode @ni have sparse support enabled). If @data_start is >= 0, then the
+ * region between the old allocated size and @data_start - 1 may be made sparse
+ * but the regions between @data_start and @new_alloc_size must be backed by
+ * actual clusters.
+ *
+ * If @new_data_size is -1, it is ignored. If it is >= 0, then the data size
+ * of the attribute is extended to @new_data_size. Note that the i_size of the
+ * vfs inode is not updated. Only the data size in the base attribute record
+ * is updated. The caller has to update i_size separately if this is required.
+ * WARNING: It is a BUG() for @new_data_size to be smaller than the old data
+ * size as well as for @new_data_size to be greater than @new_alloc_size.
+ *
+ * For resident attributes this involves resizing the attribute record and if
+ * necessary moving it and/or other attributes into extent mft records and/or
+ * converting the attribute to a non-resident attribute which in turn involves
+ * extending the allocation of a non-resident attribute as described below.
+ *
+ * For non-resident attributes this involves allocating clusters in the data
+ * zone on the volume (except for regions that are being made sparse) and
+ * extending the run list to describe the allocated clusters as well as
+ * updating the mapping pairs array of the attribute. This in turn involves
+ * resizing the attribute record and if necessary moving it and/or other
+ * attributes into extent mft records and/or splitting the attribute record
+ * into multiple extent attribute records.
+ *
+ * Also, the attribute list attribute is updated if present and in some of the
+ * above cases (the ones where extent mft records/attributes come into play),
+ * an attribute list attribute is created if not already present.
+ *
+ * Return the new allocated size on success and -errno on error. In the case
+ * that an error is encountered but a partial extension at least up to
+ * @data_start (if present) is possible, the allocation is partially extended
+ * and this is returned. This means the caller must check the returned size to
+ * determine if the extension was partial. If @data_start is -1 then partial
+ * allocations are not performed.
+ *
+ * WARNING: Do not call ntfs_attr_extend_allocation() for $MFT/$DATA.
+ *
+ * Locking: This function takes the runlist lock of @ni for writing as well as
+ * locking the mft record of the base ntfs inode. These locks are maintained
+ * throughout execution of the function. These locks are required so that the
+ * attribute can be resized safely and so that it can for example be converted
+ * from resident to non-resident safely.
+ *
+ * TODO: At present attribute list attribute handling is not implemented.
+ *
+ * TODO: At present it is not safe to call this function for anything other
+ * than the $DATA attribute(s) of an uncompressed and unencrypted file.
+ */
+s64 ntfs_attr_extend_allocation(ntfs_inode *ni, s64 new_alloc_size,
+ const s64 new_data_size, const s64 data_start)
+{
+ VCN vcn;
+ s64 ll, allocated_size, start = data_start;
+ struct inode *vi = VFS_I(ni);
+ ntfs_volume *vol = ni->vol;
+ ntfs_inode *base_ni;
+ MFT_RECORD *m;
+ ATTR_RECORD *a;
+ ntfs_attr_search_ctx *ctx;
+ runlist_element *rl, *rl2;
+ unsigned long flags;
+ int err, mp_size;
+ u32 attr_len = 0; /* Silence stupid gcc warning. */
+ bool mp_rebuilt;
+
+#ifdef DEBUG
+ read_lock_irqsave(&ni->size_lock, flags);
+ allocated_size = ni->allocated_size;
+ read_unlock_irqrestore(&ni->size_lock, flags);
+ ntfs_debug("Entering for i_ino 0x%lx, attribute type 0x%x, "
+ "old_allocated_size 0x%llx, "
+ "new_allocated_size 0x%llx, new_data_size 0x%llx, "
+ "data_start 0x%llx.", vi->i_ino,
+ (unsigned)le32_to_cpu(ni->type),
+ (unsigned long long)allocated_size,
+ (unsigned long long)new_alloc_size,
+ (unsigned long long)new_data_size,
+ (unsigned long long)start);
+#endif
+retry_extend:
+ /*
+ * For non-resident attributes, @start and @new_size need to be aligned
+ * to cluster boundaries for allocation purposes.
+ */
+ if (NInoNonResident(ni)) {
+ if (start > 0)
+ start &= ~(s64)vol->cluster_size_mask;
+ new_alloc_size = (new_alloc_size + vol->cluster_size - 1) &
+ ~(s64)vol->cluster_size_mask;
+ }
+ BUG_ON(new_data_size >= 0 && new_data_size > new_alloc_size);
+ /* Check if new size is allowed in $AttrDef. */
+ err = ntfs_attr_size_bounds_check(vol, ni->type, new_alloc_size);
+ if (unlikely(err)) {
+ /* Only emit errors when the write will fail completely. */
+ read_lock_irqsave(&ni->size_lock, flags);
+ allocated_size = ni->allocated_size;
+ read_unlock_irqrestore(&ni->size_lock, flags);
+ if (start < 0 || start >= allocated_size) {
+ if (err == -ERANGE) {
+ ntfs_error(vol->sb, "Cannot extend allocation "
+ "of inode 0x%lx, attribute "
+ "type 0x%x, because the new "
+ "allocation would exceed the "
+ "maximum allowed size for "
+ "this attribute type.",
+ vi->i_ino, (unsigned)
+ le32_to_cpu(ni->type));
+ } else {
+ ntfs_error(vol->sb, "Cannot extend allocation "
+ "of inode 0x%lx, attribute "
+ "type 0x%x, because this "
+ "attribute type is not "
+ "defined on the NTFS volume. "
+ "Possible corruption! You "
+ "should run chkdsk!",
+ vi->i_ino, (unsigned)
+ le32_to_cpu(ni->type));
+ }
+ }
+ /* Translate error code to be POSIX conformant for write(2). */
+ if (err == -ERANGE)
+ err = -EFBIG;
+ else
+ err = -EIO;
+ return err;
+ }
+ if (!NInoAttr(ni))
+ base_ni = ni;
+ else
+ base_ni = ni->ext.base_ntfs_ino;
+ /*
+ * We will be modifying both the runlist (if non-resident) and the mft
+ * record so lock them both down.
+ */
+ down_write(&ni->runlist.lock);
+ m = map_mft_record(base_ni);
+ if (IS_ERR(m)) {
+ err = PTR_ERR(m);
+ m = NULL;
+ ctx = NULL;
+ goto err_out;
+ }
+ ctx = ntfs_attr_get_search_ctx(base_ni, m);
+ if (unlikely(!ctx)) {
+ err = -ENOMEM;
+ goto err_out;
+ }
+ read_lock_irqsave(&ni->size_lock, flags);
+ allocated_size = ni->allocated_size;
+ read_unlock_irqrestore(&ni->size_lock, flags);
+ /*
+ * If non-resident, seek to the last extent. If resident, there is
+ * only one extent, so seek to that.
+ */
+ vcn = NInoNonResident(ni) ? allocated_size >> vol->cluster_size_bits :
+ 0;
+ /*
+ * Abort if someone did the work whilst we waited for the locks. If we
+ * just converted the attribute from resident to non-resident it is
+ * likely that exactly this has happened already. We cannot quite
+ * abort if we need to update the data size.
+ */
+ if (unlikely(new_alloc_size <= allocated_size)) {
+ ntfs_debug("Allocated size already exceeds requested size.");
+ new_alloc_size = allocated_size;
+ if (new_data_size < 0)
+ goto done;
+ /*
+ * We want the first attribute extent so that we can update the
+ * data size.
+ */
+ vcn = 0;
+ }
+ err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
+ CASE_SENSITIVE, vcn, NULL, 0, ctx);
+ if (unlikely(err)) {
+ if (err == -ENOENT)
+ err = -EIO;
+ goto err_out;
+ }
+ m = ctx->mrec;
+ a = ctx->attr;
+ /* Use goto to reduce indentation. */
+ if (a->non_resident)
+ goto do_non_resident_extend;
+ BUG_ON(NInoNonResident(ni));
+ /* The total length of the attribute value. */
+ attr_len = le32_to_cpu(a->data.resident.value_length);
+ /*
+ * Extend the attribute record to be able to store the new attribute
+ * size. ntfs_attr_record_resize() will not do anything if the size is
+ * not changing.
+ */
+ if (new_alloc_size < vol->mft_record_size &&
+ !ntfs_attr_record_resize(m, a,
+ le16_to_cpu(a->data.resident.value_offset) +
+ new_alloc_size)) {
+ /* The resize succeeded! */
+ write_lock_irqsave(&ni->size_lock, flags);
+ ni->allocated_size = le32_to_cpu(a->length) -
+ le16_to_cpu(a->data.resident.value_offset);
+ write_unlock_irqrestore(&ni->size_lock, flags);
+ if (new_data_size >= 0) {
+ BUG_ON(new_data_size < attr_len);
+ a->data.resident.value_length =
+ cpu_to_le32((u32)new_data_size);
+ }
+ goto flush_done;
+ }
+ /*
+ * We have to drop all the locks so we can call
+ * ntfs_attr_make_non_resident(). This could be optimised by try-
+ * locking the first page cache page and only if that fails dropping
+ * the locks, locking the page, and redoing all the locking and
+ * lookups. While this would be a huge optimisation, it is not worth
+ * it as this is definitely a slow code path.
+ */
+ ntfs_attr_put_search_ctx(ctx);
+ unmap_mft_record(base_ni);
+ up_write(&ni->runlist.lock);
+ /*
+ * Not enough space in the mft record, try to make the attribute
+ * non-resident and if successful restart the extension process.
+ */
+ err = ntfs_attr_make_non_resident(ni, attr_len);
+ if (likely(!err))
+ goto retry_extend;
+ /*
+ * Could not make non-resident. If this is due to this not being
+ * permitted for this attribute type or there not being enough space,
+ * try to make other attributes non-resident. Otherwise fail.
+ */
+ if (unlikely(err != -EPERM && err != -ENOSPC)) {
+ /* Only emit errors when the write will fail completely. */
+ read_lock_irqsave(&ni->size_lock, flags);
+ allocated_size = ni->allocated_size;
+ read_unlock_irqrestore(&ni->size_lock, flags);
+ if (start < 0 || start >= allocated_size)
+ ntfs_error(vol->sb, "Cannot extend allocation of "
+ "inode 0x%lx, attribute type 0x%x, "
+ "because the conversion from resident "
+ "to non-resident attribute failed "
+ "with error code %i.", vi->i_ino,
+ (unsigned)le32_to_cpu(ni->type), err);
+ if (err != -ENOMEM)
+ err = -EIO;
+ goto conv_err_out;
+ }
+ /* TODO: Not implemented from here, abort. */
+ read_lock_irqsave(&ni->size_lock, flags);
+ allocated_size = ni->allocated_size;
+ read_unlock_irqrestore(&ni->size_lock, flags);
+ if (start < 0 || start >= allocated_size) {
+ if (err == -ENOSPC)
+ ntfs_error(vol->sb, "Not enough space in the mft "
+ "record/on disk for the non-resident "
+ "attribute value. This case is not "
+ "implemented yet.");
+ else /* if (err == -EPERM) */
+ ntfs_error(vol->sb, "This attribute type may not be "
+ "non-resident. This case is not "
+ "implemented yet.");
+ }
+ err = -EOPNOTSUPP;
+ goto conv_err_out;
+#if 0
+ // TODO: Attempt to make other attributes non-resident.
+ if (!err)
+ goto do_resident_extend;
+ /*
+ * Both the attribute list attribute and the standard information
+ * attribute must remain in the base inode. Thus, if this is one of
+ * these attributes, we have to try to move other attributes out into
+ * extent mft records instead.
+ */
+ if (ni->type == AT_ATTRIBUTE_LIST ||
+ ni->type == AT_STANDARD_INFORMATION) {
+ // TODO: Attempt to move other attributes into extent mft
+ // records.
+ err = -EOPNOTSUPP;
+ if (!err)
+ goto do_resident_extend;
+ goto err_out;
+ }
+ // TODO: Attempt to move this attribute to an extent mft record, but
+ // only if it is not already the only attribute in an mft record in
+ // which case there would be nothing to gain.
+ err = -EOPNOTSUPP;
+ if (!err)
+ goto do_resident_extend;
+ /* There is nothing we can do to make enough space. )-: */
+ goto err_out;
+#endif
+do_non_resident_extend:
+ BUG_ON(!NInoNonResident(ni));
+ if (new_alloc_size == allocated_size) {
+ BUG_ON(vcn);
+ goto alloc_done;
+ }
+ /*
+ * If the data starts after the end of the old allocation, this is a
+ * $DATA attribute and sparse attributes are enabled on the volume and
+ * for this inode, then create a sparse region between the old
+ * allocated size and the start of the data. Otherwise simply proceed
+ * with filling the whole space between the old allocated size and the
+ * new allocated size with clusters.
+ */
+ if ((start >= 0 && start <= allocated_size) || ni->type != AT_DATA ||
+ !NVolSparseEnabled(vol) || NInoSparseDisabled(ni))
+ goto skip_sparse;
+ // TODO: This is not implemented yet. We just fill in with real
+ // clusters for now...
+ ntfs_debug("Inserting holes is not-implemented yet. Falling back to "
+ "allocating real clusters instead.");
+skip_sparse:
+ rl = ni->runlist.rl;
+ if (likely(rl)) {
+ /* Seek to the end of the runlist. */
+ while (rl->length)
+ rl++;
+ }
+ /* If this attribute extent is not mapped, map it now. */
+ if (unlikely(!rl || rl->lcn == LCN_RL_NOT_MAPPED ||
+ (rl->lcn == LCN_ENOENT && rl > ni->runlist.rl &&
+ (rl-1)->lcn == LCN_RL_NOT_MAPPED))) {
+ if (!rl && !allocated_size)
+ goto first_alloc;
+ rl = ntfs_mapping_pairs_decompress(vol, a, ni->runlist.rl);
+ if (IS_ERR(rl)) {
+ err = PTR_ERR(rl);
+ if (start < 0 || start >= allocated_size)
+ ntfs_error(vol->sb, "Cannot extend allocation "
+ "of inode 0x%lx, attribute "
+ "type 0x%x, because the "
+ "mapping of a runlist "
+ "fragment failed with error "
+ "code %i.", vi->i_ino,
+ (unsigned)le32_to_cpu(ni->type),
+ err);
+ if (err != -ENOMEM)
+ err = -EIO;
+ goto err_out;
+ }
+ ni->runlist.rl = rl;
+ /* Seek to the end of the runlist. */
+ while (rl->length)
+ rl++;
+ }
+ /*
+ * We now know the runlist of the last extent is mapped and @rl is at
+ * the end of the runlist. We want to begin allocating clusters
+ * starting at the last allocated cluster to reduce fragmentation. If
+ * there are no valid LCNs in the attribute we let the cluster
+ * allocator choose the starting cluster.
+ */
+ /* If the last LCN is a hole or simillar seek back to last real LCN. */
+ while (rl->lcn < 0 && rl > ni->runlist.rl)
+ rl--;
+first_alloc:
+ // FIXME: Need to implement partial allocations so at least part of the
+ // write can be performed when start >= 0. (Needed for POSIX write(2)
+ // conformance.)
+ rl2 = ntfs_cluster_alloc(vol, allocated_size >> vol->cluster_size_bits,
+ (new_alloc_size - allocated_size) >>
+ vol->cluster_size_bits, (rl && (rl->lcn >= 0)) ?
+ rl->lcn + rl->length : -1, DATA_ZONE, true);
+ if (IS_ERR(rl2)) {
+ err = PTR_ERR(rl2);
+ if (start < 0 || start >= allocated_size)
+ ntfs_error(vol->sb, "Cannot extend allocation of "
+ "inode 0x%lx, attribute type 0x%x, "
+ "because the allocation of clusters "
+ "failed with error code %i.", vi->i_ino,
+ (unsigned)le32_to_cpu(ni->type), err);
+ if (err != -ENOMEM && err != -ENOSPC)
+ err = -EIO;
+ goto err_out;
+ }
+ rl = ntfs_runlists_merge(ni->runlist.rl, rl2);
+ if (IS_ERR(rl)) {
+ err = PTR_ERR(rl);
+ if (start < 0 || start >= allocated_size)
+ ntfs_error(vol->sb, "Cannot extend allocation of "
+ "inode 0x%lx, attribute type 0x%x, "
+ "because the runlist merge failed "
+ "with error code %i.", vi->i_ino,
+ (unsigned)le32_to_cpu(ni->type), err);
+ if (err != -ENOMEM)
+ err = -EIO;
+ if (ntfs_cluster_free_from_rl(vol, rl2)) {
+ ntfs_error(vol->sb, "Failed to release allocated "
+ "cluster(s) in error code path. Run "
+ "chkdsk to recover the lost "
+ "cluster(s).");
+ NVolSetErrors(vol);
+ }
+ ntfs_free(rl2);
+ goto err_out;
+ }
+ ni->runlist.rl = rl;
+ ntfs_debug("Allocated 0x%llx clusters.", (long long)(new_alloc_size -
+ allocated_size) >> vol->cluster_size_bits);
+ /* Find the runlist element with which the attribute extent starts. */
+ ll = sle64_to_cpu(a->data.non_resident.lowest_vcn);
+ rl2 = ntfs_rl_find_vcn_nolock(rl, ll);
+ BUG_ON(!rl2);
+ BUG_ON(!rl2->length);
+ BUG_ON(rl2->lcn < LCN_HOLE);
+ mp_rebuilt = false;
+ /* Get the size for the new mapping pairs array for this extent. */
+ mp_size = ntfs_get_size_for_mapping_pairs(vol, rl2, ll, -1);
+ if (unlikely(mp_size <= 0)) {
+ err = mp_size;
+ if (start < 0 || start >= allocated_size)
+ ntfs_error(vol->sb, "Cannot extend allocation of "
+ "inode 0x%lx, attribute type 0x%x, "
+ "because determining the size for the "
+ "mapping pairs failed with error code "
+ "%i.", vi->i_ino,
+ (unsigned)le32_to_cpu(ni->type), err);
+ err = -EIO;
+ goto undo_alloc;
+ }
+ /* Extend the attribute record to fit the bigger mapping pairs array. */
+ attr_len = le32_to_cpu(a->length);
+ err = ntfs_attr_record_resize(m, a, mp_size +
+ le16_to_cpu(a->data.non_resident.mapping_pairs_offset));
+ if (unlikely(err)) {
+ BUG_ON(err != -ENOSPC);
+ // TODO: Deal with this by moving this extent to a new mft
+ // record or by starting a new extent in a new mft record,
+ // possibly by extending this extent partially and filling it
+ // and creating a new extent for the remainder, or by making
+ // other attributes non-resident and/or by moving other
+ // attributes out of this mft record.
+ if (start < 0 || start >= allocated_size)
+ ntfs_error(vol->sb, "Not enough space in the mft "
+ "record for the extended attribute "
+ "record. This case is not "
+ "implemented yet.");
+ err = -EOPNOTSUPP;
+ goto undo_alloc;
+ }
+ mp_rebuilt = true;
+ /* Generate the mapping pairs array directly into the attr record. */
+ err = ntfs_mapping_pairs_build(vol, (u8*)a +
+ le16_to_cpu(a->data.non_resident.mapping_pairs_offset),
+ mp_size, rl2, ll, -1, NULL);
+ if (unlikely(err)) {
+ if (start < 0 || start >= allocated_size)
+ ntfs_error(vol->sb, "Cannot extend allocation of "
+ "inode 0x%lx, attribute type 0x%x, "
+ "because building the mapping pairs "
+ "failed with error code %i.", vi->i_ino,
+ (unsigned)le32_to_cpu(ni->type), err);
+ err = -EIO;
+ goto undo_alloc;
+ }
+ /* Update the highest_vcn. */
+ a->data.non_resident.highest_vcn = cpu_to_sle64((new_alloc_size >>
+ vol->cluster_size_bits) - 1);
+ /*
+ * We now have extended the allocated size of the attribute. Reflect
+ * this in the ntfs_inode structure and the attribute record.
+ */
+ if (a->data.non_resident.lowest_vcn) {
+ /*
+ * We are not in the first attribute extent, switch to it, but
+ * first ensure the changes will make it to disk later.
+ */
+ flush_dcache_mft_record_page(ctx->ntfs_ino);
+ mark_mft_record_dirty(ctx->ntfs_ino);
+ ntfs_attr_reinit_search_ctx(ctx);
+ err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
+ CASE_SENSITIVE, 0, NULL, 0, ctx);
+ if (unlikely(err))
+ goto restore_undo_alloc;
+ /* @m is not used any more so no need to set it. */
+ a = ctx->attr;
+ }
+ write_lock_irqsave(&ni->size_lock, flags);
+ ni->allocated_size = new_alloc_size;
+ a->data.non_resident.allocated_size = cpu_to_sle64(new_alloc_size);
+ /*
+ * FIXME: This would fail if @ni is a directory, $MFT, or an index,
+ * since those can have sparse/compressed set. For example can be
+ * set compressed even though it is not compressed itself and in that
+ * case the bit means that files are to be created compressed in the
+ * directory... At present this is ok as this code is only called for
+ * regular files, and only for their $DATA attribute(s).
+ * FIXME: The calculation is wrong if we created a hole above. For now
+ * it does not matter as we never create holes.
+ */
+ if (NInoSparse(ni) || NInoCompressed(ni)) {
+ ni->itype.compressed.size += new_alloc_size - allocated_size;
+ a->data.non_resident.compressed_size =
+ cpu_to_sle64(ni->itype.compressed.size);
+ vi->i_blocks = ni->itype.compressed.size >> 9;
+ } else
+ vi->i_blocks = new_alloc_size >> 9;
+ write_unlock_irqrestore(&ni->size_lock, flags);
+alloc_done:
+ if (new_data_size >= 0) {
+ BUG_ON(new_data_size <
+ sle64_to_cpu(a->data.non_resident.data_size));
+ a->data.non_resident.data_size = cpu_to_sle64(new_data_size);
+ }
+flush_done:
+ /* Ensure the changes make it to disk. */
+ flush_dcache_mft_record_page(ctx->ntfs_ino);
+ mark_mft_record_dirty(ctx->ntfs_ino);
+done:
+ ntfs_attr_put_search_ctx(ctx);
+ unmap_mft_record(base_ni);
+ up_write(&ni->runlist.lock);
+ ntfs_debug("Done, new_allocated_size 0x%llx.",
+ (unsigned long long)new_alloc_size);
+ return new_alloc_size;
+restore_undo_alloc:
+ if (start < 0 || start >= allocated_size)
+ ntfs_error(vol->sb, "Cannot complete extension of allocation "
+ "of inode 0x%lx, attribute type 0x%x, because "
+ "lookup of first attribute extent failed with "
+ "error code %i.", vi->i_ino,
+ (unsigned)le32_to_cpu(ni->type), err);
+ if (err == -ENOENT)
+ err = -EIO;
+ ntfs_attr_reinit_search_ctx(ctx);
+ if (ntfs_attr_lookup(ni->type, ni->name, ni->name_len, CASE_SENSITIVE,
+ allocated_size >> vol->cluster_size_bits, NULL, 0,
+ ctx)) {
+ ntfs_error(vol->sb, "Failed to find last attribute extent of "
+ "attribute in error code path. Run chkdsk to "
+ "recover.");
+ write_lock_irqsave(&ni->size_lock, flags);
+ ni->allocated_size = new_alloc_size;
+ /*
+ * FIXME: This would fail if @ni is a directory... See above.
+ * FIXME: The calculation is wrong if we created a hole above.
+ * For now it does not matter as we never create holes.
+ */
+ if (NInoSparse(ni) || NInoCompressed(ni)) {
+ ni->itype.compressed.size += new_alloc_size -
+ allocated_size;
+ vi->i_blocks = ni->itype.compressed.size >> 9;
+ } else
+ vi->i_blocks = new_alloc_size >> 9;
+ write_unlock_irqrestore(&ni->size_lock, flags);
+ ntfs_attr_put_search_ctx(ctx);
+ unmap_mft_record(base_ni);
+ up_write(&ni->runlist.lock);
+ /*
+ * The only thing that is now wrong is the allocated size of the
+ * base attribute extent which chkdsk should be able to fix.
+ */
+ NVolSetErrors(vol);
+ return err;
+ }
+ ctx->attr->data.non_resident.highest_vcn = cpu_to_sle64(
+ (allocated_size >> vol->cluster_size_bits) - 1);
+undo_alloc:
+ ll = allocated_size >> vol->cluster_size_bits;
+ if (ntfs_cluster_free(ni, ll, -1, ctx) < 0) {
+ ntfs_error(vol->sb, "Failed to release allocated cluster(s) "
+ "in error code path. Run chkdsk to recover "
+ "the lost cluster(s).");
+ NVolSetErrors(vol);
+ }
+ m = ctx->mrec;
+ a = ctx->attr;
+ /*
+ * If the runlist truncation fails and/or the search context is no
+ * longer valid, we cannot resize the attribute record or build the
+ * mapping pairs array thus we mark the inode bad so that no access to
+ * the freed clusters can happen.
+ */
+ if (ntfs_rl_truncate_nolock(vol, &ni->runlist, ll) || IS_ERR(m)) {
+ ntfs_error(vol->sb, "Failed to %s in error code path. Run "
+ "chkdsk to recover.", IS_ERR(m) ?
+ "restore attribute search context" :
+ "truncate attribute runlist");
+ NVolSetErrors(vol);
+ } else if (mp_rebuilt) {
+ if (ntfs_attr_record_resize(m, a, attr_len)) {
+ ntfs_error(vol->sb, "Failed to restore attribute "
+ "record in error code path. Run "
+ "chkdsk to recover.");
+ NVolSetErrors(vol);
+ } else /* if (success) */ {
+ if (ntfs_mapping_pairs_build(vol, (u8*)a + le16_to_cpu(
+ a->data.non_resident.
+ mapping_pairs_offset), attr_len -
+ le16_to_cpu(a->data.non_resident.
+ mapping_pairs_offset), rl2, ll, -1,
+ NULL)) {
+ ntfs_error(vol->sb, "Failed to restore "
+ "mapping pairs array in error "
+ "code path. Run chkdsk to "
+ "recover.");
+ NVolSetErrors(vol);
+ }
+ flush_dcache_mft_record_page(ctx->ntfs_ino);
+ mark_mft_record_dirty(ctx->ntfs_ino);
+ }
+ }
+err_out:
+ if (ctx)
+ ntfs_attr_put_search_ctx(ctx);
+ if (m)
+ unmap_mft_record(base_ni);
+ up_write(&ni->runlist.lock);
+conv_err_out:
+ ntfs_debug("Failed. Returning error code %i.", err);
+ return err;
+}
+
+/**
+ * ntfs_attr_set - fill (a part of) an attribute with a byte
+ * @ni: ntfs inode describing the attribute to fill
+ * @ofs: offset inside the attribute at which to start to fill
+ * @cnt: number of bytes to fill
+ * @val: the unsigned 8-bit value with which to fill the attribute
+ *
+ * Fill @cnt bytes of the attribute described by the ntfs inode @ni starting at
+ * byte offset @ofs inside the attribute with the constant byte @val.
+ *
+ * This function is effectively like memset() applied to an ntfs attribute.
+ * Note thie function actually only operates on the page cache pages belonging
+ * to the ntfs attribute and it marks them dirty after doing the memset().
+ * Thus it relies on the vm dirty page write code paths to cause the modified
+ * pages to be written to the mft record/disk.
+ *
+ * Return 0 on success and -errno on error. An error code of -ESPIPE means
+ * that @ofs + @cnt were outside the end of the attribute and no write was
+ * performed.
+ */
+int ntfs_attr_set(ntfs_inode *ni, const s64 ofs, const s64 cnt, const u8 val)
+{
+ ntfs_volume *vol = ni->vol;
+ struct address_space *mapping;
+ struct page *page;
+ u8 *kaddr;
+ pgoff_t idx, end;
+ unsigned start_ofs, end_ofs, size;
+
+ ntfs_debug("Entering for ofs 0x%llx, cnt 0x%llx, val 0x%hx.",
+ (long long)ofs, (long long)cnt, val);
+ BUG_ON(ofs < 0);
+ BUG_ON(cnt < 0);
+ if (!cnt)
+ goto done;
+ /*
+ * FIXME: Compressed and encrypted attributes are not supported when
+ * writing and we should never have gotten here for them.
+ */
+ BUG_ON(NInoCompressed(ni));
+ BUG_ON(NInoEncrypted(ni));
+ mapping = VFS_I(ni)->i_mapping;
+ /* Work out the starting index and page offset. */
+ idx = ofs >> PAGE_SHIFT;
+ start_ofs = ofs & ~PAGE_MASK;
+ /* Work out the ending index and page offset. */
+ end = ofs + cnt;
+ end_ofs = end & ~PAGE_MASK;
+ /* If the end is outside the inode size return -ESPIPE. */
+ if (unlikely(end > i_size_read(VFS_I(ni)))) {
+ ntfs_error(vol->sb, "Request exceeds end of attribute.");
+ return -ESPIPE;
+ }
+ end >>= PAGE_SHIFT;
+ /* If there is a first partial page, need to do it the slow way. */
+ if (start_ofs) {
+ page = read_mapping_page(mapping, idx, NULL);
+ if (IS_ERR(page)) {
+ ntfs_error(vol->sb, "Failed to read first partial "
+ "page (error, index 0x%lx).", idx);
+ return PTR_ERR(page);
+ }
+ /*
+ * If the last page is the same as the first page, need to
+ * limit the write to the end offset.
+ */
+ size = PAGE_SIZE;
+ if (idx == end)
+ size = end_ofs;
+ kaddr = kmap_atomic(page);
+ memset(kaddr + start_ofs, val, size - start_ofs);
+ flush_dcache_page(page);
+ kunmap_atomic(kaddr);
+ set_page_dirty(page);
+ put_page(page);
+ balance_dirty_pages_ratelimited(mapping);
+ cond_resched();
+ if (idx == end)
+ goto done;
+ idx++;
+ }
+ /* Do the whole pages the fast way. */
+ for (; idx < end; idx++) {
+ /* Find or create the current page. (The page is locked.) */
+ page = grab_cache_page(mapping, idx);
+ if (unlikely(!page)) {
+ ntfs_error(vol->sb, "Insufficient memory to grab "
+ "page (index 0x%lx).", idx);
+ return -ENOMEM;
+ }
+ kaddr = kmap_atomic(page);
+ memset(kaddr, val, PAGE_SIZE);
+ flush_dcache_page(page);
+ kunmap_atomic(kaddr);
+ /*
+ * If the page has buffers, mark them uptodate since buffer
+ * state and not page state is definitive in 2.6 kernels.
+ */
+ if (page_has_buffers(page)) {
+ struct buffer_head *bh, *head;
+
+ bh = head = page_buffers(page);
+ do {
+ set_buffer_uptodate(bh);
+ } while ((bh = bh->b_this_page) != head);
+ }
+ /* Now that buffers are uptodate, set the page uptodate, too. */
+ SetPageUptodate(page);
+ /*
+ * Set the page and all its buffers dirty and mark the inode
+ * dirty, too. The VM will write the page later on.
+ */
+ set_page_dirty(page);
+ /* Finally unlock and release the page. */
+ unlock_page(page);
+ put_page(page);
+ balance_dirty_pages_ratelimited(mapping);
+ cond_resched();
+ }
+ /* If there is a last partial page, need to do it the slow way. */
+ if (end_ofs) {
+ page = read_mapping_page(mapping, idx, NULL);
+ if (IS_ERR(page)) {
+ ntfs_error(vol->sb, "Failed to read last partial page "
+ "(error, index 0x%lx).", idx);
+ return PTR_ERR(page);
+ }
+ kaddr = kmap_atomic(page);
+ memset(kaddr, val, end_ofs);
+ flush_dcache_page(page);
+ kunmap_atomic(kaddr);
+ set_page_dirty(page);
+ put_page(page);
+ balance_dirty_pages_ratelimited(mapping);
+ cond_resched();
+ }
+done:
+ ntfs_debug("Done.");
+ return 0;
+}
+
+#endif /* NTFS_RW */
diff --git a/fs/ntfs/attrib.h b/fs/ntfs/attrib.h
new file mode 100644
index 000000000..fe0890d3d
--- /dev/null
+++ b/fs/ntfs/attrib.h
@@ -0,0 +1,102 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * attrib.h - Defines for attribute handling in NTFS Linux kernel driver.
+ * Part of the Linux-NTFS project.
+ *
+ * Copyright (c) 2001-2005 Anton Altaparmakov
+ * Copyright (c) 2002 Richard Russon
+ */
+
+#ifndef _LINUX_NTFS_ATTRIB_H
+#define _LINUX_NTFS_ATTRIB_H
+
+#include "endian.h"
+#include "types.h"
+#include "layout.h"
+#include "inode.h"
+#include "runlist.h"
+#include "volume.h"
+
+/**
+ * ntfs_attr_search_ctx - used in attribute search functions
+ * @mrec: buffer containing mft record to search
+ * @attr: attribute record in @mrec where to begin/continue search
+ * @is_first: if true ntfs_attr_lookup() begins search with @attr, else after
+ *
+ * Structure must be initialized to zero before the first call to one of the
+ * attribute search functions. Initialize @mrec to point to the mft record to
+ * search, and @attr to point to the first attribute within @mrec (not necessary
+ * if calling the _first() functions), and set @is_first to 'true' (not necessary
+ * if calling the _first() functions).
+ *
+ * If @is_first is 'true', the search begins with @attr. If @is_first is 'false',
+ * the search begins after @attr. This is so that, after the first call to one
+ * of the search attribute functions, we can call the function again, without
+ * any modification of the search context, to automagically get the next
+ * matching attribute.
+ */
+typedef struct {
+ MFT_RECORD *mrec;
+ ATTR_RECORD *attr;
+ bool is_first;
+ ntfs_inode *ntfs_ino;
+ ATTR_LIST_ENTRY *al_entry;
+ ntfs_inode *base_ntfs_ino;
+ MFT_RECORD *base_mrec;
+ ATTR_RECORD *base_attr;
+} ntfs_attr_search_ctx;
+
+extern int ntfs_map_runlist_nolock(ntfs_inode *ni, VCN vcn,
+ ntfs_attr_search_ctx *ctx);
+extern int ntfs_map_runlist(ntfs_inode *ni, VCN vcn);
+
+extern LCN ntfs_attr_vcn_to_lcn_nolock(ntfs_inode *ni, const VCN vcn,
+ const bool write_locked);
+
+extern runlist_element *ntfs_attr_find_vcn_nolock(ntfs_inode *ni,
+ const VCN vcn, ntfs_attr_search_ctx *ctx);
+
+int ntfs_attr_lookup(const ATTR_TYPE type, const ntfschar *name,
+ const u32 name_len, const IGNORE_CASE_BOOL ic,
+ const VCN lowest_vcn, const u8 *val, const u32 val_len,
+ ntfs_attr_search_ctx *ctx);
+
+extern int load_attribute_list(ntfs_volume *vol, runlist *rl, u8 *al_start,
+ const s64 size, const s64 initialized_size);
+
+static inline s64 ntfs_attr_size(const ATTR_RECORD *a)
+{
+ if (!a->non_resident)
+ return (s64)le32_to_cpu(a->data.resident.value_length);
+ return sle64_to_cpu(a->data.non_resident.data_size);
+}
+
+extern void ntfs_attr_reinit_search_ctx(ntfs_attr_search_ctx *ctx);
+extern ntfs_attr_search_ctx *ntfs_attr_get_search_ctx(ntfs_inode *ni,
+ MFT_RECORD *mrec);
+extern void ntfs_attr_put_search_ctx(ntfs_attr_search_ctx *ctx);
+
+#ifdef NTFS_RW
+
+extern int ntfs_attr_size_bounds_check(const ntfs_volume *vol,
+ const ATTR_TYPE type, const s64 size);
+extern int ntfs_attr_can_be_non_resident(const ntfs_volume *vol,
+ const ATTR_TYPE type);
+extern int ntfs_attr_can_be_resident(const ntfs_volume *vol,
+ const ATTR_TYPE type);
+
+extern int ntfs_attr_record_resize(MFT_RECORD *m, ATTR_RECORD *a, u32 new_size);
+extern int ntfs_resident_attr_value_resize(MFT_RECORD *m, ATTR_RECORD *a,
+ const u32 new_size);
+
+extern int ntfs_attr_make_non_resident(ntfs_inode *ni, const u32 data_size);
+
+extern s64 ntfs_attr_extend_allocation(ntfs_inode *ni, s64 new_alloc_size,
+ const s64 new_data_size, const s64 data_start);
+
+extern int ntfs_attr_set(ntfs_inode *ni, const s64 ofs, const s64 cnt,
+ const u8 val);
+
+#endif /* NTFS_RW */
+
+#endif /* _LINUX_NTFS_ATTRIB_H */
diff --git a/fs/ntfs/bitmap.c b/fs/ntfs/bitmap.c
new file mode 100644
index 000000000..0675b2400
--- /dev/null
+++ b/fs/ntfs/bitmap.c
@@ -0,0 +1,179 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * bitmap.c - NTFS kernel bitmap handling. Part of the Linux-NTFS project.
+ *
+ * Copyright (c) 2004-2005 Anton Altaparmakov
+ */
+
+#ifdef NTFS_RW
+
+#include <linux/pagemap.h>
+
+#include "bitmap.h"
+#include "debug.h"
+#include "aops.h"
+#include "ntfs.h"
+
+/**
+ * __ntfs_bitmap_set_bits_in_run - set a run of bits in a bitmap to a value
+ * @vi: vfs inode describing the bitmap
+ * @start_bit: first bit to set
+ * @count: number of bits to set
+ * @value: value to set the bits to (i.e. 0 or 1)
+ * @is_rollback: if 'true' this is a rollback operation
+ *
+ * Set @count bits starting at bit @start_bit in the bitmap described by the
+ * vfs inode @vi to @value, where @value is either 0 or 1.
+ *
+ * @is_rollback should always be 'false', it is for internal use to rollback
+ * errors. You probably want to use ntfs_bitmap_set_bits_in_run() instead.
+ *
+ * Return 0 on success and -errno on error.
+ */
+int __ntfs_bitmap_set_bits_in_run(struct inode *vi, const s64 start_bit,
+ const s64 count, const u8 value, const bool is_rollback)
+{
+ s64 cnt = count;
+ pgoff_t index, end_index;
+ struct address_space *mapping;
+ struct page *page;
+ u8 *kaddr;
+ int pos, len;
+ u8 bit;
+
+ BUG_ON(!vi);
+ ntfs_debug("Entering for i_ino 0x%lx, start_bit 0x%llx, count 0x%llx, "
+ "value %u.%s", vi->i_ino, (unsigned long long)start_bit,
+ (unsigned long long)cnt, (unsigned int)value,
+ is_rollback ? " (rollback)" : "");
+ BUG_ON(start_bit < 0);
+ BUG_ON(cnt < 0);
+ BUG_ON(value > 1);
+ /*
+ * Calculate the indices for the pages containing the first and last
+ * bits, i.e. @start_bit and @start_bit + @cnt - 1, respectively.
+ */
+ index = start_bit >> (3 + PAGE_SHIFT);
+ end_index = (start_bit + cnt - 1) >> (3 + PAGE_SHIFT);
+
+ /* Get the page containing the first bit (@start_bit). */
+ mapping = vi->i_mapping;
+ page = ntfs_map_page(mapping, index);
+ if (IS_ERR(page)) {
+ if (!is_rollback)
+ ntfs_error(vi->i_sb, "Failed to map first page (error "
+ "%li), aborting.", PTR_ERR(page));
+ return PTR_ERR(page);
+ }
+ kaddr = page_address(page);
+
+ /* Set @pos to the position of the byte containing @start_bit. */
+ pos = (start_bit >> 3) & ~PAGE_MASK;
+
+ /* Calculate the position of @start_bit in the first byte. */
+ bit = start_bit & 7;
+
+ /* If the first byte is partial, modify the appropriate bits in it. */
+ if (bit) {
+ u8 *byte = kaddr + pos;
+ while ((bit & 7) && cnt) {
+ cnt--;
+ if (value)
+ *byte |= 1 << bit++;
+ else
+ *byte &= ~(1 << bit++);
+ }
+ /* If we are done, unmap the page and return success. */
+ if (!cnt)
+ goto done;
+
+ /* Update @pos to the new position. */
+ pos++;
+ }
+ /*
+ * Depending on @value, modify all remaining whole bytes in the page up
+ * to @cnt.
+ */
+ len = min_t(s64, cnt >> 3, PAGE_SIZE - pos);
+ memset(kaddr + pos, value ? 0xff : 0, len);
+ cnt -= len << 3;
+
+ /* Update @len to point to the first not-done byte in the page. */
+ if (cnt < 8)
+ len += pos;
+
+ /* If we are not in the last page, deal with all subsequent pages. */
+ while (index < end_index) {
+ BUG_ON(cnt <= 0);
+
+ /* Update @index and get the next page. */
+ flush_dcache_page(page);
+ set_page_dirty(page);
+ ntfs_unmap_page(page);
+ page = ntfs_map_page(mapping, ++index);
+ if (IS_ERR(page))
+ goto rollback;
+ kaddr = page_address(page);
+ /*
+ * Depending on @value, modify all remaining whole bytes in the
+ * page up to @cnt.
+ */
+ len = min_t(s64, cnt >> 3, PAGE_SIZE);
+ memset(kaddr, value ? 0xff : 0, len);
+ cnt -= len << 3;
+ }
+ /*
+ * The currently mapped page is the last one. If the last byte is
+ * partial, modify the appropriate bits in it. Note, @len is the
+ * position of the last byte inside the page.
+ */
+ if (cnt) {
+ u8 *byte;
+
+ BUG_ON(cnt > 7);
+
+ bit = cnt;
+ byte = kaddr + len;
+ while (bit--) {
+ if (value)
+ *byte |= 1 << bit;
+ else
+ *byte &= ~(1 << bit);
+ }
+ }
+done:
+ /* We are done. Unmap the page and return success. */
+ flush_dcache_page(page);
+ set_page_dirty(page);
+ ntfs_unmap_page(page);
+ ntfs_debug("Done.");
+ return 0;
+rollback:
+ /*
+ * Current state:
+ * - no pages are mapped
+ * - @count - @cnt is the number of bits that have been modified
+ */
+ if (is_rollback)
+ return PTR_ERR(page);
+ if (count != cnt)
+ pos = __ntfs_bitmap_set_bits_in_run(vi, start_bit, count - cnt,
+ value ? 0 : 1, true);
+ else
+ pos = 0;
+ if (!pos) {
+ /* Rollback was successful. */
+ ntfs_error(vi->i_sb, "Failed to map subsequent page (error "
+ "%li), aborting.", PTR_ERR(page));
+ } else {
+ /* Rollback failed. */
+ ntfs_error(vi->i_sb, "Failed to map subsequent page (error "
+ "%li) and rollback failed (error %i). "
+ "Aborting and leaving inconsistent metadata. "
+ "Unmount and run chkdsk.", PTR_ERR(page), pos);
+ NVolSetErrors(NTFS_SB(vi->i_sb));
+ }
+ return PTR_ERR(page);
+}
+
+#endif /* NTFS_RW */
diff --git a/fs/ntfs/bitmap.h b/fs/ntfs/bitmap.h
new file mode 100644
index 000000000..9dd2224ca
--- /dev/null
+++ b/fs/ntfs/bitmap.h
@@ -0,0 +1,104 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * bitmap.h - Defines for NTFS kernel bitmap handling. Part of the Linux-NTFS
+ * project.
+ *
+ * Copyright (c) 2004 Anton Altaparmakov
+ */
+
+#ifndef _LINUX_NTFS_BITMAP_H
+#define _LINUX_NTFS_BITMAP_H
+
+#ifdef NTFS_RW
+
+#include <linux/fs.h>
+
+#include "types.h"
+
+extern int __ntfs_bitmap_set_bits_in_run(struct inode *vi, const s64 start_bit,
+ const s64 count, const u8 value, const bool is_rollback);
+
+/**
+ * ntfs_bitmap_set_bits_in_run - set a run of bits in a bitmap to a value
+ * @vi: vfs inode describing the bitmap
+ * @start_bit: first bit to set
+ * @count: number of bits to set
+ * @value: value to set the bits to (i.e. 0 or 1)
+ *
+ * Set @count bits starting at bit @start_bit in the bitmap described by the
+ * vfs inode @vi to @value, where @value is either 0 or 1.
+ *
+ * Return 0 on success and -errno on error.
+ */
+static inline int ntfs_bitmap_set_bits_in_run(struct inode *vi,
+ const s64 start_bit, const s64 count, const u8 value)
+{
+ return __ntfs_bitmap_set_bits_in_run(vi, start_bit, count, value,
+ false);
+}
+
+/**
+ * ntfs_bitmap_set_run - set a run of bits in a bitmap
+ * @vi: vfs inode describing the bitmap
+ * @start_bit: first bit to set
+ * @count: number of bits to set
+ *
+ * Set @count bits starting at bit @start_bit in the bitmap described by the
+ * vfs inode @vi.
+ *
+ * Return 0 on success and -errno on error.
+ */
+static inline int ntfs_bitmap_set_run(struct inode *vi, const s64 start_bit,
+ const s64 count)
+{
+ return ntfs_bitmap_set_bits_in_run(vi, start_bit, count, 1);
+}
+
+/**
+ * ntfs_bitmap_clear_run - clear a run of bits in a bitmap
+ * @vi: vfs inode describing the bitmap
+ * @start_bit: first bit to clear
+ * @count: number of bits to clear
+ *
+ * Clear @count bits starting at bit @start_bit in the bitmap described by the
+ * vfs inode @vi.
+ *
+ * Return 0 on success and -errno on error.
+ */
+static inline int ntfs_bitmap_clear_run(struct inode *vi, const s64 start_bit,
+ const s64 count)
+{
+ return ntfs_bitmap_set_bits_in_run(vi, start_bit, count, 0);
+}
+
+/**
+ * ntfs_bitmap_set_bit - set a bit in a bitmap
+ * @vi: vfs inode describing the bitmap
+ * @bit: bit to set
+ *
+ * Set bit @bit in the bitmap described by the vfs inode @vi.
+ *
+ * Return 0 on success and -errno on error.
+ */
+static inline int ntfs_bitmap_set_bit(struct inode *vi, const s64 bit)
+{
+ return ntfs_bitmap_set_run(vi, bit, 1);
+}
+
+/**
+ * ntfs_bitmap_clear_bit - clear a bit in a bitmap
+ * @vi: vfs inode describing the bitmap
+ * @bit: bit to clear
+ *
+ * Clear bit @bit in the bitmap described by the vfs inode @vi.
+ *
+ * Return 0 on success and -errno on error.
+ */
+static inline int ntfs_bitmap_clear_bit(struct inode *vi, const s64 bit)
+{
+ return ntfs_bitmap_clear_run(vi, bit, 1);
+}
+
+#endif /* NTFS_RW */
+
+#endif /* defined _LINUX_NTFS_BITMAP_H */
diff --git a/fs/ntfs/collate.c b/fs/ntfs/collate.c
new file mode 100644
index 000000000..3ab6ec96a
--- /dev/null
+++ b/fs/ntfs/collate.c
@@ -0,0 +1,110 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * collate.c - NTFS kernel collation handling. Part of the Linux-NTFS project.
+ *
+ * Copyright (c) 2004 Anton Altaparmakov
+ */
+
+#include "collate.h"
+#include "debug.h"
+#include "ntfs.h"
+
+static int ntfs_collate_binary(ntfs_volume *vol,
+ const void *data1, const int data1_len,
+ const void *data2, const int data2_len)
+{
+ int rc;
+
+ ntfs_debug("Entering.");
+ rc = memcmp(data1, data2, min(data1_len, data2_len));
+ if (!rc && (data1_len != data2_len)) {
+ if (data1_len < data2_len)
+ rc = -1;
+ else
+ rc = 1;
+ }
+ ntfs_debug("Done, returning %i", rc);
+ return rc;
+}
+
+static int ntfs_collate_ntofs_ulong(ntfs_volume *vol,
+ const void *data1, const int data1_len,
+ const void *data2, const int data2_len)
+{
+ int rc;
+ u32 d1, d2;
+
+ ntfs_debug("Entering.");
+ // FIXME: We don't really want to bug here.
+ BUG_ON(data1_len != data2_len);
+ BUG_ON(data1_len != 4);
+ d1 = le32_to_cpup(data1);
+ d2 = le32_to_cpup(data2);
+ if (d1 < d2)
+ rc = -1;
+ else {
+ if (d1 == d2)
+ rc = 0;
+ else
+ rc = 1;
+ }
+ ntfs_debug("Done, returning %i", rc);
+ return rc;
+}
+
+typedef int (*ntfs_collate_func_t)(ntfs_volume *, const void *, const int,
+ const void *, const int);
+
+static ntfs_collate_func_t ntfs_do_collate0x0[3] = {
+ ntfs_collate_binary,
+ NULL/*ntfs_collate_file_name*/,
+ NULL/*ntfs_collate_unicode_string*/,
+};
+
+static ntfs_collate_func_t ntfs_do_collate0x1[4] = {
+ ntfs_collate_ntofs_ulong,
+ NULL/*ntfs_collate_ntofs_sid*/,
+ NULL/*ntfs_collate_ntofs_security_hash*/,
+ NULL/*ntfs_collate_ntofs_ulongs*/,
+};
+
+/**
+ * ntfs_collate - collate two data items using a specified collation rule
+ * @vol: ntfs volume to which the data items belong
+ * @cr: collation rule to use when comparing the items
+ * @data1: first data item to collate
+ * @data1_len: length in bytes of @data1
+ * @data2: second data item to collate
+ * @data2_len: length in bytes of @data2
+ *
+ * Collate the two data items @data1 and @data2 using the collation rule @cr
+ * and return -1, 0, ir 1 if @data1 is found, respectively, to collate before,
+ * to match, or to collate after @data2.
+ *
+ * For speed we use the collation rule @cr as an index into two tables of
+ * function pointers to call the appropriate collation function.
+ */
+int ntfs_collate(ntfs_volume *vol, COLLATION_RULE cr,
+ const void *data1, const int data1_len,
+ const void *data2, const int data2_len) {
+ int i;
+
+ ntfs_debug("Entering.");
+ /*
+ * FIXME: At the moment we only support COLLATION_BINARY and
+ * COLLATION_NTOFS_ULONG, so we BUG() for everything else for now.
+ */
+ BUG_ON(cr != COLLATION_BINARY && cr != COLLATION_NTOFS_ULONG);
+ i = le32_to_cpu(cr);
+ BUG_ON(i < 0);
+ if (i <= 0x02)
+ return ntfs_do_collate0x0[i](vol, data1, data1_len,
+ data2, data2_len);
+ BUG_ON(i < 0x10);
+ i -= 0x10;
+ if (likely(i <= 3))
+ return ntfs_do_collate0x1[i](vol, data1, data1_len,
+ data2, data2_len);
+ BUG();
+ return 0;
+}
diff --git a/fs/ntfs/collate.h b/fs/ntfs/collate.h
new file mode 100644
index 000000000..f2255619b
--- /dev/null
+++ b/fs/ntfs/collate.h
@@ -0,0 +1,36 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * collate.h - Defines for NTFS kernel collation handling. Part of the
+ * Linux-NTFS project.
+ *
+ * Copyright (c) 2004 Anton Altaparmakov
+ */
+
+#ifndef _LINUX_NTFS_COLLATE_H
+#define _LINUX_NTFS_COLLATE_H
+
+#include "types.h"
+#include "volume.h"
+
+static inline bool ntfs_is_collation_rule_supported(COLLATION_RULE cr) {
+ int i;
+
+ /*
+ * FIXME: At the moment we only support COLLATION_BINARY and
+ * COLLATION_NTOFS_ULONG, so we return false for everything else for
+ * now.
+ */
+ if (unlikely(cr != COLLATION_BINARY && cr != COLLATION_NTOFS_ULONG))
+ return false;
+ i = le32_to_cpu(cr);
+ if (likely(((i >= 0) && (i <= 0x02)) ||
+ ((i >= 0x10) && (i <= 0x13))))
+ return true;
+ return false;
+}
+
+extern int ntfs_collate(ntfs_volume *vol, COLLATION_RULE cr,
+ const void *data1, const int data1_len,
+ const void *data2, const int data2_len);
+
+#endif /* _LINUX_NTFS_COLLATE_H */
diff --git a/fs/ntfs/compress.c b/fs/ntfs/compress.c
new file mode 100644
index 000000000..587e9b187
--- /dev/null
+++ b/fs/ntfs/compress.c
@@ -0,0 +1,950 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/**
+ * compress.c - NTFS kernel compressed attributes handling.
+ * Part of the Linux-NTFS project.
+ *
+ * Copyright (c) 2001-2004 Anton Altaparmakov
+ * Copyright (c) 2002 Richard Russon
+ */
+
+#include <linux/fs.h>
+#include <linux/buffer_head.h>
+#include <linux/blkdev.h>
+#include <linux/vmalloc.h>
+#include <linux/slab.h>
+
+#include "attrib.h"
+#include "inode.h"
+#include "debug.h"
+#include "ntfs.h"
+
+/**
+ * ntfs_compression_constants - enum of constants used in the compression code
+ */
+typedef enum {
+ /* Token types and access mask. */
+ NTFS_SYMBOL_TOKEN = 0,
+ NTFS_PHRASE_TOKEN = 1,
+ NTFS_TOKEN_MASK = 1,
+
+ /* Compression sub-block constants. */
+ NTFS_SB_SIZE_MASK = 0x0fff,
+ NTFS_SB_SIZE = 0x1000,
+ NTFS_SB_IS_COMPRESSED = 0x8000,
+
+ /*
+ * The maximum compression block size is by definition 16 * the cluster
+ * size, with the maximum supported cluster size being 4kiB. Thus the
+ * maximum compression buffer size is 64kiB, so we use this when
+ * initializing the compression buffer.
+ */
+ NTFS_MAX_CB_SIZE = 64 * 1024,
+} ntfs_compression_constants;
+
+/**
+ * ntfs_compression_buffer - one buffer for the decompression engine
+ */
+static u8 *ntfs_compression_buffer;
+
+/**
+ * ntfs_cb_lock - spinlock which protects ntfs_compression_buffer
+ */
+static DEFINE_SPINLOCK(ntfs_cb_lock);
+
+/**
+ * allocate_compression_buffers - allocate the decompression buffers
+ *
+ * Caller has to hold the ntfs_lock mutex.
+ *
+ * Return 0 on success or -ENOMEM if the allocations failed.
+ */
+int allocate_compression_buffers(void)
+{
+ BUG_ON(ntfs_compression_buffer);
+
+ ntfs_compression_buffer = vmalloc(NTFS_MAX_CB_SIZE);
+ if (!ntfs_compression_buffer)
+ return -ENOMEM;
+ return 0;
+}
+
+/**
+ * free_compression_buffers - free the decompression buffers
+ *
+ * Caller has to hold the ntfs_lock mutex.
+ */
+void free_compression_buffers(void)
+{
+ BUG_ON(!ntfs_compression_buffer);
+ vfree(ntfs_compression_buffer);
+ ntfs_compression_buffer = NULL;
+}
+
+/**
+ * zero_partial_compressed_page - zero out of bounds compressed page region
+ */
+static void zero_partial_compressed_page(struct page *page,
+ const s64 initialized_size)
+{
+ u8 *kp = page_address(page);
+ unsigned int kp_ofs;
+
+ ntfs_debug("Zeroing page region outside initialized size.");
+ if (((s64)page->index << PAGE_SHIFT) >= initialized_size) {
+ clear_page(kp);
+ return;
+ }
+ kp_ofs = initialized_size & ~PAGE_MASK;
+ memset(kp + kp_ofs, 0, PAGE_SIZE - kp_ofs);
+ return;
+}
+
+/**
+ * handle_bounds_compressed_page - test for&handle out of bounds compressed page
+ */
+static inline void handle_bounds_compressed_page(struct page *page,
+ const loff_t i_size, const s64 initialized_size)
+{
+ if ((page->index >= (initialized_size >> PAGE_SHIFT)) &&
+ (initialized_size < i_size))
+ zero_partial_compressed_page(page, initialized_size);
+ return;
+}
+
+/**
+ * ntfs_decompress - decompress a compression block into an array of pages
+ * @dest_pages: destination array of pages
+ * @completed_pages: scratch space to track completed pages
+ * @dest_index: current index into @dest_pages (IN/OUT)
+ * @dest_ofs: current offset within @dest_pages[@dest_index] (IN/OUT)
+ * @dest_max_index: maximum index into @dest_pages (IN)
+ * @dest_max_ofs: maximum offset within @dest_pages[@dest_max_index] (IN)
+ * @xpage: the target page (-1 if none) (IN)
+ * @xpage_done: set to 1 if xpage was completed successfully (IN/OUT)
+ * @cb_start: compression block to decompress (IN)
+ * @cb_size: size of compression block @cb_start in bytes (IN)
+ * @i_size: file size when we started the read (IN)
+ * @initialized_size: initialized file size when we started the read (IN)
+ *
+ * The caller must have disabled preemption. ntfs_decompress() reenables it when
+ * the critical section is finished.
+ *
+ * This decompresses the compression block @cb_start into the array of
+ * destination pages @dest_pages starting at index @dest_index into @dest_pages
+ * and at offset @dest_pos into the page @dest_pages[@dest_index].
+ *
+ * When the page @dest_pages[@xpage] is completed, @xpage_done is set to 1.
+ * If xpage is -1 or @xpage has not been completed, @xpage_done is not modified.
+ *
+ * @cb_start is a pointer to the compression block which needs decompressing
+ * and @cb_size is the size of @cb_start in bytes (8-64kiB).
+ *
+ * Return 0 if success or -EOVERFLOW on error in the compressed stream.
+ * @xpage_done indicates whether the target page (@dest_pages[@xpage]) was
+ * completed during the decompression of the compression block (@cb_start).
+ *
+ * Warning: This function *REQUIRES* PAGE_SIZE >= 4096 or it will blow up
+ * unpredicatbly! You have been warned!
+ *
+ * Note to hackers: This function may not sleep until it has finished accessing
+ * the compression block @cb_start as it is a per-CPU buffer.
+ */
+static int ntfs_decompress(struct page *dest_pages[], int completed_pages[],
+ int *dest_index, int *dest_ofs, const int dest_max_index,
+ const int dest_max_ofs, const int xpage, char *xpage_done,
+ u8 *const cb_start, const u32 cb_size, const loff_t i_size,
+ const s64 initialized_size)
+{
+ /*
+ * Pointers into the compressed data, i.e. the compression block (cb),
+ * and the therein contained sub-blocks (sb).
+ */
+ u8 *cb_end = cb_start + cb_size; /* End of cb. */
+ u8 *cb = cb_start; /* Current position in cb. */
+ u8 *cb_sb_start = cb; /* Beginning of the current sb in the cb. */
+ u8 *cb_sb_end; /* End of current sb / beginning of next sb. */
+
+ /* Variables for uncompressed data / destination. */
+ struct page *dp; /* Current destination page being worked on. */
+ u8 *dp_addr; /* Current pointer into dp. */
+ u8 *dp_sb_start; /* Start of current sub-block in dp. */
+ u8 *dp_sb_end; /* End of current sb in dp (dp_sb_start +
+ NTFS_SB_SIZE). */
+ u16 do_sb_start; /* @dest_ofs when starting this sub-block. */
+ u16 do_sb_end; /* @dest_ofs of end of this sb (do_sb_start +
+ NTFS_SB_SIZE). */
+
+ /* Variables for tag and token parsing. */
+ u8 tag; /* Current tag. */
+ int token; /* Loop counter for the eight tokens in tag. */
+ int nr_completed_pages = 0;
+
+ /* Default error code. */
+ int err = -EOVERFLOW;
+
+ ntfs_debug("Entering, cb_size = 0x%x.", cb_size);
+do_next_sb:
+ ntfs_debug("Beginning sub-block at offset = 0x%zx in the cb.",
+ cb - cb_start);
+ /*
+ * Have we reached the end of the compression block or the end of the
+ * decompressed data? The latter can happen for example if the current
+ * position in the compression block is one byte before its end so the
+ * first two checks do not detect it.
+ */
+ if (cb == cb_end || !le16_to_cpup((le16*)cb) ||
+ (*dest_index == dest_max_index &&
+ *dest_ofs == dest_max_ofs)) {
+ int i;
+
+ ntfs_debug("Completed. Returning success (0).");
+ err = 0;
+return_error:
+ /* We can sleep from now on, so we drop lock. */
+ spin_unlock(&ntfs_cb_lock);
+ /* Second stage: finalize completed pages. */
+ if (nr_completed_pages > 0) {
+ for (i = 0; i < nr_completed_pages; i++) {
+ int di = completed_pages[i];
+
+ dp = dest_pages[di];
+ /*
+ * If we are outside the initialized size, zero
+ * the out of bounds page range.
+ */
+ handle_bounds_compressed_page(dp, i_size,
+ initialized_size);
+ flush_dcache_page(dp);
+ kunmap(dp);
+ SetPageUptodate(dp);
+ unlock_page(dp);
+ if (di == xpage)
+ *xpage_done = 1;
+ else
+ put_page(dp);
+ dest_pages[di] = NULL;
+ }
+ }
+ return err;
+ }
+
+ /* Setup offsets for the current sub-block destination. */
+ do_sb_start = *dest_ofs;
+ do_sb_end = do_sb_start + NTFS_SB_SIZE;
+
+ /* Check that we are still within allowed boundaries. */
+ if (*dest_index == dest_max_index && do_sb_end > dest_max_ofs)
+ goto return_overflow;
+
+ /* Does the minimum size of a compressed sb overflow valid range? */
+ if (cb + 6 > cb_end)
+ goto return_overflow;
+
+ /* Setup the current sub-block source pointers and validate range. */
+ cb_sb_start = cb;
+ cb_sb_end = cb_sb_start + (le16_to_cpup((le16*)cb) & NTFS_SB_SIZE_MASK)
+ + 3;
+ if (cb_sb_end > cb_end)
+ goto return_overflow;
+
+ /* Get the current destination page. */
+ dp = dest_pages[*dest_index];
+ if (!dp) {
+ /* No page present. Skip decompression of this sub-block. */
+ cb = cb_sb_end;
+
+ /* Advance destination position to next sub-block. */
+ *dest_ofs = (*dest_ofs + NTFS_SB_SIZE) & ~PAGE_MASK;
+ if (!*dest_ofs && (++*dest_index > dest_max_index))
+ goto return_overflow;
+ goto do_next_sb;
+ }
+
+ /* We have a valid destination page. Setup the destination pointers. */
+ dp_addr = (u8*)page_address(dp) + do_sb_start;
+
+ /* Now, we are ready to process the current sub-block (sb). */
+ if (!(le16_to_cpup((le16*)cb) & NTFS_SB_IS_COMPRESSED)) {
+ ntfs_debug("Found uncompressed sub-block.");
+ /* This sb is not compressed, just copy it into destination. */
+
+ /* Advance source position to first data byte. */
+ cb += 2;
+
+ /* An uncompressed sb must be full size. */
+ if (cb_sb_end - cb != NTFS_SB_SIZE)
+ goto return_overflow;
+
+ /* Copy the block and advance the source position. */
+ memcpy(dp_addr, cb, NTFS_SB_SIZE);
+ cb += NTFS_SB_SIZE;
+
+ /* Advance destination position to next sub-block. */
+ *dest_ofs += NTFS_SB_SIZE;
+ if (!(*dest_ofs &= ~PAGE_MASK)) {
+finalize_page:
+ /*
+ * First stage: add current page index to array of
+ * completed pages.
+ */
+ completed_pages[nr_completed_pages++] = *dest_index;
+ if (++*dest_index > dest_max_index)
+ goto return_overflow;
+ }
+ goto do_next_sb;
+ }
+ ntfs_debug("Found compressed sub-block.");
+ /* This sb is compressed, decompress it into destination. */
+
+ /* Setup destination pointers. */
+ dp_sb_start = dp_addr;
+ dp_sb_end = dp_sb_start + NTFS_SB_SIZE;
+
+ /* Forward to the first tag in the sub-block. */
+ cb += 2;
+do_next_tag:
+ if (cb == cb_sb_end) {
+ /* Check if the decompressed sub-block was not full-length. */
+ if (dp_addr < dp_sb_end) {
+ int nr_bytes = do_sb_end - *dest_ofs;
+
+ ntfs_debug("Filling incomplete sub-block with "
+ "zeroes.");
+ /* Zero remainder and update destination position. */
+ memset(dp_addr, 0, nr_bytes);
+ *dest_ofs += nr_bytes;
+ }
+ /* We have finished the current sub-block. */
+ if (!(*dest_ofs &= ~PAGE_MASK))
+ goto finalize_page;
+ goto do_next_sb;
+ }
+
+ /* Check we are still in range. */
+ if (cb > cb_sb_end || dp_addr > dp_sb_end)
+ goto return_overflow;
+
+ /* Get the next tag and advance to first token. */
+ tag = *cb++;
+
+ /* Parse the eight tokens described by the tag. */
+ for (token = 0; token < 8; token++, tag >>= 1) {
+ u16 lg, pt, length, max_non_overlap;
+ register u16 i;
+ u8 *dp_back_addr;
+
+ /* Check if we are done / still in range. */
+ if (cb >= cb_sb_end || dp_addr > dp_sb_end)
+ break;
+
+ /* Determine token type and parse appropriately.*/
+ if ((tag & NTFS_TOKEN_MASK) == NTFS_SYMBOL_TOKEN) {
+ /*
+ * We have a symbol token, copy the symbol across, and
+ * advance the source and destination positions.
+ */
+ *dp_addr++ = *cb++;
+ ++*dest_ofs;
+
+ /* Continue with the next token. */
+ continue;
+ }
+
+ /*
+ * We have a phrase token. Make sure it is not the first tag in
+ * the sb as this is illegal and would confuse the code below.
+ */
+ if (dp_addr == dp_sb_start)
+ goto return_overflow;
+
+ /*
+ * Determine the number of bytes to go back (p) and the number
+ * of bytes to copy (l). We use an optimized algorithm in which
+ * we first calculate log2(current destination position in sb),
+ * which allows determination of l and p in O(1) rather than
+ * O(n). We just need an arch-optimized log2() function now.
+ */
+ lg = 0;
+ for (i = *dest_ofs - do_sb_start - 1; i >= 0x10; i >>= 1)
+ lg++;
+
+ /* Get the phrase token into i. */
+ pt = le16_to_cpup((le16*)cb);
+
+ /*
+ * Calculate starting position of the byte sequence in
+ * the destination using the fact that p = (pt >> (12 - lg)) + 1
+ * and make sure we don't go too far back.
+ */
+ dp_back_addr = dp_addr - (pt >> (12 - lg)) - 1;
+ if (dp_back_addr < dp_sb_start)
+ goto return_overflow;
+
+ /* Now calculate the length of the byte sequence. */
+ length = (pt & (0xfff >> lg)) + 3;
+
+ /* Advance destination position and verify it is in range. */
+ *dest_ofs += length;
+ if (*dest_ofs > do_sb_end)
+ goto return_overflow;
+
+ /* The number of non-overlapping bytes. */
+ max_non_overlap = dp_addr - dp_back_addr;
+
+ if (length <= max_non_overlap) {
+ /* The byte sequence doesn't overlap, just copy it. */
+ memcpy(dp_addr, dp_back_addr, length);
+
+ /* Advance destination pointer. */
+ dp_addr += length;
+ } else {
+ /*
+ * The byte sequence does overlap, copy non-overlapping
+ * part and then do a slow byte by byte copy for the
+ * overlapping part. Also, advance the destination
+ * pointer.
+ */
+ memcpy(dp_addr, dp_back_addr, max_non_overlap);
+ dp_addr += max_non_overlap;
+ dp_back_addr += max_non_overlap;
+ length -= max_non_overlap;
+ while (length--)
+ *dp_addr++ = *dp_back_addr++;
+ }
+
+ /* Advance source position and continue with the next token. */
+ cb += 2;
+ }
+
+ /* No tokens left in the current tag. Continue with the next tag. */
+ goto do_next_tag;
+
+return_overflow:
+ ntfs_error(NULL, "Failed. Returning -EOVERFLOW.");
+ goto return_error;
+}
+
+/**
+ * ntfs_read_compressed_block - read a compressed block into the page cache
+ * @page: locked page in the compression block(s) we need to read
+ *
+ * When we are called the page has already been verified to be locked and the
+ * attribute is known to be non-resident, not encrypted, but compressed.
+ *
+ * 1. Determine which compression block(s) @page is in.
+ * 2. Get hold of all pages corresponding to this/these compression block(s).
+ * 3. Read the (first) compression block.
+ * 4. Decompress it into the corresponding pages.
+ * 5. Throw the compressed data away and proceed to 3. for the next compression
+ * block or return success if no more compression blocks left.
+ *
+ * Warning: We have to be careful what we do about existing pages. They might
+ * have been written to so that we would lose data if we were to just overwrite
+ * them with the out-of-date uncompressed data.
+ *
+ * FIXME: For PAGE_SIZE > cb_size we are not doing the Right Thing(TM) at
+ * the end of the file I think. We need to detect this case and zero the out
+ * of bounds remainder of the page in question and mark it as handled. At the
+ * moment we would just return -EIO on such a page. This bug will only become
+ * apparent if pages are above 8kiB and the NTFS volume only uses 512 byte
+ * clusters so is probably not going to be seen by anyone. Still this should
+ * be fixed. (AIA)
+ *
+ * FIXME: Again for PAGE_SIZE > cb_size we are screwing up both in
+ * handling sparse and compressed cbs. (AIA)
+ *
+ * FIXME: At the moment we don't do any zeroing out in the case that
+ * initialized_size is less than data_size. This should be safe because of the
+ * nature of the compression algorithm used. Just in case we check and output
+ * an error message in read inode if the two sizes are not equal for a
+ * compressed file. (AIA)
+ */
+int ntfs_read_compressed_block(struct page *page)
+{
+ loff_t i_size;
+ s64 initialized_size;
+ struct address_space *mapping = page->mapping;
+ ntfs_inode *ni = NTFS_I(mapping->host);
+ ntfs_volume *vol = ni->vol;
+ struct super_block *sb = vol->sb;
+ runlist_element *rl;
+ unsigned long flags, block_size = sb->s_blocksize;
+ unsigned char block_size_bits = sb->s_blocksize_bits;
+ u8 *cb, *cb_pos, *cb_end;
+ struct buffer_head **bhs;
+ unsigned long offset, index = page->index;
+ u32 cb_size = ni->itype.compressed.block_size;
+ u64 cb_size_mask = cb_size - 1UL;
+ VCN vcn;
+ LCN lcn;
+ /* The first wanted vcn (minimum alignment is PAGE_SIZE). */
+ VCN start_vcn = (((s64)index << PAGE_SHIFT) & ~cb_size_mask) >>
+ vol->cluster_size_bits;
+ /*
+ * The first vcn after the last wanted vcn (minimum alignment is again
+ * PAGE_SIZE.
+ */
+ VCN end_vcn = ((((s64)(index + 1UL) << PAGE_SHIFT) + cb_size - 1)
+ & ~cb_size_mask) >> vol->cluster_size_bits;
+ /* Number of compression blocks (cbs) in the wanted vcn range. */
+ unsigned int nr_cbs = (end_vcn - start_vcn) << vol->cluster_size_bits
+ >> ni->itype.compressed.block_size_bits;
+ /*
+ * Number of pages required to store the uncompressed data from all
+ * compression blocks (cbs) overlapping @page. Due to alignment
+ * guarantees of start_vcn and end_vcn, no need to round up here.
+ */
+ unsigned int nr_pages = (end_vcn - start_vcn) <<
+ vol->cluster_size_bits >> PAGE_SHIFT;
+ unsigned int xpage, max_page, cur_page, cur_ofs, i;
+ unsigned int cb_clusters, cb_max_ofs;
+ int block, max_block, cb_max_page, bhs_size, nr_bhs, err = 0;
+ struct page **pages;
+ int *completed_pages;
+ unsigned char xpage_done = 0;
+
+ ntfs_debug("Entering, page->index = 0x%lx, cb_size = 0x%x, nr_pages = "
+ "%i.", index, cb_size, nr_pages);
+ /*
+ * Bad things happen if we get here for anything that is not an
+ * unnamed $DATA attribute.
+ */
+ BUG_ON(ni->type != AT_DATA);
+ BUG_ON(ni->name_len);
+
+ pages = kmalloc_array(nr_pages, sizeof(struct page *), GFP_NOFS);
+ completed_pages = kmalloc_array(nr_pages + 1, sizeof(int), GFP_NOFS);
+
+ /* Allocate memory to store the buffer heads we need. */
+ bhs_size = cb_size / block_size * sizeof(struct buffer_head *);
+ bhs = kmalloc(bhs_size, GFP_NOFS);
+
+ if (unlikely(!pages || !bhs || !completed_pages)) {
+ kfree(bhs);
+ kfree(pages);
+ kfree(completed_pages);
+ unlock_page(page);
+ ntfs_error(vol->sb, "Failed to allocate internal buffers.");
+ return -ENOMEM;
+ }
+
+ /*
+ * We have already been given one page, this is the one we must do.
+ * Once again, the alignment guarantees keep it simple.
+ */
+ offset = start_vcn << vol->cluster_size_bits >> PAGE_SHIFT;
+ xpage = index - offset;
+ pages[xpage] = page;
+ /*
+ * The remaining pages need to be allocated and inserted into the page
+ * cache, alignment guarantees keep all the below much simpler. (-8
+ */
+ read_lock_irqsave(&ni->size_lock, flags);
+ i_size = i_size_read(VFS_I(ni));
+ initialized_size = ni->initialized_size;
+ read_unlock_irqrestore(&ni->size_lock, flags);
+ max_page = ((i_size + PAGE_SIZE - 1) >> PAGE_SHIFT) -
+ offset;
+ /* Is the page fully outside i_size? (truncate in progress) */
+ if (xpage >= max_page) {
+ kfree(bhs);
+ kfree(pages);
+ kfree(completed_pages);
+ zero_user(page, 0, PAGE_SIZE);
+ ntfs_debug("Compressed read outside i_size - truncated?");
+ SetPageUptodate(page);
+ unlock_page(page);
+ return 0;
+ }
+ if (nr_pages < max_page)
+ max_page = nr_pages;
+ for (i = 0; i < max_page; i++, offset++) {
+ if (i != xpage)
+ pages[i] = grab_cache_page_nowait(mapping, offset);
+ page = pages[i];
+ if (page) {
+ /*
+ * We only (re)read the page if it isn't already read
+ * in and/or dirty or we would be losing data or at
+ * least wasting our time.
+ */
+ if (!PageDirty(page) && (!PageUptodate(page) ||
+ PageError(page))) {
+ ClearPageError(page);
+ kmap(page);
+ continue;
+ }
+ unlock_page(page);
+ put_page(page);
+ pages[i] = NULL;
+ }
+ }
+
+ /*
+ * We have the runlist, and all the destination pages we need to fill.
+ * Now read the first compression block.
+ */
+ cur_page = 0;
+ cur_ofs = 0;
+ cb_clusters = ni->itype.compressed.block_clusters;
+do_next_cb:
+ nr_cbs--;
+ nr_bhs = 0;
+
+ /* Read all cb buffer heads one cluster at a time. */
+ rl = NULL;
+ for (vcn = start_vcn, start_vcn += cb_clusters; vcn < start_vcn;
+ vcn++) {
+ bool is_retry = false;
+
+ if (!rl) {
+lock_retry_remap:
+ down_read(&ni->runlist.lock);
+ rl = ni->runlist.rl;
+ }
+ if (likely(rl != NULL)) {
+ /* Seek to element containing target vcn. */
+ while (rl->length && rl[1].vcn <= vcn)
+ rl++;
+ lcn = ntfs_rl_vcn_to_lcn(rl, vcn);
+ } else
+ lcn = LCN_RL_NOT_MAPPED;
+ ntfs_debug("Reading vcn = 0x%llx, lcn = 0x%llx.",
+ (unsigned long long)vcn,
+ (unsigned long long)lcn);
+ if (lcn < 0) {
+ /*
+ * When we reach the first sparse cluster we have
+ * finished with the cb.
+ */
+ if (lcn == LCN_HOLE)
+ break;
+ if (is_retry || lcn != LCN_RL_NOT_MAPPED)
+ goto rl_err;
+ is_retry = true;
+ /*
+ * Attempt to map runlist, dropping lock for the
+ * duration.
+ */
+ up_read(&ni->runlist.lock);
+ if (!ntfs_map_runlist(ni, vcn))
+ goto lock_retry_remap;
+ goto map_rl_err;
+ }
+ block = lcn << vol->cluster_size_bits >> block_size_bits;
+ /* Read the lcn from device in chunks of block_size bytes. */
+ max_block = block + (vol->cluster_size >> block_size_bits);
+ do {
+ ntfs_debug("block = 0x%x.", block);
+ if (unlikely(!(bhs[nr_bhs] = sb_getblk(sb, block))))
+ goto getblk_err;
+ nr_bhs++;
+ } while (++block < max_block);
+ }
+
+ /* Release the lock if we took it. */
+ if (rl)
+ up_read(&ni->runlist.lock);
+
+ /* Setup and initiate io on all buffer heads. */
+ for (i = 0; i < nr_bhs; i++) {
+ struct buffer_head *tbh = bhs[i];
+
+ if (!trylock_buffer(tbh))
+ continue;
+ if (unlikely(buffer_uptodate(tbh))) {
+ unlock_buffer(tbh);
+ continue;
+ }
+ get_bh(tbh);
+ tbh->b_end_io = end_buffer_read_sync;
+ submit_bh(REQ_OP_READ, tbh);
+ }
+
+ /* Wait for io completion on all buffer heads. */
+ for (i = 0; i < nr_bhs; i++) {
+ struct buffer_head *tbh = bhs[i];
+
+ if (buffer_uptodate(tbh))
+ continue;
+ wait_on_buffer(tbh);
+ /*
+ * We need an optimization barrier here, otherwise we start
+ * hitting the below fixup code when accessing a loopback
+ * mounted ntfs partition. This indicates either there is a
+ * race condition in the loop driver or, more likely, gcc
+ * overoptimises the code without the barrier and it doesn't
+ * do the Right Thing(TM).
+ */
+ barrier();
+ if (unlikely(!buffer_uptodate(tbh))) {
+ ntfs_warning(vol->sb, "Buffer is unlocked but not "
+ "uptodate! Unplugging the disk queue "
+ "and rescheduling.");
+ get_bh(tbh);
+ io_schedule();
+ put_bh(tbh);
+ if (unlikely(!buffer_uptodate(tbh)))
+ goto read_err;
+ ntfs_warning(vol->sb, "Buffer is now uptodate. Good.");
+ }
+ }
+
+ /*
+ * Get the compression buffer. We must not sleep any more
+ * until we are finished with it.
+ */
+ spin_lock(&ntfs_cb_lock);
+ cb = ntfs_compression_buffer;
+
+ BUG_ON(!cb);
+
+ cb_pos = cb;
+ cb_end = cb + cb_size;
+
+ /* Copy the buffer heads into the contiguous buffer. */
+ for (i = 0; i < nr_bhs; i++) {
+ memcpy(cb_pos, bhs[i]->b_data, block_size);
+ cb_pos += block_size;
+ }
+
+ /* Just a precaution. */
+ if (cb_pos + 2 <= cb + cb_size)
+ *(u16*)cb_pos = 0;
+
+ /* Reset cb_pos back to the beginning. */
+ cb_pos = cb;
+
+ /* We now have both source (if present) and destination. */
+ ntfs_debug("Successfully read the compression block.");
+
+ /* The last page and maximum offset within it for the current cb. */
+ cb_max_page = (cur_page << PAGE_SHIFT) + cur_ofs + cb_size;
+ cb_max_ofs = cb_max_page & ~PAGE_MASK;
+ cb_max_page >>= PAGE_SHIFT;
+
+ /* Catch end of file inside a compression block. */
+ if (cb_max_page > max_page)
+ cb_max_page = max_page;
+
+ if (vcn == start_vcn - cb_clusters) {
+ /* Sparse cb, zero out page range overlapping the cb. */
+ ntfs_debug("Found sparse compression block.");
+ /* We can sleep from now on, so we drop lock. */
+ spin_unlock(&ntfs_cb_lock);
+ if (cb_max_ofs)
+ cb_max_page--;
+ for (; cur_page < cb_max_page; cur_page++) {
+ page = pages[cur_page];
+ if (page) {
+ if (likely(!cur_ofs))
+ clear_page(page_address(page));
+ else
+ memset(page_address(page) + cur_ofs, 0,
+ PAGE_SIZE -
+ cur_ofs);
+ flush_dcache_page(page);
+ kunmap(page);
+ SetPageUptodate(page);
+ unlock_page(page);
+ if (cur_page == xpage)
+ xpage_done = 1;
+ else
+ put_page(page);
+ pages[cur_page] = NULL;
+ }
+ cb_pos += PAGE_SIZE - cur_ofs;
+ cur_ofs = 0;
+ if (cb_pos >= cb_end)
+ break;
+ }
+ /* If we have a partial final page, deal with it now. */
+ if (cb_max_ofs && cb_pos < cb_end) {
+ page = pages[cur_page];
+ if (page)
+ memset(page_address(page) + cur_ofs, 0,
+ cb_max_ofs - cur_ofs);
+ /*
+ * No need to update cb_pos at this stage:
+ * cb_pos += cb_max_ofs - cur_ofs;
+ */
+ cur_ofs = cb_max_ofs;
+ }
+ } else if (vcn == start_vcn) {
+ /* We can't sleep so we need two stages. */
+ unsigned int cur2_page = cur_page;
+ unsigned int cur_ofs2 = cur_ofs;
+ u8 *cb_pos2 = cb_pos;
+
+ ntfs_debug("Found uncompressed compression block.");
+ /* Uncompressed cb, copy it to the destination pages. */
+ /*
+ * TODO: As a big optimization, we could detect this case
+ * before we read all the pages and use block_read_full_folio()
+ * on all full pages instead (we still have to treat partial
+ * pages especially but at least we are getting rid of the
+ * synchronous io for the majority of pages.
+ * Or if we choose not to do the read-ahead/-behind stuff, we
+ * could just return block_read_full_folio(pages[xpage]) as long
+ * as PAGE_SIZE <= cb_size.
+ */
+ if (cb_max_ofs)
+ cb_max_page--;
+ /* First stage: copy data into destination pages. */
+ for (; cur_page < cb_max_page; cur_page++) {
+ page = pages[cur_page];
+ if (page)
+ memcpy(page_address(page) + cur_ofs, cb_pos,
+ PAGE_SIZE - cur_ofs);
+ cb_pos += PAGE_SIZE - cur_ofs;
+ cur_ofs = 0;
+ if (cb_pos >= cb_end)
+ break;
+ }
+ /* If we have a partial final page, deal with it now. */
+ if (cb_max_ofs && cb_pos < cb_end) {
+ page = pages[cur_page];
+ if (page)
+ memcpy(page_address(page) + cur_ofs, cb_pos,
+ cb_max_ofs - cur_ofs);
+ cb_pos += cb_max_ofs - cur_ofs;
+ cur_ofs = cb_max_ofs;
+ }
+ /* We can sleep from now on, so drop lock. */
+ spin_unlock(&ntfs_cb_lock);
+ /* Second stage: finalize pages. */
+ for (; cur2_page < cb_max_page; cur2_page++) {
+ page = pages[cur2_page];
+ if (page) {
+ /*
+ * If we are outside the initialized size, zero
+ * the out of bounds page range.
+ */
+ handle_bounds_compressed_page(page, i_size,
+ initialized_size);
+ flush_dcache_page(page);
+ kunmap(page);
+ SetPageUptodate(page);
+ unlock_page(page);
+ if (cur2_page == xpage)
+ xpage_done = 1;
+ else
+ put_page(page);
+ pages[cur2_page] = NULL;
+ }
+ cb_pos2 += PAGE_SIZE - cur_ofs2;
+ cur_ofs2 = 0;
+ if (cb_pos2 >= cb_end)
+ break;
+ }
+ } else {
+ /* Compressed cb, decompress it into the destination page(s). */
+ unsigned int prev_cur_page = cur_page;
+
+ ntfs_debug("Found compressed compression block.");
+ err = ntfs_decompress(pages, completed_pages, &cur_page,
+ &cur_ofs, cb_max_page, cb_max_ofs, xpage,
+ &xpage_done, cb_pos, cb_size - (cb_pos - cb),
+ i_size, initialized_size);
+ /*
+ * We can sleep from now on, lock already dropped by
+ * ntfs_decompress().
+ */
+ if (err) {
+ ntfs_error(vol->sb, "ntfs_decompress() failed in inode "
+ "0x%lx with error code %i. Skipping "
+ "this compression block.",
+ ni->mft_no, -err);
+ /* Release the unfinished pages. */
+ for (; prev_cur_page < cur_page; prev_cur_page++) {
+ page = pages[prev_cur_page];
+ if (page) {
+ flush_dcache_page(page);
+ kunmap(page);
+ unlock_page(page);
+ if (prev_cur_page != xpage)
+ put_page(page);
+ pages[prev_cur_page] = NULL;
+ }
+ }
+ }
+ }
+
+ /* Release the buffer heads. */
+ for (i = 0; i < nr_bhs; i++)
+ brelse(bhs[i]);
+
+ /* Do we have more work to do? */
+ if (nr_cbs)
+ goto do_next_cb;
+
+ /* We no longer need the list of buffer heads. */
+ kfree(bhs);
+
+ /* Clean up if we have any pages left. Should never happen. */
+ for (cur_page = 0; cur_page < max_page; cur_page++) {
+ page = pages[cur_page];
+ if (page) {
+ ntfs_error(vol->sb, "Still have pages left! "
+ "Terminating them with extreme "
+ "prejudice. Inode 0x%lx, page index "
+ "0x%lx.", ni->mft_no, page->index);
+ flush_dcache_page(page);
+ kunmap(page);
+ unlock_page(page);
+ if (cur_page != xpage)
+ put_page(page);
+ pages[cur_page] = NULL;
+ }
+ }
+
+ /* We no longer need the list of pages. */
+ kfree(pages);
+ kfree(completed_pages);
+
+ /* If we have completed the requested page, we return success. */
+ if (likely(xpage_done))
+ return 0;
+
+ ntfs_debug("Failed. Returning error code %s.", err == -EOVERFLOW ?
+ "EOVERFLOW" : (!err ? "EIO" : "unknown error"));
+ return err < 0 ? err : -EIO;
+
+read_err:
+ ntfs_error(vol->sb, "IO error while reading compressed data.");
+ /* Release the buffer heads. */
+ for (i = 0; i < nr_bhs; i++)
+ brelse(bhs[i]);
+ goto err_out;
+
+map_rl_err:
+ ntfs_error(vol->sb, "ntfs_map_runlist() failed. Cannot read "
+ "compression block.");
+ goto err_out;
+
+rl_err:
+ up_read(&ni->runlist.lock);
+ ntfs_error(vol->sb, "ntfs_rl_vcn_to_lcn() failed. Cannot read "
+ "compression block.");
+ goto err_out;
+
+getblk_err:
+ up_read(&ni->runlist.lock);
+ ntfs_error(vol->sb, "getblk() failed. Cannot read compression block.");
+
+err_out:
+ kfree(bhs);
+ for (i = cur_page; i < max_page; i++) {
+ page = pages[i];
+ if (page) {
+ flush_dcache_page(page);
+ kunmap(page);
+ unlock_page(page);
+ if (i != xpage)
+ put_page(page);
+ }
+ }
+ kfree(pages);
+ kfree(completed_pages);
+ return -EIO;
+}
diff --git a/fs/ntfs/debug.c b/fs/ntfs/debug.c
new file mode 100644
index 000000000..a3c1c5656
--- /dev/null
+++ b/fs/ntfs/debug.c
@@ -0,0 +1,159 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * debug.c - NTFS kernel debug support. Part of the Linux-NTFS project.
+ *
+ * Copyright (c) 2001-2004 Anton Altaparmakov
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include "debug.h"
+
+/**
+ * __ntfs_warning - output a warning to the syslog
+ * @function: name of function outputting the warning
+ * @sb: super block of mounted ntfs filesystem
+ * @fmt: warning string containing format specifications
+ * @...: a variable number of arguments specified in @fmt
+ *
+ * Outputs a warning to the syslog for the mounted ntfs filesystem described
+ * by @sb.
+ *
+ * @fmt and the corresponding @... is printf style format string containing
+ * the warning string and the corresponding format arguments, respectively.
+ *
+ * @function is the name of the function from which __ntfs_warning is being
+ * called.
+ *
+ * Note, you should be using debug.h::ntfs_warning(@sb, @fmt, @...) instead
+ * as this provides the @function parameter automatically.
+ */
+void __ntfs_warning(const char *function, const struct super_block *sb,
+ const char *fmt, ...)
+{
+ struct va_format vaf;
+ va_list args;
+ int flen = 0;
+
+#ifndef DEBUG
+ if (!printk_ratelimit())
+ return;
+#endif
+ if (function)
+ flen = strlen(function);
+ va_start(args, fmt);
+ vaf.fmt = fmt;
+ vaf.va = &args;
+ if (sb)
+ pr_warn("(device %s): %s(): %pV\n",
+ sb->s_id, flen ? function : "", &vaf);
+ else
+ pr_warn("%s(): %pV\n", flen ? function : "", &vaf);
+ va_end(args);
+}
+
+/**
+ * __ntfs_error - output an error to the syslog
+ * @function: name of function outputting the error
+ * @sb: super block of mounted ntfs filesystem
+ * @fmt: error string containing format specifications
+ * @...: a variable number of arguments specified in @fmt
+ *
+ * Outputs an error to the syslog for the mounted ntfs filesystem described
+ * by @sb.
+ *
+ * @fmt and the corresponding @... is printf style format string containing
+ * the error string and the corresponding format arguments, respectively.
+ *
+ * @function is the name of the function from which __ntfs_error is being
+ * called.
+ *
+ * Note, you should be using debug.h::ntfs_error(@sb, @fmt, @...) instead
+ * as this provides the @function parameter automatically.
+ */
+void __ntfs_error(const char *function, const struct super_block *sb,
+ const char *fmt, ...)
+{
+ struct va_format vaf;
+ va_list args;
+ int flen = 0;
+
+#ifndef DEBUG
+ if (!printk_ratelimit())
+ return;
+#endif
+ if (function)
+ flen = strlen(function);
+ va_start(args, fmt);
+ vaf.fmt = fmt;
+ vaf.va = &args;
+ if (sb)
+ pr_err("(device %s): %s(): %pV\n",
+ sb->s_id, flen ? function : "", &vaf);
+ else
+ pr_err("%s(): %pV\n", flen ? function : "", &vaf);
+ va_end(args);
+}
+
+#ifdef DEBUG
+
+/* If 1, output debug messages, and if 0, don't. */
+int debug_msgs = 0;
+
+void __ntfs_debug(const char *file, int line, const char *function,
+ const char *fmt, ...)
+{
+ struct va_format vaf;
+ va_list args;
+ int flen = 0;
+
+ if (!debug_msgs)
+ return;
+ if (function)
+ flen = strlen(function);
+ va_start(args, fmt);
+ vaf.fmt = fmt;
+ vaf.va = &args;
+ pr_debug("(%s, %d): %s(): %pV", file, line, flen ? function : "", &vaf);
+ va_end(args);
+}
+
+/* Dump a runlist. Caller has to provide synchronisation for @rl. */
+void ntfs_debug_dump_runlist(const runlist_element *rl)
+{
+ int i;
+ const char *lcn_str[5] = { "LCN_HOLE ", "LCN_RL_NOT_MAPPED",
+ "LCN_ENOENT ", "LCN_unknown " };
+
+ if (!debug_msgs)
+ return;
+ pr_debug("Dumping runlist (values in hex):\n");
+ if (!rl) {
+ pr_debug("Run list not present.\n");
+ return;
+ }
+ pr_debug("VCN LCN Run length\n");
+ for (i = 0; ; i++) {
+ LCN lcn = (rl + i)->lcn;
+
+ if (lcn < (LCN)0) {
+ int index = -lcn - 1;
+
+ if (index > -LCN_ENOENT - 1)
+ index = 3;
+ pr_debug("%-16Lx %s %-16Lx%s\n",
+ (long long)(rl + i)->vcn, lcn_str[index],
+ (long long)(rl + i)->length,
+ (rl + i)->length ? "" :
+ " (runlist end)");
+ } else
+ pr_debug("%-16Lx %-16Lx %-16Lx%s\n",
+ (long long)(rl + i)->vcn,
+ (long long)(rl + i)->lcn,
+ (long long)(rl + i)->length,
+ (rl + i)->length ? "" :
+ " (runlist end)");
+ if (!(rl + i)->length)
+ break;
+ }
+}
+
+#endif
diff --git a/fs/ntfs/debug.h b/fs/ntfs/debug.h
new file mode 100644
index 000000000..6fdef388f
--- /dev/null
+++ b/fs/ntfs/debug.h
@@ -0,0 +1,57 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * debug.h - NTFS kernel debug support. Part of the Linux-NTFS project.
+ *
+ * Copyright (c) 2001-2004 Anton Altaparmakov
+ */
+
+#ifndef _LINUX_NTFS_DEBUG_H
+#define _LINUX_NTFS_DEBUG_H
+
+#include <linux/fs.h>
+
+#include "runlist.h"
+
+#ifdef DEBUG
+
+extern int debug_msgs;
+
+extern __printf(4, 5)
+void __ntfs_debug(const char *file, int line, const char *function,
+ const char *format, ...);
+/**
+ * ntfs_debug - write a debug level message to syslog
+ * @f: a printf format string containing the message
+ * @...: the variables to substitute into @f
+ *
+ * ntfs_debug() writes a DEBUG level message to the syslog but only if the
+ * driver was compiled with -DDEBUG. Otherwise, the call turns into a NOP.
+ */
+#define ntfs_debug(f, a...) \
+ __ntfs_debug(__FILE__, __LINE__, __func__, f, ##a)
+
+extern void ntfs_debug_dump_runlist(const runlist_element *rl);
+
+#else /* !DEBUG */
+
+#define ntfs_debug(fmt, ...) \
+do { \
+ if (0) \
+ no_printk(fmt, ##__VA_ARGS__); \
+} while (0)
+
+#define ntfs_debug_dump_runlist(rl) do {} while (0)
+
+#endif /* !DEBUG */
+
+extern __printf(3, 4)
+void __ntfs_warning(const char *function, const struct super_block *sb,
+ const char *fmt, ...);
+#define ntfs_warning(sb, f, a...) __ntfs_warning(__func__, sb, f, ##a)
+
+extern __printf(3, 4)
+void __ntfs_error(const char *function, const struct super_block *sb,
+ const char *fmt, ...);
+#define ntfs_error(sb, f, a...) __ntfs_error(__func__, sb, f, ##a)
+
+#endif /* _LINUX_NTFS_DEBUG_H */
diff --git a/fs/ntfs/dir.c b/fs/ntfs/dir.c
new file mode 100644
index 000000000..cd96083a1
--- /dev/null
+++ b/fs/ntfs/dir.c
@@ -0,0 +1,1538 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/**
+ * dir.c - NTFS kernel directory operations. Part of the Linux-NTFS project.
+ *
+ * Copyright (c) 2001-2007 Anton Altaparmakov
+ * Copyright (c) 2002 Richard Russon
+ */
+
+#include <linux/buffer_head.h>
+#include <linux/slab.h>
+#include <linux/blkdev.h>
+
+#include "dir.h"
+#include "aops.h"
+#include "attrib.h"
+#include "mft.h"
+#include "debug.h"
+#include "ntfs.h"
+
+/**
+ * The little endian Unicode string $I30 as a global constant.
+ */
+ntfschar I30[5] = { cpu_to_le16('$'), cpu_to_le16('I'),
+ cpu_to_le16('3'), cpu_to_le16('0'), 0 };
+
+/**
+ * ntfs_lookup_inode_by_name - find an inode in a directory given its name
+ * @dir_ni: ntfs inode of the directory in which to search for the name
+ * @uname: Unicode name for which to search in the directory
+ * @uname_len: length of the name @uname in Unicode characters
+ * @res: return the found file name if necessary (see below)
+ *
+ * Look for an inode with name @uname in the directory with inode @dir_ni.
+ * ntfs_lookup_inode_by_name() walks the contents of the directory looking for
+ * the Unicode name. If the name is found in the directory, the corresponding
+ * inode number (>= 0) is returned as a mft reference in cpu format, i.e. it
+ * is a 64-bit number containing the sequence number.
+ *
+ * On error, a negative value is returned corresponding to the error code. In
+ * particular if the inode is not found -ENOENT is returned. Note that you
+ * can't just check the return value for being negative, you have to check the
+ * inode number for being negative which you can extract using MREC(return
+ * value).
+ *
+ * Note, @uname_len does not include the (optional) terminating NULL character.
+ *
+ * Note, we look for a case sensitive match first but we also look for a case
+ * insensitive match at the same time. If we find a case insensitive match, we
+ * save that for the case that we don't find an exact match, where we return
+ * the case insensitive match and setup @res (which we allocate!) with the mft
+ * reference, the file name type, length and with a copy of the little endian
+ * Unicode file name itself. If we match a file name which is in the DOS name
+ * space, we only return the mft reference and file name type in @res.
+ * ntfs_lookup() then uses this to find the long file name in the inode itself.
+ * This is to avoid polluting the dcache with short file names. We want them to
+ * work but we don't care for how quickly one can access them. This also fixes
+ * the dcache aliasing issues.
+ *
+ * Locking: - Caller must hold i_mutex on the directory.
+ * - Each page cache page in the index allocation mapping must be
+ * locked whilst being accessed otherwise we may find a corrupt
+ * page due to it being under ->writepage at the moment which
+ * applies the mst protection fixups before writing out and then
+ * removes them again after the write is complete after which it
+ * unlocks the page.
+ */
+MFT_REF ntfs_lookup_inode_by_name(ntfs_inode *dir_ni, const ntfschar *uname,
+ const int uname_len, ntfs_name **res)
+{
+ ntfs_volume *vol = dir_ni->vol;
+ struct super_block *sb = vol->sb;
+ MFT_RECORD *m;
+ INDEX_ROOT *ir;
+ INDEX_ENTRY *ie;
+ INDEX_ALLOCATION *ia;
+ u8 *index_end;
+ u64 mref;
+ ntfs_attr_search_ctx *ctx;
+ int err, rc;
+ VCN vcn, old_vcn;
+ struct address_space *ia_mapping;
+ struct page *page;
+ u8 *kaddr;
+ ntfs_name *name = NULL;
+
+ BUG_ON(!S_ISDIR(VFS_I(dir_ni)->i_mode));
+ BUG_ON(NInoAttr(dir_ni));
+ /* Get hold of the mft record for the directory. */
+ m = map_mft_record(dir_ni);
+ if (IS_ERR(m)) {
+ ntfs_error(sb, "map_mft_record() failed with error code %ld.",
+ -PTR_ERR(m));
+ return ERR_MREF(PTR_ERR(m));
+ }
+ ctx = ntfs_attr_get_search_ctx(dir_ni, m);
+ if (unlikely(!ctx)) {
+ err = -ENOMEM;
+ goto err_out;
+ }
+ /* Find the index root attribute in the mft record. */
+ err = ntfs_attr_lookup(AT_INDEX_ROOT, I30, 4, CASE_SENSITIVE, 0, NULL,
+ 0, ctx);
+ if (unlikely(err)) {
+ if (err == -ENOENT) {
+ ntfs_error(sb, "Index root attribute missing in "
+ "directory inode 0x%lx.",
+ dir_ni->mft_no);
+ err = -EIO;
+ }
+ goto err_out;
+ }
+ /* Get to the index root value (it's been verified in read_inode). */
+ ir = (INDEX_ROOT*)((u8*)ctx->attr +
+ le16_to_cpu(ctx->attr->data.resident.value_offset));
+ index_end = (u8*)&ir->index + le32_to_cpu(ir->index.index_length);
+ /* The first index entry. */
+ ie = (INDEX_ENTRY*)((u8*)&ir->index +
+ le32_to_cpu(ir->index.entries_offset));
+ /*
+ * Loop until we exceed valid memory (corruption case) or until we
+ * reach the last entry.
+ */
+ for (;; ie = (INDEX_ENTRY*)((u8*)ie + le16_to_cpu(ie->length))) {
+ /* Bounds checks. */
+ if ((u8*)ie < (u8*)ctx->mrec || (u8*)ie +
+ sizeof(INDEX_ENTRY_HEADER) > index_end ||
+ (u8*)ie + le16_to_cpu(ie->key_length) >
+ index_end)
+ goto dir_err_out;
+ /*
+ * The last entry cannot contain a name. It can however contain
+ * a pointer to a child node in the B+tree so we just break out.
+ */
+ if (ie->flags & INDEX_ENTRY_END)
+ break;
+ /*
+ * We perform a case sensitive comparison and if that matches
+ * we are done and return the mft reference of the inode (i.e.
+ * the inode number together with the sequence number for
+ * consistency checking). We convert it to cpu format before
+ * returning.
+ */
+ if (ntfs_are_names_equal(uname, uname_len,
+ (ntfschar*)&ie->key.file_name.file_name,
+ ie->key.file_name.file_name_length,
+ CASE_SENSITIVE, vol->upcase, vol->upcase_len)) {
+found_it:
+ /*
+ * We have a perfect match, so we don't need to care
+ * about having matched imperfectly before, so we can
+ * free name and set *res to NULL.
+ * However, if the perfect match is a short file name,
+ * we need to signal this through *res, so that
+ * ntfs_lookup() can fix dcache aliasing issues.
+ * As an optimization we just reuse an existing
+ * allocation of *res.
+ */
+ if (ie->key.file_name.file_name_type == FILE_NAME_DOS) {
+ if (!name) {
+ name = kmalloc(sizeof(ntfs_name),
+ GFP_NOFS);
+ if (!name) {
+ err = -ENOMEM;
+ goto err_out;
+ }
+ }
+ name->mref = le64_to_cpu(
+ ie->data.dir.indexed_file);
+ name->type = FILE_NAME_DOS;
+ name->len = 0;
+ *res = name;
+ } else {
+ kfree(name);
+ *res = NULL;
+ }
+ mref = le64_to_cpu(ie->data.dir.indexed_file);
+ ntfs_attr_put_search_ctx(ctx);
+ unmap_mft_record(dir_ni);
+ return mref;
+ }
+ /*
+ * For a case insensitive mount, we also perform a case
+ * insensitive comparison (provided the file name is not in the
+ * POSIX namespace). If the comparison matches, and the name is
+ * in the WIN32 namespace, we cache the filename in *res so
+ * that the caller, ntfs_lookup(), can work on it. If the
+ * comparison matches, and the name is in the DOS namespace, we
+ * only cache the mft reference and the file name type (we set
+ * the name length to zero for simplicity).
+ */
+ if (!NVolCaseSensitive(vol) &&
+ ie->key.file_name.file_name_type &&
+ ntfs_are_names_equal(uname, uname_len,
+ (ntfschar*)&ie->key.file_name.file_name,
+ ie->key.file_name.file_name_length,
+ IGNORE_CASE, vol->upcase, vol->upcase_len)) {
+ int name_size = sizeof(ntfs_name);
+ u8 type = ie->key.file_name.file_name_type;
+ u8 len = ie->key.file_name.file_name_length;
+
+ /* Only one case insensitive matching name allowed. */
+ if (name) {
+ ntfs_error(sb, "Found already allocated name "
+ "in phase 1. Please run chkdsk "
+ "and if that doesn't find any "
+ "errors please report you saw "
+ "this message to "
+ "linux-ntfs-dev@lists."
+ "sourceforge.net.");
+ goto dir_err_out;
+ }
+
+ if (type != FILE_NAME_DOS)
+ name_size += len * sizeof(ntfschar);
+ name = kmalloc(name_size, GFP_NOFS);
+ if (!name) {
+ err = -ENOMEM;
+ goto err_out;
+ }
+ name->mref = le64_to_cpu(ie->data.dir.indexed_file);
+ name->type = type;
+ if (type != FILE_NAME_DOS) {
+ name->len = len;
+ memcpy(name->name, ie->key.file_name.file_name,
+ len * sizeof(ntfschar));
+ } else
+ name->len = 0;
+ *res = name;
+ }
+ /*
+ * Not a perfect match, need to do full blown collation so we
+ * know which way in the B+tree we have to go.
+ */
+ rc = ntfs_collate_names(uname, uname_len,
+ (ntfschar*)&ie->key.file_name.file_name,
+ ie->key.file_name.file_name_length, 1,
+ IGNORE_CASE, vol->upcase, vol->upcase_len);
+ /*
+ * If uname collates before the name of the current entry, there
+ * is definitely no such name in this index but we might need to
+ * descend into the B+tree so we just break out of the loop.
+ */
+ if (rc == -1)
+ break;
+ /* The names are not equal, continue the search. */
+ if (rc)
+ continue;
+ /*
+ * Names match with case insensitive comparison, now try the
+ * case sensitive comparison, which is required for proper
+ * collation.
+ */
+ rc = ntfs_collate_names(uname, uname_len,
+ (ntfschar*)&ie->key.file_name.file_name,
+ ie->key.file_name.file_name_length, 1,
+ CASE_SENSITIVE, vol->upcase, vol->upcase_len);
+ if (rc == -1)
+ break;
+ if (rc)
+ continue;
+ /*
+ * Perfect match, this will never happen as the
+ * ntfs_are_names_equal() call will have gotten a match but we
+ * still treat it correctly.
+ */
+ goto found_it;
+ }
+ /*
+ * We have finished with this index without success. Check for the
+ * presence of a child node and if not present return -ENOENT, unless
+ * we have got a matching name cached in name in which case return the
+ * mft reference associated with it.
+ */
+ if (!(ie->flags & INDEX_ENTRY_NODE)) {
+ if (name) {
+ ntfs_attr_put_search_ctx(ctx);
+ unmap_mft_record(dir_ni);
+ return name->mref;
+ }
+ ntfs_debug("Entry not found.");
+ err = -ENOENT;
+ goto err_out;
+ } /* Child node present, descend into it. */
+ /* Consistency check: Verify that an index allocation exists. */
+ if (!NInoIndexAllocPresent(dir_ni)) {
+ ntfs_error(sb, "No index allocation attribute but index entry "
+ "requires one. Directory inode 0x%lx is "
+ "corrupt or driver bug.", dir_ni->mft_no);
+ goto err_out;
+ }
+ /* Get the starting vcn of the index_block holding the child node. */
+ vcn = sle64_to_cpup((sle64*)((u8*)ie + le16_to_cpu(ie->length) - 8));
+ ia_mapping = VFS_I(dir_ni)->i_mapping;
+ /*
+ * We are done with the index root and the mft record. Release them,
+ * otherwise we deadlock with ntfs_map_page().
+ */
+ ntfs_attr_put_search_ctx(ctx);
+ unmap_mft_record(dir_ni);
+ m = NULL;
+ ctx = NULL;
+descend_into_child_node:
+ /*
+ * Convert vcn to index into the index allocation attribute in units
+ * of PAGE_SIZE and map the page cache page, reading it from
+ * disk if necessary.
+ */
+ page = ntfs_map_page(ia_mapping, vcn <<
+ dir_ni->itype.index.vcn_size_bits >> PAGE_SHIFT);
+ if (IS_ERR(page)) {
+ ntfs_error(sb, "Failed to map directory index page, error %ld.",
+ -PTR_ERR(page));
+ err = PTR_ERR(page);
+ goto err_out;
+ }
+ lock_page(page);
+ kaddr = (u8*)page_address(page);
+fast_descend_into_child_node:
+ /* Get to the index allocation block. */
+ ia = (INDEX_ALLOCATION*)(kaddr + ((vcn <<
+ dir_ni->itype.index.vcn_size_bits) & ~PAGE_MASK));
+ /* Bounds checks. */
+ if ((u8*)ia < kaddr || (u8*)ia > kaddr + PAGE_SIZE) {
+ ntfs_error(sb, "Out of bounds check failed. Corrupt directory "
+ "inode 0x%lx or driver bug.", dir_ni->mft_no);
+ goto unm_err_out;
+ }
+ /* Catch multi sector transfer fixup errors. */
+ if (unlikely(!ntfs_is_indx_record(ia->magic))) {
+ ntfs_error(sb, "Directory index record with vcn 0x%llx is "
+ "corrupt. Corrupt inode 0x%lx. Run chkdsk.",
+ (unsigned long long)vcn, dir_ni->mft_no);
+ goto unm_err_out;
+ }
+ if (sle64_to_cpu(ia->index_block_vcn) != vcn) {
+ ntfs_error(sb, "Actual VCN (0x%llx) of index buffer is "
+ "different from expected VCN (0x%llx). "
+ "Directory inode 0x%lx is corrupt or driver "
+ "bug.", (unsigned long long)
+ sle64_to_cpu(ia->index_block_vcn),
+ (unsigned long long)vcn, dir_ni->mft_no);
+ goto unm_err_out;
+ }
+ if (le32_to_cpu(ia->index.allocated_size) + 0x18 !=
+ dir_ni->itype.index.block_size) {
+ ntfs_error(sb, "Index buffer (VCN 0x%llx) of directory inode "
+ "0x%lx has a size (%u) differing from the "
+ "directory specified size (%u). Directory "
+ "inode is corrupt or driver bug.",
+ (unsigned long long)vcn, dir_ni->mft_no,
+ le32_to_cpu(ia->index.allocated_size) + 0x18,
+ dir_ni->itype.index.block_size);
+ goto unm_err_out;
+ }
+ index_end = (u8*)ia + dir_ni->itype.index.block_size;
+ if (index_end > kaddr + PAGE_SIZE) {
+ ntfs_error(sb, "Index buffer (VCN 0x%llx) of directory inode "
+ "0x%lx crosses page boundary. Impossible! "
+ "Cannot access! This is probably a bug in the "
+ "driver.", (unsigned long long)vcn,
+ dir_ni->mft_no);
+ goto unm_err_out;
+ }
+ index_end = (u8*)&ia->index + le32_to_cpu(ia->index.index_length);
+ if (index_end > (u8*)ia + dir_ni->itype.index.block_size) {
+ ntfs_error(sb, "Size of index buffer (VCN 0x%llx) of directory "
+ "inode 0x%lx exceeds maximum size.",
+ (unsigned long long)vcn, dir_ni->mft_no);
+ goto unm_err_out;
+ }
+ /* The first index entry. */
+ ie = (INDEX_ENTRY*)((u8*)&ia->index +
+ le32_to_cpu(ia->index.entries_offset));
+ /*
+ * Iterate similar to above big loop but applied to index buffer, thus
+ * loop until we exceed valid memory (corruption case) or until we
+ * reach the last entry.
+ */
+ for (;; ie = (INDEX_ENTRY*)((u8*)ie + le16_to_cpu(ie->length))) {
+ /* Bounds check. */
+ if ((u8*)ie < (u8*)ia || (u8*)ie +
+ sizeof(INDEX_ENTRY_HEADER) > index_end ||
+ (u8*)ie + le16_to_cpu(ie->key_length) >
+ index_end) {
+ ntfs_error(sb, "Index entry out of bounds in "
+ "directory inode 0x%lx.",
+ dir_ni->mft_no);
+ goto unm_err_out;
+ }
+ /*
+ * The last entry cannot contain a name. It can however contain
+ * a pointer to a child node in the B+tree so we just break out.
+ */
+ if (ie->flags & INDEX_ENTRY_END)
+ break;
+ /*
+ * We perform a case sensitive comparison and if that matches
+ * we are done and return the mft reference of the inode (i.e.
+ * the inode number together with the sequence number for
+ * consistency checking). We convert it to cpu format before
+ * returning.
+ */
+ if (ntfs_are_names_equal(uname, uname_len,
+ (ntfschar*)&ie->key.file_name.file_name,
+ ie->key.file_name.file_name_length,
+ CASE_SENSITIVE, vol->upcase, vol->upcase_len)) {
+found_it2:
+ /*
+ * We have a perfect match, so we don't need to care
+ * about having matched imperfectly before, so we can
+ * free name and set *res to NULL.
+ * However, if the perfect match is a short file name,
+ * we need to signal this through *res, so that
+ * ntfs_lookup() can fix dcache aliasing issues.
+ * As an optimization we just reuse an existing
+ * allocation of *res.
+ */
+ if (ie->key.file_name.file_name_type == FILE_NAME_DOS) {
+ if (!name) {
+ name = kmalloc(sizeof(ntfs_name),
+ GFP_NOFS);
+ if (!name) {
+ err = -ENOMEM;
+ goto unm_err_out;
+ }
+ }
+ name->mref = le64_to_cpu(
+ ie->data.dir.indexed_file);
+ name->type = FILE_NAME_DOS;
+ name->len = 0;
+ *res = name;
+ } else {
+ kfree(name);
+ *res = NULL;
+ }
+ mref = le64_to_cpu(ie->data.dir.indexed_file);
+ unlock_page(page);
+ ntfs_unmap_page(page);
+ return mref;
+ }
+ /*
+ * For a case insensitive mount, we also perform a case
+ * insensitive comparison (provided the file name is not in the
+ * POSIX namespace). If the comparison matches, and the name is
+ * in the WIN32 namespace, we cache the filename in *res so
+ * that the caller, ntfs_lookup(), can work on it. If the
+ * comparison matches, and the name is in the DOS namespace, we
+ * only cache the mft reference and the file name type (we set
+ * the name length to zero for simplicity).
+ */
+ if (!NVolCaseSensitive(vol) &&
+ ie->key.file_name.file_name_type &&
+ ntfs_are_names_equal(uname, uname_len,
+ (ntfschar*)&ie->key.file_name.file_name,
+ ie->key.file_name.file_name_length,
+ IGNORE_CASE, vol->upcase, vol->upcase_len)) {
+ int name_size = sizeof(ntfs_name);
+ u8 type = ie->key.file_name.file_name_type;
+ u8 len = ie->key.file_name.file_name_length;
+
+ /* Only one case insensitive matching name allowed. */
+ if (name) {
+ ntfs_error(sb, "Found already allocated name "
+ "in phase 2. Please run chkdsk "
+ "and if that doesn't find any "
+ "errors please report you saw "
+ "this message to "
+ "linux-ntfs-dev@lists."
+ "sourceforge.net.");
+ unlock_page(page);
+ ntfs_unmap_page(page);
+ goto dir_err_out;
+ }
+
+ if (type != FILE_NAME_DOS)
+ name_size += len * sizeof(ntfschar);
+ name = kmalloc(name_size, GFP_NOFS);
+ if (!name) {
+ err = -ENOMEM;
+ goto unm_err_out;
+ }
+ name->mref = le64_to_cpu(ie->data.dir.indexed_file);
+ name->type = type;
+ if (type != FILE_NAME_DOS) {
+ name->len = len;
+ memcpy(name->name, ie->key.file_name.file_name,
+ len * sizeof(ntfschar));
+ } else
+ name->len = 0;
+ *res = name;
+ }
+ /*
+ * Not a perfect match, need to do full blown collation so we
+ * know which way in the B+tree we have to go.
+ */
+ rc = ntfs_collate_names(uname, uname_len,
+ (ntfschar*)&ie->key.file_name.file_name,
+ ie->key.file_name.file_name_length, 1,
+ IGNORE_CASE, vol->upcase, vol->upcase_len);
+ /*
+ * If uname collates before the name of the current entry, there
+ * is definitely no such name in this index but we might need to
+ * descend into the B+tree so we just break out of the loop.
+ */
+ if (rc == -1)
+ break;
+ /* The names are not equal, continue the search. */
+ if (rc)
+ continue;
+ /*
+ * Names match with case insensitive comparison, now try the
+ * case sensitive comparison, which is required for proper
+ * collation.
+ */
+ rc = ntfs_collate_names(uname, uname_len,
+ (ntfschar*)&ie->key.file_name.file_name,
+ ie->key.file_name.file_name_length, 1,
+ CASE_SENSITIVE, vol->upcase, vol->upcase_len);
+ if (rc == -1)
+ break;
+ if (rc)
+ continue;
+ /*
+ * Perfect match, this will never happen as the
+ * ntfs_are_names_equal() call will have gotten a match but we
+ * still treat it correctly.
+ */
+ goto found_it2;
+ }
+ /*
+ * We have finished with this index buffer without success. Check for
+ * the presence of a child node.
+ */
+ if (ie->flags & INDEX_ENTRY_NODE) {
+ if ((ia->index.flags & NODE_MASK) == LEAF_NODE) {
+ ntfs_error(sb, "Index entry with child node found in "
+ "a leaf node in directory inode 0x%lx.",
+ dir_ni->mft_no);
+ goto unm_err_out;
+ }
+ /* Child node present, descend into it. */
+ old_vcn = vcn;
+ vcn = sle64_to_cpup((sle64*)((u8*)ie +
+ le16_to_cpu(ie->length) - 8));
+ if (vcn >= 0) {
+ /* If vcn is in the same page cache page as old_vcn we
+ * recycle the mapped page. */
+ if (old_vcn << vol->cluster_size_bits >>
+ PAGE_SHIFT == vcn <<
+ vol->cluster_size_bits >>
+ PAGE_SHIFT)
+ goto fast_descend_into_child_node;
+ unlock_page(page);
+ ntfs_unmap_page(page);
+ goto descend_into_child_node;
+ }
+ ntfs_error(sb, "Negative child node vcn in directory inode "
+ "0x%lx.", dir_ni->mft_no);
+ goto unm_err_out;
+ }
+ /*
+ * No child node present, return -ENOENT, unless we have got a matching
+ * name cached in name in which case return the mft reference
+ * associated with it.
+ */
+ if (name) {
+ unlock_page(page);
+ ntfs_unmap_page(page);
+ return name->mref;
+ }
+ ntfs_debug("Entry not found.");
+ err = -ENOENT;
+unm_err_out:
+ unlock_page(page);
+ ntfs_unmap_page(page);
+err_out:
+ if (!err)
+ err = -EIO;
+ if (ctx)
+ ntfs_attr_put_search_ctx(ctx);
+ if (m)
+ unmap_mft_record(dir_ni);
+ if (name) {
+ kfree(name);
+ *res = NULL;
+ }
+ return ERR_MREF(err);
+dir_err_out:
+ ntfs_error(sb, "Corrupt directory. Aborting lookup.");
+ goto err_out;
+}
+
+#if 0
+
+// TODO: (AIA)
+// The algorithm embedded in this code will be required for the time when we
+// want to support adding of entries to directories, where we require correct
+// collation of file names in order not to cause corruption of the filesystem.
+
+/**
+ * ntfs_lookup_inode_by_name - find an inode in a directory given its name
+ * @dir_ni: ntfs inode of the directory in which to search for the name
+ * @uname: Unicode name for which to search in the directory
+ * @uname_len: length of the name @uname in Unicode characters
+ *
+ * Look for an inode with name @uname in the directory with inode @dir_ni.
+ * ntfs_lookup_inode_by_name() walks the contents of the directory looking for
+ * the Unicode name. If the name is found in the directory, the corresponding
+ * inode number (>= 0) is returned as a mft reference in cpu format, i.e. it
+ * is a 64-bit number containing the sequence number.
+ *
+ * On error, a negative value is returned corresponding to the error code. In
+ * particular if the inode is not found -ENOENT is returned. Note that you
+ * can't just check the return value for being negative, you have to check the
+ * inode number for being negative which you can extract using MREC(return
+ * value).
+ *
+ * Note, @uname_len does not include the (optional) terminating NULL character.
+ */
+u64 ntfs_lookup_inode_by_name(ntfs_inode *dir_ni, const ntfschar *uname,
+ const int uname_len)
+{
+ ntfs_volume *vol = dir_ni->vol;
+ struct super_block *sb = vol->sb;
+ MFT_RECORD *m;
+ INDEX_ROOT *ir;
+ INDEX_ENTRY *ie;
+ INDEX_ALLOCATION *ia;
+ u8 *index_end;
+ u64 mref;
+ ntfs_attr_search_ctx *ctx;
+ int err, rc;
+ IGNORE_CASE_BOOL ic;
+ VCN vcn, old_vcn;
+ struct address_space *ia_mapping;
+ struct page *page;
+ u8 *kaddr;
+
+ /* Get hold of the mft record for the directory. */
+ m = map_mft_record(dir_ni);
+ if (IS_ERR(m)) {
+ ntfs_error(sb, "map_mft_record() failed with error code %ld.",
+ -PTR_ERR(m));
+ return ERR_MREF(PTR_ERR(m));
+ }
+ ctx = ntfs_attr_get_search_ctx(dir_ni, m);
+ if (!ctx) {
+ err = -ENOMEM;
+ goto err_out;
+ }
+ /* Find the index root attribute in the mft record. */
+ err = ntfs_attr_lookup(AT_INDEX_ROOT, I30, 4, CASE_SENSITIVE, 0, NULL,
+ 0, ctx);
+ if (unlikely(err)) {
+ if (err == -ENOENT) {
+ ntfs_error(sb, "Index root attribute missing in "
+ "directory inode 0x%lx.",
+ dir_ni->mft_no);
+ err = -EIO;
+ }
+ goto err_out;
+ }
+ /* Get to the index root value (it's been verified in read_inode). */
+ ir = (INDEX_ROOT*)((u8*)ctx->attr +
+ le16_to_cpu(ctx->attr->data.resident.value_offset));
+ index_end = (u8*)&ir->index + le32_to_cpu(ir->index.index_length);
+ /* The first index entry. */
+ ie = (INDEX_ENTRY*)((u8*)&ir->index +
+ le32_to_cpu(ir->index.entries_offset));
+ /*
+ * Loop until we exceed valid memory (corruption case) or until we
+ * reach the last entry.
+ */
+ for (;; ie = (INDEX_ENTRY*)((u8*)ie + le16_to_cpu(ie->length))) {
+ /* Bounds checks. */
+ if ((u8*)ie < (u8*)ctx->mrec || (u8*)ie +
+ sizeof(INDEX_ENTRY_HEADER) > index_end ||
+ (u8*)ie + le16_to_cpu(ie->key_length) >
+ index_end)
+ goto dir_err_out;
+ /*
+ * The last entry cannot contain a name. It can however contain
+ * a pointer to a child node in the B+tree so we just break out.
+ */
+ if (ie->flags & INDEX_ENTRY_END)
+ break;
+ /*
+ * If the current entry has a name type of POSIX, the name is
+ * case sensitive and not otherwise. This has the effect of us
+ * not being able to access any POSIX file names which collate
+ * after the non-POSIX one when they only differ in case, but
+ * anyone doing screwy stuff like that deserves to burn in
+ * hell... Doing that kind of stuff on NT4 actually causes
+ * corruption on the partition even when using SP6a and Linux
+ * is not involved at all.
+ */
+ ic = ie->key.file_name.file_name_type ? IGNORE_CASE :
+ CASE_SENSITIVE;
+ /*
+ * If the names match perfectly, we are done and return the
+ * mft reference of the inode (i.e. the inode number together
+ * with the sequence number for consistency checking. We
+ * convert it to cpu format before returning.
+ */
+ if (ntfs_are_names_equal(uname, uname_len,
+ (ntfschar*)&ie->key.file_name.file_name,
+ ie->key.file_name.file_name_length, ic,
+ vol->upcase, vol->upcase_len)) {
+found_it:
+ mref = le64_to_cpu(ie->data.dir.indexed_file);
+ ntfs_attr_put_search_ctx(ctx);
+ unmap_mft_record(dir_ni);
+ return mref;
+ }
+ /*
+ * Not a perfect match, need to do full blown collation so we
+ * know which way in the B+tree we have to go.
+ */
+ rc = ntfs_collate_names(uname, uname_len,
+ (ntfschar*)&ie->key.file_name.file_name,
+ ie->key.file_name.file_name_length, 1,
+ IGNORE_CASE, vol->upcase, vol->upcase_len);
+ /*
+ * If uname collates before the name of the current entry, there
+ * is definitely no such name in this index but we might need to
+ * descend into the B+tree so we just break out of the loop.
+ */
+ if (rc == -1)
+ break;
+ /* The names are not equal, continue the search. */
+ if (rc)
+ continue;
+ /*
+ * Names match with case insensitive comparison, now try the
+ * case sensitive comparison, which is required for proper
+ * collation.
+ */
+ rc = ntfs_collate_names(uname, uname_len,
+ (ntfschar*)&ie->key.file_name.file_name,
+ ie->key.file_name.file_name_length, 1,
+ CASE_SENSITIVE, vol->upcase, vol->upcase_len);
+ if (rc == -1)
+ break;
+ if (rc)
+ continue;
+ /*
+ * Perfect match, this will never happen as the
+ * ntfs_are_names_equal() call will have gotten a match but we
+ * still treat it correctly.
+ */
+ goto found_it;
+ }
+ /*
+ * We have finished with this index without success. Check for the
+ * presence of a child node.
+ */
+ if (!(ie->flags & INDEX_ENTRY_NODE)) {
+ /* No child node, return -ENOENT. */
+ err = -ENOENT;
+ goto err_out;
+ } /* Child node present, descend into it. */
+ /* Consistency check: Verify that an index allocation exists. */
+ if (!NInoIndexAllocPresent(dir_ni)) {
+ ntfs_error(sb, "No index allocation attribute but index entry "
+ "requires one. Directory inode 0x%lx is "
+ "corrupt or driver bug.", dir_ni->mft_no);
+ goto err_out;
+ }
+ /* Get the starting vcn of the index_block holding the child node. */
+ vcn = sle64_to_cpup((u8*)ie + le16_to_cpu(ie->length) - 8);
+ ia_mapping = VFS_I(dir_ni)->i_mapping;
+ /*
+ * We are done with the index root and the mft record. Release them,
+ * otherwise we deadlock with ntfs_map_page().
+ */
+ ntfs_attr_put_search_ctx(ctx);
+ unmap_mft_record(dir_ni);
+ m = NULL;
+ ctx = NULL;
+descend_into_child_node:
+ /*
+ * Convert vcn to index into the index allocation attribute in units
+ * of PAGE_SIZE and map the page cache page, reading it from
+ * disk if necessary.
+ */
+ page = ntfs_map_page(ia_mapping, vcn <<
+ dir_ni->itype.index.vcn_size_bits >> PAGE_SHIFT);
+ if (IS_ERR(page)) {
+ ntfs_error(sb, "Failed to map directory index page, error %ld.",
+ -PTR_ERR(page));
+ err = PTR_ERR(page);
+ goto err_out;
+ }
+ lock_page(page);
+ kaddr = (u8*)page_address(page);
+fast_descend_into_child_node:
+ /* Get to the index allocation block. */
+ ia = (INDEX_ALLOCATION*)(kaddr + ((vcn <<
+ dir_ni->itype.index.vcn_size_bits) & ~PAGE_MASK));
+ /* Bounds checks. */
+ if ((u8*)ia < kaddr || (u8*)ia > kaddr + PAGE_SIZE) {
+ ntfs_error(sb, "Out of bounds check failed. Corrupt directory "
+ "inode 0x%lx or driver bug.", dir_ni->mft_no);
+ goto unm_err_out;
+ }
+ /* Catch multi sector transfer fixup errors. */
+ if (unlikely(!ntfs_is_indx_record(ia->magic))) {
+ ntfs_error(sb, "Directory index record with vcn 0x%llx is "
+ "corrupt. Corrupt inode 0x%lx. Run chkdsk.",
+ (unsigned long long)vcn, dir_ni->mft_no);
+ goto unm_err_out;
+ }
+ if (sle64_to_cpu(ia->index_block_vcn) != vcn) {
+ ntfs_error(sb, "Actual VCN (0x%llx) of index buffer is "
+ "different from expected VCN (0x%llx). "
+ "Directory inode 0x%lx is corrupt or driver "
+ "bug.", (unsigned long long)
+ sle64_to_cpu(ia->index_block_vcn),
+ (unsigned long long)vcn, dir_ni->mft_no);
+ goto unm_err_out;
+ }
+ if (le32_to_cpu(ia->index.allocated_size) + 0x18 !=
+ dir_ni->itype.index.block_size) {
+ ntfs_error(sb, "Index buffer (VCN 0x%llx) of directory inode "
+ "0x%lx has a size (%u) differing from the "
+ "directory specified size (%u). Directory "
+ "inode is corrupt or driver bug.",
+ (unsigned long long)vcn, dir_ni->mft_no,
+ le32_to_cpu(ia->index.allocated_size) + 0x18,
+ dir_ni->itype.index.block_size);
+ goto unm_err_out;
+ }
+ index_end = (u8*)ia + dir_ni->itype.index.block_size;
+ if (index_end > kaddr + PAGE_SIZE) {
+ ntfs_error(sb, "Index buffer (VCN 0x%llx) of directory inode "
+ "0x%lx crosses page boundary. Impossible! "
+ "Cannot access! This is probably a bug in the "
+ "driver.", (unsigned long long)vcn,
+ dir_ni->mft_no);
+ goto unm_err_out;
+ }
+ index_end = (u8*)&ia->index + le32_to_cpu(ia->index.index_length);
+ if (index_end > (u8*)ia + dir_ni->itype.index.block_size) {
+ ntfs_error(sb, "Size of index buffer (VCN 0x%llx) of directory "
+ "inode 0x%lx exceeds maximum size.",
+ (unsigned long long)vcn, dir_ni->mft_no);
+ goto unm_err_out;
+ }
+ /* The first index entry. */
+ ie = (INDEX_ENTRY*)((u8*)&ia->index +
+ le32_to_cpu(ia->index.entries_offset));
+ /*
+ * Iterate similar to above big loop but applied to index buffer, thus
+ * loop until we exceed valid memory (corruption case) or until we
+ * reach the last entry.
+ */
+ for (;; ie = (INDEX_ENTRY*)((u8*)ie + le16_to_cpu(ie->length))) {
+ /* Bounds check. */
+ if ((u8*)ie < (u8*)ia || (u8*)ie +
+ sizeof(INDEX_ENTRY_HEADER) > index_end ||
+ (u8*)ie + le16_to_cpu(ie->key_length) >
+ index_end) {
+ ntfs_error(sb, "Index entry out of bounds in "
+ "directory inode 0x%lx.",
+ dir_ni->mft_no);
+ goto unm_err_out;
+ }
+ /*
+ * The last entry cannot contain a name. It can however contain
+ * a pointer to a child node in the B+tree so we just break out.
+ */
+ if (ie->flags & INDEX_ENTRY_END)
+ break;
+ /*
+ * If the current entry has a name type of POSIX, the name is
+ * case sensitive and not otherwise. This has the effect of us
+ * not being able to access any POSIX file names which collate
+ * after the non-POSIX one when they only differ in case, but
+ * anyone doing screwy stuff like that deserves to burn in
+ * hell... Doing that kind of stuff on NT4 actually causes
+ * corruption on the partition even when using SP6a and Linux
+ * is not involved at all.
+ */
+ ic = ie->key.file_name.file_name_type ? IGNORE_CASE :
+ CASE_SENSITIVE;
+ /*
+ * If the names match perfectly, we are done and return the
+ * mft reference of the inode (i.e. the inode number together
+ * with the sequence number for consistency checking. We
+ * convert it to cpu format before returning.
+ */
+ if (ntfs_are_names_equal(uname, uname_len,
+ (ntfschar*)&ie->key.file_name.file_name,
+ ie->key.file_name.file_name_length, ic,
+ vol->upcase, vol->upcase_len)) {
+found_it2:
+ mref = le64_to_cpu(ie->data.dir.indexed_file);
+ unlock_page(page);
+ ntfs_unmap_page(page);
+ return mref;
+ }
+ /*
+ * Not a perfect match, need to do full blown collation so we
+ * know which way in the B+tree we have to go.
+ */
+ rc = ntfs_collate_names(uname, uname_len,
+ (ntfschar*)&ie->key.file_name.file_name,
+ ie->key.file_name.file_name_length, 1,
+ IGNORE_CASE, vol->upcase, vol->upcase_len);
+ /*
+ * If uname collates before the name of the current entry, there
+ * is definitely no such name in this index but we might need to
+ * descend into the B+tree so we just break out of the loop.
+ */
+ if (rc == -1)
+ break;
+ /* The names are not equal, continue the search. */
+ if (rc)
+ continue;
+ /*
+ * Names match with case insensitive comparison, now try the
+ * case sensitive comparison, which is required for proper
+ * collation.
+ */
+ rc = ntfs_collate_names(uname, uname_len,
+ (ntfschar*)&ie->key.file_name.file_name,
+ ie->key.file_name.file_name_length, 1,
+ CASE_SENSITIVE, vol->upcase, vol->upcase_len);
+ if (rc == -1)
+ break;
+ if (rc)
+ continue;
+ /*
+ * Perfect match, this will never happen as the
+ * ntfs_are_names_equal() call will have gotten a match but we
+ * still treat it correctly.
+ */
+ goto found_it2;
+ }
+ /*
+ * We have finished with this index buffer without success. Check for
+ * the presence of a child node.
+ */
+ if (ie->flags & INDEX_ENTRY_NODE) {
+ if ((ia->index.flags & NODE_MASK) == LEAF_NODE) {
+ ntfs_error(sb, "Index entry with child node found in "
+ "a leaf node in directory inode 0x%lx.",
+ dir_ni->mft_no);
+ goto unm_err_out;
+ }
+ /* Child node present, descend into it. */
+ old_vcn = vcn;
+ vcn = sle64_to_cpup((u8*)ie + le16_to_cpu(ie->length) - 8);
+ if (vcn >= 0) {
+ /* If vcn is in the same page cache page as old_vcn we
+ * recycle the mapped page. */
+ if (old_vcn << vol->cluster_size_bits >>
+ PAGE_SHIFT == vcn <<
+ vol->cluster_size_bits >>
+ PAGE_SHIFT)
+ goto fast_descend_into_child_node;
+ unlock_page(page);
+ ntfs_unmap_page(page);
+ goto descend_into_child_node;
+ }
+ ntfs_error(sb, "Negative child node vcn in directory inode "
+ "0x%lx.", dir_ni->mft_no);
+ goto unm_err_out;
+ }
+ /* No child node, return -ENOENT. */
+ ntfs_debug("Entry not found.");
+ err = -ENOENT;
+unm_err_out:
+ unlock_page(page);
+ ntfs_unmap_page(page);
+err_out:
+ if (!err)
+ err = -EIO;
+ if (ctx)
+ ntfs_attr_put_search_ctx(ctx);
+ if (m)
+ unmap_mft_record(dir_ni);
+ return ERR_MREF(err);
+dir_err_out:
+ ntfs_error(sb, "Corrupt directory. Aborting lookup.");
+ goto err_out;
+}
+
+#endif
+
+/**
+ * ntfs_filldir - ntfs specific filldir method
+ * @vol: current ntfs volume
+ * @ndir: ntfs inode of current directory
+ * @ia_page: page in which the index allocation buffer @ie is in resides
+ * @ie: current index entry
+ * @name: buffer to use for the converted name
+ * @actor: what to feed the entries to
+ *
+ * Convert the Unicode @name to the loaded NLS and pass it to the @filldir
+ * callback.
+ *
+ * If @ia_page is not NULL it is the locked page containing the index
+ * allocation block containing the index entry @ie.
+ *
+ * Note, we drop (and then reacquire) the page lock on @ia_page across the
+ * @filldir() call otherwise we would deadlock with NFSd when it calls ->lookup
+ * since ntfs_lookup() will lock the same page. As an optimization, we do not
+ * retake the lock if we are returning a non-zero value as ntfs_readdir()
+ * would need to drop the lock immediately anyway.
+ */
+static inline int ntfs_filldir(ntfs_volume *vol,
+ ntfs_inode *ndir, struct page *ia_page, INDEX_ENTRY *ie,
+ u8 *name, struct dir_context *actor)
+{
+ unsigned long mref;
+ int name_len;
+ unsigned dt_type;
+ FILE_NAME_TYPE_FLAGS name_type;
+
+ name_type = ie->key.file_name.file_name_type;
+ if (name_type == FILE_NAME_DOS) {
+ ntfs_debug("Skipping DOS name space entry.");
+ return 0;
+ }
+ if (MREF_LE(ie->data.dir.indexed_file) == FILE_root) {
+ ntfs_debug("Skipping root directory self reference entry.");
+ return 0;
+ }
+ if (MREF_LE(ie->data.dir.indexed_file) < FILE_first_user &&
+ !NVolShowSystemFiles(vol)) {
+ ntfs_debug("Skipping system file.");
+ return 0;
+ }
+ name_len = ntfs_ucstonls(vol, (ntfschar*)&ie->key.file_name.file_name,
+ ie->key.file_name.file_name_length, &name,
+ NTFS_MAX_NAME_LEN * NLS_MAX_CHARSET_SIZE + 1);
+ if (name_len <= 0) {
+ ntfs_warning(vol->sb, "Skipping unrepresentable inode 0x%llx.",
+ (long long)MREF_LE(ie->data.dir.indexed_file));
+ return 0;
+ }
+ if (ie->key.file_name.file_attributes &
+ FILE_ATTR_DUP_FILE_NAME_INDEX_PRESENT)
+ dt_type = DT_DIR;
+ else
+ dt_type = DT_REG;
+ mref = MREF_LE(ie->data.dir.indexed_file);
+ /*
+ * Drop the page lock otherwise we deadlock with NFS when it calls
+ * ->lookup since ntfs_lookup() will lock the same page.
+ */
+ if (ia_page)
+ unlock_page(ia_page);
+ ntfs_debug("Calling filldir for %s with len %i, fpos 0x%llx, inode "
+ "0x%lx, DT_%s.", name, name_len, actor->pos, mref,
+ dt_type == DT_DIR ? "DIR" : "REG");
+ if (!dir_emit(actor, name, name_len, mref, dt_type))
+ return 1;
+ /* Relock the page but not if we are aborting ->readdir. */
+ if (ia_page)
+ lock_page(ia_page);
+ return 0;
+}
+
+/*
+ * We use the same basic approach as the old NTFS driver, i.e. we parse the
+ * index root entries and then the index allocation entries that are marked
+ * as in use in the index bitmap.
+ *
+ * While this will return the names in random order this doesn't matter for
+ * ->readdir but OTOH results in a faster ->readdir.
+ *
+ * VFS calls ->readdir without BKL but with i_mutex held. This protects the VFS
+ * parts (e.g. ->f_pos and ->i_size, and it also protects against directory
+ * modifications).
+ *
+ * Locking: - Caller must hold i_mutex on the directory.
+ * - Each page cache page in the index allocation mapping must be
+ * locked whilst being accessed otherwise we may find a corrupt
+ * page due to it being under ->writepage at the moment which
+ * applies the mst protection fixups before writing out and then
+ * removes them again after the write is complete after which it
+ * unlocks the page.
+ */
+static int ntfs_readdir(struct file *file, struct dir_context *actor)
+{
+ s64 ia_pos, ia_start, prev_ia_pos, bmp_pos;
+ loff_t i_size;
+ struct inode *bmp_vi, *vdir = file_inode(file);
+ struct super_block *sb = vdir->i_sb;
+ ntfs_inode *ndir = NTFS_I(vdir);
+ ntfs_volume *vol = NTFS_SB(sb);
+ MFT_RECORD *m;
+ INDEX_ROOT *ir = NULL;
+ INDEX_ENTRY *ie;
+ INDEX_ALLOCATION *ia;
+ u8 *name = NULL;
+ int rc, err, ir_pos, cur_bmp_pos;
+ struct address_space *ia_mapping, *bmp_mapping;
+ struct page *bmp_page = NULL, *ia_page = NULL;
+ u8 *kaddr, *bmp, *index_end;
+ ntfs_attr_search_ctx *ctx;
+
+ ntfs_debug("Entering for inode 0x%lx, fpos 0x%llx.",
+ vdir->i_ino, actor->pos);
+ rc = err = 0;
+ /* Are we at end of dir yet? */
+ i_size = i_size_read(vdir);
+ if (actor->pos >= i_size + vol->mft_record_size)
+ return 0;
+ /* Emulate . and .. for all directories. */
+ if (!dir_emit_dots(file, actor))
+ return 0;
+ m = NULL;
+ ctx = NULL;
+ /*
+ * Allocate a buffer to store the current name being processed
+ * converted to format determined by current NLS.
+ */
+ name = kmalloc(NTFS_MAX_NAME_LEN * NLS_MAX_CHARSET_SIZE + 1, GFP_NOFS);
+ if (unlikely(!name)) {
+ err = -ENOMEM;
+ goto err_out;
+ }
+ /* Are we jumping straight into the index allocation attribute? */
+ if (actor->pos >= vol->mft_record_size)
+ goto skip_index_root;
+ /* Get hold of the mft record for the directory. */
+ m = map_mft_record(ndir);
+ if (IS_ERR(m)) {
+ err = PTR_ERR(m);
+ m = NULL;
+ goto err_out;
+ }
+ ctx = ntfs_attr_get_search_ctx(ndir, m);
+ if (unlikely(!ctx)) {
+ err = -ENOMEM;
+ goto err_out;
+ }
+ /* Get the offset into the index root attribute. */
+ ir_pos = (s64)actor->pos;
+ /* Find the index root attribute in the mft record. */
+ err = ntfs_attr_lookup(AT_INDEX_ROOT, I30, 4, CASE_SENSITIVE, 0, NULL,
+ 0, ctx);
+ if (unlikely(err)) {
+ ntfs_error(sb, "Index root attribute missing in directory "
+ "inode 0x%lx.", vdir->i_ino);
+ goto err_out;
+ }
+ /*
+ * Copy the index root attribute value to a buffer so that we can put
+ * the search context and unmap the mft record before calling the
+ * filldir() callback. We need to do this because of NFSd which calls
+ * ->lookup() from its filldir callback() and this causes NTFS to
+ * deadlock as ntfs_lookup() maps the mft record of the directory and
+ * we have got it mapped here already. The only solution is for us to
+ * unmap the mft record here so that a call to ntfs_lookup() is able to
+ * map the mft record without deadlocking.
+ */
+ rc = le32_to_cpu(ctx->attr->data.resident.value_length);
+ ir = kmalloc(rc, GFP_NOFS);
+ if (unlikely(!ir)) {
+ err = -ENOMEM;
+ goto err_out;
+ }
+ /* Copy the index root value (it has been verified in read_inode). */
+ memcpy(ir, (u8*)ctx->attr +
+ le16_to_cpu(ctx->attr->data.resident.value_offset), rc);
+ ntfs_attr_put_search_ctx(ctx);
+ unmap_mft_record(ndir);
+ ctx = NULL;
+ m = NULL;
+ index_end = (u8*)&ir->index + le32_to_cpu(ir->index.index_length);
+ /* The first index entry. */
+ ie = (INDEX_ENTRY*)((u8*)&ir->index +
+ le32_to_cpu(ir->index.entries_offset));
+ /*
+ * Loop until we exceed valid memory (corruption case) or until we
+ * reach the last entry or until filldir tells us it has had enough
+ * or signals an error (both covered by the rc test).
+ */
+ for (;; ie = (INDEX_ENTRY*)((u8*)ie + le16_to_cpu(ie->length))) {
+ ntfs_debug("In index root, offset 0x%zx.", (u8*)ie - (u8*)ir);
+ /* Bounds checks. */
+ if (unlikely((u8*)ie < (u8*)ir || (u8*)ie +
+ sizeof(INDEX_ENTRY_HEADER) > index_end ||
+ (u8*)ie + le16_to_cpu(ie->key_length) >
+ index_end))
+ goto err_out;
+ /* The last entry cannot contain a name. */
+ if (ie->flags & INDEX_ENTRY_END)
+ break;
+ /* Skip index root entry if continuing previous readdir. */
+ if (ir_pos > (u8*)ie - (u8*)ir)
+ continue;
+ /* Advance the position even if going to skip the entry. */
+ actor->pos = (u8*)ie - (u8*)ir;
+ /* Submit the name to the filldir callback. */
+ rc = ntfs_filldir(vol, ndir, NULL, ie, name, actor);
+ if (rc) {
+ kfree(ir);
+ goto abort;
+ }
+ }
+ /* We are done with the index root and can free the buffer. */
+ kfree(ir);
+ ir = NULL;
+ /* If there is no index allocation attribute we are finished. */
+ if (!NInoIndexAllocPresent(ndir))
+ goto EOD;
+ /* Advance fpos to the beginning of the index allocation. */
+ actor->pos = vol->mft_record_size;
+skip_index_root:
+ kaddr = NULL;
+ prev_ia_pos = -1LL;
+ /* Get the offset into the index allocation attribute. */
+ ia_pos = (s64)actor->pos - vol->mft_record_size;
+ ia_mapping = vdir->i_mapping;
+ ntfs_debug("Inode 0x%lx, getting index bitmap.", vdir->i_ino);
+ bmp_vi = ntfs_attr_iget(vdir, AT_BITMAP, I30, 4);
+ if (IS_ERR(bmp_vi)) {
+ ntfs_error(sb, "Failed to get bitmap attribute.");
+ err = PTR_ERR(bmp_vi);
+ goto err_out;
+ }
+ bmp_mapping = bmp_vi->i_mapping;
+ /* Get the starting bitmap bit position and sanity check it. */
+ bmp_pos = ia_pos >> ndir->itype.index.block_size_bits;
+ if (unlikely(bmp_pos >> 3 >= i_size_read(bmp_vi))) {
+ ntfs_error(sb, "Current index allocation position exceeds "
+ "index bitmap size.");
+ goto iput_err_out;
+ }
+ /* Get the starting bit position in the current bitmap page. */
+ cur_bmp_pos = bmp_pos & ((PAGE_SIZE * 8) - 1);
+ bmp_pos &= ~(u64)((PAGE_SIZE * 8) - 1);
+get_next_bmp_page:
+ ntfs_debug("Reading bitmap with page index 0x%llx, bit ofs 0x%llx",
+ (unsigned long long)bmp_pos >> (3 + PAGE_SHIFT),
+ (unsigned long long)bmp_pos &
+ (unsigned long long)((PAGE_SIZE * 8) - 1));
+ bmp_page = ntfs_map_page(bmp_mapping,
+ bmp_pos >> (3 + PAGE_SHIFT));
+ if (IS_ERR(bmp_page)) {
+ ntfs_error(sb, "Reading index bitmap failed.");
+ err = PTR_ERR(bmp_page);
+ bmp_page = NULL;
+ goto iput_err_out;
+ }
+ bmp = (u8*)page_address(bmp_page);
+ /* Find next index block in use. */
+ while (!(bmp[cur_bmp_pos >> 3] & (1 << (cur_bmp_pos & 7)))) {
+find_next_index_buffer:
+ cur_bmp_pos++;
+ /*
+ * If we have reached the end of the bitmap page, get the next
+ * page, and put away the old one.
+ */
+ if (unlikely((cur_bmp_pos >> 3) >= PAGE_SIZE)) {
+ ntfs_unmap_page(bmp_page);
+ bmp_pos += PAGE_SIZE * 8;
+ cur_bmp_pos = 0;
+ goto get_next_bmp_page;
+ }
+ /* If we have reached the end of the bitmap, we are done. */
+ if (unlikely(((bmp_pos + cur_bmp_pos) >> 3) >= i_size))
+ goto unm_EOD;
+ ia_pos = (bmp_pos + cur_bmp_pos) <<
+ ndir->itype.index.block_size_bits;
+ }
+ ntfs_debug("Handling index buffer 0x%llx.",
+ (unsigned long long)bmp_pos + cur_bmp_pos);
+ /* If the current index buffer is in the same page we reuse the page. */
+ if ((prev_ia_pos & (s64)PAGE_MASK) !=
+ (ia_pos & (s64)PAGE_MASK)) {
+ prev_ia_pos = ia_pos;
+ if (likely(ia_page != NULL)) {
+ unlock_page(ia_page);
+ ntfs_unmap_page(ia_page);
+ }
+ /*
+ * Map the page cache page containing the current ia_pos,
+ * reading it from disk if necessary.
+ */
+ ia_page = ntfs_map_page(ia_mapping, ia_pos >> PAGE_SHIFT);
+ if (IS_ERR(ia_page)) {
+ ntfs_error(sb, "Reading index allocation data failed.");
+ err = PTR_ERR(ia_page);
+ ia_page = NULL;
+ goto err_out;
+ }
+ lock_page(ia_page);
+ kaddr = (u8*)page_address(ia_page);
+ }
+ /* Get the current index buffer. */
+ ia = (INDEX_ALLOCATION*)(kaddr + (ia_pos & ~PAGE_MASK &
+ ~(s64)(ndir->itype.index.block_size - 1)));
+ /* Bounds checks. */
+ if (unlikely((u8*)ia < kaddr || (u8*)ia > kaddr + PAGE_SIZE)) {
+ ntfs_error(sb, "Out of bounds check failed. Corrupt directory "
+ "inode 0x%lx or driver bug.", vdir->i_ino);
+ goto err_out;
+ }
+ /* Catch multi sector transfer fixup errors. */
+ if (unlikely(!ntfs_is_indx_record(ia->magic))) {
+ ntfs_error(sb, "Directory index record with vcn 0x%llx is "
+ "corrupt. Corrupt inode 0x%lx. Run chkdsk.",
+ (unsigned long long)ia_pos >>
+ ndir->itype.index.vcn_size_bits, vdir->i_ino);
+ goto err_out;
+ }
+ if (unlikely(sle64_to_cpu(ia->index_block_vcn) != (ia_pos &
+ ~(s64)(ndir->itype.index.block_size - 1)) >>
+ ndir->itype.index.vcn_size_bits)) {
+ ntfs_error(sb, "Actual VCN (0x%llx) of index buffer is "
+ "different from expected VCN (0x%llx). "
+ "Directory inode 0x%lx is corrupt or driver "
+ "bug. ", (unsigned long long)
+ sle64_to_cpu(ia->index_block_vcn),
+ (unsigned long long)ia_pos >>
+ ndir->itype.index.vcn_size_bits, vdir->i_ino);
+ goto err_out;
+ }
+ if (unlikely(le32_to_cpu(ia->index.allocated_size) + 0x18 !=
+ ndir->itype.index.block_size)) {
+ ntfs_error(sb, "Index buffer (VCN 0x%llx) of directory inode "
+ "0x%lx has a size (%u) differing from the "
+ "directory specified size (%u). Directory "
+ "inode is corrupt or driver bug.",
+ (unsigned long long)ia_pos >>
+ ndir->itype.index.vcn_size_bits, vdir->i_ino,
+ le32_to_cpu(ia->index.allocated_size) + 0x18,
+ ndir->itype.index.block_size);
+ goto err_out;
+ }
+ index_end = (u8*)ia + ndir->itype.index.block_size;
+ if (unlikely(index_end > kaddr + PAGE_SIZE)) {
+ ntfs_error(sb, "Index buffer (VCN 0x%llx) of directory inode "
+ "0x%lx crosses page boundary. Impossible! "
+ "Cannot access! This is probably a bug in the "
+ "driver.", (unsigned long long)ia_pos >>
+ ndir->itype.index.vcn_size_bits, vdir->i_ino);
+ goto err_out;
+ }
+ ia_start = ia_pos & ~(s64)(ndir->itype.index.block_size - 1);
+ index_end = (u8*)&ia->index + le32_to_cpu(ia->index.index_length);
+ if (unlikely(index_end > (u8*)ia + ndir->itype.index.block_size)) {
+ ntfs_error(sb, "Size of index buffer (VCN 0x%llx) of directory "
+ "inode 0x%lx exceeds maximum size.",
+ (unsigned long long)ia_pos >>
+ ndir->itype.index.vcn_size_bits, vdir->i_ino);
+ goto err_out;
+ }
+ /* The first index entry in this index buffer. */
+ ie = (INDEX_ENTRY*)((u8*)&ia->index +
+ le32_to_cpu(ia->index.entries_offset));
+ /*
+ * Loop until we exceed valid memory (corruption case) or until we
+ * reach the last entry or until filldir tells us it has had enough
+ * or signals an error (both covered by the rc test).
+ */
+ for (;; ie = (INDEX_ENTRY*)((u8*)ie + le16_to_cpu(ie->length))) {
+ ntfs_debug("In index allocation, offset 0x%llx.",
+ (unsigned long long)ia_start +
+ (unsigned long long)((u8*)ie - (u8*)ia));
+ /* Bounds checks. */
+ if (unlikely((u8*)ie < (u8*)ia || (u8*)ie +
+ sizeof(INDEX_ENTRY_HEADER) > index_end ||
+ (u8*)ie + le16_to_cpu(ie->key_length) >
+ index_end))
+ goto err_out;
+ /* The last entry cannot contain a name. */
+ if (ie->flags & INDEX_ENTRY_END)
+ break;
+ /* Skip index block entry if continuing previous readdir. */
+ if (ia_pos - ia_start > (u8*)ie - (u8*)ia)
+ continue;
+ /* Advance the position even if going to skip the entry. */
+ actor->pos = (u8*)ie - (u8*)ia +
+ (sle64_to_cpu(ia->index_block_vcn) <<
+ ndir->itype.index.vcn_size_bits) +
+ vol->mft_record_size;
+ /*
+ * Submit the name to the @filldir callback. Note,
+ * ntfs_filldir() drops the lock on @ia_page but it retakes it
+ * before returning, unless a non-zero value is returned in
+ * which case the page is left unlocked.
+ */
+ rc = ntfs_filldir(vol, ndir, ia_page, ie, name, actor);
+ if (rc) {
+ /* @ia_page is already unlocked in this case. */
+ ntfs_unmap_page(ia_page);
+ ntfs_unmap_page(bmp_page);
+ iput(bmp_vi);
+ goto abort;
+ }
+ }
+ goto find_next_index_buffer;
+unm_EOD:
+ if (ia_page) {
+ unlock_page(ia_page);
+ ntfs_unmap_page(ia_page);
+ }
+ ntfs_unmap_page(bmp_page);
+ iput(bmp_vi);
+EOD:
+ /* We are finished, set fpos to EOD. */
+ actor->pos = i_size + vol->mft_record_size;
+abort:
+ kfree(name);
+ return 0;
+err_out:
+ if (bmp_page) {
+ ntfs_unmap_page(bmp_page);
+iput_err_out:
+ iput(bmp_vi);
+ }
+ if (ia_page) {
+ unlock_page(ia_page);
+ ntfs_unmap_page(ia_page);
+ }
+ kfree(ir);
+ kfree(name);
+ if (ctx)
+ ntfs_attr_put_search_ctx(ctx);
+ if (m)
+ unmap_mft_record(ndir);
+ if (!err)
+ err = -EIO;
+ ntfs_debug("Failed. Returning error code %i.", -err);
+ return err;
+}
+
+/**
+ * ntfs_dir_open - called when an inode is about to be opened
+ * @vi: inode to be opened
+ * @filp: file structure describing the inode
+ *
+ * Limit directory size to the page cache limit on architectures where unsigned
+ * long is 32-bits. This is the most we can do for now without overflowing the
+ * page cache page index. Doing it this way means we don't run into problems
+ * because of existing too large directories. It would be better to allow the
+ * user to read the accessible part of the directory but I doubt very much
+ * anyone is going to hit this check on a 32-bit architecture, so there is no
+ * point in adding the extra complexity required to support this.
+ *
+ * On 64-bit architectures, the check is hopefully optimized away by the
+ * compiler.
+ */
+static int ntfs_dir_open(struct inode *vi, struct file *filp)
+{
+ if (sizeof(unsigned long) < 8) {
+ if (i_size_read(vi) > MAX_LFS_FILESIZE)
+ return -EFBIG;
+ }
+ return 0;
+}
+
+#ifdef NTFS_RW
+
+/**
+ * ntfs_dir_fsync - sync a directory to disk
+ * @filp: directory to be synced
+ * @dentry: dentry describing the directory to sync
+ * @datasync: if non-zero only flush user data and not metadata
+ *
+ * Data integrity sync of a directory to disk. Used for fsync, fdatasync, and
+ * msync system calls. This function is based on file.c::ntfs_file_fsync().
+ *
+ * Write the mft record and all associated extent mft records as well as the
+ * $INDEX_ALLOCATION and $BITMAP attributes and then sync the block device.
+ *
+ * If @datasync is true, we do not wait on the inode(s) to be written out
+ * but we always wait on the page cache pages to be written out.
+ *
+ * Note: In the past @filp could be NULL so we ignore it as we don't need it
+ * anyway.
+ *
+ * Locking: Caller must hold i_mutex on the inode.
+ *
+ * TODO: We should probably also write all attribute/index inodes associated
+ * with this inode but since we have no simple way of getting to them we ignore
+ * this problem for now. We do write the $BITMAP attribute if it is present
+ * which is the important one for a directory so things are not too bad.
+ */
+static int ntfs_dir_fsync(struct file *filp, loff_t start, loff_t end,
+ int datasync)
+{
+ struct inode *bmp_vi, *vi = filp->f_mapping->host;
+ int err, ret;
+ ntfs_attr na;
+
+ ntfs_debug("Entering for inode 0x%lx.", vi->i_ino);
+
+ err = file_write_and_wait_range(filp, start, end);
+ if (err)
+ return err;
+ inode_lock(vi);
+
+ BUG_ON(!S_ISDIR(vi->i_mode));
+ /* If the bitmap attribute inode is in memory sync it, too. */
+ na.mft_no = vi->i_ino;
+ na.type = AT_BITMAP;
+ na.name = I30;
+ na.name_len = 4;
+ bmp_vi = ilookup5(vi->i_sb, vi->i_ino, ntfs_test_inode, &na);
+ if (bmp_vi) {
+ write_inode_now(bmp_vi, !datasync);
+ iput(bmp_vi);
+ }
+ ret = __ntfs_write_inode(vi, 1);
+ write_inode_now(vi, !datasync);
+ err = sync_blockdev(vi->i_sb->s_bdev);
+ if (unlikely(err && !ret))
+ ret = err;
+ if (likely(!ret))
+ ntfs_debug("Done.");
+ else
+ ntfs_warning(vi->i_sb, "Failed to f%ssync inode 0x%lx. Error "
+ "%u.", datasync ? "data" : "", vi->i_ino, -ret);
+ inode_unlock(vi);
+ return ret;
+}
+
+#endif /* NTFS_RW */
+
+const struct file_operations ntfs_dir_ops = {
+ .llseek = generic_file_llseek, /* Seek inside directory. */
+ .read = generic_read_dir, /* Return -EISDIR. */
+ .iterate = ntfs_readdir, /* Read directory contents. */
+#ifdef NTFS_RW
+ .fsync = ntfs_dir_fsync, /* Sync a directory to disk. */
+#endif /* NTFS_RW */
+ /*.ioctl = ,*/ /* Perform function on the
+ mounted filesystem. */
+ .open = ntfs_dir_open, /* Open directory. */
+};
diff --git a/fs/ntfs/dir.h b/fs/ntfs/dir.h
new file mode 100644
index 000000000..0e326753d
--- /dev/null
+++ b/fs/ntfs/dir.h
@@ -0,0 +1,34 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * dir.h - Defines for directory handling in NTFS Linux kernel driver. Part of
+ * the Linux-NTFS project.
+ *
+ * Copyright (c) 2002-2004 Anton Altaparmakov
+ */
+
+#ifndef _LINUX_NTFS_DIR_H
+#define _LINUX_NTFS_DIR_H
+
+#include "layout.h"
+#include "inode.h"
+#include "types.h"
+
+/*
+ * ntfs_name is used to return the file name to the caller of
+ * ntfs_lookup_inode_by_name() in order for the caller (namei.c::ntfs_lookup())
+ * to be able to deal with dcache aliasing issues.
+ */
+typedef struct {
+ MFT_REF mref;
+ FILE_NAME_TYPE_FLAGS type;
+ u8 len;
+ ntfschar name[0];
+} __attribute__ ((__packed__)) ntfs_name;
+
+/* The little endian Unicode string $I30 as a global constant. */
+extern ntfschar I30[5];
+
+extern MFT_REF ntfs_lookup_inode_by_name(ntfs_inode *dir_ni,
+ const ntfschar *uname, const int uname_len, ntfs_name **res);
+
+#endif /* _LINUX_NTFS_FS_DIR_H */
diff --git a/fs/ntfs/endian.h b/fs/ntfs/endian.h
new file mode 100644
index 000000000..f30c139bf
--- /dev/null
+++ b/fs/ntfs/endian.h
@@ -0,0 +1,79 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * endian.h - Defines for endianness handling in NTFS Linux kernel driver.
+ * Part of the Linux-NTFS project.
+ *
+ * Copyright (c) 2001-2004 Anton Altaparmakov
+ */
+
+#ifndef _LINUX_NTFS_ENDIAN_H
+#define _LINUX_NTFS_ENDIAN_H
+
+#include <asm/byteorder.h>
+#include "types.h"
+
+/*
+ * Signed endianness conversion functions.
+ */
+
+static inline s16 sle16_to_cpu(sle16 x)
+{
+ return le16_to_cpu((__force le16)x);
+}
+
+static inline s32 sle32_to_cpu(sle32 x)
+{
+ return le32_to_cpu((__force le32)x);
+}
+
+static inline s64 sle64_to_cpu(sle64 x)
+{
+ return le64_to_cpu((__force le64)x);
+}
+
+static inline s16 sle16_to_cpup(sle16 *x)
+{
+ return le16_to_cpu(*(__force le16*)x);
+}
+
+static inline s32 sle32_to_cpup(sle32 *x)
+{
+ return le32_to_cpu(*(__force le32*)x);
+}
+
+static inline s64 sle64_to_cpup(sle64 *x)
+{
+ return le64_to_cpu(*(__force le64*)x);
+}
+
+static inline sle16 cpu_to_sle16(s16 x)
+{
+ return (__force sle16)cpu_to_le16(x);
+}
+
+static inline sle32 cpu_to_sle32(s32 x)
+{
+ return (__force sle32)cpu_to_le32(x);
+}
+
+static inline sle64 cpu_to_sle64(s64 x)
+{
+ return (__force sle64)cpu_to_le64(x);
+}
+
+static inline sle16 cpu_to_sle16p(s16 *x)
+{
+ return (__force sle16)cpu_to_le16(*x);
+}
+
+static inline sle32 cpu_to_sle32p(s32 *x)
+{
+ return (__force sle32)cpu_to_le32(*x);
+}
+
+static inline sle64 cpu_to_sle64p(s64 *x)
+{
+ return (__force sle64)cpu_to_le64(*x);
+}
+
+#endif /* _LINUX_NTFS_ENDIAN_H */
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
new file mode 100644
index 000000000..c481b14e4
--- /dev/null
+++ b/fs/ntfs/file.c
@@ -0,0 +1,2006 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * file.c - NTFS kernel file operations. Part of the Linux-NTFS project.
+ *
+ * Copyright (c) 2001-2015 Anton Altaparmakov and Tuxera Inc.
+ */
+
+#include <linux/blkdev.h>
+#include <linux/backing-dev.h>
+#include <linux/buffer_head.h>
+#include <linux/gfp.h>
+#include <linux/pagemap.h>
+#include <linux/pagevec.h>
+#include <linux/sched/signal.h>
+#include <linux/swap.h>
+#include <linux/uio.h>
+#include <linux/writeback.h>
+
+#include <asm/page.h>
+#include <linux/uaccess.h>
+
+#include "attrib.h"
+#include "bitmap.h"
+#include "inode.h"
+#include "debug.h"
+#include "lcnalloc.h"
+#include "malloc.h"
+#include "mft.h"
+#include "ntfs.h"
+
+/**
+ * ntfs_file_open - called when an inode is about to be opened
+ * @vi: inode to be opened
+ * @filp: file structure describing the inode
+ *
+ * Limit file size to the page cache limit on architectures where unsigned long
+ * is 32-bits. This is the most we can do for now without overflowing the page
+ * cache page index. Doing it this way means we don't run into problems because
+ * of existing too large files. It would be better to allow the user to read
+ * the beginning of the file but I doubt very much anyone is going to hit this
+ * check on a 32-bit architecture, so there is no point in adding the extra
+ * complexity required to support this.
+ *
+ * On 64-bit architectures, the check is hopefully optimized away by the
+ * compiler.
+ *
+ * After the check passes, just call generic_file_open() to do its work.
+ */
+static int ntfs_file_open(struct inode *vi, struct file *filp)
+{
+ if (sizeof(unsigned long) < 8) {
+ if (i_size_read(vi) > MAX_LFS_FILESIZE)
+ return -EOVERFLOW;
+ }
+ return generic_file_open(vi, filp);
+}
+
+#ifdef NTFS_RW
+
+/**
+ * ntfs_attr_extend_initialized - extend the initialized size of an attribute
+ * @ni: ntfs inode of the attribute to extend
+ * @new_init_size: requested new initialized size in bytes
+ *
+ * Extend the initialized size of an attribute described by the ntfs inode @ni
+ * to @new_init_size bytes. This involves zeroing any non-sparse space between
+ * the old initialized size and @new_init_size both in the page cache and on
+ * disk (if relevant complete pages are already uptodate in the page cache then
+ * these are simply marked dirty).
+ *
+ * As a side-effect, the file size (vfs inode->i_size) may be incremented as,
+ * in the resident attribute case, it is tied to the initialized size and, in
+ * the non-resident attribute case, it may not fall below the initialized size.
+ *
+ * Note that if the attribute is resident, we do not need to touch the page
+ * cache at all. This is because if the page cache page is not uptodate we
+ * bring it uptodate later, when doing the write to the mft record since we
+ * then already have the page mapped. And if the page is uptodate, the
+ * non-initialized region will already have been zeroed when the page was
+ * brought uptodate and the region may in fact already have been overwritten
+ * with new data via mmap() based writes, so we cannot just zero it. And since
+ * POSIX specifies that the behaviour of resizing a file whilst it is mmap()ped
+ * is unspecified, we choose not to do zeroing and thus we do not need to touch
+ * the page at all. For a more detailed explanation see ntfs_truncate() in
+ * fs/ntfs/inode.c.
+ *
+ * Return 0 on success and -errno on error. In the case that an error is
+ * encountered it is possible that the initialized size will already have been
+ * incremented some way towards @new_init_size but it is guaranteed that if
+ * this is the case, the necessary zeroing will also have happened and that all
+ * metadata is self-consistent.
+ *
+ * Locking: i_mutex on the vfs inode corrseponsind to the ntfs inode @ni must be
+ * held by the caller.
+ */
+static int ntfs_attr_extend_initialized(ntfs_inode *ni, const s64 new_init_size)
+{
+ s64 old_init_size;
+ loff_t old_i_size;
+ pgoff_t index, end_index;
+ unsigned long flags;
+ struct inode *vi = VFS_I(ni);
+ ntfs_inode *base_ni;
+ MFT_RECORD *m = NULL;
+ ATTR_RECORD *a;
+ ntfs_attr_search_ctx *ctx = NULL;
+ struct address_space *mapping;
+ struct page *page = NULL;
+ u8 *kattr;
+ int err;
+ u32 attr_len;
+
+ read_lock_irqsave(&ni->size_lock, flags);
+ old_init_size = ni->initialized_size;
+ old_i_size = i_size_read(vi);
+ BUG_ON(new_init_size > ni->allocated_size);
+ read_unlock_irqrestore(&ni->size_lock, flags);
+ ntfs_debug("Entering for i_ino 0x%lx, attribute type 0x%x, "
+ "old_initialized_size 0x%llx, "
+ "new_initialized_size 0x%llx, i_size 0x%llx.",
+ vi->i_ino, (unsigned)le32_to_cpu(ni->type),
+ (unsigned long long)old_init_size,
+ (unsigned long long)new_init_size, old_i_size);
+ if (!NInoAttr(ni))
+ base_ni = ni;
+ else
+ base_ni = ni->ext.base_ntfs_ino;
+ /* Use goto to reduce indentation and we need the label below anyway. */
+ if (NInoNonResident(ni))
+ goto do_non_resident_extend;
+ BUG_ON(old_init_size != old_i_size);
+ m = map_mft_record(base_ni);
+ if (IS_ERR(m)) {
+ err = PTR_ERR(m);
+ m = NULL;
+ goto err_out;
+ }
+ ctx = ntfs_attr_get_search_ctx(base_ni, m);
+ if (unlikely(!ctx)) {
+ err = -ENOMEM;
+ goto err_out;
+ }
+ err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
+ CASE_SENSITIVE, 0, NULL, 0, ctx);
+ if (unlikely(err)) {
+ if (err == -ENOENT)
+ err = -EIO;
+ goto err_out;
+ }
+ m = ctx->mrec;
+ a = ctx->attr;
+ BUG_ON(a->non_resident);
+ /* The total length of the attribute value. */
+ attr_len = le32_to_cpu(a->data.resident.value_length);
+ BUG_ON(old_i_size != (loff_t)attr_len);
+ /*
+ * Do the zeroing in the mft record and update the attribute size in
+ * the mft record.
+ */
+ kattr = (u8*)a + le16_to_cpu(a->data.resident.value_offset);
+ memset(kattr + attr_len, 0, new_init_size - attr_len);
+ a->data.resident.value_length = cpu_to_le32((u32)new_init_size);
+ /* Finally, update the sizes in the vfs and ntfs inodes. */
+ write_lock_irqsave(&ni->size_lock, flags);
+ i_size_write(vi, new_init_size);
+ ni->initialized_size = new_init_size;
+ write_unlock_irqrestore(&ni->size_lock, flags);
+ goto done;
+do_non_resident_extend:
+ /*
+ * If the new initialized size @new_init_size exceeds the current file
+ * size (vfs inode->i_size), we need to extend the file size to the
+ * new initialized size.
+ */
+ if (new_init_size > old_i_size) {
+ m = map_mft_record(base_ni);
+ if (IS_ERR(m)) {
+ err = PTR_ERR(m);
+ m = NULL;
+ goto err_out;
+ }
+ ctx = ntfs_attr_get_search_ctx(base_ni, m);
+ if (unlikely(!ctx)) {
+ err = -ENOMEM;
+ goto err_out;
+ }
+ err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
+ CASE_SENSITIVE, 0, NULL, 0, ctx);
+ if (unlikely(err)) {
+ if (err == -ENOENT)
+ err = -EIO;
+ goto err_out;
+ }
+ m = ctx->mrec;
+ a = ctx->attr;
+ BUG_ON(!a->non_resident);
+ BUG_ON(old_i_size != (loff_t)
+ sle64_to_cpu(a->data.non_resident.data_size));
+ a->data.non_resident.data_size = cpu_to_sle64(new_init_size);
+ flush_dcache_mft_record_page(ctx->ntfs_ino);
+ mark_mft_record_dirty(ctx->ntfs_ino);
+ /* Update the file size in the vfs inode. */
+ i_size_write(vi, new_init_size);
+ ntfs_attr_put_search_ctx(ctx);
+ ctx = NULL;
+ unmap_mft_record(base_ni);
+ m = NULL;
+ }
+ mapping = vi->i_mapping;
+ index = old_init_size >> PAGE_SHIFT;
+ end_index = (new_init_size + PAGE_SIZE - 1) >> PAGE_SHIFT;
+ do {
+ /*
+ * Read the page. If the page is not present, this will zero
+ * the uninitialized regions for us.
+ */
+ page = read_mapping_page(mapping, index, NULL);
+ if (IS_ERR(page)) {
+ err = PTR_ERR(page);
+ goto init_err_out;
+ }
+ /*
+ * Update the initialized size in the ntfs inode. This is
+ * enough to make ntfs_writepage() work.
+ */
+ write_lock_irqsave(&ni->size_lock, flags);
+ ni->initialized_size = (s64)(index + 1) << PAGE_SHIFT;
+ if (ni->initialized_size > new_init_size)
+ ni->initialized_size = new_init_size;
+ write_unlock_irqrestore(&ni->size_lock, flags);
+ /* Set the page dirty so it gets written out. */
+ set_page_dirty(page);
+ put_page(page);
+ /*
+ * Play nice with the vm and the rest of the system. This is
+ * very much needed as we can potentially be modifying the
+ * initialised size from a very small value to a really huge
+ * value, e.g.
+ * f = open(somefile, O_TRUNC);
+ * truncate(f, 10GiB);
+ * seek(f, 10GiB);
+ * write(f, 1);
+ * And this would mean we would be marking dirty hundreds of
+ * thousands of pages or as in the above example more than
+ * two and a half million pages!
+ *
+ * TODO: For sparse pages could optimize this workload by using
+ * the FsMisc / MiscFs page bit as a "PageIsSparse" bit. This
+ * would be set in read_folio for sparse pages and here we would
+ * not need to mark dirty any pages which have this bit set.
+ * The only caveat is that we have to clear the bit everywhere
+ * where we allocate any clusters that lie in the page or that
+ * contain the page.
+ *
+ * TODO: An even greater optimization would be for us to only
+ * call read_folio() on pages which are not in sparse regions as
+ * determined from the runlist. This would greatly reduce the
+ * number of pages we read and make dirty in the case of sparse
+ * files.
+ */
+ balance_dirty_pages_ratelimited(mapping);
+ cond_resched();
+ } while (++index < end_index);
+ read_lock_irqsave(&ni->size_lock, flags);
+ BUG_ON(ni->initialized_size != new_init_size);
+ read_unlock_irqrestore(&ni->size_lock, flags);
+ /* Now bring in sync the initialized_size in the mft record. */
+ m = map_mft_record(base_ni);
+ if (IS_ERR(m)) {
+ err = PTR_ERR(m);
+ m = NULL;
+ goto init_err_out;
+ }
+ ctx = ntfs_attr_get_search_ctx(base_ni, m);
+ if (unlikely(!ctx)) {
+ err = -ENOMEM;
+ goto init_err_out;
+ }
+ err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
+ CASE_SENSITIVE, 0, NULL, 0, ctx);
+ if (unlikely(err)) {
+ if (err == -ENOENT)
+ err = -EIO;
+ goto init_err_out;
+ }
+ m = ctx->mrec;
+ a = ctx->attr;
+ BUG_ON(!a->non_resident);
+ a->data.non_resident.initialized_size = cpu_to_sle64(new_init_size);
+done:
+ flush_dcache_mft_record_page(ctx->ntfs_ino);
+ mark_mft_record_dirty(ctx->ntfs_ino);
+ if (ctx)
+ ntfs_attr_put_search_ctx(ctx);
+ if (m)
+ unmap_mft_record(base_ni);
+ ntfs_debug("Done, initialized_size 0x%llx, i_size 0x%llx.",
+ (unsigned long long)new_init_size, i_size_read(vi));
+ return 0;
+init_err_out:
+ write_lock_irqsave(&ni->size_lock, flags);
+ ni->initialized_size = old_init_size;
+ write_unlock_irqrestore(&ni->size_lock, flags);
+err_out:
+ if (ctx)
+ ntfs_attr_put_search_ctx(ctx);
+ if (m)
+ unmap_mft_record(base_ni);
+ ntfs_debug("Failed. Returning error code %i.", err);
+ return err;
+}
+
+static ssize_t ntfs_prepare_file_for_write(struct kiocb *iocb,
+ struct iov_iter *from)
+{
+ loff_t pos;
+ s64 end, ll;
+ ssize_t err;
+ unsigned long flags;
+ struct file *file = iocb->ki_filp;
+ struct inode *vi = file_inode(file);
+ ntfs_inode *ni = NTFS_I(vi);
+ ntfs_volume *vol = ni->vol;
+
+ ntfs_debug("Entering for i_ino 0x%lx, attribute type 0x%x, pos "
+ "0x%llx, count 0x%zx.", vi->i_ino,
+ (unsigned)le32_to_cpu(ni->type),
+ (unsigned long long)iocb->ki_pos,
+ iov_iter_count(from));
+ err = generic_write_checks(iocb, from);
+ if (unlikely(err <= 0))
+ goto out;
+ /*
+ * All checks have passed. Before we start doing any writing we want
+ * to abort any totally illegal writes.
+ */
+ BUG_ON(NInoMstProtected(ni));
+ BUG_ON(ni->type != AT_DATA);
+ /* If file is encrypted, deny access, just like NT4. */
+ if (NInoEncrypted(ni)) {
+ /* Only $DATA attributes can be encrypted. */
+ /*
+ * Reminder for later: Encrypted files are _always_
+ * non-resident so that the content can always be encrypted.
+ */
+ ntfs_debug("Denying write access to encrypted file.");
+ err = -EACCES;
+ goto out;
+ }
+ if (NInoCompressed(ni)) {
+ /* Only unnamed $DATA attribute can be compressed. */
+ BUG_ON(ni->name_len);
+ /*
+ * Reminder for later: If resident, the data is not actually
+ * compressed. Only on the switch to non-resident does
+ * compression kick in. This is in contrast to encrypted files
+ * (see above).
+ */
+ ntfs_error(vi->i_sb, "Writing to compressed files is not "
+ "implemented yet. Sorry.");
+ err = -EOPNOTSUPP;
+ goto out;
+ }
+ err = file_remove_privs(file);
+ if (unlikely(err))
+ goto out;
+ /*
+ * Our ->update_time method always succeeds thus file_update_time()
+ * cannot fail either so there is no need to check the return code.
+ */
+ file_update_time(file);
+ pos = iocb->ki_pos;
+ /* The first byte after the last cluster being written to. */
+ end = (pos + iov_iter_count(from) + vol->cluster_size_mask) &
+ ~(u64)vol->cluster_size_mask;
+ /*
+ * If the write goes beyond the allocated size, extend the allocation
+ * to cover the whole of the write, rounded up to the nearest cluster.
+ */
+ read_lock_irqsave(&ni->size_lock, flags);
+ ll = ni->allocated_size;
+ read_unlock_irqrestore(&ni->size_lock, flags);
+ if (end > ll) {
+ /*
+ * Extend the allocation without changing the data size.
+ *
+ * Note we ensure the allocation is big enough to at least
+ * write some data but we do not require the allocation to be
+ * complete, i.e. it may be partial.
+ */
+ ll = ntfs_attr_extend_allocation(ni, end, -1, pos);
+ if (likely(ll >= 0)) {
+ BUG_ON(pos >= ll);
+ /* If the extension was partial truncate the write. */
+ if (end > ll) {
+ ntfs_debug("Truncating write to inode 0x%lx, "
+ "attribute type 0x%x, because "
+ "the allocation was only "
+ "partially extended.",
+ vi->i_ino, (unsigned)
+ le32_to_cpu(ni->type));
+ iov_iter_truncate(from, ll - pos);
+ }
+ } else {
+ err = ll;
+ read_lock_irqsave(&ni->size_lock, flags);
+ ll = ni->allocated_size;
+ read_unlock_irqrestore(&ni->size_lock, flags);
+ /* Perform a partial write if possible or fail. */
+ if (pos < ll) {
+ ntfs_debug("Truncating write to inode 0x%lx "
+ "attribute type 0x%x, because "
+ "extending the allocation "
+ "failed (error %d).",
+ vi->i_ino, (unsigned)
+ le32_to_cpu(ni->type),
+ (int)-err);
+ iov_iter_truncate(from, ll - pos);
+ } else {
+ if (err != -ENOSPC)
+ ntfs_error(vi->i_sb, "Cannot perform "
+ "write to inode "
+ "0x%lx, attribute "
+ "type 0x%x, because "
+ "extending the "
+ "allocation failed "
+ "(error %ld).",
+ vi->i_ino, (unsigned)
+ le32_to_cpu(ni->type),
+ (long)-err);
+ else
+ ntfs_debug("Cannot perform write to "
+ "inode 0x%lx, "
+ "attribute type 0x%x, "
+ "because there is not "
+ "space left.",
+ vi->i_ino, (unsigned)
+ le32_to_cpu(ni->type));
+ goto out;
+ }
+ }
+ }
+ /*
+ * If the write starts beyond the initialized size, extend it up to the
+ * beginning of the write and initialize all non-sparse space between
+ * the old initialized size and the new one. This automatically also
+ * increments the vfs inode->i_size to keep it above or equal to the
+ * initialized_size.
+ */
+ read_lock_irqsave(&ni->size_lock, flags);
+ ll = ni->initialized_size;
+ read_unlock_irqrestore(&ni->size_lock, flags);
+ if (pos > ll) {
+ /*
+ * Wait for ongoing direct i/o to complete before proceeding.
+ * New direct i/o cannot start as we hold i_mutex.
+ */
+ inode_dio_wait(vi);
+ err = ntfs_attr_extend_initialized(ni, pos);
+ if (unlikely(err < 0))
+ ntfs_error(vi->i_sb, "Cannot perform write to inode "
+ "0x%lx, attribute type 0x%x, because "
+ "extending the initialized size "
+ "failed (error %d).", vi->i_ino,
+ (unsigned)le32_to_cpu(ni->type),
+ (int)-err);
+ }
+out:
+ return err;
+}
+
+/**
+ * __ntfs_grab_cache_pages - obtain a number of locked pages
+ * @mapping: address space mapping from which to obtain page cache pages
+ * @index: starting index in @mapping at which to begin obtaining pages
+ * @nr_pages: number of page cache pages to obtain
+ * @pages: array of pages in which to return the obtained page cache pages
+ * @cached_page: allocated but as yet unused page
+ *
+ * Obtain @nr_pages locked page cache pages from the mapping @mapping and
+ * starting at index @index.
+ *
+ * If a page is newly created, add it to lru list
+ *
+ * Note, the page locks are obtained in ascending page index order.
+ */
+static inline int __ntfs_grab_cache_pages(struct address_space *mapping,
+ pgoff_t index, const unsigned nr_pages, struct page **pages,
+ struct page **cached_page)
+{
+ int err, nr;
+
+ BUG_ON(!nr_pages);
+ err = nr = 0;
+ do {
+ pages[nr] = find_get_page_flags(mapping, index, FGP_LOCK |
+ FGP_ACCESSED);
+ if (!pages[nr]) {
+ if (!*cached_page) {
+ *cached_page = page_cache_alloc(mapping);
+ if (unlikely(!*cached_page)) {
+ err = -ENOMEM;
+ goto err_out;
+ }
+ }
+ err = add_to_page_cache_lru(*cached_page, mapping,
+ index,
+ mapping_gfp_constraint(mapping, GFP_KERNEL));
+ if (unlikely(err)) {
+ if (err == -EEXIST)
+ continue;
+ goto err_out;
+ }
+ pages[nr] = *cached_page;
+ *cached_page = NULL;
+ }
+ index++;
+ nr++;
+ } while (nr < nr_pages);
+out:
+ return err;
+err_out:
+ while (nr > 0) {
+ unlock_page(pages[--nr]);
+ put_page(pages[nr]);
+ }
+ goto out;
+}
+
+static inline void ntfs_submit_bh_for_read(struct buffer_head *bh)
+{
+ lock_buffer(bh);
+ get_bh(bh);
+ bh->b_end_io = end_buffer_read_sync;
+ submit_bh(REQ_OP_READ, bh);
+}
+
+/**
+ * ntfs_prepare_pages_for_non_resident_write - prepare pages for receiving data
+ * @pages: array of destination pages
+ * @nr_pages: number of pages in @pages
+ * @pos: byte position in file at which the write begins
+ * @bytes: number of bytes to be written
+ *
+ * This is called for non-resident attributes from ntfs_file_buffered_write()
+ * with i_mutex held on the inode (@pages[0]->mapping->host). There are
+ * @nr_pages pages in @pages which are locked but not kmap()ped. The source
+ * data has not yet been copied into the @pages.
+ *
+ * Need to fill any holes with actual clusters, allocate buffers if necessary,
+ * ensure all the buffers are mapped, and bring uptodate any buffers that are
+ * only partially being written to.
+ *
+ * If @nr_pages is greater than one, we are guaranteed that the cluster size is
+ * greater than PAGE_SIZE, that all pages in @pages are entirely inside
+ * the same cluster and that they are the entirety of that cluster, and that
+ * the cluster is sparse, i.e. we need to allocate a cluster to fill the hole.
+ *
+ * i_size is not to be modified yet.
+ *
+ * Return 0 on success or -errno on error.
+ */
+static int ntfs_prepare_pages_for_non_resident_write(struct page **pages,
+ unsigned nr_pages, s64 pos, size_t bytes)
+{
+ VCN vcn, highest_vcn = 0, cpos, cend, bh_cpos, bh_cend;
+ LCN lcn;
+ s64 bh_pos, vcn_len, end, initialized_size;
+ sector_t lcn_block;
+ struct page *page;
+ struct inode *vi;
+ ntfs_inode *ni, *base_ni = NULL;
+ ntfs_volume *vol;
+ runlist_element *rl, *rl2;
+ struct buffer_head *bh, *head, *wait[2], **wait_bh = wait;
+ ntfs_attr_search_ctx *ctx = NULL;
+ MFT_RECORD *m = NULL;
+ ATTR_RECORD *a = NULL;
+ unsigned long flags;
+ u32 attr_rec_len = 0;
+ unsigned blocksize, u;
+ int err, mp_size;
+ bool rl_write_locked, was_hole, is_retry;
+ unsigned char blocksize_bits;
+ struct {
+ u8 runlist_merged:1;
+ u8 mft_attr_mapped:1;
+ u8 mp_rebuilt:1;
+ u8 attr_switched:1;
+ } status = { 0, 0, 0, 0 };
+
+ BUG_ON(!nr_pages);
+ BUG_ON(!pages);
+ BUG_ON(!*pages);
+ vi = pages[0]->mapping->host;
+ ni = NTFS_I(vi);
+ vol = ni->vol;
+ ntfs_debug("Entering for inode 0x%lx, attribute type 0x%x, start page "
+ "index 0x%lx, nr_pages 0x%x, pos 0x%llx, bytes 0x%zx.",
+ vi->i_ino, ni->type, pages[0]->index, nr_pages,
+ (long long)pos, bytes);
+ blocksize = vol->sb->s_blocksize;
+ blocksize_bits = vol->sb->s_blocksize_bits;
+ u = 0;
+ do {
+ page = pages[u];
+ BUG_ON(!page);
+ /*
+ * create_empty_buffers() will create uptodate/dirty buffers if
+ * the page is uptodate/dirty.
+ */
+ if (!page_has_buffers(page)) {
+ create_empty_buffers(page, blocksize, 0);
+ if (unlikely(!page_has_buffers(page)))
+ return -ENOMEM;
+ }
+ } while (++u < nr_pages);
+ rl_write_locked = false;
+ rl = NULL;
+ err = 0;
+ vcn = lcn = -1;
+ vcn_len = 0;
+ lcn_block = -1;
+ was_hole = false;
+ cpos = pos >> vol->cluster_size_bits;
+ end = pos + bytes;
+ cend = (end + vol->cluster_size - 1) >> vol->cluster_size_bits;
+ /*
+ * Loop over each page and for each page over each buffer. Use goto to
+ * reduce indentation.
+ */
+ u = 0;
+do_next_page:
+ page = pages[u];
+ bh_pos = (s64)page->index << PAGE_SHIFT;
+ bh = head = page_buffers(page);
+ do {
+ VCN cdelta;
+ s64 bh_end;
+ unsigned bh_cofs;
+
+ /* Clear buffer_new on all buffers to reinitialise state. */
+ if (buffer_new(bh))
+ clear_buffer_new(bh);
+ bh_end = bh_pos + blocksize;
+ bh_cpos = bh_pos >> vol->cluster_size_bits;
+ bh_cofs = bh_pos & vol->cluster_size_mask;
+ if (buffer_mapped(bh)) {
+ /*
+ * The buffer is already mapped. If it is uptodate,
+ * ignore it.
+ */
+ if (buffer_uptodate(bh))
+ continue;
+ /*
+ * The buffer is not uptodate. If the page is uptodate
+ * set the buffer uptodate and otherwise ignore it.
+ */
+ if (PageUptodate(page)) {
+ set_buffer_uptodate(bh);
+ continue;
+ }
+ /*
+ * Neither the page nor the buffer are uptodate. If
+ * the buffer is only partially being written to, we
+ * need to read it in before the write, i.e. now.
+ */
+ if ((bh_pos < pos && bh_end > pos) ||
+ (bh_pos < end && bh_end > end)) {
+ /*
+ * If the buffer is fully or partially within
+ * the initialized size, do an actual read.
+ * Otherwise, simply zero the buffer.
+ */
+ read_lock_irqsave(&ni->size_lock, flags);
+ initialized_size = ni->initialized_size;
+ read_unlock_irqrestore(&ni->size_lock, flags);
+ if (bh_pos < initialized_size) {
+ ntfs_submit_bh_for_read(bh);
+ *wait_bh++ = bh;
+ } else {
+ zero_user(page, bh_offset(bh),
+ blocksize);
+ set_buffer_uptodate(bh);
+ }
+ }
+ continue;
+ }
+ /* Unmapped buffer. Need to map it. */
+ bh->b_bdev = vol->sb->s_bdev;
+ /*
+ * If the current buffer is in the same clusters as the map
+ * cache, there is no need to check the runlist again. The
+ * map cache is made up of @vcn, which is the first cached file
+ * cluster, @vcn_len which is the number of cached file
+ * clusters, @lcn is the device cluster corresponding to @vcn,
+ * and @lcn_block is the block number corresponding to @lcn.
+ */
+ cdelta = bh_cpos - vcn;
+ if (likely(!cdelta || (cdelta > 0 && cdelta < vcn_len))) {
+map_buffer_cached:
+ BUG_ON(lcn < 0);
+ bh->b_blocknr = lcn_block +
+ (cdelta << (vol->cluster_size_bits -
+ blocksize_bits)) +
+ (bh_cofs >> blocksize_bits);
+ set_buffer_mapped(bh);
+ /*
+ * If the page is uptodate so is the buffer. If the
+ * buffer is fully outside the write, we ignore it if
+ * it was already allocated and we mark it dirty so it
+ * gets written out if we allocated it. On the other
+ * hand, if we allocated the buffer but we are not
+ * marking it dirty we set buffer_new so we can do
+ * error recovery.
+ */
+ if (PageUptodate(page)) {
+ if (!buffer_uptodate(bh))
+ set_buffer_uptodate(bh);
+ if (unlikely(was_hole)) {
+ /* We allocated the buffer. */
+ clean_bdev_bh_alias(bh);
+ if (bh_end <= pos || bh_pos >= end)
+ mark_buffer_dirty(bh);
+ else
+ set_buffer_new(bh);
+ }
+ continue;
+ }
+ /* Page is _not_ uptodate. */
+ if (likely(!was_hole)) {
+ /*
+ * Buffer was already allocated. If it is not
+ * uptodate and is only partially being written
+ * to, we need to read it in before the write,
+ * i.e. now.
+ */
+ if (!buffer_uptodate(bh) && bh_pos < end &&
+ bh_end > pos &&
+ (bh_pos < pos ||
+ bh_end > end)) {
+ /*
+ * If the buffer is fully or partially
+ * within the initialized size, do an
+ * actual read. Otherwise, simply zero
+ * the buffer.
+ */
+ read_lock_irqsave(&ni->size_lock,
+ flags);
+ initialized_size = ni->initialized_size;
+ read_unlock_irqrestore(&ni->size_lock,
+ flags);
+ if (bh_pos < initialized_size) {
+ ntfs_submit_bh_for_read(bh);
+ *wait_bh++ = bh;
+ } else {
+ zero_user(page, bh_offset(bh),
+ blocksize);
+ set_buffer_uptodate(bh);
+ }
+ }
+ continue;
+ }
+ /* We allocated the buffer. */
+ clean_bdev_bh_alias(bh);
+ /*
+ * If the buffer is fully outside the write, zero it,
+ * set it uptodate, and mark it dirty so it gets
+ * written out. If it is partially being written to,
+ * zero region surrounding the write but leave it to
+ * commit write to do anything else. Finally, if the
+ * buffer is fully being overwritten, do nothing.
+ */
+ if (bh_end <= pos || bh_pos >= end) {
+ if (!buffer_uptodate(bh)) {
+ zero_user(page, bh_offset(bh),
+ blocksize);
+ set_buffer_uptodate(bh);
+ }
+ mark_buffer_dirty(bh);
+ continue;
+ }
+ set_buffer_new(bh);
+ if (!buffer_uptodate(bh) &&
+ (bh_pos < pos || bh_end > end)) {
+ u8 *kaddr;
+ unsigned pofs;
+
+ kaddr = kmap_atomic(page);
+ if (bh_pos < pos) {
+ pofs = bh_pos & ~PAGE_MASK;
+ memset(kaddr + pofs, 0, pos - bh_pos);
+ }
+ if (bh_end > end) {
+ pofs = end & ~PAGE_MASK;
+ memset(kaddr + pofs, 0, bh_end - end);
+ }
+ kunmap_atomic(kaddr);
+ flush_dcache_page(page);
+ }
+ continue;
+ }
+ /*
+ * Slow path: this is the first buffer in the cluster. If it
+ * is outside allocated size and is not uptodate, zero it and
+ * set it uptodate.
+ */
+ read_lock_irqsave(&ni->size_lock, flags);
+ initialized_size = ni->allocated_size;
+ read_unlock_irqrestore(&ni->size_lock, flags);
+ if (bh_pos > initialized_size) {
+ if (PageUptodate(page)) {
+ if (!buffer_uptodate(bh))
+ set_buffer_uptodate(bh);
+ } else if (!buffer_uptodate(bh)) {
+ zero_user(page, bh_offset(bh), blocksize);
+ set_buffer_uptodate(bh);
+ }
+ continue;
+ }
+ is_retry = false;
+ if (!rl) {
+ down_read(&ni->runlist.lock);
+retry_remap:
+ rl = ni->runlist.rl;
+ }
+ if (likely(rl != NULL)) {
+ /* Seek to element containing target cluster. */
+ while (rl->length && rl[1].vcn <= bh_cpos)
+ rl++;
+ lcn = ntfs_rl_vcn_to_lcn(rl, bh_cpos);
+ if (likely(lcn >= 0)) {
+ /*
+ * Successful remap, setup the map cache and
+ * use that to deal with the buffer.
+ */
+ was_hole = false;
+ vcn = bh_cpos;
+ vcn_len = rl[1].vcn - vcn;
+ lcn_block = lcn << (vol->cluster_size_bits -
+ blocksize_bits);
+ cdelta = 0;
+ /*
+ * If the number of remaining clusters touched
+ * by the write is smaller or equal to the
+ * number of cached clusters, unlock the
+ * runlist as the map cache will be used from
+ * now on.
+ */
+ if (likely(vcn + vcn_len >= cend)) {
+ if (rl_write_locked) {
+ up_write(&ni->runlist.lock);
+ rl_write_locked = false;
+ } else
+ up_read(&ni->runlist.lock);
+ rl = NULL;
+ }
+ goto map_buffer_cached;
+ }
+ } else
+ lcn = LCN_RL_NOT_MAPPED;
+ /*
+ * If it is not a hole and not out of bounds, the runlist is
+ * probably unmapped so try to map it now.
+ */
+ if (unlikely(lcn != LCN_HOLE && lcn != LCN_ENOENT)) {
+ if (likely(!is_retry && lcn == LCN_RL_NOT_MAPPED)) {
+ /* Attempt to map runlist. */
+ if (!rl_write_locked) {
+ /*
+ * We need the runlist locked for
+ * writing, so if it is locked for
+ * reading relock it now and retry in
+ * case it changed whilst we dropped
+ * the lock.
+ */
+ up_read(&ni->runlist.lock);
+ down_write(&ni->runlist.lock);
+ rl_write_locked = true;
+ goto retry_remap;
+ }
+ err = ntfs_map_runlist_nolock(ni, bh_cpos,
+ NULL);
+ if (likely(!err)) {
+ is_retry = true;
+ goto retry_remap;
+ }
+ /*
+ * If @vcn is out of bounds, pretend @lcn is
+ * LCN_ENOENT. As long as the buffer is out
+ * of bounds this will work fine.
+ */
+ if (err == -ENOENT) {
+ lcn = LCN_ENOENT;
+ err = 0;
+ goto rl_not_mapped_enoent;
+ }
+ } else
+ err = -EIO;
+ /* Failed to map the buffer, even after retrying. */
+ bh->b_blocknr = -1;
+ ntfs_error(vol->sb, "Failed to write to inode 0x%lx, "
+ "attribute type 0x%x, vcn 0x%llx, "
+ "vcn offset 0x%x, because its "
+ "location on disk could not be "
+ "determined%s (error code %i).",
+ ni->mft_no, ni->type,
+ (unsigned long long)bh_cpos,
+ (unsigned)bh_pos &
+ vol->cluster_size_mask,
+ is_retry ? " even after retrying" : "",
+ err);
+ break;
+ }
+rl_not_mapped_enoent:
+ /*
+ * The buffer is in a hole or out of bounds. We need to fill
+ * the hole, unless the buffer is in a cluster which is not
+ * touched by the write, in which case we just leave the buffer
+ * unmapped. This can only happen when the cluster size is
+ * less than the page cache size.
+ */
+ if (unlikely(vol->cluster_size < PAGE_SIZE)) {
+ bh_cend = (bh_end + vol->cluster_size - 1) >>
+ vol->cluster_size_bits;
+ if ((bh_cend <= cpos || bh_cpos >= cend)) {
+ bh->b_blocknr = -1;
+ /*
+ * If the buffer is uptodate we skip it. If it
+ * is not but the page is uptodate, we can set
+ * the buffer uptodate. If the page is not
+ * uptodate, we can clear the buffer and set it
+ * uptodate. Whether this is worthwhile is
+ * debatable and this could be removed.
+ */
+ if (PageUptodate(page)) {
+ if (!buffer_uptodate(bh))
+ set_buffer_uptodate(bh);
+ } else if (!buffer_uptodate(bh)) {
+ zero_user(page, bh_offset(bh),
+ blocksize);
+ set_buffer_uptodate(bh);
+ }
+ continue;
+ }
+ }
+ /*
+ * Out of bounds buffer is invalid if it was not really out of
+ * bounds.
+ */
+ BUG_ON(lcn != LCN_HOLE);
+ /*
+ * We need the runlist locked for writing, so if it is locked
+ * for reading relock it now and retry in case it changed
+ * whilst we dropped the lock.
+ */
+ BUG_ON(!rl);
+ if (!rl_write_locked) {
+ up_read(&ni->runlist.lock);
+ down_write(&ni->runlist.lock);
+ rl_write_locked = true;
+ goto retry_remap;
+ }
+ /* Find the previous last allocated cluster. */
+ BUG_ON(rl->lcn != LCN_HOLE);
+ lcn = -1;
+ rl2 = rl;
+ while (--rl2 >= ni->runlist.rl) {
+ if (rl2->lcn >= 0) {
+ lcn = rl2->lcn + rl2->length;
+ break;
+ }
+ }
+ rl2 = ntfs_cluster_alloc(vol, bh_cpos, 1, lcn, DATA_ZONE,
+ false);
+ if (IS_ERR(rl2)) {
+ err = PTR_ERR(rl2);
+ ntfs_debug("Failed to allocate cluster, error code %i.",
+ err);
+ break;
+ }
+ lcn = rl2->lcn;
+ rl = ntfs_runlists_merge(ni->runlist.rl, rl2);
+ if (IS_ERR(rl)) {
+ err = PTR_ERR(rl);
+ if (err != -ENOMEM)
+ err = -EIO;
+ if (ntfs_cluster_free_from_rl(vol, rl2)) {
+ ntfs_error(vol->sb, "Failed to release "
+ "allocated cluster in error "
+ "code path. Run chkdsk to "
+ "recover the lost cluster.");
+ NVolSetErrors(vol);
+ }
+ ntfs_free(rl2);
+ break;
+ }
+ ni->runlist.rl = rl;
+ status.runlist_merged = 1;
+ ntfs_debug("Allocated cluster, lcn 0x%llx.",
+ (unsigned long long)lcn);
+ /* Map and lock the mft record and get the attribute record. */
+ if (!NInoAttr(ni))
+ base_ni = ni;
+ else
+ base_ni = ni->ext.base_ntfs_ino;
+ m = map_mft_record(base_ni);
+ if (IS_ERR(m)) {
+ err = PTR_ERR(m);
+ break;
+ }
+ ctx = ntfs_attr_get_search_ctx(base_ni, m);
+ if (unlikely(!ctx)) {
+ err = -ENOMEM;
+ unmap_mft_record(base_ni);
+ break;
+ }
+ status.mft_attr_mapped = 1;
+ err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
+ CASE_SENSITIVE, bh_cpos, NULL, 0, ctx);
+ if (unlikely(err)) {
+ if (err == -ENOENT)
+ err = -EIO;
+ break;
+ }
+ m = ctx->mrec;
+ a = ctx->attr;
+ /*
+ * Find the runlist element with which the attribute extent
+ * starts. Note, we cannot use the _attr_ version because we
+ * have mapped the mft record. That is ok because we know the
+ * runlist fragment must be mapped already to have ever gotten
+ * here, so we can just use the _rl_ version.
+ */
+ vcn = sle64_to_cpu(a->data.non_resident.lowest_vcn);
+ rl2 = ntfs_rl_find_vcn_nolock(rl, vcn);
+ BUG_ON(!rl2);
+ BUG_ON(!rl2->length);
+ BUG_ON(rl2->lcn < LCN_HOLE);
+ highest_vcn = sle64_to_cpu(a->data.non_resident.highest_vcn);
+ /*
+ * If @highest_vcn is zero, calculate the real highest_vcn
+ * (which can really be zero).
+ */
+ if (!highest_vcn)
+ highest_vcn = (sle64_to_cpu(
+ a->data.non_resident.allocated_size) >>
+ vol->cluster_size_bits) - 1;
+ /*
+ * Determine the size of the mapping pairs array for the new
+ * extent, i.e. the old extent with the hole filled.
+ */
+ mp_size = ntfs_get_size_for_mapping_pairs(vol, rl2, vcn,
+ highest_vcn);
+ if (unlikely(mp_size <= 0)) {
+ if (!(err = mp_size))
+ err = -EIO;
+ ntfs_debug("Failed to get size for mapping pairs "
+ "array, error code %i.", err);
+ break;
+ }
+ /*
+ * Resize the attribute record to fit the new mapping pairs
+ * array.
+ */
+ attr_rec_len = le32_to_cpu(a->length);
+ err = ntfs_attr_record_resize(m, a, mp_size + le16_to_cpu(
+ a->data.non_resident.mapping_pairs_offset));
+ if (unlikely(err)) {
+ BUG_ON(err != -ENOSPC);
+ // TODO: Deal with this by using the current attribute
+ // and fill it with as much of the mapping pairs
+ // array as possible. Then loop over each attribute
+ // extent rewriting the mapping pairs arrays as we go
+ // along and if when we reach the end we have not
+ // enough space, try to resize the last attribute
+ // extent and if even that fails, add a new attribute
+ // extent.
+ // We could also try to resize at each step in the hope
+ // that we will not need to rewrite every single extent.
+ // Note, we may need to decompress some extents to fill
+ // the runlist as we are walking the extents...
+ ntfs_error(vol->sb, "Not enough space in the mft "
+ "record for the extended attribute "
+ "record. This case is not "
+ "implemented yet.");
+ err = -EOPNOTSUPP;
+ break ;
+ }
+ status.mp_rebuilt = 1;
+ /*
+ * Generate the mapping pairs array directly into the attribute
+ * record.
+ */
+ err = ntfs_mapping_pairs_build(vol, (u8*)a + le16_to_cpu(
+ a->data.non_resident.mapping_pairs_offset),
+ mp_size, rl2, vcn, highest_vcn, NULL);
+ if (unlikely(err)) {
+ ntfs_error(vol->sb, "Cannot fill hole in inode 0x%lx, "
+ "attribute type 0x%x, because building "
+ "the mapping pairs failed with error "
+ "code %i.", vi->i_ino,
+ (unsigned)le32_to_cpu(ni->type), err);
+ err = -EIO;
+ break;
+ }
+ /* Update the highest_vcn but only if it was not set. */
+ if (unlikely(!a->data.non_resident.highest_vcn))
+ a->data.non_resident.highest_vcn =
+ cpu_to_sle64(highest_vcn);
+ /*
+ * If the attribute is sparse/compressed, update the compressed
+ * size in the ntfs_inode structure and the attribute record.
+ */
+ if (likely(NInoSparse(ni) || NInoCompressed(ni))) {
+ /*
+ * If we are not in the first attribute extent, switch
+ * to it, but first ensure the changes will make it to
+ * disk later.
+ */
+ if (a->data.non_resident.lowest_vcn) {
+ flush_dcache_mft_record_page(ctx->ntfs_ino);
+ mark_mft_record_dirty(ctx->ntfs_ino);
+ ntfs_attr_reinit_search_ctx(ctx);
+ err = ntfs_attr_lookup(ni->type, ni->name,
+ ni->name_len, CASE_SENSITIVE,
+ 0, NULL, 0, ctx);
+ if (unlikely(err)) {
+ status.attr_switched = 1;
+ break;
+ }
+ /* @m is not used any more so do not set it. */
+ a = ctx->attr;
+ }
+ write_lock_irqsave(&ni->size_lock, flags);
+ ni->itype.compressed.size += vol->cluster_size;
+ a->data.non_resident.compressed_size =
+ cpu_to_sle64(ni->itype.compressed.size);
+ write_unlock_irqrestore(&ni->size_lock, flags);
+ }
+ /* Ensure the changes make it to disk. */
+ flush_dcache_mft_record_page(ctx->ntfs_ino);
+ mark_mft_record_dirty(ctx->ntfs_ino);
+ ntfs_attr_put_search_ctx(ctx);
+ unmap_mft_record(base_ni);
+ /* Successfully filled the hole. */
+ status.runlist_merged = 0;
+ status.mft_attr_mapped = 0;
+ status.mp_rebuilt = 0;
+ /* Setup the map cache and use that to deal with the buffer. */
+ was_hole = true;
+ vcn = bh_cpos;
+ vcn_len = 1;
+ lcn_block = lcn << (vol->cluster_size_bits - blocksize_bits);
+ cdelta = 0;
+ /*
+ * If the number of remaining clusters in the @pages is smaller
+ * or equal to the number of cached clusters, unlock the
+ * runlist as the map cache will be used from now on.
+ */
+ if (likely(vcn + vcn_len >= cend)) {
+ up_write(&ni->runlist.lock);
+ rl_write_locked = false;
+ rl = NULL;
+ }
+ goto map_buffer_cached;
+ } while (bh_pos += blocksize, (bh = bh->b_this_page) != head);
+ /* If there are no errors, do the next page. */
+ if (likely(!err && ++u < nr_pages))
+ goto do_next_page;
+ /* If there are no errors, release the runlist lock if we took it. */
+ if (likely(!err)) {
+ if (unlikely(rl_write_locked)) {
+ up_write(&ni->runlist.lock);
+ rl_write_locked = false;
+ } else if (unlikely(rl))
+ up_read(&ni->runlist.lock);
+ rl = NULL;
+ }
+ /* If we issued read requests, let them complete. */
+ read_lock_irqsave(&ni->size_lock, flags);
+ initialized_size = ni->initialized_size;
+ read_unlock_irqrestore(&ni->size_lock, flags);
+ while (wait_bh > wait) {
+ bh = *--wait_bh;
+ wait_on_buffer(bh);
+ if (likely(buffer_uptodate(bh))) {
+ page = bh->b_page;
+ bh_pos = ((s64)page->index << PAGE_SHIFT) +
+ bh_offset(bh);
+ /*
+ * If the buffer overflows the initialized size, need
+ * to zero the overflowing region.
+ */
+ if (unlikely(bh_pos + blocksize > initialized_size)) {
+ int ofs = 0;
+
+ if (likely(bh_pos < initialized_size))
+ ofs = initialized_size - bh_pos;
+ zero_user_segment(page, bh_offset(bh) + ofs,
+ blocksize);
+ }
+ } else /* if (unlikely(!buffer_uptodate(bh))) */
+ err = -EIO;
+ }
+ if (likely(!err)) {
+ /* Clear buffer_new on all buffers. */
+ u = 0;
+ do {
+ bh = head = page_buffers(pages[u]);
+ do {
+ if (buffer_new(bh))
+ clear_buffer_new(bh);
+ } while ((bh = bh->b_this_page) != head);
+ } while (++u < nr_pages);
+ ntfs_debug("Done.");
+ return err;
+ }
+ if (status.attr_switched) {
+ /* Get back to the attribute extent we modified. */
+ ntfs_attr_reinit_search_ctx(ctx);
+ if (ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
+ CASE_SENSITIVE, bh_cpos, NULL, 0, ctx)) {
+ ntfs_error(vol->sb, "Failed to find required "
+ "attribute extent of attribute in "
+ "error code path. Run chkdsk to "
+ "recover.");
+ write_lock_irqsave(&ni->size_lock, flags);
+ ni->itype.compressed.size += vol->cluster_size;
+ write_unlock_irqrestore(&ni->size_lock, flags);
+ flush_dcache_mft_record_page(ctx->ntfs_ino);
+ mark_mft_record_dirty(ctx->ntfs_ino);
+ /*
+ * The only thing that is now wrong is the compressed
+ * size of the base attribute extent which chkdsk
+ * should be able to fix.
+ */
+ NVolSetErrors(vol);
+ } else {
+ m = ctx->mrec;
+ a = ctx->attr;
+ status.attr_switched = 0;
+ }
+ }
+ /*
+ * If the runlist has been modified, need to restore it by punching a
+ * hole into it and we then need to deallocate the on-disk cluster as
+ * well. Note, we only modify the runlist if we are able to generate a
+ * new mapping pairs array, i.e. only when the mapped attribute extent
+ * is not switched.
+ */
+ if (status.runlist_merged && !status.attr_switched) {
+ BUG_ON(!rl_write_locked);
+ /* Make the file cluster we allocated sparse in the runlist. */
+ if (ntfs_rl_punch_nolock(vol, &ni->runlist, bh_cpos, 1)) {
+ ntfs_error(vol->sb, "Failed to punch hole into "
+ "attribute runlist in error code "
+ "path. Run chkdsk to recover the "
+ "lost cluster.");
+ NVolSetErrors(vol);
+ } else /* if (success) */ {
+ status.runlist_merged = 0;
+ /*
+ * Deallocate the on-disk cluster we allocated but only
+ * if we succeeded in punching its vcn out of the
+ * runlist.
+ */
+ down_write(&vol->lcnbmp_lock);
+ if (ntfs_bitmap_clear_bit(vol->lcnbmp_ino, lcn)) {
+ ntfs_error(vol->sb, "Failed to release "
+ "allocated cluster in error "
+ "code path. Run chkdsk to "
+ "recover the lost cluster.");
+ NVolSetErrors(vol);
+ }
+ up_write(&vol->lcnbmp_lock);
+ }
+ }
+ /*
+ * Resize the attribute record to its old size and rebuild the mapping
+ * pairs array. Note, we only can do this if the runlist has been
+ * restored to its old state which also implies that the mapped
+ * attribute extent is not switched.
+ */
+ if (status.mp_rebuilt && !status.runlist_merged) {
+ if (ntfs_attr_record_resize(m, a, attr_rec_len)) {
+ ntfs_error(vol->sb, "Failed to restore attribute "
+ "record in error code path. Run "
+ "chkdsk to recover.");
+ NVolSetErrors(vol);
+ } else /* if (success) */ {
+ if (ntfs_mapping_pairs_build(vol, (u8*)a +
+ le16_to_cpu(a->data.non_resident.
+ mapping_pairs_offset), attr_rec_len -
+ le16_to_cpu(a->data.non_resident.
+ mapping_pairs_offset), ni->runlist.rl,
+ vcn, highest_vcn, NULL)) {
+ ntfs_error(vol->sb, "Failed to restore "
+ "mapping pairs array in error "
+ "code path. Run chkdsk to "
+ "recover.");
+ NVolSetErrors(vol);
+ }
+ flush_dcache_mft_record_page(ctx->ntfs_ino);
+ mark_mft_record_dirty(ctx->ntfs_ino);
+ }
+ }
+ /* Release the mft record and the attribute. */
+ if (status.mft_attr_mapped) {
+ ntfs_attr_put_search_ctx(ctx);
+ unmap_mft_record(base_ni);
+ }
+ /* Release the runlist lock. */
+ if (rl_write_locked)
+ up_write(&ni->runlist.lock);
+ else if (rl)
+ up_read(&ni->runlist.lock);
+ /*
+ * Zero out any newly allocated blocks to avoid exposing stale data.
+ * If BH_New is set, we know that the block was newly allocated above
+ * and that it has not been fully zeroed and marked dirty yet.
+ */
+ nr_pages = u;
+ u = 0;
+ end = bh_cpos << vol->cluster_size_bits;
+ do {
+ page = pages[u];
+ bh = head = page_buffers(page);
+ do {
+ if (u == nr_pages &&
+ ((s64)page->index << PAGE_SHIFT) +
+ bh_offset(bh) >= end)
+ break;
+ if (!buffer_new(bh))
+ continue;
+ clear_buffer_new(bh);
+ if (!buffer_uptodate(bh)) {
+ if (PageUptodate(page))
+ set_buffer_uptodate(bh);
+ else {
+ zero_user(page, bh_offset(bh),
+ blocksize);
+ set_buffer_uptodate(bh);
+ }
+ }
+ mark_buffer_dirty(bh);
+ } while ((bh = bh->b_this_page) != head);
+ } while (++u <= nr_pages);
+ ntfs_error(vol->sb, "Failed. Returning error code %i.", err);
+ return err;
+}
+
+static inline void ntfs_flush_dcache_pages(struct page **pages,
+ unsigned nr_pages)
+{
+ BUG_ON(!nr_pages);
+ /*
+ * Warning: Do not do the decrement at the same time as the call to
+ * flush_dcache_page() because it is a NULL macro on i386 and hence the
+ * decrement never happens so the loop never terminates.
+ */
+ do {
+ --nr_pages;
+ flush_dcache_page(pages[nr_pages]);
+ } while (nr_pages > 0);
+}
+
+/**
+ * ntfs_commit_pages_after_non_resident_write - commit the received data
+ * @pages: array of destination pages
+ * @nr_pages: number of pages in @pages
+ * @pos: byte position in file at which the write begins
+ * @bytes: number of bytes to be written
+ *
+ * See description of ntfs_commit_pages_after_write(), below.
+ */
+static inline int ntfs_commit_pages_after_non_resident_write(
+ struct page **pages, const unsigned nr_pages,
+ s64 pos, size_t bytes)
+{
+ s64 end, initialized_size;
+ struct inode *vi;
+ ntfs_inode *ni, *base_ni;
+ struct buffer_head *bh, *head;
+ ntfs_attr_search_ctx *ctx;
+ MFT_RECORD *m;
+ ATTR_RECORD *a;
+ unsigned long flags;
+ unsigned blocksize, u;
+ int err;
+
+ vi = pages[0]->mapping->host;
+ ni = NTFS_I(vi);
+ blocksize = vi->i_sb->s_blocksize;
+ end = pos + bytes;
+ u = 0;
+ do {
+ s64 bh_pos;
+ struct page *page;
+ bool partial;
+
+ page = pages[u];
+ bh_pos = (s64)page->index << PAGE_SHIFT;
+ bh = head = page_buffers(page);
+ partial = false;
+ do {
+ s64 bh_end;
+
+ bh_end = bh_pos + blocksize;
+ if (bh_end <= pos || bh_pos >= end) {
+ if (!buffer_uptodate(bh))
+ partial = true;
+ } else {
+ set_buffer_uptodate(bh);
+ mark_buffer_dirty(bh);
+ }
+ } while (bh_pos += blocksize, (bh = bh->b_this_page) != head);
+ /*
+ * If all buffers are now uptodate but the page is not, set the
+ * page uptodate.
+ */
+ if (!partial && !PageUptodate(page))
+ SetPageUptodate(page);
+ } while (++u < nr_pages);
+ /*
+ * Finally, if we do not need to update initialized_size or i_size we
+ * are finished.
+ */
+ read_lock_irqsave(&ni->size_lock, flags);
+ initialized_size = ni->initialized_size;
+ read_unlock_irqrestore(&ni->size_lock, flags);
+ if (end <= initialized_size) {
+ ntfs_debug("Done.");
+ return 0;
+ }
+ /*
+ * Update initialized_size/i_size as appropriate, both in the inode and
+ * the mft record.
+ */
+ if (!NInoAttr(ni))
+ base_ni = ni;
+ else
+ base_ni = ni->ext.base_ntfs_ino;
+ /* Map, pin, and lock the mft record. */
+ m = map_mft_record(base_ni);
+ if (IS_ERR(m)) {
+ err = PTR_ERR(m);
+ m = NULL;
+ ctx = NULL;
+ goto err_out;
+ }
+ BUG_ON(!NInoNonResident(ni));
+ ctx = ntfs_attr_get_search_ctx(base_ni, m);
+ if (unlikely(!ctx)) {
+ err = -ENOMEM;
+ goto err_out;
+ }
+ err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
+ CASE_SENSITIVE, 0, NULL, 0, ctx);
+ if (unlikely(err)) {
+ if (err == -ENOENT)
+ err = -EIO;
+ goto err_out;
+ }
+ a = ctx->attr;
+ BUG_ON(!a->non_resident);
+ write_lock_irqsave(&ni->size_lock, flags);
+ BUG_ON(end > ni->allocated_size);
+ ni->initialized_size = end;
+ a->data.non_resident.initialized_size = cpu_to_sle64(end);
+ if (end > i_size_read(vi)) {
+ i_size_write(vi, end);
+ a->data.non_resident.data_size =
+ a->data.non_resident.initialized_size;
+ }
+ write_unlock_irqrestore(&ni->size_lock, flags);
+ /* Mark the mft record dirty, so it gets written back. */
+ flush_dcache_mft_record_page(ctx->ntfs_ino);
+ mark_mft_record_dirty(ctx->ntfs_ino);
+ ntfs_attr_put_search_ctx(ctx);
+ unmap_mft_record(base_ni);
+ ntfs_debug("Done.");
+ return 0;
+err_out:
+ if (ctx)
+ ntfs_attr_put_search_ctx(ctx);
+ if (m)
+ unmap_mft_record(base_ni);
+ ntfs_error(vi->i_sb, "Failed to update initialized_size/i_size (error "
+ "code %i).", err);
+ if (err != -ENOMEM)
+ NVolSetErrors(ni->vol);
+ return err;
+}
+
+/**
+ * ntfs_commit_pages_after_write - commit the received data
+ * @pages: array of destination pages
+ * @nr_pages: number of pages in @pages
+ * @pos: byte position in file at which the write begins
+ * @bytes: number of bytes to be written
+ *
+ * This is called from ntfs_file_buffered_write() with i_mutex held on the inode
+ * (@pages[0]->mapping->host). There are @nr_pages pages in @pages which are
+ * locked but not kmap()ped. The source data has already been copied into the
+ * @page. ntfs_prepare_pages_for_non_resident_write() has been called before
+ * the data was copied (for non-resident attributes only) and it returned
+ * success.
+ *
+ * Need to set uptodate and mark dirty all buffers within the boundary of the
+ * write. If all buffers in a page are uptodate we set the page uptodate, too.
+ *
+ * Setting the buffers dirty ensures that they get written out later when
+ * ntfs_writepage() is invoked by the VM.
+ *
+ * Finally, we need to update i_size and initialized_size as appropriate both
+ * in the inode and the mft record.
+ *
+ * This is modelled after fs/buffer.c::generic_commit_write(), which marks
+ * buffers uptodate and dirty, sets the page uptodate if all buffers in the
+ * page are uptodate, and updates i_size if the end of io is beyond i_size. In
+ * that case, it also marks the inode dirty.
+ *
+ * If things have gone as outlined in
+ * ntfs_prepare_pages_for_non_resident_write(), we do not need to do any page
+ * content modifications here for non-resident attributes. For resident
+ * attributes we need to do the uptodate bringing here which we combine with
+ * the copying into the mft record which means we save one atomic kmap.
+ *
+ * Return 0 on success or -errno on error.
+ */
+static int ntfs_commit_pages_after_write(struct page **pages,
+ const unsigned nr_pages, s64 pos, size_t bytes)
+{
+ s64 end, initialized_size;
+ loff_t i_size;
+ struct inode *vi;
+ ntfs_inode *ni, *base_ni;
+ struct page *page;
+ ntfs_attr_search_ctx *ctx;
+ MFT_RECORD *m;
+ ATTR_RECORD *a;
+ char *kattr, *kaddr;
+ unsigned long flags;
+ u32 attr_len;
+ int err;
+
+ BUG_ON(!nr_pages);
+ BUG_ON(!pages);
+ page = pages[0];
+ BUG_ON(!page);
+ vi = page->mapping->host;
+ ni = NTFS_I(vi);
+ ntfs_debug("Entering for inode 0x%lx, attribute type 0x%x, start page "
+ "index 0x%lx, nr_pages 0x%x, pos 0x%llx, bytes 0x%zx.",
+ vi->i_ino, ni->type, page->index, nr_pages,
+ (long long)pos, bytes);
+ if (NInoNonResident(ni))
+ return ntfs_commit_pages_after_non_resident_write(pages,
+ nr_pages, pos, bytes);
+ BUG_ON(nr_pages > 1);
+ /*
+ * Attribute is resident, implying it is not compressed, encrypted, or
+ * sparse.
+ */
+ if (!NInoAttr(ni))
+ base_ni = ni;
+ else
+ base_ni = ni->ext.base_ntfs_ino;
+ BUG_ON(NInoNonResident(ni));
+ /* Map, pin, and lock the mft record. */
+ m = map_mft_record(base_ni);
+ if (IS_ERR(m)) {
+ err = PTR_ERR(m);
+ m = NULL;
+ ctx = NULL;
+ goto err_out;
+ }
+ ctx = ntfs_attr_get_search_ctx(base_ni, m);
+ if (unlikely(!ctx)) {
+ err = -ENOMEM;
+ goto err_out;
+ }
+ err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
+ CASE_SENSITIVE, 0, NULL, 0, ctx);
+ if (unlikely(err)) {
+ if (err == -ENOENT)
+ err = -EIO;
+ goto err_out;
+ }
+ a = ctx->attr;
+ BUG_ON(a->non_resident);
+ /* The total length of the attribute value. */
+ attr_len = le32_to_cpu(a->data.resident.value_length);
+ i_size = i_size_read(vi);
+ BUG_ON(attr_len != i_size);
+ BUG_ON(pos > attr_len);
+ end = pos + bytes;
+ BUG_ON(end > le32_to_cpu(a->length) -
+ le16_to_cpu(a->data.resident.value_offset));
+ kattr = (u8*)a + le16_to_cpu(a->data.resident.value_offset);
+ kaddr = kmap_atomic(page);
+ /* Copy the received data from the page to the mft record. */
+ memcpy(kattr + pos, kaddr + pos, bytes);
+ /* Update the attribute length if necessary. */
+ if (end > attr_len) {
+ attr_len = end;
+ a->data.resident.value_length = cpu_to_le32(attr_len);
+ }
+ /*
+ * If the page is not uptodate, bring the out of bounds area(s)
+ * uptodate by copying data from the mft record to the page.
+ */
+ if (!PageUptodate(page)) {
+ if (pos > 0)
+ memcpy(kaddr, kattr, pos);
+ if (end < attr_len)
+ memcpy(kaddr + end, kattr + end, attr_len - end);
+ /* Zero the region outside the end of the attribute value. */
+ memset(kaddr + attr_len, 0, PAGE_SIZE - attr_len);
+ flush_dcache_page(page);
+ SetPageUptodate(page);
+ }
+ kunmap_atomic(kaddr);
+ /* Update initialized_size/i_size if necessary. */
+ read_lock_irqsave(&ni->size_lock, flags);
+ initialized_size = ni->initialized_size;
+ BUG_ON(end > ni->allocated_size);
+ read_unlock_irqrestore(&ni->size_lock, flags);
+ BUG_ON(initialized_size != i_size);
+ if (end > initialized_size) {
+ write_lock_irqsave(&ni->size_lock, flags);
+ ni->initialized_size = end;
+ i_size_write(vi, end);
+ write_unlock_irqrestore(&ni->size_lock, flags);
+ }
+ /* Mark the mft record dirty, so it gets written back. */
+ flush_dcache_mft_record_page(ctx->ntfs_ino);
+ mark_mft_record_dirty(ctx->ntfs_ino);
+ ntfs_attr_put_search_ctx(ctx);
+ unmap_mft_record(base_ni);
+ ntfs_debug("Done.");
+ return 0;
+err_out:
+ if (err == -ENOMEM) {
+ ntfs_warning(vi->i_sb, "Error allocating memory required to "
+ "commit the write.");
+ if (PageUptodate(page)) {
+ ntfs_warning(vi->i_sb, "Page is uptodate, setting "
+ "dirty so the write will be retried "
+ "later on by the VM.");
+ /*
+ * Put the page on mapping->dirty_pages, but leave its
+ * buffers' dirty state as-is.
+ */
+ __set_page_dirty_nobuffers(page);
+ err = 0;
+ } else
+ ntfs_error(vi->i_sb, "Page is not uptodate. Written "
+ "data has been lost.");
+ } else {
+ ntfs_error(vi->i_sb, "Resident attribute commit write failed "
+ "with error %i.", err);
+ NVolSetErrors(ni->vol);
+ }
+ if (ctx)
+ ntfs_attr_put_search_ctx(ctx);
+ if (m)
+ unmap_mft_record(base_ni);
+ return err;
+}
+
+/*
+ * Copy as much as we can into the pages and return the number of bytes which
+ * were successfully copied. If a fault is encountered then clear the pages
+ * out to (ofs + bytes) and return the number of bytes which were copied.
+ */
+static size_t ntfs_copy_from_user_iter(struct page **pages, unsigned nr_pages,
+ unsigned ofs, struct iov_iter *i, size_t bytes)
+{
+ struct page **last_page = pages + nr_pages;
+ size_t total = 0;
+ unsigned len, copied;
+
+ do {
+ len = PAGE_SIZE - ofs;
+ if (len > bytes)
+ len = bytes;
+ copied = copy_page_from_iter_atomic(*pages, ofs, len, i);
+ total += copied;
+ bytes -= copied;
+ if (!bytes)
+ break;
+ if (copied < len)
+ goto err;
+ ofs = 0;
+ } while (++pages < last_page);
+out:
+ return total;
+err:
+ /* Zero the rest of the target like __copy_from_user(). */
+ len = PAGE_SIZE - copied;
+ do {
+ if (len > bytes)
+ len = bytes;
+ zero_user(*pages, copied, len);
+ bytes -= len;
+ copied = 0;
+ len = PAGE_SIZE;
+ } while (++pages < last_page);
+ goto out;
+}
+
+/**
+ * ntfs_perform_write - perform buffered write to a file
+ * @file: file to write to
+ * @i: iov_iter with data to write
+ * @pos: byte offset in file at which to begin writing to
+ */
+static ssize_t ntfs_perform_write(struct file *file, struct iov_iter *i,
+ loff_t pos)
+{
+ struct address_space *mapping = file->f_mapping;
+ struct inode *vi = mapping->host;
+ ntfs_inode *ni = NTFS_I(vi);
+ ntfs_volume *vol = ni->vol;
+ struct page *pages[NTFS_MAX_PAGES_PER_CLUSTER];
+ struct page *cached_page = NULL;
+ VCN last_vcn;
+ LCN lcn;
+ size_t bytes;
+ ssize_t status, written = 0;
+ unsigned nr_pages;
+
+ ntfs_debug("Entering for i_ino 0x%lx, attribute type 0x%x, pos "
+ "0x%llx, count 0x%lx.", vi->i_ino,
+ (unsigned)le32_to_cpu(ni->type),
+ (unsigned long long)pos,
+ (unsigned long)iov_iter_count(i));
+ /*
+ * If a previous ntfs_truncate() failed, repeat it and abort if it
+ * fails again.
+ */
+ if (unlikely(NInoTruncateFailed(ni))) {
+ int err;
+
+ inode_dio_wait(vi);
+ err = ntfs_truncate(vi);
+ if (err || NInoTruncateFailed(ni)) {
+ if (!err)
+ err = -EIO;
+ ntfs_error(vol->sb, "Cannot perform write to inode "
+ "0x%lx, attribute type 0x%x, because "
+ "ntfs_truncate() failed (error code "
+ "%i).", vi->i_ino,
+ (unsigned)le32_to_cpu(ni->type), err);
+ return err;
+ }
+ }
+ /*
+ * Determine the number of pages per cluster for non-resident
+ * attributes.
+ */
+ nr_pages = 1;
+ if (vol->cluster_size > PAGE_SIZE && NInoNonResident(ni))
+ nr_pages = vol->cluster_size >> PAGE_SHIFT;
+ last_vcn = -1;
+ do {
+ VCN vcn;
+ pgoff_t start_idx;
+ unsigned ofs, do_pages, u;
+ size_t copied;
+
+ start_idx = pos >> PAGE_SHIFT;
+ ofs = pos & ~PAGE_MASK;
+ bytes = PAGE_SIZE - ofs;
+ do_pages = 1;
+ if (nr_pages > 1) {
+ vcn = pos >> vol->cluster_size_bits;
+ if (vcn != last_vcn) {
+ last_vcn = vcn;
+ /*
+ * Get the lcn of the vcn the write is in. If
+ * it is a hole, need to lock down all pages in
+ * the cluster.
+ */
+ down_read(&ni->runlist.lock);
+ lcn = ntfs_attr_vcn_to_lcn_nolock(ni, pos >>
+ vol->cluster_size_bits, false);
+ up_read(&ni->runlist.lock);
+ if (unlikely(lcn < LCN_HOLE)) {
+ if (lcn == LCN_ENOMEM)
+ status = -ENOMEM;
+ else {
+ status = -EIO;
+ ntfs_error(vol->sb, "Cannot "
+ "perform write to "
+ "inode 0x%lx, "
+ "attribute type 0x%x, "
+ "because the attribute "
+ "is corrupt.",
+ vi->i_ino, (unsigned)
+ le32_to_cpu(ni->type));
+ }
+ break;
+ }
+ if (lcn == LCN_HOLE) {
+ start_idx = (pos & ~(s64)
+ vol->cluster_size_mask)
+ >> PAGE_SHIFT;
+ bytes = vol->cluster_size - (pos &
+ vol->cluster_size_mask);
+ do_pages = nr_pages;
+ }
+ }
+ }
+ if (bytes > iov_iter_count(i))
+ bytes = iov_iter_count(i);
+again:
+ /*
+ * Bring in the user page(s) that we will copy from _first_.
+ * Otherwise there is a nasty deadlock on copying from the same
+ * page(s) as we are writing to, without it/them being marked
+ * up-to-date. Note, at present there is nothing to stop the
+ * pages being swapped out between us bringing them into memory
+ * and doing the actual copying.
+ */
+ if (unlikely(fault_in_iov_iter_readable(i, bytes))) {
+ status = -EFAULT;
+ break;
+ }
+ /* Get and lock @do_pages starting at index @start_idx. */
+ status = __ntfs_grab_cache_pages(mapping, start_idx, do_pages,
+ pages, &cached_page);
+ if (unlikely(status))
+ break;
+ /*
+ * For non-resident attributes, we need to fill any holes with
+ * actual clusters and ensure all bufferes are mapped. We also
+ * need to bring uptodate any buffers that are only partially
+ * being written to.
+ */
+ if (NInoNonResident(ni)) {
+ status = ntfs_prepare_pages_for_non_resident_write(
+ pages, do_pages, pos, bytes);
+ if (unlikely(status)) {
+ do {
+ unlock_page(pages[--do_pages]);
+ put_page(pages[do_pages]);
+ } while (do_pages);
+ break;
+ }
+ }
+ u = (pos >> PAGE_SHIFT) - pages[0]->index;
+ copied = ntfs_copy_from_user_iter(pages + u, do_pages - u, ofs,
+ i, bytes);
+ ntfs_flush_dcache_pages(pages + u, do_pages - u);
+ status = 0;
+ if (likely(copied == bytes)) {
+ status = ntfs_commit_pages_after_write(pages, do_pages,
+ pos, bytes);
+ }
+ do {
+ unlock_page(pages[--do_pages]);
+ put_page(pages[do_pages]);
+ } while (do_pages);
+ if (unlikely(status < 0)) {
+ iov_iter_revert(i, copied);
+ break;
+ }
+ cond_resched();
+ if (unlikely(copied < bytes)) {
+ iov_iter_revert(i, copied);
+ if (copied)
+ bytes = copied;
+ else if (bytes > PAGE_SIZE - ofs)
+ bytes = PAGE_SIZE - ofs;
+ goto again;
+ }
+ pos += copied;
+ written += copied;
+ balance_dirty_pages_ratelimited(mapping);
+ if (fatal_signal_pending(current)) {
+ status = -EINTR;
+ break;
+ }
+ } while (iov_iter_count(i));
+ if (cached_page)
+ put_page(cached_page);
+ ntfs_debug("Done. Returning %s (written 0x%lx, status %li).",
+ written ? "written" : "status", (unsigned long)written,
+ (long)status);
+ return written ? written : status;
+}
+
+/**
+ * ntfs_file_write_iter - simple wrapper for ntfs_file_write_iter_nolock()
+ * @iocb: IO state structure
+ * @from: iov_iter with data to write
+ *
+ * Basically the same as generic_file_write_iter() except that it ends up
+ * up calling ntfs_perform_write() instead of generic_perform_write() and that
+ * O_DIRECT is not implemented.
+ */
+static ssize_t ntfs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
+{
+ struct file *file = iocb->ki_filp;
+ struct inode *vi = file_inode(file);
+ ssize_t written = 0;
+ ssize_t err;
+
+ inode_lock(vi);
+ /* We can write back this queue in page reclaim. */
+ current->backing_dev_info = inode_to_bdi(vi);
+ err = ntfs_prepare_file_for_write(iocb, from);
+ if (iov_iter_count(from) && !err)
+ written = ntfs_perform_write(file, from, iocb->ki_pos);
+ current->backing_dev_info = NULL;
+ inode_unlock(vi);
+ iocb->ki_pos += written;
+ if (likely(written > 0))
+ written = generic_write_sync(iocb, written);
+ return written ? written : err;
+}
+
+/**
+ * ntfs_file_fsync - sync a file to disk
+ * @filp: file to be synced
+ * @datasync: if non-zero only flush user data and not metadata
+ *
+ * Data integrity sync of a file to disk. Used for fsync, fdatasync, and msync
+ * system calls. This function is inspired by fs/buffer.c::file_fsync().
+ *
+ * If @datasync is false, write the mft record and all associated extent mft
+ * records as well as the $DATA attribute and then sync the block device.
+ *
+ * If @datasync is true and the attribute is non-resident, we skip the writing
+ * of the mft record and all associated extent mft records (this might still
+ * happen due to the write_inode_now() call).
+ *
+ * Also, if @datasync is true, we do not wait on the inode to be written out
+ * but we always wait on the page cache pages to be written out.
+ *
+ * Locking: Caller must hold i_mutex on the inode.
+ *
+ * TODO: We should probably also write all attribute/index inodes associated
+ * with this inode but since we have no simple way of getting to them we ignore
+ * this problem for now.
+ */
+static int ntfs_file_fsync(struct file *filp, loff_t start, loff_t end,
+ int datasync)
+{
+ struct inode *vi = filp->f_mapping->host;
+ int err, ret = 0;
+
+ ntfs_debug("Entering for inode 0x%lx.", vi->i_ino);
+
+ err = file_write_and_wait_range(filp, start, end);
+ if (err)
+ return err;
+ inode_lock(vi);
+
+ BUG_ON(S_ISDIR(vi->i_mode));
+ if (!datasync || !NInoNonResident(NTFS_I(vi)))
+ ret = __ntfs_write_inode(vi, 1);
+ write_inode_now(vi, !datasync);
+ /*
+ * NOTE: If we were to use mapping->private_list (see ext2 and
+ * fs/buffer.c) for dirty blocks then we could optimize the below to be
+ * sync_mapping_buffers(vi->i_mapping).
+ */
+ err = sync_blockdev(vi->i_sb->s_bdev);
+ if (unlikely(err && !ret))
+ ret = err;
+ if (likely(!ret))
+ ntfs_debug("Done.");
+ else
+ ntfs_warning(vi->i_sb, "Failed to f%ssync inode 0x%lx. Error "
+ "%u.", datasync ? "data" : "", vi->i_ino, -ret);
+ inode_unlock(vi);
+ return ret;
+}
+
+#endif /* NTFS_RW */
+
+const struct file_operations ntfs_file_ops = {
+ .llseek = generic_file_llseek,
+ .read_iter = generic_file_read_iter,
+#ifdef NTFS_RW
+ .write_iter = ntfs_file_write_iter,
+ .fsync = ntfs_file_fsync,
+#endif /* NTFS_RW */
+ .mmap = generic_file_mmap,
+ .open = ntfs_file_open,
+ .splice_read = generic_file_splice_read,
+};
+
+const struct inode_operations ntfs_file_inode_ops = {
+#ifdef NTFS_RW
+ .setattr = ntfs_setattr,
+#endif /* NTFS_RW */
+};
+
+const struct file_operations ntfs_empty_file_ops = {};
+
+const struct inode_operations ntfs_empty_inode_ops = {};
diff --git a/fs/ntfs/index.c b/fs/ntfs/index.c
new file mode 100644
index 000000000..d46c2c03a
--- /dev/null
+++ b/fs/ntfs/index.c
@@ -0,0 +1,440 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * index.c - NTFS kernel index handling. Part of the Linux-NTFS project.
+ *
+ * Copyright (c) 2004-2005 Anton Altaparmakov
+ */
+
+#include <linux/slab.h>
+
+#include "aops.h"
+#include "collate.h"
+#include "debug.h"
+#include "index.h"
+#include "ntfs.h"
+
+/**
+ * ntfs_index_ctx_get - allocate and initialize a new index context
+ * @idx_ni: ntfs index inode with which to initialize the context
+ *
+ * Allocate a new index context, initialize it with @idx_ni and return it.
+ * Return NULL if allocation failed.
+ *
+ * Locking: Caller must hold i_mutex on the index inode.
+ */
+ntfs_index_context *ntfs_index_ctx_get(ntfs_inode *idx_ni)
+{
+ ntfs_index_context *ictx;
+
+ ictx = kmem_cache_alloc(ntfs_index_ctx_cache, GFP_NOFS);
+ if (ictx)
+ *ictx = (ntfs_index_context){ .idx_ni = idx_ni };
+ return ictx;
+}
+
+/**
+ * ntfs_index_ctx_put - release an index context
+ * @ictx: index context to free
+ *
+ * Release the index context @ictx, releasing all associated resources.
+ *
+ * Locking: Caller must hold i_mutex on the index inode.
+ */
+void ntfs_index_ctx_put(ntfs_index_context *ictx)
+{
+ if (ictx->entry) {
+ if (ictx->is_in_root) {
+ if (ictx->actx)
+ ntfs_attr_put_search_ctx(ictx->actx);
+ if (ictx->base_ni)
+ unmap_mft_record(ictx->base_ni);
+ } else {
+ struct page *page = ictx->page;
+ if (page) {
+ BUG_ON(!PageLocked(page));
+ unlock_page(page);
+ ntfs_unmap_page(page);
+ }
+ }
+ }
+ kmem_cache_free(ntfs_index_ctx_cache, ictx);
+ return;
+}
+
+/**
+ * ntfs_index_lookup - find a key in an index and return its index entry
+ * @key: [IN] key for which to search in the index
+ * @key_len: [IN] length of @key in bytes
+ * @ictx: [IN/OUT] context describing the index and the returned entry
+ *
+ * Before calling ntfs_index_lookup(), @ictx must have been obtained from a
+ * call to ntfs_index_ctx_get().
+ *
+ * Look for the @key in the index specified by the index lookup context @ictx.
+ * ntfs_index_lookup() walks the contents of the index looking for the @key.
+ *
+ * If the @key is found in the index, 0 is returned and @ictx is setup to
+ * describe the index entry containing the matching @key. @ictx->entry is the
+ * index entry and @ictx->data and @ictx->data_len are the index entry data and
+ * its length in bytes, respectively.
+ *
+ * If the @key is not found in the index, -ENOENT is returned and @ictx is
+ * setup to describe the index entry whose key collates immediately after the
+ * search @key, i.e. this is the position in the index at which an index entry
+ * with a key of @key would need to be inserted.
+ *
+ * If an error occurs return the negative error code and @ictx is left
+ * untouched.
+ *
+ * When finished with the entry and its data, call ntfs_index_ctx_put() to free
+ * the context and other associated resources.
+ *
+ * If the index entry was modified, call flush_dcache_index_entry_page()
+ * immediately after the modification and either ntfs_index_entry_mark_dirty()
+ * or ntfs_index_entry_write() before the call to ntfs_index_ctx_put() to
+ * ensure that the changes are written to disk.
+ *
+ * Locking: - Caller must hold i_mutex on the index inode.
+ * - Each page cache page in the index allocation mapping must be
+ * locked whilst being accessed otherwise we may find a corrupt
+ * page due to it being under ->writepage at the moment which
+ * applies the mst protection fixups before writing out and then
+ * removes them again after the write is complete after which it
+ * unlocks the page.
+ */
+int ntfs_index_lookup(const void *key, const int key_len,
+ ntfs_index_context *ictx)
+{
+ VCN vcn, old_vcn;
+ ntfs_inode *idx_ni = ictx->idx_ni;
+ ntfs_volume *vol = idx_ni->vol;
+ struct super_block *sb = vol->sb;
+ ntfs_inode *base_ni = idx_ni->ext.base_ntfs_ino;
+ MFT_RECORD *m;
+ INDEX_ROOT *ir;
+ INDEX_ENTRY *ie;
+ INDEX_ALLOCATION *ia;
+ u8 *index_end, *kaddr;
+ ntfs_attr_search_ctx *actx;
+ struct address_space *ia_mapping;
+ struct page *page;
+ int rc, err = 0;
+
+ ntfs_debug("Entering.");
+ BUG_ON(!NInoAttr(idx_ni));
+ BUG_ON(idx_ni->type != AT_INDEX_ALLOCATION);
+ BUG_ON(idx_ni->nr_extents != -1);
+ BUG_ON(!base_ni);
+ BUG_ON(!key);
+ BUG_ON(key_len <= 0);
+ if (!ntfs_is_collation_rule_supported(
+ idx_ni->itype.index.collation_rule)) {
+ ntfs_error(sb, "Index uses unsupported collation rule 0x%x. "
+ "Aborting lookup.", le32_to_cpu(
+ idx_ni->itype.index.collation_rule));
+ return -EOPNOTSUPP;
+ }
+ /* Get hold of the mft record for the index inode. */
+ m = map_mft_record(base_ni);
+ if (IS_ERR(m)) {
+ ntfs_error(sb, "map_mft_record() failed with error code %ld.",
+ -PTR_ERR(m));
+ return PTR_ERR(m);
+ }
+ actx = ntfs_attr_get_search_ctx(base_ni, m);
+ if (unlikely(!actx)) {
+ err = -ENOMEM;
+ goto err_out;
+ }
+ /* Find the index root attribute in the mft record. */
+ err = ntfs_attr_lookup(AT_INDEX_ROOT, idx_ni->name, idx_ni->name_len,
+ CASE_SENSITIVE, 0, NULL, 0, actx);
+ if (unlikely(err)) {
+ if (err == -ENOENT) {
+ ntfs_error(sb, "Index root attribute missing in inode "
+ "0x%lx.", idx_ni->mft_no);
+ err = -EIO;
+ }
+ goto err_out;
+ }
+ /* Get to the index root value (it has been verified in read_inode). */
+ ir = (INDEX_ROOT*)((u8*)actx->attr +
+ le16_to_cpu(actx->attr->data.resident.value_offset));
+ index_end = (u8*)&ir->index + le32_to_cpu(ir->index.index_length);
+ /* The first index entry. */
+ ie = (INDEX_ENTRY*)((u8*)&ir->index +
+ le32_to_cpu(ir->index.entries_offset));
+ /*
+ * Loop until we exceed valid memory (corruption case) or until we
+ * reach the last entry.
+ */
+ for (;; ie = (INDEX_ENTRY*)((u8*)ie + le16_to_cpu(ie->length))) {
+ /* Bounds checks. */
+ if ((u8*)ie < (u8*)actx->mrec || (u8*)ie +
+ sizeof(INDEX_ENTRY_HEADER) > index_end ||
+ (u8*)ie + le16_to_cpu(ie->length) > index_end)
+ goto idx_err_out;
+ /*
+ * The last entry cannot contain a key. It can however contain
+ * a pointer to a child node in the B+tree so we just break out.
+ */
+ if (ie->flags & INDEX_ENTRY_END)
+ break;
+ /* Further bounds checks. */
+ if ((u32)sizeof(INDEX_ENTRY_HEADER) +
+ le16_to_cpu(ie->key_length) >
+ le16_to_cpu(ie->data.vi.data_offset) ||
+ (u32)le16_to_cpu(ie->data.vi.data_offset) +
+ le16_to_cpu(ie->data.vi.data_length) >
+ le16_to_cpu(ie->length))
+ goto idx_err_out;
+ /* If the keys match perfectly, we setup @ictx and return 0. */
+ if ((key_len == le16_to_cpu(ie->key_length)) && !memcmp(key,
+ &ie->key, key_len)) {
+ir_done:
+ ictx->is_in_root = true;
+ ictx->ir = ir;
+ ictx->actx = actx;
+ ictx->base_ni = base_ni;
+ ictx->ia = NULL;
+ ictx->page = NULL;
+done:
+ ictx->entry = ie;
+ ictx->data = (u8*)ie +
+ le16_to_cpu(ie->data.vi.data_offset);
+ ictx->data_len = le16_to_cpu(ie->data.vi.data_length);
+ ntfs_debug("Done.");
+ return err;
+ }
+ /*
+ * Not a perfect match, need to do full blown collation so we
+ * know which way in the B+tree we have to go.
+ */
+ rc = ntfs_collate(vol, idx_ni->itype.index.collation_rule, key,
+ key_len, &ie->key, le16_to_cpu(ie->key_length));
+ /*
+ * If @key collates before the key of the current entry, there
+ * is definitely no such key in this index but we might need to
+ * descend into the B+tree so we just break out of the loop.
+ */
+ if (rc == -1)
+ break;
+ /*
+ * A match should never happen as the memcmp() call should have
+ * cought it, but we still treat it correctly.
+ */
+ if (!rc)
+ goto ir_done;
+ /* The keys are not equal, continue the search. */
+ }
+ /*
+ * We have finished with this index without success. Check for the
+ * presence of a child node and if not present setup @ictx and return
+ * -ENOENT.
+ */
+ if (!(ie->flags & INDEX_ENTRY_NODE)) {
+ ntfs_debug("Entry not found.");
+ err = -ENOENT;
+ goto ir_done;
+ } /* Child node present, descend into it. */
+ /* Consistency check: Verify that an index allocation exists. */
+ if (!NInoIndexAllocPresent(idx_ni)) {
+ ntfs_error(sb, "No index allocation attribute but index entry "
+ "requires one. Inode 0x%lx is corrupt or "
+ "driver bug.", idx_ni->mft_no);
+ goto err_out;
+ }
+ /* Get the starting vcn of the index_block holding the child node. */
+ vcn = sle64_to_cpup((sle64*)((u8*)ie + le16_to_cpu(ie->length) - 8));
+ ia_mapping = VFS_I(idx_ni)->i_mapping;
+ /*
+ * We are done with the index root and the mft record. Release them,
+ * otherwise we deadlock with ntfs_map_page().
+ */
+ ntfs_attr_put_search_ctx(actx);
+ unmap_mft_record(base_ni);
+ m = NULL;
+ actx = NULL;
+descend_into_child_node:
+ /*
+ * Convert vcn to index into the index allocation attribute in units
+ * of PAGE_SIZE and map the page cache page, reading it from
+ * disk if necessary.
+ */
+ page = ntfs_map_page(ia_mapping, vcn <<
+ idx_ni->itype.index.vcn_size_bits >> PAGE_SHIFT);
+ if (IS_ERR(page)) {
+ ntfs_error(sb, "Failed to map index page, error %ld.",
+ -PTR_ERR(page));
+ err = PTR_ERR(page);
+ goto err_out;
+ }
+ lock_page(page);
+ kaddr = (u8*)page_address(page);
+fast_descend_into_child_node:
+ /* Get to the index allocation block. */
+ ia = (INDEX_ALLOCATION*)(kaddr + ((vcn <<
+ idx_ni->itype.index.vcn_size_bits) & ~PAGE_MASK));
+ /* Bounds checks. */
+ if ((u8*)ia < kaddr || (u8*)ia > kaddr + PAGE_SIZE) {
+ ntfs_error(sb, "Out of bounds check failed. Corrupt inode "
+ "0x%lx or driver bug.", idx_ni->mft_no);
+ goto unm_err_out;
+ }
+ /* Catch multi sector transfer fixup errors. */
+ if (unlikely(!ntfs_is_indx_record(ia->magic))) {
+ ntfs_error(sb, "Index record with vcn 0x%llx is corrupt. "
+ "Corrupt inode 0x%lx. Run chkdsk.",
+ (long long)vcn, idx_ni->mft_no);
+ goto unm_err_out;
+ }
+ if (sle64_to_cpu(ia->index_block_vcn) != vcn) {
+ ntfs_error(sb, "Actual VCN (0x%llx) of index buffer is "
+ "different from expected VCN (0x%llx). Inode "
+ "0x%lx is corrupt or driver bug.",
+ (unsigned long long)
+ sle64_to_cpu(ia->index_block_vcn),
+ (unsigned long long)vcn, idx_ni->mft_no);
+ goto unm_err_out;
+ }
+ if (le32_to_cpu(ia->index.allocated_size) + 0x18 !=
+ idx_ni->itype.index.block_size) {
+ ntfs_error(sb, "Index buffer (VCN 0x%llx) of inode 0x%lx has "
+ "a size (%u) differing from the index "
+ "specified size (%u). Inode is corrupt or "
+ "driver bug.", (unsigned long long)vcn,
+ idx_ni->mft_no,
+ le32_to_cpu(ia->index.allocated_size) + 0x18,
+ idx_ni->itype.index.block_size);
+ goto unm_err_out;
+ }
+ index_end = (u8*)ia + idx_ni->itype.index.block_size;
+ if (index_end > kaddr + PAGE_SIZE) {
+ ntfs_error(sb, "Index buffer (VCN 0x%llx) of inode 0x%lx "
+ "crosses page boundary. Impossible! Cannot "
+ "access! This is probably a bug in the "
+ "driver.", (unsigned long long)vcn,
+ idx_ni->mft_no);
+ goto unm_err_out;
+ }
+ index_end = (u8*)&ia->index + le32_to_cpu(ia->index.index_length);
+ if (index_end > (u8*)ia + idx_ni->itype.index.block_size) {
+ ntfs_error(sb, "Size of index buffer (VCN 0x%llx) of inode "
+ "0x%lx exceeds maximum size.",
+ (unsigned long long)vcn, idx_ni->mft_no);
+ goto unm_err_out;
+ }
+ /* The first index entry. */
+ ie = (INDEX_ENTRY*)((u8*)&ia->index +
+ le32_to_cpu(ia->index.entries_offset));
+ /*
+ * Iterate similar to above big loop but applied to index buffer, thus
+ * loop until we exceed valid memory (corruption case) or until we
+ * reach the last entry.
+ */
+ for (;; ie = (INDEX_ENTRY*)((u8*)ie + le16_to_cpu(ie->length))) {
+ /* Bounds checks. */
+ if ((u8*)ie < (u8*)ia || (u8*)ie +
+ sizeof(INDEX_ENTRY_HEADER) > index_end ||
+ (u8*)ie + le16_to_cpu(ie->length) > index_end) {
+ ntfs_error(sb, "Index entry out of bounds in inode "
+ "0x%lx.", idx_ni->mft_no);
+ goto unm_err_out;
+ }
+ /*
+ * The last entry cannot contain a key. It can however contain
+ * a pointer to a child node in the B+tree so we just break out.
+ */
+ if (ie->flags & INDEX_ENTRY_END)
+ break;
+ /* Further bounds checks. */
+ if ((u32)sizeof(INDEX_ENTRY_HEADER) +
+ le16_to_cpu(ie->key_length) >
+ le16_to_cpu(ie->data.vi.data_offset) ||
+ (u32)le16_to_cpu(ie->data.vi.data_offset) +
+ le16_to_cpu(ie->data.vi.data_length) >
+ le16_to_cpu(ie->length)) {
+ ntfs_error(sb, "Index entry out of bounds in inode "
+ "0x%lx.", idx_ni->mft_no);
+ goto unm_err_out;
+ }
+ /* If the keys match perfectly, we setup @ictx and return 0. */
+ if ((key_len == le16_to_cpu(ie->key_length)) && !memcmp(key,
+ &ie->key, key_len)) {
+ia_done:
+ ictx->is_in_root = false;
+ ictx->actx = NULL;
+ ictx->base_ni = NULL;
+ ictx->ia = ia;
+ ictx->page = page;
+ goto done;
+ }
+ /*
+ * Not a perfect match, need to do full blown collation so we
+ * know which way in the B+tree we have to go.
+ */
+ rc = ntfs_collate(vol, idx_ni->itype.index.collation_rule, key,
+ key_len, &ie->key, le16_to_cpu(ie->key_length));
+ /*
+ * If @key collates before the key of the current entry, there
+ * is definitely no such key in this index but we might need to
+ * descend into the B+tree so we just break out of the loop.
+ */
+ if (rc == -1)
+ break;
+ /*
+ * A match should never happen as the memcmp() call should have
+ * cought it, but we still treat it correctly.
+ */
+ if (!rc)
+ goto ia_done;
+ /* The keys are not equal, continue the search. */
+ }
+ /*
+ * We have finished with this index buffer without success. Check for
+ * the presence of a child node and if not present return -ENOENT.
+ */
+ if (!(ie->flags & INDEX_ENTRY_NODE)) {
+ ntfs_debug("Entry not found.");
+ err = -ENOENT;
+ goto ia_done;
+ }
+ if ((ia->index.flags & NODE_MASK) == LEAF_NODE) {
+ ntfs_error(sb, "Index entry with child node found in a leaf "
+ "node in inode 0x%lx.", idx_ni->mft_no);
+ goto unm_err_out;
+ }
+ /* Child node present, descend into it. */
+ old_vcn = vcn;
+ vcn = sle64_to_cpup((sle64*)((u8*)ie + le16_to_cpu(ie->length) - 8));
+ if (vcn >= 0) {
+ /*
+ * If vcn is in the same page cache page as old_vcn we recycle
+ * the mapped page.
+ */
+ if (old_vcn << vol->cluster_size_bits >>
+ PAGE_SHIFT == vcn <<
+ vol->cluster_size_bits >>
+ PAGE_SHIFT)
+ goto fast_descend_into_child_node;
+ unlock_page(page);
+ ntfs_unmap_page(page);
+ goto descend_into_child_node;
+ }
+ ntfs_error(sb, "Negative child node vcn in inode 0x%lx.",
+ idx_ni->mft_no);
+unm_err_out:
+ unlock_page(page);
+ ntfs_unmap_page(page);
+err_out:
+ if (!err)
+ err = -EIO;
+ if (actx)
+ ntfs_attr_put_search_ctx(actx);
+ if (m)
+ unmap_mft_record(base_ni);
+ return err;
+idx_err_out:
+ ntfs_error(sb, "Corrupt index. Aborting lookup.");
+ goto err_out;
+}
diff --git a/fs/ntfs/index.h b/fs/ntfs/index.h
new file mode 100644
index 000000000..bb3c3ae55
--- /dev/null
+++ b/fs/ntfs/index.h
@@ -0,0 +1,134 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * index.h - Defines for NTFS kernel index handling. Part of the Linux-NTFS
+ * project.
+ *
+ * Copyright (c) 2004 Anton Altaparmakov
+ */
+
+#ifndef _LINUX_NTFS_INDEX_H
+#define _LINUX_NTFS_INDEX_H
+
+#include <linux/fs.h>
+
+#include "types.h"
+#include "layout.h"
+#include "inode.h"
+#include "attrib.h"
+#include "mft.h"
+#include "aops.h"
+
+/**
+ * @idx_ni: index inode containing the @entry described by this context
+ * @entry: index entry (points into @ir or @ia)
+ * @data: index entry data (points into @entry)
+ * @data_len: length in bytes of @data
+ * @is_in_root: 'true' if @entry is in @ir and 'false' if it is in @ia
+ * @ir: index root if @is_in_root and NULL otherwise
+ * @actx: attribute search context if @is_in_root and NULL otherwise
+ * @base_ni: base inode if @is_in_root and NULL otherwise
+ * @ia: index block if @is_in_root is 'false' and NULL otherwise
+ * @page: page if @is_in_root is 'false' and NULL otherwise
+ *
+ * @idx_ni is the index inode this context belongs to.
+ *
+ * @entry is the index entry described by this context. @data and @data_len
+ * are the index entry data and its length in bytes, respectively. @data
+ * simply points into @entry. This is probably what the user is interested in.
+ *
+ * If @is_in_root is 'true', @entry is in the index root attribute @ir described
+ * by the attribute search context @actx and the base inode @base_ni. @ia and
+ * @page are NULL in this case.
+ *
+ * If @is_in_root is 'false', @entry is in the index allocation attribute and @ia
+ * and @page point to the index allocation block and the mapped, locked page it
+ * is in, respectively. @ir, @actx and @base_ni are NULL in this case.
+ *
+ * To obtain a context call ntfs_index_ctx_get().
+ *
+ * We use this context to allow ntfs_index_lookup() to return the found index
+ * @entry and its @data without having to allocate a buffer and copy the @entry
+ * and/or its @data into it.
+ *
+ * When finished with the @entry and its @data, call ntfs_index_ctx_put() to
+ * free the context and other associated resources.
+ *
+ * If the index entry was modified, call flush_dcache_index_entry_page()
+ * immediately after the modification and either ntfs_index_entry_mark_dirty()
+ * or ntfs_index_entry_write() before the call to ntfs_index_ctx_put() to
+ * ensure that the changes are written to disk.
+ */
+typedef struct {
+ ntfs_inode *idx_ni;
+ INDEX_ENTRY *entry;
+ void *data;
+ u16 data_len;
+ bool is_in_root;
+ INDEX_ROOT *ir;
+ ntfs_attr_search_ctx *actx;
+ ntfs_inode *base_ni;
+ INDEX_ALLOCATION *ia;
+ struct page *page;
+} ntfs_index_context;
+
+extern ntfs_index_context *ntfs_index_ctx_get(ntfs_inode *idx_ni);
+extern void ntfs_index_ctx_put(ntfs_index_context *ictx);
+
+extern int ntfs_index_lookup(const void *key, const int key_len,
+ ntfs_index_context *ictx);
+
+#ifdef NTFS_RW
+
+/**
+ * ntfs_index_entry_flush_dcache_page - flush_dcache_page() for index entries
+ * @ictx: ntfs index context describing the index entry
+ *
+ * Call flush_dcache_page() for the page in which an index entry resides.
+ *
+ * This must be called every time an index entry is modified, just after the
+ * modification.
+ *
+ * If the index entry is in the index root attribute, simply flush the page
+ * containing the mft record containing the index root attribute.
+ *
+ * If the index entry is in an index block belonging to the index allocation
+ * attribute, simply flush the page cache page containing the index block.
+ */
+static inline void ntfs_index_entry_flush_dcache_page(ntfs_index_context *ictx)
+{
+ if (ictx->is_in_root)
+ flush_dcache_mft_record_page(ictx->actx->ntfs_ino);
+ else
+ flush_dcache_page(ictx->page);
+}
+
+/**
+ * ntfs_index_entry_mark_dirty - mark an index entry dirty
+ * @ictx: ntfs index context describing the index entry
+ *
+ * Mark the index entry described by the index entry context @ictx dirty.
+ *
+ * If the index entry is in the index root attribute, simply mark the mft
+ * record containing the index root attribute dirty. This ensures the mft
+ * record, and hence the index root attribute, will be written out to disk
+ * later.
+ *
+ * If the index entry is in an index block belonging to the index allocation
+ * attribute, mark the buffers belonging to the index record as well as the
+ * page cache page the index block is in dirty. This automatically marks the
+ * VFS inode of the ntfs index inode to which the index entry belongs dirty,
+ * too (I_DIRTY_PAGES) and this in turn ensures the page buffers, and hence the
+ * dirty index block, will be written out to disk later.
+ */
+static inline void ntfs_index_entry_mark_dirty(ntfs_index_context *ictx)
+{
+ if (ictx->is_in_root)
+ mark_mft_record_dirty(ictx->actx->ntfs_ino);
+ else
+ mark_ntfs_record_dirty(ictx->page,
+ (u8*)ictx->ia - (u8*)page_address(ictx->page));
+}
+
+#endif /* NTFS_RW */
+
+#endif /* _LINUX_NTFS_INDEX_H */
diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c
new file mode 100644
index 000000000..08c659332
--- /dev/null
+++ b/fs/ntfs/inode.c
@@ -0,0 +1,3100 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/**
+ * inode.c - NTFS kernel inode handling.
+ *
+ * Copyright (c) 2001-2014 Anton Altaparmakov and Tuxera Inc.
+ */
+
+#include <linux/buffer_head.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/mount.h>
+#include <linux/mutex.h>
+#include <linux/pagemap.h>
+#include <linux/quotaops.h>
+#include <linux/slab.h>
+#include <linux/log2.h>
+
+#include "aops.h"
+#include "attrib.h"
+#include "bitmap.h"
+#include "dir.h"
+#include "debug.h"
+#include "inode.h"
+#include "lcnalloc.h"
+#include "malloc.h"
+#include "mft.h"
+#include "time.h"
+#include "ntfs.h"
+
+/**
+ * ntfs_test_inode - compare two (possibly fake) inodes for equality
+ * @vi: vfs inode which to test
+ * @data: data which is being tested with
+ *
+ * Compare the ntfs attribute embedded in the ntfs specific part of the vfs
+ * inode @vi for equality with the ntfs attribute @data.
+ *
+ * If searching for the normal file/directory inode, set @na->type to AT_UNUSED.
+ * @na->name and @na->name_len are then ignored.
+ *
+ * Return 1 if the attributes match and 0 if not.
+ *
+ * NOTE: This function runs with the inode_hash_lock spin lock held so it is not
+ * allowed to sleep.
+ */
+int ntfs_test_inode(struct inode *vi, void *data)
+{
+ ntfs_attr *na = (ntfs_attr *)data;
+ ntfs_inode *ni;
+
+ if (vi->i_ino != na->mft_no)
+ return 0;
+ ni = NTFS_I(vi);
+ /* If !NInoAttr(ni), @vi is a normal file or directory inode. */
+ if (likely(!NInoAttr(ni))) {
+ /* If not looking for a normal inode this is a mismatch. */
+ if (unlikely(na->type != AT_UNUSED))
+ return 0;
+ } else {
+ /* A fake inode describing an attribute. */
+ if (ni->type != na->type)
+ return 0;
+ if (ni->name_len != na->name_len)
+ return 0;
+ if (na->name_len && memcmp(ni->name, na->name,
+ na->name_len * sizeof(ntfschar)))
+ return 0;
+ }
+ /* Match! */
+ return 1;
+}
+
+/**
+ * ntfs_init_locked_inode - initialize an inode
+ * @vi: vfs inode to initialize
+ * @data: data which to initialize @vi to
+ *
+ * Initialize the vfs inode @vi with the values from the ntfs attribute @data in
+ * order to enable ntfs_test_inode() to do its work.
+ *
+ * If initializing the normal file/directory inode, set @na->type to AT_UNUSED.
+ * In that case, @na->name and @na->name_len should be set to NULL and 0,
+ * respectively. Although that is not strictly necessary as
+ * ntfs_read_locked_inode() will fill them in later.
+ *
+ * Return 0 on success and -errno on error.
+ *
+ * NOTE: This function runs with the inode->i_lock spin lock held so it is not
+ * allowed to sleep. (Hence the GFP_ATOMIC allocation.)
+ */
+static int ntfs_init_locked_inode(struct inode *vi, void *data)
+{
+ ntfs_attr *na = (ntfs_attr *)data;
+ ntfs_inode *ni = NTFS_I(vi);
+
+ vi->i_ino = na->mft_no;
+
+ ni->type = na->type;
+ if (na->type == AT_INDEX_ALLOCATION)
+ NInoSetMstProtected(ni);
+
+ ni->name = na->name;
+ ni->name_len = na->name_len;
+
+ /* If initializing a normal inode, we are done. */
+ if (likely(na->type == AT_UNUSED)) {
+ BUG_ON(na->name);
+ BUG_ON(na->name_len);
+ return 0;
+ }
+
+ /* It is a fake inode. */
+ NInoSetAttr(ni);
+
+ /*
+ * We have I30 global constant as an optimization as it is the name
+ * in >99.9% of named attributes! The other <0.1% incur a GFP_ATOMIC
+ * allocation but that is ok. And most attributes are unnamed anyway,
+ * thus the fraction of named attributes with name != I30 is actually
+ * absolutely tiny.
+ */
+ if (na->name_len && na->name != I30) {
+ unsigned int i;
+
+ BUG_ON(!na->name);
+ i = na->name_len * sizeof(ntfschar);
+ ni->name = kmalloc(i + sizeof(ntfschar), GFP_ATOMIC);
+ if (!ni->name)
+ return -ENOMEM;
+ memcpy(ni->name, na->name, i);
+ ni->name[na->name_len] = 0;
+ }
+ return 0;
+}
+
+static int ntfs_read_locked_inode(struct inode *vi);
+static int ntfs_read_locked_attr_inode(struct inode *base_vi, struct inode *vi);
+static int ntfs_read_locked_index_inode(struct inode *base_vi,
+ struct inode *vi);
+
+/**
+ * ntfs_iget - obtain a struct inode corresponding to a specific normal inode
+ * @sb: super block of mounted volume
+ * @mft_no: mft record number / inode number to obtain
+ *
+ * Obtain the struct inode corresponding to a specific normal inode (i.e. a
+ * file or directory).
+ *
+ * If the inode is in the cache, it is just returned with an increased
+ * reference count. Otherwise, a new struct inode is allocated and initialized,
+ * and finally ntfs_read_locked_inode() is called to read in the inode and
+ * fill in the remainder of the inode structure.
+ *
+ * Return the struct inode on success. Check the return value with IS_ERR() and
+ * if true, the function failed and the error code is obtained from PTR_ERR().
+ */
+struct inode *ntfs_iget(struct super_block *sb, unsigned long mft_no)
+{
+ struct inode *vi;
+ int err;
+ ntfs_attr na;
+
+ na.mft_no = mft_no;
+ na.type = AT_UNUSED;
+ na.name = NULL;
+ na.name_len = 0;
+
+ vi = iget5_locked(sb, mft_no, ntfs_test_inode,
+ ntfs_init_locked_inode, &na);
+ if (unlikely(!vi))
+ return ERR_PTR(-ENOMEM);
+
+ err = 0;
+
+ /* If this is a freshly allocated inode, need to read it now. */
+ if (vi->i_state & I_NEW) {
+ err = ntfs_read_locked_inode(vi);
+ unlock_new_inode(vi);
+ }
+ /*
+ * There is no point in keeping bad inodes around if the failure was
+ * due to ENOMEM. We want to be able to retry again later.
+ */
+ if (unlikely(err == -ENOMEM)) {
+ iput(vi);
+ vi = ERR_PTR(err);
+ }
+ return vi;
+}
+
+/**
+ * ntfs_attr_iget - obtain a struct inode corresponding to an attribute
+ * @base_vi: vfs base inode containing the attribute
+ * @type: attribute type
+ * @name: Unicode name of the attribute (NULL if unnamed)
+ * @name_len: length of @name in Unicode characters (0 if unnamed)
+ *
+ * Obtain the (fake) struct inode corresponding to the attribute specified by
+ * @type, @name, and @name_len, which is present in the base mft record
+ * specified by the vfs inode @base_vi.
+ *
+ * If the attribute inode is in the cache, it is just returned with an
+ * increased reference count. Otherwise, a new struct inode is allocated and
+ * initialized, and finally ntfs_read_locked_attr_inode() is called to read the
+ * attribute and fill in the inode structure.
+ *
+ * Note, for index allocation attributes, you need to use ntfs_index_iget()
+ * instead of ntfs_attr_iget() as working with indices is a lot more complex.
+ *
+ * Return the struct inode of the attribute inode on success. Check the return
+ * value with IS_ERR() and if true, the function failed and the error code is
+ * obtained from PTR_ERR().
+ */
+struct inode *ntfs_attr_iget(struct inode *base_vi, ATTR_TYPE type,
+ ntfschar *name, u32 name_len)
+{
+ struct inode *vi;
+ int err;
+ ntfs_attr na;
+
+ /* Make sure no one calls ntfs_attr_iget() for indices. */
+ BUG_ON(type == AT_INDEX_ALLOCATION);
+
+ na.mft_no = base_vi->i_ino;
+ na.type = type;
+ na.name = name;
+ na.name_len = name_len;
+
+ vi = iget5_locked(base_vi->i_sb, na.mft_no, ntfs_test_inode,
+ ntfs_init_locked_inode, &na);
+ if (unlikely(!vi))
+ return ERR_PTR(-ENOMEM);
+
+ err = 0;
+
+ /* If this is a freshly allocated inode, need to read it now. */
+ if (vi->i_state & I_NEW) {
+ err = ntfs_read_locked_attr_inode(base_vi, vi);
+ unlock_new_inode(vi);
+ }
+ /*
+ * There is no point in keeping bad attribute inodes around. This also
+ * simplifies things in that we never need to check for bad attribute
+ * inodes elsewhere.
+ */
+ if (unlikely(err)) {
+ iput(vi);
+ vi = ERR_PTR(err);
+ }
+ return vi;
+}
+
+/**
+ * ntfs_index_iget - obtain a struct inode corresponding to an index
+ * @base_vi: vfs base inode containing the index related attributes
+ * @name: Unicode name of the index
+ * @name_len: length of @name in Unicode characters
+ *
+ * Obtain the (fake) struct inode corresponding to the index specified by @name
+ * and @name_len, which is present in the base mft record specified by the vfs
+ * inode @base_vi.
+ *
+ * If the index inode is in the cache, it is just returned with an increased
+ * reference count. Otherwise, a new struct inode is allocated and
+ * initialized, and finally ntfs_read_locked_index_inode() is called to read
+ * the index related attributes and fill in the inode structure.
+ *
+ * Return the struct inode of the index inode on success. Check the return
+ * value with IS_ERR() and if true, the function failed and the error code is
+ * obtained from PTR_ERR().
+ */
+struct inode *ntfs_index_iget(struct inode *base_vi, ntfschar *name,
+ u32 name_len)
+{
+ struct inode *vi;
+ int err;
+ ntfs_attr na;
+
+ na.mft_no = base_vi->i_ino;
+ na.type = AT_INDEX_ALLOCATION;
+ na.name = name;
+ na.name_len = name_len;
+
+ vi = iget5_locked(base_vi->i_sb, na.mft_no, ntfs_test_inode,
+ ntfs_init_locked_inode, &na);
+ if (unlikely(!vi))
+ return ERR_PTR(-ENOMEM);
+
+ err = 0;
+
+ /* If this is a freshly allocated inode, need to read it now. */
+ if (vi->i_state & I_NEW) {
+ err = ntfs_read_locked_index_inode(base_vi, vi);
+ unlock_new_inode(vi);
+ }
+ /*
+ * There is no point in keeping bad index inodes around. This also
+ * simplifies things in that we never need to check for bad index
+ * inodes elsewhere.
+ */
+ if (unlikely(err)) {
+ iput(vi);
+ vi = ERR_PTR(err);
+ }
+ return vi;
+}
+
+struct inode *ntfs_alloc_big_inode(struct super_block *sb)
+{
+ ntfs_inode *ni;
+
+ ntfs_debug("Entering.");
+ ni = alloc_inode_sb(sb, ntfs_big_inode_cache, GFP_NOFS);
+ if (likely(ni != NULL)) {
+ ni->state = 0;
+ return VFS_I(ni);
+ }
+ ntfs_error(sb, "Allocation of NTFS big inode structure failed.");
+ return NULL;
+}
+
+void ntfs_free_big_inode(struct inode *inode)
+{
+ kmem_cache_free(ntfs_big_inode_cache, NTFS_I(inode));
+}
+
+static inline ntfs_inode *ntfs_alloc_extent_inode(void)
+{
+ ntfs_inode *ni;
+
+ ntfs_debug("Entering.");
+ ni = kmem_cache_alloc(ntfs_inode_cache, GFP_NOFS);
+ if (likely(ni != NULL)) {
+ ni->state = 0;
+ return ni;
+ }
+ ntfs_error(NULL, "Allocation of NTFS inode structure failed.");
+ return NULL;
+}
+
+static void ntfs_destroy_extent_inode(ntfs_inode *ni)
+{
+ ntfs_debug("Entering.");
+ BUG_ON(ni->page);
+ if (!atomic_dec_and_test(&ni->count))
+ BUG();
+ kmem_cache_free(ntfs_inode_cache, ni);
+}
+
+/*
+ * The attribute runlist lock has separate locking rules from the
+ * normal runlist lock, so split the two lock-classes:
+ */
+static struct lock_class_key attr_list_rl_lock_class;
+
+/**
+ * __ntfs_init_inode - initialize ntfs specific part of an inode
+ * @sb: super block of mounted volume
+ * @ni: freshly allocated ntfs inode which to initialize
+ *
+ * Initialize an ntfs inode to defaults.
+ *
+ * NOTE: ni->mft_no, ni->state, ni->type, ni->name, and ni->name_len are left
+ * untouched. Make sure to initialize them elsewhere.
+ *
+ * Return zero on success and -ENOMEM on error.
+ */
+void __ntfs_init_inode(struct super_block *sb, ntfs_inode *ni)
+{
+ ntfs_debug("Entering.");
+ rwlock_init(&ni->size_lock);
+ ni->initialized_size = ni->allocated_size = 0;
+ ni->seq_no = 0;
+ atomic_set(&ni->count, 1);
+ ni->vol = NTFS_SB(sb);
+ ntfs_init_runlist(&ni->runlist);
+ mutex_init(&ni->mrec_lock);
+ ni->page = NULL;
+ ni->page_ofs = 0;
+ ni->attr_list_size = 0;
+ ni->attr_list = NULL;
+ ntfs_init_runlist(&ni->attr_list_rl);
+ lockdep_set_class(&ni->attr_list_rl.lock,
+ &attr_list_rl_lock_class);
+ ni->itype.index.block_size = 0;
+ ni->itype.index.vcn_size = 0;
+ ni->itype.index.collation_rule = 0;
+ ni->itype.index.block_size_bits = 0;
+ ni->itype.index.vcn_size_bits = 0;
+ mutex_init(&ni->extent_lock);
+ ni->nr_extents = 0;
+ ni->ext.base_ntfs_ino = NULL;
+}
+
+/*
+ * Extent inodes get MFT-mapped in a nested way, while the base inode
+ * is still mapped. Teach this nesting to the lock validator by creating
+ * a separate class for nested inode's mrec_lock's:
+ */
+static struct lock_class_key extent_inode_mrec_lock_key;
+
+inline ntfs_inode *ntfs_new_extent_inode(struct super_block *sb,
+ unsigned long mft_no)
+{
+ ntfs_inode *ni = ntfs_alloc_extent_inode();
+
+ ntfs_debug("Entering.");
+ if (likely(ni != NULL)) {
+ __ntfs_init_inode(sb, ni);
+ lockdep_set_class(&ni->mrec_lock, &extent_inode_mrec_lock_key);
+ ni->mft_no = mft_no;
+ ni->type = AT_UNUSED;
+ ni->name = NULL;
+ ni->name_len = 0;
+ }
+ return ni;
+}
+
+/**
+ * ntfs_is_extended_system_file - check if a file is in the $Extend directory
+ * @ctx: initialized attribute search context
+ *
+ * Search all file name attributes in the inode described by the attribute
+ * search context @ctx and check if any of the names are in the $Extend system
+ * directory.
+ *
+ * Return values:
+ * 1: file is in $Extend directory
+ * 0: file is not in $Extend directory
+ * -errno: failed to determine if the file is in the $Extend directory
+ */
+static int ntfs_is_extended_system_file(ntfs_attr_search_ctx *ctx)
+{
+ int nr_links, err;
+
+ /* Restart search. */
+ ntfs_attr_reinit_search_ctx(ctx);
+
+ /* Get number of hard links. */
+ nr_links = le16_to_cpu(ctx->mrec->link_count);
+
+ /* Loop through all hard links. */
+ while (!(err = ntfs_attr_lookup(AT_FILE_NAME, NULL, 0, 0, 0, NULL, 0,
+ ctx))) {
+ FILE_NAME_ATTR *file_name_attr;
+ ATTR_RECORD *attr = ctx->attr;
+ u8 *p, *p2;
+
+ nr_links--;
+ /*
+ * Maximum sanity checking as we are called on an inode that
+ * we suspect might be corrupt.
+ */
+ p = (u8*)attr + le32_to_cpu(attr->length);
+ if (p < (u8*)ctx->mrec || (u8*)p > (u8*)ctx->mrec +
+ le32_to_cpu(ctx->mrec->bytes_in_use)) {
+err_corrupt_attr:
+ ntfs_error(ctx->ntfs_ino->vol->sb, "Corrupt file name "
+ "attribute. You should run chkdsk.");
+ return -EIO;
+ }
+ if (attr->non_resident) {
+ ntfs_error(ctx->ntfs_ino->vol->sb, "Non-resident file "
+ "name. You should run chkdsk.");
+ return -EIO;
+ }
+ if (attr->flags) {
+ ntfs_error(ctx->ntfs_ino->vol->sb, "File name with "
+ "invalid flags. You should run "
+ "chkdsk.");
+ return -EIO;
+ }
+ if (!(attr->data.resident.flags & RESIDENT_ATTR_IS_INDEXED)) {
+ ntfs_error(ctx->ntfs_ino->vol->sb, "Unindexed file "
+ "name. You should run chkdsk.");
+ return -EIO;
+ }
+ file_name_attr = (FILE_NAME_ATTR*)((u8*)attr +
+ le16_to_cpu(attr->data.resident.value_offset));
+ p2 = (u8 *)file_name_attr + le32_to_cpu(attr->data.resident.value_length);
+ if (p2 < (u8*)attr || p2 > p)
+ goto err_corrupt_attr;
+ /* This attribute is ok, but is it in the $Extend directory? */
+ if (MREF_LE(file_name_attr->parent_directory) == FILE_Extend)
+ return 1; /* YES, it's an extended system file. */
+ }
+ if (unlikely(err != -ENOENT))
+ return err;
+ if (unlikely(nr_links)) {
+ ntfs_error(ctx->ntfs_ino->vol->sb, "Inode hard link count "
+ "doesn't match number of name attributes. You "
+ "should run chkdsk.");
+ return -EIO;
+ }
+ return 0; /* NO, it is not an extended system file. */
+}
+
+/**
+ * ntfs_read_locked_inode - read an inode from its device
+ * @vi: inode to read
+ *
+ * ntfs_read_locked_inode() is called from ntfs_iget() to read the inode
+ * described by @vi into memory from the device.
+ *
+ * The only fields in @vi that we need to/can look at when the function is
+ * called are i_sb, pointing to the mounted device's super block, and i_ino,
+ * the number of the inode to load.
+ *
+ * ntfs_read_locked_inode() maps, pins and locks the mft record number i_ino
+ * for reading and sets up the necessary @vi fields as well as initializing
+ * the ntfs inode.
+ *
+ * Q: What locks are held when the function is called?
+ * A: i_state has I_NEW set, hence the inode is locked, also
+ * i_count is set to 1, so it is not going to go away
+ * i_flags is set to 0 and we have no business touching it. Only an ioctl()
+ * is allowed to write to them. We should of course be honouring them but
+ * we need to do that using the IS_* macros defined in include/linux/fs.h.
+ * In any case ntfs_read_locked_inode() has nothing to do with i_flags.
+ *
+ * Return 0 on success and -errno on error. In the error case, the inode will
+ * have had make_bad_inode() executed on it.
+ */
+static int ntfs_read_locked_inode(struct inode *vi)
+{
+ ntfs_volume *vol = NTFS_SB(vi->i_sb);
+ ntfs_inode *ni;
+ struct inode *bvi;
+ MFT_RECORD *m;
+ ATTR_RECORD *a;
+ STANDARD_INFORMATION *si;
+ ntfs_attr_search_ctx *ctx;
+ int err = 0;
+
+ ntfs_debug("Entering for i_ino 0x%lx.", vi->i_ino);
+
+ /* Setup the generic vfs inode parts now. */
+ vi->i_uid = vol->uid;
+ vi->i_gid = vol->gid;
+ vi->i_mode = 0;
+
+ /*
+ * Initialize the ntfs specific part of @vi special casing
+ * FILE_MFT which we need to do at mount time.
+ */
+ if (vi->i_ino != FILE_MFT)
+ ntfs_init_big_inode(vi);
+ ni = NTFS_I(vi);
+
+ m = map_mft_record(ni);
+ if (IS_ERR(m)) {
+ err = PTR_ERR(m);
+ goto err_out;
+ }
+ ctx = ntfs_attr_get_search_ctx(ni, m);
+ if (!ctx) {
+ err = -ENOMEM;
+ goto unm_err_out;
+ }
+
+ if (!(m->flags & MFT_RECORD_IN_USE)) {
+ ntfs_error(vi->i_sb, "Inode is not in use!");
+ goto unm_err_out;
+ }
+ if (m->base_mft_record) {
+ ntfs_error(vi->i_sb, "Inode is an extent inode!");
+ goto unm_err_out;
+ }
+
+ /* Transfer information from mft record into vfs and ntfs inodes. */
+ vi->i_generation = ni->seq_no = le16_to_cpu(m->sequence_number);
+
+ /*
+ * FIXME: Keep in mind that link_count is two for files which have both
+ * a long file name and a short file name as separate entries, so if
+ * we are hiding short file names this will be too high. Either we need
+ * to account for the short file names by subtracting them or we need
+ * to make sure we delete files even though i_nlink is not zero which
+ * might be tricky due to vfs interactions. Need to think about this
+ * some more when implementing the unlink command.
+ */
+ set_nlink(vi, le16_to_cpu(m->link_count));
+ /*
+ * FIXME: Reparse points can have the directory bit set even though
+ * they would be S_IFLNK. Need to deal with this further below when we
+ * implement reparse points / symbolic links but it will do for now.
+ * Also if not a directory, it could be something else, rather than
+ * a regular file. But again, will do for now.
+ */
+ /* Everyone gets all permissions. */
+ vi->i_mode |= S_IRWXUGO;
+ /* If read-only, no one gets write permissions. */
+ if (IS_RDONLY(vi))
+ vi->i_mode &= ~S_IWUGO;
+ if (m->flags & MFT_RECORD_IS_DIRECTORY) {
+ vi->i_mode |= S_IFDIR;
+ /*
+ * Apply the directory permissions mask set in the mount
+ * options.
+ */
+ vi->i_mode &= ~vol->dmask;
+ /* Things break without this kludge! */
+ if (vi->i_nlink > 1)
+ set_nlink(vi, 1);
+ } else {
+ vi->i_mode |= S_IFREG;
+ /* Apply the file permissions mask set in the mount options. */
+ vi->i_mode &= ~vol->fmask;
+ }
+ /*
+ * Find the standard information attribute in the mft record. At this
+ * stage we haven't setup the attribute list stuff yet, so this could
+ * in fact fail if the standard information is in an extent record, but
+ * I don't think this actually ever happens.
+ */
+ err = ntfs_attr_lookup(AT_STANDARD_INFORMATION, NULL, 0, 0, 0, NULL, 0,
+ ctx);
+ if (unlikely(err)) {
+ if (err == -ENOENT) {
+ /*
+ * TODO: We should be performing a hot fix here (if the
+ * recover mount option is set) by creating a new
+ * attribute.
+ */
+ ntfs_error(vi->i_sb, "$STANDARD_INFORMATION attribute "
+ "is missing.");
+ }
+ goto unm_err_out;
+ }
+ a = ctx->attr;
+ /* Get the standard information attribute value. */
+ if ((u8 *)a + le16_to_cpu(a->data.resident.value_offset)
+ + le32_to_cpu(a->data.resident.value_length) >
+ (u8 *)ctx->mrec + vol->mft_record_size) {
+ ntfs_error(vi->i_sb, "Corrupt standard information attribute in inode.");
+ goto unm_err_out;
+ }
+ si = (STANDARD_INFORMATION*)((u8*)a +
+ le16_to_cpu(a->data.resident.value_offset));
+
+ /* Transfer information from the standard information into vi. */
+ /*
+ * Note: The i_?times do not quite map perfectly onto the NTFS times,
+ * but they are close enough, and in the end it doesn't really matter
+ * that much...
+ */
+ /*
+ * mtime is the last change of the data within the file. Not changed
+ * when only metadata is changed, e.g. a rename doesn't affect mtime.
+ */
+ vi->i_mtime = ntfs2utc(si->last_data_change_time);
+ /*
+ * ctime is the last change of the metadata of the file. This obviously
+ * always changes, when mtime is changed. ctime can be changed on its
+ * own, mtime is then not changed, e.g. when a file is renamed.
+ */
+ vi->i_ctime = ntfs2utc(si->last_mft_change_time);
+ /*
+ * Last access to the data within the file. Not changed during a rename
+ * for example but changed whenever the file is written to.
+ */
+ vi->i_atime = ntfs2utc(si->last_access_time);
+
+ /* Find the attribute list attribute if present. */
+ ntfs_attr_reinit_search_ctx(ctx);
+ err = ntfs_attr_lookup(AT_ATTRIBUTE_LIST, NULL, 0, 0, 0, NULL, 0, ctx);
+ if (err) {
+ if (unlikely(err != -ENOENT)) {
+ ntfs_error(vi->i_sb, "Failed to lookup attribute list "
+ "attribute.");
+ goto unm_err_out;
+ }
+ } else /* if (!err) */ {
+ if (vi->i_ino == FILE_MFT)
+ goto skip_attr_list_load;
+ ntfs_debug("Attribute list found in inode 0x%lx.", vi->i_ino);
+ NInoSetAttrList(ni);
+ a = ctx->attr;
+ if (a->flags & ATTR_COMPRESSION_MASK) {
+ ntfs_error(vi->i_sb, "Attribute list attribute is "
+ "compressed.");
+ goto unm_err_out;
+ }
+ if (a->flags & ATTR_IS_ENCRYPTED ||
+ a->flags & ATTR_IS_SPARSE) {
+ if (a->non_resident) {
+ ntfs_error(vi->i_sb, "Non-resident attribute "
+ "list attribute is encrypted/"
+ "sparse.");
+ goto unm_err_out;
+ }
+ ntfs_warning(vi->i_sb, "Resident attribute list "
+ "attribute in inode 0x%lx is marked "
+ "encrypted/sparse which is not true. "
+ "However, Windows allows this and "
+ "chkdsk does not detect or correct it "
+ "so we will just ignore the invalid "
+ "flags and pretend they are not set.",
+ vi->i_ino);
+ }
+ /* Now allocate memory for the attribute list. */
+ ni->attr_list_size = (u32)ntfs_attr_size(a);
+ ni->attr_list = ntfs_malloc_nofs(ni->attr_list_size);
+ if (!ni->attr_list) {
+ ntfs_error(vi->i_sb, "Not enough memory to allocate "
+ "buffer for attribute list.");
+ err = -ENOMEM;
+ goto unm_err_out;
+ }
+ if (a->non_resident) {
+ NInoSetAttrListNonResident(ni);
+ if (a->data.non_resident.lowest_vcn) {
+ ntfs_error(vi->i_sb, "Attribute list has non "
+ "zero lowest_vcn.");
+ goto unm_err_out;
+ }
+ /*
+ * Setup the runlist. No need for locking as we have
+ * exclusive access to the inode at this time.
+ */
+ ni->attr_list_rl.rl = ntfs_mapping_pairs_decompress(vol,
+ a, NULL);
+ if (IS_ERR(ni->attr_list_rl.rl)) {
+ err = PTR_ERR(ni->attr_list_rl.rl);
+ ni->attr_list_rl.rl = NULL;
+ ntfs_error(vi->i_sb, "Mapping pairs "
+ "decompression failed.");
+ goto unm_err_out;
+ }
+ /* Now load the attribute list. */
+ if ((err = load_attribute_list(vol, &ni->attr_list_rl,
+ ni->attr_list, ni->attr_list_size,
+ sle64_to_cpu(a->data.non_resident.
+ initialized_size)))) {
+ ntfs_error(vi->i_sb, "Failed to load "
+ "attribute list attribute.");
+ goto unm_err_out;
+ }
+ } else /* if (!a->non_resident) */ {
+ if ((u8*)a + le16_to_cpu(a->data.resident.value_offset)
+ + le32_to_cpu(
+ a->data.resident.value_length) >
+ (u8*)ctx->mrec + vol->mft_record_size) {
+ ntfs_error(vi->i_sb, "Corrupt attribute list "
+ "in inode.");
+ goto unm_err_out;
+ }
+ /* Now copy the attribute list. */
+ memcpy(ni->attr_list, (u8*)a + le16_to_cpu(
+ a->data.resident.value_offset),
+ le32_to_cpu(
+ a->data.resident.value_length));
+ }
+ }
+skip_attr_list_load:
+ /*
+ * If an attribute list is present we now have the attribute list value
+ * in ntfs_ino->attr_list and it is ntfs_ino->attr_list_size bytes.
+ */
+ if (S_ISDIR(vi->i_mode)) {
+ loff_t bvi_size;
+ ntfs_inode *bni;
+ INDEX_ROOT *ir;
+ u8 *ir_end, *index_end;
+
+ /* It is a directory, find index root attribute. */
+ ntfs_attr_reinit_search_ctx(ctx);
+ err = ntfs_attr_lookup(AT_INDEX_ROOT, I30, 4, CASE_SENSITIVE,
+ 0, NULL, 0, ctx);
+ if (unlikely(err)) {
+ if (err == -ENOENT) {
+ // FIXME: File is corrupt! Hot-fix with empty
+ // index root attribute if recovery option is
+ // set.
+ ntfs_error(vi->i_sb, "$INDEX_ROOT attribute "
+ "is missing.");
+ }
+ goto unm_err_out;
+ }
+ a = ctx->attr;
+ /* Set up the state. */
+ if (unlikely(a->non_resident)) {
+ ntfs_error(vol->sb, "$INDEX_ROOT attribute is not "
+ "resident.");
+ goto unm_err_out;
+ }
+ /* Ensure the attribute name is placed before the value. */
+ if (unlikely(a->name_length && (le16_to_cpu(a->name_offset) >=
+ le16_to_cpu(a->data.resident.value_offset)))) {
+ ntfs_error(vol->sb, "$INDEX_ROOT attribute name is "
+ "placed after the attribute value.");
+ goto unm_err_out;
+ }
+ /*
+ * Compressed/encrypted index root just means that the newly
+ * created files in that directory should be created compressed/
+ * encrypted. However index root cannot be both compressed and
+ * encrypted.
+ */
+ if (a->flags & ATTR_COMPRESSION_MASK)
+ NInoSetCompressed(ni);
+ if (a->flags & ATTR_IS_ENCRYPTED) {
+ if (a->flags & ATTR_COMPRESSION_MASK) {
+ ntfs_error(vi->i_sb, "Found encrypted and "
+ "compressed attribute.");
+ goto unm_err_out;
+ }
+ NInoSetEncrypted(ni);
+ }
+ if (a->flags & ATTR_IS_SPARSE)
+ NInoSetSparse(ni);
+ ir = (INDEX_ROOT*)((u8*)a +
+ le16_to_cpu(a->data.resident.value_offset));
+ ir_end = (u8*)ir + le32_to_cpu(a->data.resident.value_length);
+ if (ir_end > (u8*)ctx->mrec + vol->mft_record_size) {
+ ntfs_error(vi->i_sb, "$INDEX_ROOT attribute is "
+ "corrupt.");
+ goto unm_err_out;
+ }
+ index_end = (u8*)&ir->index +
+ le32_to_cpu(ir->index.index_length);
+ if (index_end > ir_end) {
+ ntfs_error(vi->i_sb, "Directory index is corrupt.");
+ goto unm_err_out;
+ }
+ if (ir->type != AT_FILE_NAME) {
+ ntfs_error(vi->i_sb, "Indexed attribute is not "
+ "$FILE_NAME.");
+ goto unm_err_out;
+ }
+ if (ir->collation_rule != COLLATION_FILE_NAME) {
+ ntfs_error(vi->i_sb, "Index collation rule is not "
+ "COLLATION_FILE_NAME.");
+ goto unm_err_out;
+ }
+ ni->itype.index.collation_rule = ir->collation_rule;
+ ni->itype.index.block_size = le32_to_cpu(ir->index_block_size);
+ if (ni->itype.index.block_size &
+ (ni->itype.index.block_size - 1)) {
+ ntfs_error(vi->i_sb, "Index block size (%u) is not a "
+ "power of two.",
+ ni->itype.index.block_size);
+ goto unm_err_out;
+ }
+ if (ni->itype.index.block_size > PAGE_SIZE) {
+ ntfs_error(vi->i_sb, "Index block size (%u) > "
+ "PAGE_SIZE (%ld) is not "
+ "supported. Sorry.",
+ ni->itype.index.block_size,
+ PAGE_SIZE);
+ err = -EOPNOTSUPP;
+ goto unm_err_out;
+ }
+ if (ni->itype.index.block_size < NTFS_BLOCK_SIZE) {
+ ntfs_error(vi->i_sb, "Index block size (%u) < "
+ "NTFS_BLOCK_SIZE (%i) is not "
+ "supported. Sorry.",
+ ni->itype.index.block_size,
+ NTFS_BLOCK_SIZE);
+ err = -EOPNOTSUPP;
+ goto unm_err_out;
+ }
+ ni->itype.index.block_size_bits =
+ ffs(ni->itype.index.block_size) - 1;
+ /* Determine the size of a vcn in the directory index. */
+ if (vol->cluster_size <= ni->itype.index.block_size) {
+ ni->itype.index.vcn_size = vol->cluster_size;
+ ni->itype.index.vcn_size_bits = vol->cluster_size_bits;
+ } else {
+ ni->itype.index.vcn_size = vol->sector_size;
+ ni->itype.index.vcn_size_bits = vol->sector_size_bits;
+ }
+
+ /* Setup the index allocation attribute, even if not present. */
+ NInoSetMstProtected(ni);
+ ni->type = AT_INDEX_ALLOCATION;
+ ni->name = I30;
+ ni->name_len = 4;
+
+ if (!(ir->index.flags & LARGE_INDEX)) {
+ /* No index allocation. */
+ vi->i_size = ni->initialized_size =
+ ni->allocated_size = 0;
+ /* We are done with the mft record, so we release it. */
+ ntfs_attr_put_search_ctx(ctx);
+ unmap_mft_record(ni);
+ m = NULL;
+ ctx = NULL;
+ goto skip_large_dir_stuff;
+ } /* LARGE_INDEX: Index allocation present. Setup state. */
+ NInoSetIndexAllocPresent(ni);
+ /* Find index allocation attribute. */
+ ntfs_attr_reinit_search_ctx(ctx);
+ err = ntfs_attr_lookup(AT_INDEX_ALLOCATION, I30, 4,
+ CASE_SENSITIVE, 0, NULL, 0, ctx);
+ if (unlikely(err)) {
+ if (err == -ENOENT)
+ ntfs_error(vi->i_sb, "$INDEX_ALLOCATION "
+ "attribute is not present but "
+ "$INDEX_ROOT indicated it is.");
+ else
+ ntfs_error(vi->i_sb, "Failed to lookup "
+ "$INDEX_ALLOCATION "
+ "attribute.");
+ goto unm_err_out;
+ }
+ a = ctx->attr;
+ if (!a->non_resident) {
+ ntfs_error(vi->i_sb, "$INDEX_ALLOCATION attribute "
+ "is resident.");
+ goto unm_err_out;
+ }
+ /*
+ * Ensure the attribute name is placed before the mapping pairs
+ * array.
+ */
+ if (unlikely(a->name_length && (le16_to_cpu(a->name_offset) >=
+ le16_to_cpu(
+ a->data.non_resident.mapping_pairs_offset)))) {
+ ntfs_error(vol->sb, "$INDEX_ALLOCATION attribute name "
+ "is placed after the mapping pairs "
+ "array.");
+ goto unm_err_out;
+ }
+ if (a->flags & ATTR_IS_ENCRYPTED) {
+ ntfs_error(vi->i_sb, "$INDEX_ALLOCATION attribute "
+ "is encrypted.");
+ goto unm_err_out;
+ }
+ if (a->flags & ATTR_IS_SPARSE) {
+ ntfs_error(vi->i_sb, "$INDEX_ALLOCATION attribute "
+ "is sparse.");
+ goto unm_err_out;
+ }
+ if (a->flags & ATTR_COMPRESSION_MASK) {
+ ntfs_error(vi->i_sb, "$INDEX_ALLOCATION attribute "
+ "is compressed.");
+ goto unm_err_out;
+ }
+ if (a->data.non_resident.lowest_vcn) {
+ ntfs_error(vi->i_sb, "First extent of "
+ "$INDEX_ALLOCATION attribute has non "
+ "zero lowest_vcn.");
+ goto unm_err_out;
+ }
+ vi->i_size = sle64_to_cpu(a->data.non_resident.data_size);
+ ni->initialized_size = sle64_to_cpu(
+ a->data.non_resident.initialized_size);
+ ni->allocated_size = sle64_to_cpu(
+ a->data.non_resident.allocated_size);
+ /*
+ * We are done with the mft record, so we release it. Otherwise
+ * we would deadlock in ntfs_attr_iget().
+ */
+ ntfs_attr_put_search_ctx(ctx);
+ unmap_mft_record(ni);
+ m = NULL;
+ ctx = NULL;
+ /* Get the index bitmap attribute inode. */
+ bvi = ntfs_attr_iget(vi, AT_BITMAP, I30, 4);
+ if (IS_ERR(bvi)) {
+ ntfs_error(vi->i_sb, "Failed to get bitmap attribute.");
+ err = PTR_ERR(bvi);
+ goto unm_err_out;
+ }
+ bni = NTFS_I(bvi);
+ if (NInoCompressed(bni) || NInoEncrypted(bni) ||
+ NInoSparse(bni)) {
+ ntfs_error(vi->i_sb, "$BITMAP attribute is compressed "
+ "and/or encrypted and/or sparse.");
+ goto iput_unm_err_out;
+ }
+ /* Consistency check bitmap size vs. index allocation size. */
+ bvi_size = i_size_read(bvi);
+ if ((bvi_size << 3) < (vi->i_size >>
+ ni->itype.index.block_size_bits)) {
+ ntfs_error(vi->i_sb, "Index bitmap too small (0x%llx) "
+ "for index allocation (0x%llx).",
+ bvi_size << 3, vi->i_size);
+ goto iput_unm_err_out;
+ }
+ /* No longer need the bitmap attribute inode. */
+ iput(bvi);
+skip_large_dir_stuff:
+ /* Setup the operations for this inode. */
+ vi->i_op = &ntfs_dir_inode_ops;
+ vi->i_fop = &ntfs_dir_ops;
+ vi->i_mapping->a_ops = &ntfs_mst_aops;
+ } else {
+ /* It is a file. */
+ ntfs_attr_reinit_search_ctx(ctx);
+
+ /* Setup the data attribute, even if not present. */
+ ni->type = AT_DATA;
+ ni->name = NULL;
+ ni->name_len = 0;
+
+ /* Find first extent of the unnamed data attribute. */
+ err = ntfs_attr_lookup(AT_DATA, NULL, 0, 0, 0, NULL, 0, ctx);
+ if (unlikely(err)) {
+ vi->i_size = ni->initialized_size =
+ ni->allocated_size = 0;
+ if (err != -ENOENT) {
+ ntfs_error(vi->i_sb, "Failed to lookup $DATA "
+ "attribute.");
+ goto unm_err_out;
+ }
+ /*
+ * FILE_Secure does not have an unnamed $DATA
+ * attribute, so we special case it here.
+ */
+ if (vi->i_ino == FILE_Secure)
+ goto no_data_attr_special_case;
+ /*
+ * Most if not all the system files in the $Extend
+ * system directory do not have unnamed data
+ * attributes so we need to check if the parent
+ * directory of the file is FILE_Extend and if it is
+ * ignore this error. To do this we need to get the
+ * name of this inode from the mft record as the name
+ * contains the back reference to the parent directory.
+ */
+ if (ntfs_is_extended_system_file(ctx) > 0)
+ goto no_data_attr_special_case;
+ // FIXME: File is corrupt! Hot-fix with empty data
+ // attribute if recovery option is set.
+ ntfs_error(vi->i_sb, "$DATA attribute is missing.");
+ goto unm_err_out;
+ }
+ a = ctx->attr;
+ /* Setup the state. */
+ if (a->flags & (ATTR_COMPRESSION_MASK | ATTR_IS_SPARSE)) {
+ if (a->flags & ATTR_COMPRESSION_MASK) {
+ NInoSetCompressed(ni);
+ if (vol->cluster_size > 4096) {
+ ntfs_error(vi->i_sb, "Found "
+ "compressed data but "
+ "compression is "
+ "disabled due to "
+ "cluster size (%i) > "
+ "4kiB.",
+ vol->cluster_size);
+ goto unm_err_out;
+ }
+ if ((a->flags & ATTR_COMPRESSION_MASK)
+ != ATTR_IS_COMPRESSED) {
+ ntfs_error(vi->i_sb, "Found unknown "
+ "compression method "
+ "or corrupt file.");
+ goto unm_err_out;
+ }
+ }
+ if (a->flags & ATTR_IS_SPARSE)
+ NInoSetSparse(ni);
+ }
+ if (a->flags & ATTR_IS_ENCRYPTED) {
+ if (NInoCompressed(ni)) {
+ ntfs_error(vi->i_sb, "Found encrypted and "
+ "compressed data.");
+ goto unm_err_out;
+ }
+ NInoSetEncrypted(ni);
+ }
+ if (a->non_resident) {
+ NInoSetNonResident(ni);
+ if (NInoCompressed(ni) || NInoSparse(ni)) {
+ if (NInoCompressed(ni) && a->data.non_resident.
+ compression_unit != 4) {
+ ntfs_error(vi->i_sb, "Found "
+ "non-standard "
+ "compression unit (%u "
+ "instead of 4). "
+ "Cannot handle this.",
+ a->data.non_resident.
+ compression_unit);
+ err = -EOPNOTSUPP;
+ goto unm_err_out;
+ }
+ if (a->data.non_resident.compression_unit) {
+ ni->itype.compressed.block_size = 1U <<
+ (a->data.non_resident.
+ compression_unit +
+ vol->cluster_size_bits);
+ ni->itype.compressed.block_size_bits =
+ ffs(ni->itype.
+ compressed.
+ block_size) - 1;
+ ni->itype.compressed.block_clusters =
+ 1U << a->data.
+ non_resident.
+ compression_unit;
+ } else {
+ ni->itype.compressed.block_size = 0;
+ ni->itype.compressed.block_size_bits =
+ 0;
+ ni->itype.compressed.block_clusters =
+ 0;
+ }
+ ni->itype.compressed.size = sle64_to_cpu(
+ a->data.non_resident.
+ compressed_size);
+ }
+ if (a->data.non_resident.lowest_vcn) {
+ ntfs_error(vi->i_sb, "First extent of $DATA "
+ "attribute has non zero "
+ "lowest_vcn.");
+ goto unm_err_out;
+ }
+ vi->i_size = sle64_to_cpu(
+ a->data.non_resident.data_size);
+ ni->initialized_size = sle64_to_cpu(
+ a->data.non_resident.initialized_size);
+ ni->allocated_size = sle64_to_cpu(
+ a->data.non_resident.allocated_size);
+ } else { /* Resident attribute. */
+ vi->i_size = ni->initialized_size = le32_to_cpu(
+ a->data.resident.value_length);
+ ni->allocated_size = le32_to_cpu(a->length) -
+ le16_to_cpu(
+ a->data.resident.value_offset);
+ if (vi->i_size > ni->allocated_size) {
+ ntfs_error(vi->i_sb, "Resident data attribute "
+ "is corrupt (size exceeds "
+ "allocation).");
+ goto unm_err_out;
+ }
+ }
+no_data_attr_special_case:
+ /* We are done with the mft record, so we release it. */
+ ntfs_attr_put_search_ctx(ctx);
+ unmap_mft_record(ni);
+ m = NULL;
+ ctx = NULL;
+ /* Setup the operations for this inode. */
+ vi->i_op = &ntfs_file_inode_ops;
+ vi->i_fop = &ntfs_file_ops;
+ vi->i_mapping->a_ops = &ntfs_normal_aops;
+ if (NInoMstProtected(ni))
+ vi->i_mapping->a_ops = &ntfs_mst_aops;
+ else if (NInoCompressed(ni))
+ vi->i_mapping->a_ops = &ntfs_compressed_aops;
+ }
+ /*
+ * The number of 512-byte blocks used on disk (for stat). This is in so
+ * far inaccurate as it doesn't account for any named streams or other
+ * special non-resident attributes, but that is how Windows works, too,
+ * so we are at least consistent with Windows, if not entirely
+ * consistent with the Linux Way. Doing it the Linux Way would cause a
+ * significant slowdown as it would involve iterating over all
+ * attributes in the mft record and adding the allocated/compressed
+ * sizes of all non-resident attributes present to give us the Linux
+ * correct size that should go into i_blocks (after division by 512).
+ */
+ if (S_ISREG(vi->i_mode) && (NInoCompressed(ni) || NInoSparse(ni)))
+ vi->i_blocks = ni->itype.compressed.size >> 9;
+ else
+ vi->i_blocks = ni->allocated_size >> 9;
+ ntfs_debug("Done.");
+ return 0;
+iput_unm_err_out:
+ iput(bvi);
+unm_err_out:
+ if (!err)
+ err = -EIO;
+ if (ctx)
+ ntfs_attr_put_search_ctx(ctx);
+ if (m)
+ unmap_mft_record(ni);
+err_out:
+ ntfs_error(vol->sb, "Failed with error code %i. Marking corrupt "
+ "inode 0x%lx as bad. Run chkdsk.", err, vi->i_ino);
+ make_bad_inode(vi);
+ if (err != -EOPNOTSUPP && err != -ENOMEM)
+ NVolSetErrors(vol);
+ return err;
+}
+
+/**
+ * ntfs_read_locked_attr_inode - read an attribute inode from its base inode
+ * @base_vi: base inode
+ * @vi: attribute inode to read
+ *
+ * ntfs_read_locked_attr_inode() is called from ntfs_attr_iget() to read the
+ * attribute inode described by @vi into memory from the base mft record
+ * described by @base_ni.
+ *
+ * ntfs_read_locked_attr_inode() maps, pins and locks the base inode for
+ * reading and looks up the attribute described by @vi before setting up the
+ * necessary fields in @vi as well as initializing the ntfs inode.
+ *
+ * Q: What locks are held when the function is called?
+ * A: i_state has I_NEW set, hence the inode is locked, also
+ * i_count is set to 1, so it is not going to go away
+ *
+ * Return 0 on success and -errno on error. In the error case, the inode will
+ * have had make_bad_inode() executed on it.
+ *
+ * Note this cannot be called for AT_INDEX_ALLOCATION.
+ */
+static int ntfs_read_locked_attr_inode(struct inode *base_vi, struct inode *vi)
+{
+ ntfs_volume *vol = NTFS_SB(vi->i_sb);
+ ntfs_inode *ni, *base_ni;
+ MFT_RECORD *m;
+ ATTR_RECORD *a;
+ ntfs_attr_search_ctx *ctx;
+ int err = 0;
+
+ ntfs_debug("Entering for i_ino 0x%lx.", vi->i_ino);
+
+ ntfs_init_big_inode(vi);
+
+ ni = NTFS_I(vi);
+ base_ni = NTFS_I(base_vi);
+
+ /* Just mirror the values from the base inode. */
+ vi->i_uid = base_vi->i_uid;
+ vi->i_gid = base_vi->i_gid;
+ set_nlink(vi, base_vi->i_nlink);
+ vi->i_mtime = base_vi->i_mtime;
+ vi->i_ctime = base_vi->i_ctime;
+ vi->i_atime = base_vi->i_atime;
+ vi->i_generation = ni->seq_no = base_ni->seq_no;
+
+ /* Set inode type to zero but preserve permissions. */
+ vi->i_mode = base_vi->i_mode & ~S_IFMT;
+
+ m = map_mft_record(base_ni);
+ if (IS_ERR(m)) {
+ err = PTR_ERR(m);
+ goto err_out;
+ }
+ ctx = ntfs_attr_get_search_ctx(base_ni, m);
+ if (!ctx) {
+ err = -ENOMEM;
+ goto unm_err_out;
+ }
+ /* Find the attribute. */
+ err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
+ CASE_SENSITIVE, 0, NULL, 0, ctx);
+ if (unlikely(err))
+ goto unm_err_out;
+ a = ctx->attr;
+ if (a->flags & (ATTR_COMPRESSION_MASK | ATTR_IS_SPARSE)) {
+ if (a->flags & ATTR_COMPRESSION_MASK) {
+ NInoSetCompressed(ni);
+ if ((ni->type != AT_DATA) || (ni->type == AT_DATA &&
+ ni->name_len)) {
+ ntfs_error(vi->i_sb, "Found compressed "
+ "non-data or named data "
+ "attribute. Please report "
+ "you saw this message to "
+ "linux-ntfs-dev@lists."
+ "sourceforge.net");
+ goto unm_err_out;
+ }
+ if (vol->cluster_size > 4096) {
+ ntfs_error(vi->i_sb, "Found compressed "
+ "attribute but compression is "
+ "disabled due to cluster size "
+ "(%i) > 4kiB.",
+ vol->cluster_size);
+ goto unm_err_out;
+ }
+ if ((a->flags & ATTR_COMPRESSION_MASK) !=
+ ATTR_IS_COMPRESSED) {
+ ntfs_error(vi->i_sb, "Found unknown "
+ "compression method.");
+ goto unm_err_out;
+ }
+ }
+ /*
+ * The compressed/sparse flag set in an index root just means
+ * to compress all files.
+ */
+ if (NInoMstProtected(ni) && ni->type != AT_INDEX_ROOT) {
+ ntfs_error(vi->i_sb, "Found mst protected attribute "
+ "but the attribute is %s. Please "
+ "report you saw this message to "
+ "linux-ntfs-dev@lists.sourceforge.net",
+ NInoCompressed(ni) ? "compressed" :
+ "sparse");
+ goto unm_err_out;
+ }
+ if (a->flags & ATTR_IS_SPARSE)
+ NInoSetSparse(ni);
+ }
+ if (a->flags & ATTR_IS_ENCRYPTED) {
+ if (NInoCompressed(ni)) {
+ ntfs_error(vi->i_sb, "Found encrypted and compressed "
+ "data.");
+ goto unm_err_out;
+ }
+ /*
+ * The encryption flag set in an index root just means to
+ * encrypt all files.
+ */
+ if (NInoMstProtected(ni) && ni->type != AT_INDEX_ROOT) {
+ ntfs_error(vi->i_sb, "Found mst protected attribute "
+ "but the attribute is encrypted. "
+ "Please report you saw this message "
+ "to linux-ntfs-dev@lists.sourceforge."
+ "net");
+ goto unm_err_out;
+ }
+ if (ni->type != AT_DATA) {
+ ntfs_error(vi->i_sb, "Found encrypted non-data "
+ "attribute.");
+ goto unm_err_out;
+ }
+ NInoSetEncrypted(ni);
+ }
+ if (!a->non_resident) {
+ /* Ensure the attribute name is placed before the value. */
+ if (unlikely(a->name_length && (le16_to_cpu(a->name_offset) >=
+ le16_to_cpu(a->data.resident.value_offset)))) {
+ ntfs_error(vol->sb, "Attribute name is placed after "
+ "the attribute value.");
+ goto unm_err_out;
+ }
+ if (NInoMstProtected(ni)) {
+ ntfs_error(vi->i_sb, "Found mst protected attribute "
+ "but the attribute is resident. "
+ "Please report you saw this message to "
+ "linux-ntfs-dev@lists.sourceforge.net");
+ goto unm_err_out;
+ }
+ vi->i_size = ni->initialized_size = le32_to_cpu(
+ a->data.resident.value_length);
+ ni->allocated_size = le32_to_cpu(a->length) -
+ le16_to_cpu(a->data.resident.value_offset);
+ if (vi->i_size > ni->allocated_size) {
+ ntfs_error(vi->i_sb, "Resident attribute is corrupt "
+ "(size exceeds allocation).");
+ goto unm_err_out;
+ }
+ } else {
+ NInoSetNonResident(ni);
+ /*
+ * Ensure the attribute name is placed before the mapping pairs
+ * array.
+ */
+ if (unlikely(a->name_length && (le16_to_cpu(a->name_offset) >=
+ le16_to_cpu(
+ a->data.non_resident.mapping_pairs_offset)))) {
+ ntfs_error(vol->sb, "Attribute name is placed after "
+ "the mapping pairs array.");
+ goto unm_err_out;
+ }
+ if (NInoCompressed(ni) || NInoSparse(ni)) {
+ if (NInoCompressed(ni) && a->data.non_resident.
+ compression_unit != 4) {
+ ntfs_error(vi->i_sb, "Found non-standard "
+ "compression unit (%u instead "
+ "of 4). Cannot handle this.",
+ a->data.non_resident.
+ compression_unit);
+ err = -EOPNOTSUPP;
+ goto unm_err_out;
+ }
+ if (a->data.non_resident.compression_unit) {
+ ni->itype.compressed.block_size = 1U <<
+ (a->data.non_resident.
+ compression_unit +
+ vol->cluster_size_bits);
+ ni->itype.compressed.block_size_bits =
+ ffs(ni->itype.compressed.
+ block_size) - 1;
+ ni->itype.compressed.block_clusters = 1U <<
+ a->data.non_resident.
+ compression_unit;
+ } else {
+ ni->itype.compressed.block_size = 0;
+ ni->itype.compressed.block_size_bits = 0;
+ ni->itype.compressed.block_clusters = 0;
+ }
+ ni->itype.compressed.size = sle64_to_cpu(
+ a->data.non_resident.compressed_size);
+ }
+ if (a->data.non_resident.lowest_vcn) {
+ ntfs_error(vi->i_sb, "First extent of attribute has "
+ "non-zero lowest_vcn.");
+ goto unm_err_out;
+ }
+ vi->i_size = sle64_to_cpu(a->data.non_resident.data_size);
+ ni->initialized_size = sle64_to_cpu(
+ a->data.non_resident.initialized_size);
+ ni->allocated_size = sle64_to_cpu(
+ a->data.non_resident.allocated_size);
+ }
+ vi->i_mapping->a_ops = &ntfs_normal_aops;
+ if (NInoMstProtected(ni))
+ vi->i_mapping->a_ops = &ntfs_mst_aops;
+ else if (NInoCompressed(ni))
+ vi->i_mapping->a_ops = &ntfs_compressed_aops;
+ if ((NInoCompressed(ni) || NInoSparse(ni)) && ni->type != AT_INDEX_ROOT)
+ vi->i_blocks = ni->itype.compressed.size >> 9;
+ else
+ vi->i_blocks = ni->allocated_size >> 9;
+ /*
+ * Make sure the base inode does not go away and attach it to the
+ * attribute inode.
+ */
+ igrab(base_vi);
+ ni->ext.base_ntfs_ino = base_ni;
+ ni->nr_extents = -1;
+
+ ntfs_attr_put_search_ctx(ctx);
+ unmap_mft_record(base_ni);
+
+ ntfs_debug("Done.");
+ return 0;
+
+unm_err_out:
+ if (!err)
+ err = -EIO;
+ if (ctx)
+ ntfs_attr_put_search_ctx(ctx);
+ unmap_mft_record(base_ni);
+err_out:
+ ntfs_error(vol->sb, "Failed with error code %i while reading attribute "
+ "inode (mft_no 0x%lx, type 0x%x, name_len %i). "
+ "Marking corrupt inode and base inode 0x%lx as bad. "
+ "Run chkdsk.", err, vi->i_ino, ni->type, ni->name_len,
+ base_vi->i_ino);
+ make_bad_inode(vi);
+ if (err != -ENOMEM)
+ NVolSetErrors(vol);
+ return err;
+}
+
+/**
+ * ntfs_read_locked_index_inode - read an index inode from its base inode
+ * @base_vi: base inode
+ * @vi: index inode to read
+ *
+ * ntfs_read_locked_index_inode() is called from ntfs_index_iget() to read the
+ * index inode described by @vi into memory from the base mft record described
+ * by @base_ni.
+ *
+ * ntfs_read_locked_index_inode() maps, pins and locks the base inode for
+ * reading and looks up the attributes relating to the index described by @vi
+ * before setting up the necessary fields in @vi as well as initializing the
+ * ntfs inode.
+ *
+ * Note, index inodes are essentially attribute inodes (NInoAttr() is true)
+ * with the attribute type set to AT_INDEX_ALLOCATION. Apart from that, they
+ * are setup like directory inodes since directories are a special case of
+ * indices ao they need to be treated in much the same way. Most importantly,
+ * for small indices the index allocation attribute might not actually exist.
+ * However, the index root attribute always exists but this does not need to
+ * have an inode associated with it and this is why we define a new inode type
+ * index. Also, like for directories, we need to have an attribute inode for
+ * the bitmap attribute corresponding to the index allocation attribute and we
+ * can store this in the appropriate field of the inode, just like we do for
+ * normal directory inodes.
+ *
+ * Q: What locks are held when the function is called?
+ * A: i_state has I_NEW set, hence the inode is locked, also
+ * i_count is set to 1, so it is not going to go away
+ *
+ * Return 0 on success and -errno on error. In the error case, the inode will
+ * have had make_bad_inode() executed on it.
+ */
+static int ntfs_read_locked_index_inode(struct inode *base_vi, struct inode *vi)
+{
+ loff_t bvi_size;
+ ntfs_volume *vol = NTFS_SB(vi->i_sb);
+ ntfs_inode *ni, *base_ni, *bni;
+ struct inode *bvi;
+ MFT_RECORD *m;
+ ATTR_RECORD *a;
+ ntfs_attr_search_ctx *ctx;
+ INDEX_ROOT *ir;
+ u8 *ir_end, *index_end;
+ int err = 0;
+
+ ntfs_debug("Entering for i_ino 0x%lx.", vi->i_ino);
+ ntfs_init_big_inode(vi);
+ ni = NTFS_I(vi);
+ base_ni = NTFS_I(base_vi);
+ /* Just mirror the values from the base inode. */
+ vi->i_uid = base_vi->i_uid;
+ vi->i_gid = base_vi->i_gid;
+ set_nlink(vi, base_vi->i_nlink);
+ vi->i_mtime = base_vi->i_mtime;
+ vi->i_ctime = base_vi->i_ctime;
+ vi->i_atime = base_vi->i_atime;
+ vi->i_generation = ni->seq_no = base_ni->seq_no;
+ /* Set inode type to zero but preserve permissions. */
+ vi->i_mode = base_vi->i_mode & ~S_IFMT;
+ /* Map the mft record for the base inode. */
+ m = map_mft_record(base_ni);
+ if (IS_ERR(m)) {
+ err = PTR_ERR(m);
+ goto err_out;
+ }
+ ctx = ntfs_attr_get_search_ctx(base_ni, m);
+ if (!ctx) {
+ err = -ENOMEM;
+ goto unm_err_out;
+ }
+ /* Find the index root attribute. */
+ err = ntfs_attr_lookup(AT_INDEX_ROOT, ni->name, ni->name_len,
+ CASE_SENSITIVE, 0, NULL, 0, ctx);
+ if (unlikely(err)) {
+ if (err == -ENOENT)
+ ntfs_error(vi->i_sb, "$INDEX_ROOT attribute is "
+ "missing.");
+ goto unm_err_out;
+ }
+ a = ctx->attr;
+ /* Set up the state. */
+ if (unlikely(a->non_resident)) {
+ ntfs_error(vol->sb, "$INDEX_ROOT attribute is not resident.");
+ goto unm_err_out;
+ }
+ /* Ensure the attribute name is placed before the value. */
+ if (unlikely(a->name_length && (le16_to_cpu(a->name_offset) >=
+ le16_to_cpu(a->data.resident.value_offset)))) {
+ ntfs_error(vol->sb, "$INDEX_ROOT attribute name is placed "
+ "after the attribute value.");
+ goto unm_err_out;
+ }
+ /*
+ * Compressed/encrypted/sparse index root is not allowed, except for
+ * directories of course but those are not dealt with here.
+ */
+ if (a->flags & (ATTR_COMPRESSION_MASK | ATTR_IS_ENCRYPTED |
+ ATTR_IS_SPARSE)) {
+ ntfs_error(vi->i_sb, "Found compressed/encrypted/sparse index "
+ "root attribute.");
+ goto unm_err_out;
+ }
+ ir = (INDEX_ROOT*)((u8*)a + le16_to_cpu(a->data.resident.value_offset));
+ ir_end = (u8*)ir + le32_to_cpu(a->data.resident.value_length);
+ if (ir_end > (u8*)ctx->mrec + vol->mft_record_size) {
+ ntfs_error(vi->i_sb, "$INDEX_ROOT attribute is corrupt.");
+ goto unm_err_out;
+ }
+ index_end = (u8*)&ir->index + le32_to_cpu(ir->index.index_length);
+ if (index_end > ir_end) {
+ ntfs_error(vi->i_sb, "Index is corrupt.");
+ goto unm_err_out;
+ }
+ if (ir->type) {
+ ntfs_error(vi->i_sb, "Index type is not 0 (type is 0x%x).",
+ le32_to_cpu(ir->type));
+ goto unm_err_out;
+ }
+ ni->itype.index.collation_rule = ir->collation_rule;
+ ntfs_debug("Index collation rule is 0x%x.",
+ le32_to_cpu(ir->collation_rule));
+ ni->itype.index.block_size = le32_to_cpu(ir->index_block_size);
+ if (!is_power_of_2(ni->itype.index.block_size)) {
+ ntfs_error(vi->i_sb, "Index block size (%u) is not a power of "
+ "two.", ni->itype.index.block_size);
+ goto unm_err_out;
+ }
+ if (ni->itype.index.block_size > PAGE_SIZE) {
+ ntfs_error(vi->i_sb, "Index block size (%u) > PAGE_SIZE "
+ "(%ld) is not supported. Sorry.",
+ ni->itype.index.block_size, PAGE_SIZE);
+ err = -EOPNOTSUPP;
+ goto unm_err_out;
+ }
+ if (ni->itype.index.block_size < NTFS_BLOCK_SIZE) {
+ ntfs_error(vi->i_sb, "Index block size (%u) < NTFS_BLOCK_SIZE "
+ "(%i) is not supported. Sorry.",
+ ni->itype.index.block_size, NTFS_BLOCK_SIZE);
+ err = -EOPNOTSUPP;
+ goto unm_err_out;
+ }
+ ni->itype.index.block_size_bits = ffs(ni->itype.index.block_size) - 1;
+ /* Determine the size of a vcn in the index. */
+ if (vol->cluster_size <= ni->itype.index.block_size) {
+ ni->itype.index.vcn_size = vol->cluster_size;
+ ni->itype.index.vcn_size_bits = vol->cluster_size_bits;
+ } else {
+ ni->itype.index.vcn_size = vol->sector_size;
+ ni->itype.index.vcn_size_bits = vol->sector_size_bits;
+ }
+ /* Check for presence of index allocation attribute. */
+ if (!(ir->index.flags & LARGE_INDEX)) {
+ /* No index allocation. */
+ vi->i_size = ni->initialized_size = ni->allocated_size = 0;
+ /* We are done with the mft record, so we release it. */
+ ntfs_attr_put_search_ctx(ctx);
+ unmap_mft_record(base_ni);
+ m = NULL;
+ ctx = NULL;
+ goto skip_large_index_stuff;
+ } /* LARGE_INDEX: Index allocation present. Setup state. */
+ NInoSetIndexAllocPresent(ni);
+ /* Find index allocation attribute. */
+ ntfs_attr_reinit_search_ctx(ctx);
+ err = ntfs_attr_lookup(AT_INDEX_ALLOCATION, ni->name, ni->name_len,
+ CASE_SENSITIVE, 0, NULL, 0, ctx);
+ if (unlikely(err)) {
+ if (err == -ENOENT)
+ ntfs_error(vi->i_sb, "$INDEX_ALLOCATION attribute is "
+ "not present but $INDEX_ROOT "
+ "indicated it is.");
+ else
+ ntfs_error(vi->i_sb, "Failed to lookup "
+ "$INDEX_ALLOCATION attribute.");
+ goto unm_err_out;
+ }
+ a = ctx->attr;
+ if (!a->non_resident) {
+ ntfs_error(vi->i_sb, "$INDEX_ALLOCATION attribute is "
+ "resident.");
+ goto unm_err_out;
+ }
+ /*
+ * Ensure the attribute name is placed before the mapping pairs array.
+ */
+ if (unlikely(a->name_length && (le16_to_cpu(a->name_offset) >=
+ le16_to_cpu(
+ a->data.non_resident.mapping_pairs_offset)))) {
+ ntfs_error(vol->sb, "$INDEX_ALLOCATION attribute name is "
+ "placed after the mapping pairs array.");
+ goto unm_err_out;
+ }
+ if (a->flags & ATTR_IS_ENCRYPTED) {
+ ntfs_error(vi->i_sb, "$INDEX_ALLOCATION attribute is "
+ "encrypted.");
+ goto unm_err_out;
+ }
+ if (a->flags & ATTR_IS_SPARSE) {
+ ntfs_error(vi->i_sb, "$INDEX_ALLOCATION attribute is sparse.");
+ goto unm_err_out;
+ }
+ if (a->flags & ATTR_COMPRESSION_MASK) {
+ ntfs_error(vi->i_sb, "$INDEX_ALLOCATION attribute is "
+ "compressed.");
+ goto unm_err_out;
+ }
+ if (a->data.non_resident.lowest_vcn) {
+ ntfs_error(vi->i_sb, "First extent of $INDEX_ALLOCATION "
+ "attribute has non zero lowest_vcn.");
+ goto unm_err_out;
+ }
+ vi->i_size = sle64_to_cpu(a->data.non_resident.data_size);
+ ni->initialized_size = sle64_to_cpu(
+ a->data.non_resident.initialized_size);
+ ni->allocated_size = sle64_to_cpu(a->data.non_resident.allocated_size);
+ /*
+ * We are done with the mft record, so we release it. Otherwise
+ * we would deadlock in ntfs_attr_iget().
+ */
+ ntfs_attr_put_search_ctx(ctx);
+ unmap_mft_record(base_ni);
+ m = NULL;
+ ctx = NULL;
+ /* Get the index bitmap attribute inode. */
+ bvi = ntfs_attr_iget(base_vi, AT_BITMAP, ni->name, ni->name_len);
+ if (IS_ERR(bvi)) {
+ ntfs_error(vi->i_sb, "Failed to get bitmap attribute.");
+ err = PTR_ERR(bvi);
+ goto unm_err_out;
+ }
+ bni = NTFS_I(bvi);
+ if (NInoCompressed(bni) || NInoEncrypted(bni) ||
+ NInoSparse(bni)) {
+ ntfs_error(vi->i_sb, "$BITMAP attribute is compressed and/or "
+ "encrypted and/or sparse.");
+ goto iput_unm_err_out;
+ }
+ /* Consistency check bitmap size vs. index allocation size. */
+ bvi_size = i_size_read(bvi);
+ if ((bvi_size << 3) < (vi->i_size >> ni->itype.index.block_size_bits)) {
+ ntfs_error(vi->i_sb, "Index bitmap too small (0x%llx) for "
+ "index allocation (0x%llx).", bvi_size << 3,
+ vi->i_size);
+ goto iput_unm_err_out;
+ }
+ iput(bvi);
+skip_large_index_stuff:
+ /* Setup the operations for this index inode. */
+ vi->i_mapping->a_ops = &ntfs_mst_aops;
+ vi->i_blocks = ni->allocated_size >> 9;
+ /*
+ * Make sure the base inode doesn't go away and attach it to the
+ * index inode.
+ */
+ igrab(base_vi);
+ ni->ext.base_ntfs_ino = base_ni;
+ ni->nr_extents = -1;
+
+ ntfs_debug("Done.");
+ return 0;
+iput_unm_err_out:
+ iput(bvi);
+unm_err_out:
+ if (!err)
+ err = -EIO;
+ if (ctx)
+ ntfs_attr_put_search_ctx(ctx);
+ if (m)
+ unmap_mft_record(base_ni);
+err_out:
+ ntfs_error(vi->i_sb, "Failed with error code %i while reading index "
+ "inode (mft_no 0x%lx, name_len %i.", err, vi->i_ino,
+ ni->name_len);
+ make_bad_inode(vi);
+ if (err != -EOPNOTSUPP && err != -ENOMEM)
+ NVolSetErrors(vol);
+ return err;
+}
+
+/*
+ * The MFT inode has special locking, so teach the lock validator
+ * about this by splitting off the locking rules of the MFT from
+ * the locking rules of other inodes. The MFT inode can never be
+ * accessed from the VFS side (or even internally), only by the
+ * map_mft functions.
+ */
+static struct lock_class_key mft_ni_runlist_lock_key, mft_ni_mrec_lock_key;
+
+/**
+ * ntfs_read_inode_mount - special read_inode for mount time use only
+ * @vi: inode to read
+ *
+ * Read inode FILE_MFT at mount time, only called with super_block lock
+ * held from within the read_super() code path.
+ *
+ * This function exists because when it is called the page cache for $MFT/$DATA
+ * is not initialized and hence we cannot get at the contents of mft records
+ * by calling map_mft_record*().
+ *
+ * Further it needs to cope with the circular references problem, i.e. cannot
+ * load any attributes other than $ATTRIBUTE_LIST until $DATA is loaded, because
+ * we do not know where the other extent mft records are yet and again, because
+ * we cannot call map_mft_record*() yet. Obviously this applies only when an
+ * attribute list is actually present in $MFT inode.
+ *
+ * We solve these problems by starting with the $DATA attribute before anything
+ * else and iterating using ntfs_attr_lookup($DATA) over all extents. As each
+ * extent is found, we ntfs_mapping_pairs_decompress() including the implied
+ * ntfs_runlists_merge(). Each step of the iteration necessarily provides
+ * sufficient information for the next step to complete.
+ *
+ * This should work but there are two possible pit falls (see inline comments
+ * below), but only time will tell if they are real pits or just smoke...
+ */
+int ntfs_read_inode_mount(struct inode *vi)
+{
+ VCN next_vcn, last_vcn, highest_vcn;
+ s64 block;
+ struct super_block *sb = vi->i_sb;
+ ntfs_volume *vol = NTFS_SB(sb);
+ struct buffer_head *bh;
+ ntfs_inode *ni;
+ MFT_RECORD *m = NULL;
+ ATTR_RECORD *a;
+ ntfs_attr_search_ctx *ctx;
+ unsigned int i, nr_blocks;
+ int err;
+
+ ntfs_debug("Entering.");
+
+ /* Initialize the ntfs specific part of @vi. */
+ ntfs_init_big_inode(vi);
+
+ ni = NTFS_I(vi);
+
+ /* Setup the data attribute. It is special as it is mst protected. */
+ NInoSetNonResident(ni);
+ NInoSetMstProtected(ni);
+ NInoSetSparseDisabled(ni);
+ ni->type = AT_DATA;
+ ni->name = NULL;
+ ni->name_len = 0;
+ /*
+ * This sets up our little cheat allowing us to reuse the async read io
+ * completion handler for directories.
+ */
+ ni->itype.index.block_size = vol->mft_record_size;
+ ni->itype.index.block_size_bits = vol->mft_record_size_bits;
+
+ /* Very important! Needed to be able to call map_mft_record*(). */
+ vol->mft_ino = vi;
+
+ /* Allocate enough memory to read the first mft record. */
+ if (vol->mft_record_size > 64 * 1024) {
+ ntfs_error(sb, "Unsupported mft record size %i (max 64kiB).",
+ vol->mft_record_size);
+ goto err_out;
+ }
+ i = vol->mft_record_size;
+ if (i < sb->s_blocksize)
+ i = sb->s_blocksize;
+ m = (MFT_RECORD*)ntfs_malloc_nofs(i);
+ if (!m) {
+ ntfs_error(sb, "Failed to allocate buffer for $MFT record 0.");
+ goto err_out;
+ }
+
+ /* Determine the first block of the $MFT/$DATA attribute. */
+ block = vol->mft_lcn << vol->cluster_size_bits >>
+ sb->s_blocksize_bits;
+ nr_blocks = vol->mft_record_size >> sb->s_blocksize_bits;
+ if (!nr_blocks)
+ nr_blocks = 1;
+
+ /* Load $MFT/$DATA's first mft record. */
+ for (i = 0; i < nr_blocks; i++) {
+ bh = sb_bread(sb, block++);
+ if (!bh) {
+ ntfs_error(sb, "Device read failed.");
+ goto err_out;
+ }
+ memcpy((char*)m + (i << sb->s_blocksize_bits), bh->b_data,
+ sb->s_blocksize);
+ brelse(bh);
+ }
+
+ if (le32_to_cpu(m->bytes_allocated) != vol->mft_record_size) {
+ ntfs_error(sb, "Incorrect mft record size %u in superblock, should be %u.",
+ le32_to_cpu(m->bytes_allocated), vol->mft_record_size);
+ goto err_out;
+ }
+
+ /* Apply the mst fixups. */
+ if (post_read_mst_fixup((NTFS_RECORD*)m, vol->mft_record_size)) {
+ /* FIXME: Try to use the $MFTMirr now. */
+ ntfs_error(sb, "MST fixup failed. $MFT is corrupt.");
+ goto err_out;
+ }
+
+ /* Sanity check offset to the first attribute */
+ if (le16_to_cpu(m->attrs_offset) >= le32_to_cpu(m->bytes_allocated)) {
+ ntfs_error(sb, "Incorrect mft offset to the first attribute %u in superblock.",
+ le16_to_cpu(m->attrs_offset));
+ goto err_out;
+ }
+
+ /* Need this to sanity check attribute list references to $MFT. */
+ vi->i_generation = ni->seq_no = le16_to_cpu(m->sequence_number);
+
+ /* Provides read_folio() for map_mft_record(). */
+ vi->i_mapping->a_ops = &ntfs_mst_aops;
+
+ ctx = ntfs_attr_get_search_ctx(ni, m);
+ if (!ctx) {
+ err = -ENOMEM;
+ goto err_out;
+ }
+
+ /* Find the attribute list attribute if present. */
+ err = ntfs_attr_lookup(AT_ATTRIBUTE_LIST, NULL, 0, 0, 0, NULL, 0, ctx);
+ if (err) {
+ if (unlikely(err != -ENOENT)) {
+ ntfs_error(sb, "Failed to lookup attribute list "
+ "attribute. You should run chkdsk.");
+ goto put_err_out;
+ }
+ } else /* if (!err) */ {
+ ATTR_LIST_ENTRY *al_entry, *next_al_entry;
+ u8 *al_end;
+ static const char *es = " Not allowed. $MFT is corrupt. "
+ "You should run chkdsk.";
+
+ ntfs_debug("Attribute list attribute found in $MFT.");
+ NInoSetAttrList(ni);
+ a = ctx->attr;
+ if (a->flags & ATTR_COMPRESSION_MASK) {
+ ntfs_error(sb, "Attribute list attribute is "
+ "compressed.%s", es);
+ goto put_err_out;
+ }
+ if (a->flags & ATTR_IS_ENCRYPTED ||
+ a->flags & ATTR_IS_SPARSE) {
+ if (a->non_resident) {
+ ntfs_error(sb, "Non-resident attribute list "
+ "attribute is encrypted/"
+ "sparse.%s", es);
+ goto put_err_out;
+ }
+ ntfs_warning(sb, "Resident attribute list attribute "
+ "in $MFT system file is marked "
+ "encrypted/sparse which is not true. "
+ "However, Windows allows this and "
+ "chkdsk does not detect or correct it "
+ "so we will just ignore the invalid "
+ "flags and pretend they are not set.");
+ }
+ /* Now allocate memory for the attribute list. */
+ ni->attr_list_size = (u32)ntfs_attr_size(a);
+ if (!ni->attr_list_size) {
+ ntfs_error(sb, "Attr_list_size is zero");
+ goto put_err_out;
+ }
+ ni->attr_list = ntfs_malloc_nofs(ni->attr_list_size);
+ if (!ni->attr_list) {
+ ntfs_error(sb, "Not enough memory to allocate buffer "
+ "for attribute list.");
+ goto put_err_out;
+ }
+ if (a->non_resident) {
+ NInoSetAttrListNonResident(ni);
+ if (a->data.non_resident.lowest_vcn) {
+ ntfs_error(sb, "Attribute list has non zero "
+ "lowest_vcn. $MFT is corrupt. "
+ "You should run chkdsk.");
+ goto put_err_out;
+ }
+ /* Setup the runlist. */
+ ni->attr_list_rl.rl = ntfs_mapping_pairs_decompress(vol,
+ a, NULL);
+ if (IS_ERR(ni->attr_list_rl.rl)) {
+ err = PTR_ERR(ni->attr_list_rl.rl);
+ ni->attr_list_rl.rl = NULL;
+ ntfs_error(sb, "Mapping pairs decompression "
+ "failed with error code %i.",
+ -err);
+ goto put_err_out;
+ }
+ /* Now load the attribute list. */
+ if ((err = load_attribute_list(vol, &ni->attr_list_rl,
+ ni->attr_list, ni->attr_list_size,
+ sle64_to_cpu(a->data.
+ non_resident.initialized_size)))) {
+ ntfs_error(sb, "Failed to load attribute list "
+ "attribute with error code %i.",
+ -err);
+ goto put_err_out;
+ }
+ } else /* if (!ctx.attr->non_resident) */ {
+ if ((u8*)a + le16_to_cpu(
+ a->data.resident.value_offset) +
+ le32_to_cpu(
+ a->data.resident.value_length) >
+ (u8*)ctx->mrec + vol->mft_record_size) {
+ ntfs_error(sb, "Corrupt attribute list "
+ "attribute.");
+ goto put_err_out;
+ }
+ /* Now copy the attribute list. */
+ memcpy(ni->attr_list, (u8*)a + le16_to_cpu(
+ a->data.resident.value_offset),
+ le32_to_cpu(
+ a->data.resident.value_length));
+ }
+ /* The attribute list is now setup in memory. */
+ /*
+ * FIXME: I don't know if this case is actually possible.
+ * According to logic it is not possible but I have seen too
+ * many weird things in MS software to rely on logic... Thus we
+ * perform a manual search and make sure the first $MFT/$DATA
+ * extent is in the base inode. If it is not we abort with an
+ * error and if we ever see a report of this error we will need
+ * to do some magic in order to have the necessary mft record
+ * loaded and in the right place in the page cache. But
+ * hopefully logic will prevail and this never happens...
+ */
+ al_entry = (ATTR_LIST_ENTRY*)ni->attr_list;
+ al_end = (u8*)al_entry + ni->attr_list_size;
+ for (;; al_entry = next_al_entry) {
+ /* Out of bounds check. */
+ if ((u8*)al_entry < ni->attr_list ||
+ (u8*)al_entry > al_end)
+ goto em_put_err_out;
+ /* Catch the end of the attribute list. */
+ if ((u8*)al_entry == al_end)
+ goto em_put_err_out;
+ if (!al_entry->length)
+ goto em_put_err_out;
+ if ((u8*)al_entry + 6 > al_end || (u8*)al_entry +
+ le16_to_cpu(al_entry->length) > al_end)
+ goto em_put_err_out;
+ next_al_entry = (ATTR_LIST_ENTRY*)((u8*)al_entry +
+ le16_to_cpu(al_entry->length));
+ if (le32_to_cpu(al_entry->type) > le32_to_cpu(AT_DATA))
+ goto em_put_err_out;
+ if (AT_DATA != al_entry->type)
+ continue;
+ /* We want an unnamed attribute. */
+ if (al_entry->name_length)
+ goto em_put_err_out;
+ /* Want the first entry, i.e. lowest_vcn == 0. */
+ if (al_entry->lowest_vcn)
+ goto em_put_err_out;
+ /* First entry has to be in the base mft record. */
+ if (MREF_LE(al_entry->mft_reference) != vi->i_ino) {
+ /* MFT references do not match, logic fails. */
+ ntfs_error(sb, "BUG: The first $DATA extent "
+ "of $MFT is not in the base "
+ "mft record. Please report "
+ "you saw this message to "
+ "linux-ntfs-dev@lists."
+ "sourceforge.net");
+ goto put_err_out;
+ } else {
+ /* Sequence numbers must match. */
+ if (MSEQNO_LE(al_entry->mft_reference) !=
+ ni->seq_no)
+ goto em_put_err_out;
+ /* Got it. All is ok. We can stop now. */
+ break;
+ }
+ }
+ }
+
+ ntfs_attr_reinit_search_ctx(ctx);
+
+ /* Now load all attribute extents. */
+ a = NULL;
+ next_vcn = last_vcn = highest_vcn = 0;
+ while (!(err = ntfs_attr_lookup(AT_DATA, NULL, 0, 0, next_vcn, NULL, 0,
+ ctx))) {
+ runlist_element *nrl;
+
+ /* Cache the current attribute. */
+ a = ctx->attr;
+ /* $MFT must be non-resident. */
+ if (!a->non_resident) {
+ ntfs_error(sb, "$MFT must be non-resident but a "
+ "resident extent was found. $MFT is "
+ "corrupt. Run chkdsk.");
+ goto put_err_out;
+ }
+ /* $MFT must be uncompressed and unencrypted. */
+ if (a->flags & ATTR_COMPRESSION_MASK ||
+ a->flags & ATTR_IS_ENCRYPTED ||
+ a->flags & ATTR_IS_SPARSE) {
+ ntfs_error(sb, "$MFT must be uncompressed, "
+ "non-sparse, and unencrypted but a "
+ "compressed/sparse/encrypted extent "
+ "was found. $MFT is corrupt. Run "
+ "chkdsk.");
+ goto put_err_out;
+ }
+ /*
+ * Decompress the mapping pairs array of this extent and merge
+ * the result into the existing runlist. No need for locking
+ * as we have exclusive access to the inode at this time and we
+ * are a mount in progress task, too.
+ */
+ nrl = ntfs_mapping_pairs_decompress(vol, a, ni->runlist.rl);
+ if (IS_ERR(nrl)) {
+ ntfs_error(sb, "ntfs_mapping_pairs_decompress() "
+ "failed with error code %ld. $MFT is "
+ "corrupt.", PTR_ERR(nrl));
+ goto put_err_out;
+ }
+ ni->runlist.rl = nrl;
+
+ /* Are we in the first extent? */
+ if (!next_vcn) {
+ if (a->data.non_resident.lowest_vcn) {
+ ntfs_error(sb, "First extent of $DATA "
+ "attribute has non zero "
+ "lowest_vcn. $MFT is corrupt. "
+ "You should run chkdsk.");
+ goto put_err_out;
+ }
+ /* Get the last vcn in the $DATA attribute. */
+ last_vcn = sle64_to_cpu(
+ a->data.non_resident.allocated_size)
+ >> vol->cluster_size_bits;
+ /* Fill in the inode size. */
+ vi->i_size = sle64_to_cpu(
+ a->data.non_resident.data_size);
+ ni->initialized_size = sle64_to_cpu(
+ a->data.non_resident.initialized_size);
+ ni->allocated_size = sle64_to_cpu(
+ a->data.non_resident.allocated_size);
+ /*
+ * Verify the number of mft records does not exceed
+ * 2^32 - 1.
+ */
+ if ((vi->i_size >> vol->mft_record_size_bits) >=
+ (1ULL << 32)) {
+ ntfs_error(sb, "$MFT is too big! Aborting.");
+ goto put_err_out;
+ }
+ /*
+ * We have got the first extent of the runlist for
+ * $MFT which means it is now relatively safe to call
+ * the normal ntfs_read_inode() function.
+ * Complete reading the inode, this will actually
+ * re-read the mft record for $MFT, this time entering
+ * it into the page cache with which we complete the
+ * kick start of the volume. It should be safe to do
+ * this now as the first extent of $MFT/$DATA is
+ * already known and we would hope that we don't need
+ * further extents in order to find the other
+ * attributes belonging to $MFT. Only time will tell if
+ * this is really the case. If not we will have to play
+ * magic at this point, possibly duplicating a lot of
+ * ntfs_read_inode() at this point. We will need to
+ * ensure we do enough of its work to be able to call
+ * ntfs_read_inode() on extents of $MFT/$DATA. But lets
+ * hope this never happens...
+ */
+ ntfs_read_locked_inode(vi);
+ if (is_bad_inode(vi)) {
+ ntfs_error(sb, "ntfs_read_inode() of $MFT "
+ "failed. BUG or corrupt $MFT. "
+ "Run chkdsk and if no errors "
+ "are found, please report you "
+ "saw this message to "
+ "linux-ntfs-dev@lists."
+ "sourceforge.net");
+ ntfs_attr_put_search_ctx(ctx);
+ /* Revert to the safe super operations. */
+ ntfs_free(m);
+ return -1;
+ }
+ /*
+ * Re-initialize some specifics about $MFT's inode as
+ * ntfs_read_inode() will have set up the default ones.
+ */
+ /* Set uid and gid to root. */
+ vi->i_uid = GLOBAL_ROOT_UID;
+ vi->i_gid = GLOBAL_ROOT_GID;
+ /* Regular file. No access for anyone. */
+ vi->i_mode = S_IFREG;
+ /* No VFS initiated operations allowed for $MFT. */
+ vi->i_op = &ntfs_empty_inode_ops;
+ vi->i_fop = &ntfs_empty_file_ops;
+ }
+
+ /* Get the lowest vcn for the next extent. */
+ highest_vcn = sle64_to_cpu(a->data.non_resident.highest_vcn);
+ next_vcn = highest_vcn + 1;
+
+ /* Only one extent or error, which we catch below. */
+ if (next_vcn <= 0)
+ break;
+
+ /* Avoid endless loops due to corruption. */
+ if (next_vcn < sle64_to_cpu(
+ a->data.non_resident.lowest_vcn)) {
+ ntfs_error(sb, "$MFT has corrupt attribute list "
+ "attribute. Run chkdsk.");
+ goto put_err_out;
+ }
+ }
+ if (err != -ENOENT) {
+ ntfs_error(sb, "Failed to lookup $MFT/$DATA attribute extent. "
+ "$MFT is corrupt. Run chkdsk.");
+ goto put_err_out;
+ }
+ if (!a) {
+ ntfs_error(sb, "$MFT/$DATA attribute not found. $MFT is "
+ "corrupt. Run chkdsk.");
+ goto put_err_out;
+ }
+ if (highest_vcn && highest_vcn != last_vcn - 1) {
+ ntfs_error(sb, "Failed to load the complete runlist for "
+ "$MFT/$DATA. Driver bug or corrupt $MFT. "
+ "Run chkdsk.");
+ ntfs_debug("highest_vcn = 0x%llx, last_vcn - 1 = 0x%llx",
+ (unsigned long long)highest_vcn,
+ (unsigned long long)last_vcn - 1);
+ goto put_err_out;
+ }
+ ntfs_attr_put_search_ctx(ctx);
+ ntfs_debug("Done.");
+ ntfs_free(m);
+
+ /*
+ * Split the locking rules of the MFT inode from the
+ * locking rules of other inodes:
+ */
+ lockdep_set_class(&ni->runlist.lock, &mft_ni_runlist_lock_key);
+ lockdep_set_class(&ni->mrec_lock, &mft_ni_mrec_lock_key);
+
+ return 0;
+
+em_put_err_out:
+ ntfs_error(sb, "Couldn't find first extent of $DATA attribute in "
+ "attribute list. $MFT is corrupt. Run chkdsk.");
+put_err_out:
+ ntfs_attr_put_search_ctx(ctx);
+err_out:
+ ntfs_error(sb, "Failed. Marking inode as bad.");
+ make_bad_inode(vi);
+ ntfs_free(m);
+ return -1;
+}
+
+static void __ntfs_clear_inode(ntfs_inode *ni)
+{
+ /* Free all alocated memory. */
+ down_write(&ni->runlist.lock);
+ if (ni->runlist.rl) {
+ ntfs_free(ni->runlist.rl);
+ ni->runlist.rl = NULL;
+ }
+ up_write(&ni->runlist.lock);
+
+ if (ni->attr_list) {
+ ntfs_free(ni->attr_list);
+ ni->attr_list = NULL;
+ }
+
+ down_write(&ni->attr_list_rl.lock);
+ if (ni->attr_list_rl.rl) {
+ ntfs_free(ni->attr_list_rl.rl);
+ ni->attr_list_rl.rl = NULL;
+ }
+ up_write(&ni->attr_list_rl.lock);
+
+ if (ni->name_len && ni->name != I30) {
+ /* Catch bugs... */
+ BUG_ON(!ni->name);
+ kfree(ni->name);
+ }
+}
+
+void ntfs_clear_extent_inode(ntfs_inode *ni)
+{
+ ntfs_debug("Entering for inode 0x%lx.", ni->mft_no);
+
+ BUG_ON(NInoAttr(ni));
+ BUG_ON(ni->nr_extents != -1);
+
+#ifdef NTFS_RW
+ if (NInoDirty(ni)) {
+ if (!is_bad_inode(VFS_I(ni->ext.base_ntfs_ino)))
+ ntfs_error(ni->vol->sb, "Clearing dirty extent inode! "
+ "Losing data! This is a BUG!!!");
+ // FIXME: Do something!!!
+ }
+#endif /* NTFS_RW */
+
+ __ntfs_clear_inode(ni);
+
+ /* Bye, bye... */
+ ntfs_destroy_extent_inode(ni);
+}
+
+/**
+ * ntfs_evict_big_inode - clean up the ntfs specific part of an inode
+ * @vi: vfs inode pending annihilation
+ *
+ * When the VFS is going to remove an inode from memory, ntfs_clear_big_inode()
+ * is called, which deallocates all memory belonging to the NTFS specific part
+ * of the inode and returns.
+ *
+ * If the MFT record is dirty, we commit it before doing anything else.
+ */
+void ntfs_evict_big_inode(struct inode *vi)
+{
+ ntfs_inode *ni = NTFS_I(vi);
+
+ truncate_inode_pages_final(&vi->i_data);
+ clear_inode(vi);
+
+#ifdef NTFS_RW
+ if (NInoDirty(ni)) {
+ bool was_bad = (is_bad_inode(vi));
+
+ /* Committing the inode also commits all extent inodes. */
+ ntfs_commit_inode(vi);
+
+ if (!was_bad && (is_bad_inode(vi) || NInoDirty(ni))) {
+ ntfs_error(vi->i_sb, "Failed to commit dirty inode "
+ "0x%lx. Losing data!", vi->i_ino);
+ // FIXME: Do something!!!
+ }
+ }
+#endif /* NTFS_RW */
+
+ /* No need to lock at this stage as no one else has a reference. */
+ if (ni->nr_extents > 0) {
+ int i;
+
+ for (i = 0; i < ni->nr_extents; i++)
+ ntfs_clear_extent_inode(ni->ext.extent_ntfs_inos[i]);
+ kfree(ni->ext.extent_ntfs_inos);
+ }
+
+ __ntfs_clear_inode(ni);
+
+ if (NInoAttr(ni)) {
+ /* Release the base inode if we are holding it. */
+ if (ni->nr_extents == -1) {
+ iput(VFS_I(ni->ext.base_ntfs_ino));
+ ni->nr_extents = 0;
+ ni->ext.base_ntfs_ino = NULL;
+ }
+ }
+ BUG_ON(ni->page);
+ if (!atomic_dec_and_test(&ni->count))
+ BUG();
+ return;
+}
+
+/**
+ * ntfs_show_options - show mount options in /proc/mounts
+ * @sf: seq_file in which to write our mount options
+ * @root: root of the mounted tree whose mount options to display
+ *
+ * Called by the VFS once for each mounted ntfs volume when someone reads
+ * /proc/mounts in order to display the NTFS specific mount options of each
+ * mount. The mount options of fs specified by @root are written to the seq file
+ * @sf and success is returned.
+ */
+int ntfs_show_options(struct seq_file *sf, struct dentry *root)
+{
+ ntfs_volume *vol = NTFS_SB(root->d_sb);
+ int i;
+
+ seq_printf(sf, ",uid=%i", from_kuid_munged(&init_user_ns, vol->uid));
+ seq_printf(sf, ",gid=%i", from_kgid_munged(&init_user_ns, vol->gid));
+ if (vol->fmask == vol->dmask)
+ seq_printf(sf, ",umask=0%o", vol->fmask);
+ else {
+ seq_printf(sf, ",fmask=0%o", vol->fmask);
+ seq_printf(sf, ",dmask=0%o", vol->dmask);
+ }
+ seq_printf(sf, ",nls=%s", vol->nls_map->charset);
+ if (NVolCaseSensitive(vol))
+ seq_printf(sf, ",case_sensitive");
+ if (NVolShowSystemFiles(vol))
+ seq_printf(sf, ",show_sys_files");
+ if (!NVolSparseEnabled(vol))
+ seq_printf(sf, ",disable_sparse");
+ for (i = 0; on_errors_arr[i].val; i++) {
+ if (on_errors_arr[i].val & vol->on_errors)
+ seq_printf(sf, ",errors=%s", on_errors_arr[i].str);
+ }
+ seq_printf(sf, ",mft_zone_multiplier=%i", vol->mft_zone_multiplier);
+ return 0;
+}
+
+#ifdef NTFS_RW
+
+static const char *es = " Leaving inconsistent metadata. Unmount and run "
+ "chkdsk.";
+
+/**
+ * ntfs_truncate - called when the i_size of an ntfs inode is changed
+ * @vi: inode for which the i_size was changed
+ *
+ * We only support i_size changes for normal files at present, i.e. not
+ * compressed and not encrypted. This is enforced in ntfs_setattr(), see
+ * below.
+ *
+ * The kernel guarantees that @vi is a regular file (S_ISREG() is true) and
+ * that the change is allowed.
+ *
+ * This implies for us that @vi is a file inode rather than a directory, index,
+ * or attribute inode as well as that @vi is a base inode.
+ *
+ * Returns 0 on success or -errno on error.
+ *
+ * Called with ->i_mutex held.
+ */
+int ntfs_truncate(struct inode *vi)
+{
+ s64 new_size, old_size, nr_freed, new_alloc_size, old_alloc_size;
+ VCN highest_vcn;
+ unsigned long flags;
+ ntfs_inode *base_ni, *ni = NTFS_I(vi);
+ ntfs_volume *vol = ni->vol;
+ ntfs_attr_search_ctx *ctx;
+ MFT_RECORD *m;
+ ATTR_RECORD *a;
+ const char *te = " Leaving file length out of sync with i_size.";
+ int err, mp_size, size_change, alloc_change;
+
+ ntfs_debug("Entering for inode 0x%lx.", vi->i_ino);
+ BUG_ON(NInoAttr(ni));
+ BUG_ON(S_ISDIR(vi->i_mode));
+ BUG_ON(NInoMstProtected(ni));
+ BUG_ON(ni->nr_extents < 0);
+retry_truncate:
+ /*
+ * Lock the runlist for writing and map the mft record to ensure it is
+ * safe to mess with the attribute runlist and sizes.
+ */
+ down_write(&ni->runlist.lock);
+ if (!NInoAttr(ni))
+ base_ni = ni;
+ else
+ base_ni = ni->ext.base_ntfs_ino;
+ m = map_mft_record(base_ni);
+ if (IS_ERR(m)) {
+ err = PTR_ERR(m);
+ ntfs_error(vi->i_sb, "Failed to map mft record for inode 0x%lx "
+ "(error code %d).%s", vi->i_ino, err, te);
+ ctx = NULL;
+ m = NULL;
+ goto old_bad_out;
+ }
+ ctx = ntfs_attr_get_search_ctx(base_ni, m);
+ if (unlikely(!ctx)) {
+ ntfs_error(vi->i_sb, "Failed to allocate a search context for "
+ "inode 0x%lx (not enough memory).%s",
+ vi->i_ino, te);
+ err = -ENOMEM;
+ goto old_bad_out;
+ }
+ err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
+ CASE_SENSITIVE, 0, NULL, 0, ctx);
+ if (unlikely(err)) {
+ if (err == -ENOENT) {
+ ntfs_error(vi->i_sb, "Open attribute is missing from "
+ "mft record. Inode 0x%lx is corrupt. "
+ "Run chkdsk.%s", vi->i_ino, te);
+ err = -EIO;
+ } else
+ ntfs_error(vi->i_sb, "Failed to lookup attribute in "
+ "inode 0x%lx (error code %d).%s",
+ vi->i_ino, err, te);
+ goto old_bad_out;
+ }
+ m = ctx->mrec;
+ a = ctx->attr;
+ /*
+ * The i_size of the vfs inode is the new size for the attribute value.
+ */
+ new_size = i_size_read(vi);
+ /* The current size of the attribute value is the old size. */
+ old_size = ntfs_attr_size(a);
+ /* Calculate the new allocated size. */
+ if (NInoNonResident(ni))
+ new_alloc_size = (new_size + vol->cluster_size - 1) &
+ ~(s64)vol->cluster_size_mask;
+ else
+ new_alloc_size = (new_size + 7) & ~7;
+ /* The current allocated size is the old allocated size. */
+ read_lock_irqsave(&ni->size_lock, flags);
+ old_alloc_size = ni->allocated_size;
+ read_unlock_irqrestore(&ni->size_lock, flags);
+ /*
+ * The change in the file size. This will be 0 if no change, >0 if the
+ * size is growing, and <0 if the size is shrinking.
+ */
+ size_change = -1;
+ if (new_size - old_size >= 0) {
+ size_change = 1;
+ if (new_size == old_size)
+ size_change = 0;
+ }
+ /* As above for the allocated size. */
+ alloc_change = -1;
+ if (new_alloc_size - old_alloc_size >= 0) {
+ alloc_change = 1;
+ if (new_alloc_size == old_alloc_size)
+ alloc_change = 0;
+ }
+ /*
+ * If neither the size nor the allocation are being changed there is
+ * nothing to do.
+ */
+ if (!size_change && !alloc_change)
+ goto unm_done;
+ /* If the size is changing, check if new size is allowed in $AttrDef. */
+ if (size_change) {
+ err = ntfs_attr_size_bounds_check(vol, ni->type, new_size);
+ if (unlikely(err)) {
+ if (err == -ERANGE) {
+ ntfs_error(vol->sb, "Truncate would cause the "
+ "inode 0x%lx to %simum size "
+ "for its attribute type "
+ "(0x%x). Aborting truncate.",
+ vi->i_ino,
+ new_size > old_size ? "exceed "
+ "the max" : "go under the min",
+ le32_to_cpu(ni->type));
+ err = -EFBIG;
+ } else {
+ ntfs_error(vol->sb, "Inode 0x%lx has unknown "
+ "attribute type 0x%x. "
+ "Aborting truncate.",
+ vi->i_ino,
+ le32_to_cpu(ni->type));
+ err = -EIO;
+ }
+ /* Reset the vfs inode size to the old size. */
+ i_size_write(vi, old_size);
+ goto err_out;
+ }
+ }
+ if (NInoCompressed(ni) || NInoEncrypted(ni)) {
+ ntfs_warning(vi->i_sb, "Changes in inode size are not "
+ "supported yet for %s files, ignoring.",
+ NInoCompressed(ni) ? "compressed" :
+ "encrypted");
+ err = -EOPNOTSUPP;
+ goto bad_out;
+ }
+ if (a->non_resident)
+ goto do_non_resident_truncate;
+ BUG_ON(NInoNonResident(ni));
+ /* Resize the attribute record to best fit the new attribute size. */
+ if (new_size < vol->mft_record_size &&
+ !ntfs_resident_attr_value_resize(m, a, new_size)) {
+ /* The resize succeeded! */
+ flush_dcache_mft_record_page(ctx->ntfs_ino);
+ mark_mft_record_dirty(ctx->ntfs_ino);
+ write_lock_irqsave(&ni->size_lock, flags);
+ /* Update the sizes in the ntfs inode and all is done. */
+ ni->allocated_size = le32_to_cpu(a->length) -
+ le16_to_cpu(a->data.resident.value_offset);
+ /*
+ * Note ntfs_resident_attr_value_resize() has already done any
+ * necessary data clearing in the attribute record. When the
+ * file is being shrunk vmtruncate() will already have cleared
+ * the top part of the last partial page, i.e. since this is
+ * the resident case this is the page with index 0. However,
+ * when the file is being expanded, the page cache page data
+ * between the old data_size, i.e. old_size, and the new_size
+ * has not been zeroed. Fortunately, we do not need to zero it
+ * either since on one hand it will either already be zero due
+ * to both read_folio and writepage clearing partial page data
+ * beyond i_size in which case there is nothing to do or in the
+ * case of the file being mmap()ped at the same time, POSIX
+ * specifies that the behaviour is unspecified thus we do not
+ * have to do anything. This means that in our implementation
+ * in the rare case that the file is mmap()ped and a write
+ * occurred into the mmap()ped region just beyond the file size
+ * and writepage has not yet been called to write out the page
+ * (which would clear the area beyond the file size) and we now
+ * extend the file size to incorporate this dirty region
+ * outside the file size, a write of the page would result in
+ * this data being written to disk instead of being cleared.
+ * Given both POSIX and the Linux mmap(2) man page specify that
+ * this corner case is undefined, we choose to leave it like
+ * that as this is much simpler for us as we cannot lock the
+ * relevant page now since we are holding too many ntfs locks
+ * which would result in a lock reversal deadlock.
+ */
+ ni->initialized_size = new_size;
+ write_unlock_irqrestore(&ni->size_lock, flags);
+ goto unm_done;
+ }
+ /* If the above resize failed, this must be an attribute extension. */
+ BUG_ON(size_change < 0);
+ /*
+ * We have to drop all the locks so we can call
+ * ntfs_attr_make_non_resident(). This could be optimised by try-
+ * locking the first page cache page and only if that fails dropping
+ * the locks, locking the page, and redoing all the locking and
+ * lookups. While this would be a huge optimisation, it is not worth
+ * it as this is definitely a slow code path as it only ever can happen
+ * once for any given file.
+ */
+ ntfs_attr_put_search_ctx(ctx);
+ unmap_mft_record(base_ni);
+ up_write(&ni->runlist.lock);
+ /*
+ * Not enough space in the mft record, try to make the attribute
+ * non-resident and if successful restart the truncation process.
+ */
+ err = ntfs_attr_make_non_resident(ni, old_size);
+ if (likely(!err))
+ goto retry_truncate;
+ /*
+ * Could not make non-resident. If this is due to this not being
+ * permitted for this attribute type or there not being enough space,
+ * try to make other attributes non-resident. Otherwise fail.
+ */
+ if (unlikely(err != -EPERM && err != -ENOSPC)) {
+ ntfs_error(vol->sb, "Cannot truncate inode 0x%lx, attribute "
+ "type 0x%x, because the conversion from "
+ "resident to non-resident attribute failed "
+ "with error code %i.", vi->i_ino,
+ (unsigned)le32_to_cpu(ni->type), err);
+ if (err != -ENOMEM)
+ err = -EIO;
+ goto conv_err_out;
+ }
+ /* TODO: Not implemented from here, abort. */
+ if (err == -ENOSPC)
+ ntfs_error(vol->sb, "Not enough space in the mft record/on "
+ "disk for the non-resident attribute value. "
+ "This case is not implemented yet.");
+ else /* if (err == -EPERM) */
+ ntfs_error(vol->sb, "This attribute type may not be "
+ "non-resident. This case is not implemented "
+ "yet.");
+ err = -EOPNOTSUPP;
+ goto conv_err_out;
+#if 0
+ // TODO: Attempt to make other attributes non-resident.
+ if (!err)
+ goto do_resident_extend;
+ /*
+ * Both the attribute list attribute and the standard information
+ * attribute must remain in the base inode. Thus, if this is one of
+ * these attributes, we have to try to move other attributes out into
+ * extent mft records instead.
+ */
+ if (ni->type == AT_ATTRIBUTE_LIST ||
+ ni->type == AT_STANDARD_INFORMATION) {
+ // TODO: Attempt to move other attributes into extent mft
+ // records.
+ err = -EOPNOTSUPP;
+ if (!err)
+ goto do_resident_extend;
+ goto err_out;
+ }
+ // TODO: Attempt to move this attribute to an extent mft record, but
+ // only if it is not already the only attribute in an mft record in
+ // which case there would be nothing to gain.
+ err = -EOPNOTSUPP;
+ if (!err)
+ goto do_resident_extend;
+ /* There is nothing we can do to make enough space. )-: */
+ goto err_out;
+#endif
+do_non_resident_truncate:
+ BUG_ON(!NInoNonResident(ni));
+ if (alloc_change < 0) {
+ highest_vcn = sle64_to_cpu(a->data.non_resident.highest_vcn);
+ if (highest_vcn > 0 &&
+ old_alloc_size >> vol->cluster_size_bits >
+ highest_vcn + 1) {
+ /*
+ * This attribute has multiple extents. Not yet
+ * supported.
+ */
+ ntfs_error(vol->sb, "Cannot truncate inode 0x%lx, "
+ "attribute type 0x%x, because the "
+ "attribute is highly fragmented (it "
+ "consists of multiple extents) and "
+ "this case is not implemented yet.",
+ vi->i_ino,
+ (unsigned)le32_to_cpu(ni->type));
+ err = -EOPNOTSUPP;
+ goto bad_out;
+ }
+ }
+ /*
+ * If the size is shrinking, need to reduce the initialized_size and
+ * the data_size before reducing the allocation.
+ */
+ if (size_change < 0) {
+ /*
+ * Make the valid size smaller (i_size is already up-to-date).
+ */
+ write_lock_irqsave(&ni->size_lock, flags);
+ if (new_size < ni->initialized_size) {
+ ni->initialized_size = new_size;
+ a->data.non_resident.initialized_size =
+ cpu_to_sle64(new_size);
+ }
+ a->data.non_resident.data_size = cpu_to_sle64(new_size);
+ write_unlock_irqrestore(&ni->size_lock, flags);
+ flush_dcache_mft_record_page(ctx->ntfs_ino);
+ mark_mft_record_dirty(ctx->ntfs_ino);
+ /* If the allocated size is not changing, we are done. */
+ if (!alloc_change)
+ goto unm_done;
+ /*
+ * If the size is shrinking it makes no sense for the
+ * allocation to be growing.
+ */
+ BUG_ON(alloc_change > 0);
+ } else /* if (size_change >= 0) */ {
+ /*
+ * The file size is growing or staying the same but the
+ * allocation can be shrinking, growing or staying the same.
+ */
+ if (alloc_change > 0) {
+ /*
+ * We need to extend the allocation and possibly update
+ * the data size. If we are updating the data size,
+ * since we are not touching the initialized_size we do
+ * not need to worry about the actual data on disk.
+ * And as far as the page cache is concerned, there
+ * will be no pages beyond the old data size and any
+ * partial region in the last page between the old and
+ * new data size (or the end of the page if the new
+ * data size is outside the page) does not need to be
+ * modified as explained above for the resident
+ * attribute truncate case. To do this, we simply drop
+ * the locks we hold and leave all the work to our
+ * friendly helper ntfs_attr_extend_allocation().
+ */
+ ntfs_attr_put_search_ctx(ctx);
+ unmap_mft_record(base_ni);
+ up_write(&ni->runlist.lock);
+ err = ntfs_attr_extend_allocation(ni, new_size,
+ size_change > 0 ? new_size : -1, -1);
+ /*
+ * ntfs_attr_extend_allocation() will have done error
+ * output already.
+ */
+ goto done;
+ }
+ if (!alloc_change)
+ goto alloc_done;
+ }
+ /* alloc_change < 0 */
+ /* Free the clusters. */
+ nr_freed = ntfs_cluster_free(ni, new_alloc_size >>
+ vol->cluster_size_bits, -1, ctx);
+ m = ctx->mrec;
+ a = ctx->attr;
+ if (unlikely(nr_freed < 0)) {
+ ntfs_error(vol->sb, "Failed to release cluster(s) (error code "
+ "%lli). Unmount and run chkdsk to recover "
+ "the lost cluster(s).", (long long)nr_freed);
+ NVolSetErrors(vol);
+ nr_freed = 0;
+ }
+ /* Truncate the runlist. */
+ err = ntfs_rl_truncate_nolock(vol, &ni->runlist,
+ new_alloc_size >> vol->cluster_size_bits);
+ /*
+ * If the runlist truncation failed and/or the search context is no
+ * longer valid, we cannot resize the attribute record or build the
+ * mapping pairs array thus we mark the inode bad so that no access to
+ * the freed clusters can happen.
+ */
+ if (unlikely(err || IS_ERR(m))) {
+ ntfs_error(vol->sb, "Failed to %s (error code %li).%s",
+ IS_ERR(m) ?
+ "restore attribute search context" :
+ "truncate attribute runlist",
+ IS_ERR(m) ? PTR_ERR(m) : err, es);
+ err = -EIO;
+ goto bad_out;
+ }
+ /* Get the size for the shrunk mapping pairs array for the runlist. */
+ mp_size = ntfs_get_size_for_mapping_pairs(vol, ni->runlist.rl, 0, -1);
+ if (unlikely(mp_size <= 0)) {
+ ntfs_error(vol->sb, "Cannot shrink allocation of inode 0x%lx, "
+ "attribute type 0x%x, because determining the "
+ "size for the mapping pairs failed with error "
+ "code %i.%s", vi->i_ino,
+ (unsigned)le32_to_cpu(ni->type), mp_size, es);
+ err = -EIO;
+ goto bad_out;
+ }
+ /*
+ * Shrink the attribute record for the new mapping pairs array. Note,
+ * this cannot fail since we are making the attribute smaller thus by
+ * definition there is enough space to do so.
+ */
+ err = ntfs_attr_record_resize(m, a, mp_size +
+ le16_to_cpu(a->data.non_resident.mapping_pairs_offset));
+ BUG_ON(err);
+ /*
+ * Generate the mapping pairs array directly into the attribute record.
+ */
+ err = ntfs_mapping_pairs_build(vol, (u8*)a +
+ le16_to_cpu(a->data.non_resident.mapping_pairs_offset),
+ mp_size, ni->runlist.rl, 0, -1, NULL);
+ if (unlikely(err)) {
+ ntfs_error(vol->sb, "Cannot shrink allocation of inode 0x%lx, "
+ "attribute type 0x%x, because building the "
+ "mapping pairs failed with error code %i.%s",
+ vi->i_ino, (unsigned)le32_to_cpu(ni->type),
+ err, es);
+ err = -EIO;
+ goto bad_out;
+ }
+ /* Update the allocated/compressed size as well as the highest vcn. */
+ a->data.non_resident.highest_vcn = cpu_to_sle64((new_alloc_size >>
+ vol->cluster_size_bits) - 1);
+ write_lock_irqsave(&ni->size_lock, flags);
+ ni->allocated_size = new_alloc_size;
+ a->data.non_resident.allocated_size = cpu_to_sle64(new_alloc_size);
+ if (NInoSparse(ni) || NInoCompressed(ni)) {
+ if (nr_freed) {
+ ni->itype.compressed.size -= nr_freed <<
+ vol->cluster_size_bits;
+ BUG_ON(ni->itype.compressed.size < 0);
+ a->data.non_resident.compressed_size = cpu_to_sle64(
+ ni->itype.compressed.size);
+ vi->i_blocks = ni->itype.compressed.size >> 9;
+ }
+ } else
+ vi->i_blocks = new_alloc_size >> 9;
+ write_unlock_irqrestore(&ni->size_lock, flags);
+ /*
+ * We have shrunk the allocation. If this is a shrinking truncate we
+ * have already dealt with the initialized_size and the data_size above
+ * and we are done. If the truncate is only changing the allocation
+ * and not the data_size, we are also done. If this is an extending
+ * truncate, need to extend the data_size now which is ensured by the
+ * fact that @size_change is positive.
+ */
+alloc_done:
+ /*
+ * If the size is growing, need to update it now. If it is shrinking,
+ * we have already updated it above (before the allocation change).
+ */
+ if (size_change > 0)
+ a->data.non_resident.data_size = cpu_to_sle64(new_size);
+ /* Ensure the modified mft record is written out. */
+ flush_dcache_mft_record_page(ctx->ntfs_ino);
+ mark_mft_record_dirty(ctx->ntfs_ino);
+unm_done:
+ ntfs_attr_put_search_ctx(ctx);
+ unmap_mft_record(base_ni);
+ up_write(&ni->runlist.lock);
+done:
+ /* Update the mtime and ctime on the base inode. */
+ /* normally ->truncate shouldn't update ctime or mtime,
+ * but ntfs did before so it got a copy & paste version
+ * of file_update_time. one day someone should fix this
+ * for real.
+ */
+ if (!IS_NOCMTIME(VFS_I(base_ni)) && !IS_RDONLY(VFS_I(base_ni))) {
+ struct timespec64 now = current_time(VFS_I(base_ni));
+ int sync_it = 0;
+
+ if (!timespec64_equal(&VFS_I(base_ni)->i_mtime, &now) ||
+ !timespec64_equal(&VFS_I(base_ni)->i_ctime, &now))
+ sync_it = 1;
+ VFS_I(base_ni)->i_mtime = now;
+ VFS_I(base_ni)->i_ctime = now;
+
+ if (sync_it)
+ mark_inode_dirty_sync(VFS_I(base_ni));
+ }
+
+ if (likely(!err)) {
+ NInoClearTruncateFailed(ni);
+ ntfs_debug("Done.");
+ }
+ return err;
+old_bad_out:
+ old_size = -1;
+bad_out:
+ if (err != -ENOMEM && err != -EOPNOTSUPP)
+ NVolSetErrors(vol);
+ if (err != -EOPNOTSUPP)
+ NInoSetTruncateFailed(ni);
+ else if (old_size >= 0)
+ i_size_write(vi, old_size);
+err_out:
+ if (ctx)
+ ntfs_attr_put_search_ctx(ctx);
+ if (m)
+ unmap_mft_record(base_ni);
+ up_write(&ni->runlist.lock);
+out:
+ ntfs_debug("Failed. Returning error code %i.", err);
+ return err;
+conv_err_out:
+ if (err != -ENOMEM && err != -EOPNOTSUPP)
+ NVolSetErrors(vol);
+ if (err != -EOPNOTSUPP)
+ NInoSetTruncateFailed(ni);
+ else
+ i_size_write(vi, old_size);
+ goto out;
+}
+
+/**
+ * ntfs_truncate_vfs - wrapper for ntfs_truncate() that has no return value
+ * @vi: inode for which the i_size was changed
+ *
+ * Wrapper for ntfs_truncate() that has no return value.
+ *
+ * See ntfs_truncate() description above for details.
+ */
+#ifdef NTFS_RW
+void ntfs_truncate_vfs(struct inode *vi) {
+ ntfs_truncate(vi);
+}
+#endif
+
+/**
+ * ntfs_setattr - called from notify_change() when an attribute is being changed
+ * @mnt_userns: user namespace of the mount the inode was found from
+ * @dentry: dentry whose attributes to change
+ * @attr: structure describing the attributes and the changes
+ *
+ * We have to trap VFS attempts to truncate the file described by @dentry as
+ * soon as possible, because we do not implement changes in i_size yet. So we
+ * abort all i_size changes here.
+ *
+ * We also abort all changes of user, group, and mode as we do not implement
+ * the NTFS ACLs yet.
+ *
+ * Called with ->i_mutex held.
+ */
+int ntfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
+ struct iattr *attr)
+{
+ struct inode *vi = d_inode(dentry);
+ int err;
+ unsigned int ia_valid = attr->ia_valid;
+
+ err = setattr_prepare(&init_user_ns, dentry, attr);
+ if (err)
+ goto out;
+ /* We do not support NTFS ACLs yet. */
+ if (ia_valid & (ATTR_UID | ATTR_GID | ATTR_MODE)) {
+ ntfs_warning(vi->i_sb, "Changes in user/group/mode are not "
+ "supported yet, ignoring.");
+ err = -EOPNOTSUPP;
+ goto out;
+ }
+ if (ia_valid & ATTR_SIZE) {
+ if (attr->ia_size != i_size_read(vi)) {
+ ntfs_inode *ni = NTFS_I(vi);
+ /*
+ * FIXME: For now we do not support resizing of
+ * compressed or encrypted files yet.
+ */
+ if (NInoCompressed(ni) || NInoEncrypted(ni)) {
+ ntfs_warning(vi->i_sb, "Changes in inode size "
+ "are not supported yet for "
+ "%s files, ignoring.",
+ NInoCompressed(ni) ?
+ "compressed" : "encrypted");
+ err = -EOPNOTSUPP;
+ } else {
+ truncate_setsize(vi, attr->ia_size);
+ ntfs_truncate_vfs(vi);
+ }
+ if (err || ia_valid == ATTR_SIZE)
+ goto out;
+ } else {
+ /*
+ * We skipped the truncate but must still update
+ * timestamps.
+ */
+ ia_valid |= ATTR_MTIME | ATTR_CTIME;
+ }
+ }
+ if (ia_valid & ATTR_ATIME)
+ vi->i_atime = attr->ia_atime;
+ if (ia_valid & ATTR_MTIME)
+ vi->i_mtime = attr->ia_mtime;
+ if (ia_valid & ATTR_CTIME)
+ vi->i_ctime = attr->ia_ctime;
+ mark_inode_dirty(vi);
+out:
+ return err;
+}
+
+/**
+ * ntfs_write_inode - write out a dirty inode
+ * @vi: inode to write out
+ * @sync: if true, write out synchronously
+ *
+ * Write out a dirty inode to disk including any extent inodes if present.
+ *
+ * If @sync is true, commit the inode to disk and wait for io completion. This
+ * is done using write_mft_record().
+ *
+ * If @sync is false, just schedule the write to happen but do not wait for i/o
+ * completion. In 2.6 kernels, scheduling usually happens just by virtue of
+ * marking the page (and in this case mft record) dirty but we do not implement
+ * this yet as write_mft_record() largely ignores the @sync parameter and
+ * always performs synchronous writes.
+ *
+ * Return 0 on success and -errno on error.
+ */
+int __ntfs_write_inode(struct inode *vi, int sync)
+{
+ sle64 nt;
+ ntfs_inode *ni = NTFS_I(vi);
+ ntfs_attr_search_ctx *ctx;
+ MFT_RECORD *m;
+ STANDARD_INFORMATION *si;
+ int err = 0;
+ bool modified = false;
+
+ ntfs_debug("Entering for %sinode 0x%lx.", NInoAttr(ni) ? "attr " : "",
+ vi->i_ino);
+ /*
+ * Dirty attribute inodes are written via their real inodes so just
+ * clean them here. Access time updates are taken care off when the
+ * real inode is written.
+ */
+ if (NInoAttr(ni)) {
+ NInoClearDirty(ni);
+ ntfs_debug("Done.");
+ return 0;
+ }
+ /* Map, pin, and lock the mft record belonging to the inode. */
+ m = map_mft_record(ni);
+ if (IS_ERR(m)) {
+ err = PTR_ERR(m);
+ goto err_out;
+ }
+ /* Update the access times in the standard information attribute. */
+ ctx = ntfs_attr_get_search_ctx(ni, m);
+ if (unlikely(!ctx)) {
+ err = -ENOMEM;
+ goto unm_err_out;
+ }
+ err = ntfs_attr_lookup(AT_STANDARD_INFORMATION, NULL, 0,
+ CASE_SENSITIVE, 0, NULL, 0, ctx);
+ if (unlikely(err)) {
+ ntfs_attr_put_search_ctx(ctx);
+ goto unm_err_out;
+ }
+ si = (STANDARD_INFORMATION*)((u8*)ctx->attr +
+ le16_to_cpu(ctx->attr->data.resident.value_offset));
+ /* Update the access times if they have changed. */
+ nt = utc2ntfs(vi->i_mtime);
+ if (si->last_data_change_time != nt) {
+ ntfs_debug("Updating mtime for inode 0x%lx: old = 0x%llx, "
+ "new = 0x%llx", vi->i_ino, (long long)
+ sle64_to_cpu(si->last_data_change_time),
+ (long long)sle64_to_cpu(nt));
+ si->last_data_change_time = nt;
+ modified = true;
+ }
+ nt = utc2ntfs(vi->i_ctime);
+ if (si->last_mft_change_time != nt) {
+ ntfs_debug("Updating ctime for inode 0x%lx: old = 0x%llx, "
+ "new = 0x%llx", vi->i_ino, (long long)
+ sle64_to_cpu(si->last_mft_change_time),
+ (long long)sle64_to_cpu(nt));
+ si->last_mft_change_time = nt;
+ modified = true;
+ }
+ nt = utc2ntfs(vi->i_atime);
+ if (si->last_access_time != nt) {
+ ntfs_debug("Updating atime for inode 0x%lx: old = 0x%llx, "
+ "new = 0x%llx", vi->i_ino,
+ (long long)sle64_to_cpu(si->last_access_time),
+ (long long)sle64_to_cpu(nt));
+ si->last_access_time = nt;
+ modified = true;
+ }
+ /*
+ * If we just modified the standard information attribute we need to
+ * mark the mft record it is in dirty. We do this manually so that
+ * mark_inode_dirty() is not called which would redirty the inode and
+ * hence result in an infinite loop of trying to write the inode.
+ * There is no need to mark the base inode nor the base mft record
+ * dirty, since we are going to write this mft record below in any case
+ * and the base mft record may actually not have been modified so it
+ * might not need to be written out.
+ * NOTE: It is not a problem when the inode for $MFT itself is being
+ * written out as mark_ntfs_record_dirty() will only set I_DIRTY_PAGES
+ * on the $MFT inode and hence ntfs_write_inode() will not be
+ * re-invoked because of it which in turn is ok since the dirtied mft
+ * record will be cleaned and written out to disk below, i.e. before
+ * this function returns.
+ */
+ if (modified) {
+ flush_dcache_mft_record_page(ctx->ntfs_ino);
+ if (!NInoTestSetDirty(ctx->ntfs_ino))
+ mark_ntfs_record_dirty(ctx->ntfs_ino->page,
+ ctx->ntfs_ino->page_ofs);
+ }
+ ntfs_attr_put_search_ctx(ctx);
+ /* Now the access times are updated, write the base mft record. */
+ if (NInoDirty(ni))
+ err = write_mft_record(ni, m, sync);
+ /* Write all attached extent mft records. */
+ mutex_lock(&ni->extent_lock);
+ if (ni->nr_extents > 0) {
+ ntfs_inode **extent_nis = ni->ext.extent_ntfs_inos;
+ int i;
+
+ ntfs_debug("Writing %i extent inodes.", ni->nr_extents);
+ for (i = 0; i < ni->nr_extents; i++) {
+ ntfs_inode *tni = extent_nis[i];
+
+ if (NInoDirty(tni)) {
+ MFT_RECORD *tm = map_mft_record(tni);
+ int ret;
+
+ if (IS_ERR(tm)) {
+ if (!err || err == -ENOMEM)
+ err = PTR_ERR(tm);
+ continue;
+ }
+ ret = write_mft_record(tni, tm, sync);
+ unmap_mft_record(tni);
+ if (unlikely(ret)) {
+ if (!err || err == -ENOMEM)
+ err = ret;
+ }
+ }
+ }
+ }
+ mutex_unlock(&ni->extent_lock);
+ unmap_mft_record(ni);
+ if (unlikely(err))
+ goto err_out;
+ ntfs_debug("Done.");
+ return 0;
+unm_err_out:
+ unmap_mft_record(ni);
+err_out:
+ if (err == -ENOMEM) {
+ ntfs_warning(vi->i_sb, "Not enough memory to write inode. "
+ "Marking the inode dirty again, so the VFS "
+ "retries later.");
+ mark_inode_dirty(vi);
+ } else {
+ ntfs_error(vi->i_sb, "Failed (error %i): Run chkdsk.", -err);
+ NVolSetErrors(ni->vol);
+ }
+ return err;
+}
+
+#endif /* NTFS_RW */
diff --git a/fs/ntfs/inode.h b/fs/ntfs/inode.h
new file mode 100644
index 000000000..6f78ee00f
--- /dev/null
+++ b/fs/ntfs/inode.h
@@ -0,0 +1,310 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * inode.h - Defines for inode structures NTFS Linux kernel driver. Part of
+ * the Linux-NTFS project.
+ *
+ * Copyright (c) 2001-2007 Anton Altaparmakov
+ * Copyright (c) 2002 Richard Russon
+ */
+
+#ifndef _LINUX_NTFS_INODE_H
+#define _LINUX_NTFS_INODE_H
+
+#include <linux/atomic.h>
+
+#include <linux/fs.h>
+#include <linux/list.h>
+#include <linux/mm.h>
+#include <linux/mutex.h>
+#include <linux/seq_file.h>
+
+#include "layout.h"
+#include "volume.h"
+#include "types.h"
+#include "runlist.h"
+#include "debug.h"
+
+typedef struct _ntfs_inode ntfs_inode;
+
+/*
+ * The NTFS in-memory inode structure. It is just used as an extension to the
+ * fields already provided in the VFS inode.
+ */
+struct _ntfs_inode {
+ rwlock_t size_lock; /* Lock serializing access to inode sizes. */
+ s64 initialized_size; /* Copy from the attribute record. */
+ s64 allocated_size; /* Copy from the attribute record. */
+ unsigned long state; /* NTFS specific flags describing this inode.
+ See ntfs_inode_state_bits below. */
+ unsigned long mft_no; /* Number of the mft record / inode. */
+ u16 seq_no; /* Sequence number of the mft record. */
+ atomic_t count; /* Inode reference count for book keeping. */
+ ntfs_volume *vol; /* Pointer to the ntfs volume of this inode. */
+ /*
+ * If NInoAttr() is true, the below fields describe the attribute which
+ * this fake inode belongs to. The actual inode of this attribute is
+ * pointed to by base_ntfs_ino and nr_extents is always set to -1 (see
+ * below). For real inodes, we also set the type (AT_DATA for files and
+ * AT_INDEX_ALLOCATION for directories), with the name = NULL and
+ * name_len = 0 for files and name = I30 (global constant) and
+ * name_len = 4 for directories.
+ */
+ ATTR_TYPE type; /* Attribute type of this fake inode. */
+ ntfschar *name; /* Attribute name of this fake inode. */
+ u32 name_len; /* Attribute name length of this fake inode. */
+ runlist runlist; /* If state has the NI_NonResident bit set,
+ the runlist of the unnamed data attribute
+ (if a file) or of the index allocation
+ attribute (directory) or of the attribute
+ described by the fake inode (if NInoAttr()).
+ If runlist.rl is NULL, the runlist has not
+ been read in yet or has been unmapped. If
+ NI_NonResident is clear, the attribute is
+ resident (file and fake inode) or there is
+ no $I30 index allocation attribute
+ (small directory). In the latter case
+ runlist.rl is always NULL.*/
+ /*
+ * The following fields are only valid for real inodes and extent
+ * inodes.
+ */
+ struct mutex mrec_lock; /* Lock for serializing access to the
+ mft record belonging to this inode. */
+ struct page *page; /* The page containing the mft record of the
+ inode. This should only be touched by the
+ (un)map_mft_record*() functions. */
+ int page_ofs; /* Offset into the page at which the mft record
+ begins. This should only be touched by the
+ (un)map_mft_record*() functions. */
+ /*
+ * Attribute list support (only for use by the attribute lookup
+ * functions). Setup during read_inode for all inodes with attribute
+ * lists. Only valid if NI_AttrList is set in state, and attr_list_rl is
+ * further only valid if NI_AttrListNonResident is set.
+ */
+ u32 attr_list_size; /* Length of attribute list value in bytes. */
+ u8 *attr_list; /* Attribute list value itself. */
+ runlist attr_list_rl; /* Run list for the attribute list value. */
+ union {
+ struct { /* It is a directory, $MFT, or an index inode. */
+ u32 block_size; /* Size of an index block. */
+ u32 vcn_size; /* Size of a vcn in this
+ index. */
+ COLLATION_RULE collation_rule; /* The collation rule
+ for the index. */
+ u8 block_size_bits; /* Log2 of the above. */
+ u8 vcn_size_bits; /* Log2 of the above. */
+ } index;
+ struct { /* It is a compressed/sparse file/attribute inode. */
+ s64 size; /* Copy of compressed_size from
+ $DATA. */
+ u32 block_size; /* Size of a compression block
+ (cb). */
+ u8 block_size_bits; /* Log2 of the size of a cb. */
+ u8 block_clusters; /* Number of clusters per cb. */
+ } compressed;
+ } itype;
+ struct mutex extent_lock; /* Lock for accessing/modifying the
+ below . */
+ s32 nr_extents; /* For a base mft record, the number of attached extent
+ inodes (0 if none), for extent records and for fake
+ inodes describing an attribute this is -1. */
+ union { /* This union is only used if nr_extents != 0. */
+ ntfs_inode **extent_ntfs_inos; /* For nr_extents > 0, array of
+ the ntfs inodes of the extent
+ mft records belonging to
+ this base inode which have
+ been loaded. */
+ ntfs_inode *base_ntfs_ino; /* For nr_extents == -1, the
+ ntfs inode of the base mft
+ record. For fake inodes, the
+ real (base) inode to which
+ the attribute belongs. */
+ } ext;
+};
+
+/*
+ * Defined bits for the state field in the ntfs_inode structure.
+ * (f) = files only, (d) = directories only, (a) = attributes/fake inodes only
+ */
+typedef enum {
+ NI_Dirty, /* 1: Mft record needs to be written to disk. */
+ NI_AttrList, /* 1: Mft record contains an attribute list. */
+ NI_AttrListNonResident, /* 1: Attribute list is non-resident. Implies
+ NI_AttrList is set. */
+
+ NI_Attr, /* 1: Fake inode for attribute i/o.
+ 0: Real inode or extent inode. */
+
+ NI_MstProtected, /* 1: Attribute is protected by MST fixups.
+ 0: Attribute is not protected by fixups. */
+ NI_NonResident, /* 1: Unnamed data attr is non-resident (f).
+ 1: Attribute is non-resident (a). */
+ NI_IndexAllocPresent = NI_NonResident, /* 1: $I30 index alloc attr is
+ present (d). */
+ NI_Compressed, /* 1: Unnamed data attr is compressed (f).
+ 1: Create compressed files by default (d).
+ 1: Attribute is compressed (a). */
+ NI_Encrypted, /* 1: Unnamed data attr is encrypted (f).
+ 1: Create encrypted files by default (d).
+ 1: Attribute is encrypted (a). */
+ NI_Sparse, /* 1: Unnamed data attr is sparse (f).
+ 1: Create sparse files by default (d).
+ 1: Attribute is sparse (a). */
+ NI_SparseDisabled, /* 1: May not create sparse regions. */
+ NI_TruncateFailed, /* 1: Last ntfs_truncate() call failed. */
+} ntfs_inode_state_bits;
+
+/*
+ * NOTE: We should be adding dirty mft records to a list somewhere and they
+ * should be independent of the (ntfs/vfs) inode structure so that an inode can
+ * be removed but the record can be left dirty for syncing later.
+ */
+
+/*
+ * Macro tricks to expand the NInoFoo(), NInoSetFoo(), and NInoClearFoo()
+ * functions.
+ */
+#define NINO_FNS(flag) \
+static inline int NIno##flag(ntfs_inode *ni) \
+{ \
+ return test_bit(NI_##flag, &(ni)->state); \
+} \
+static inline void NInoSet##flag(ntfs_inode *ni) \
+{ \
+ set_bit(NI_##flag, &(ni)->state); \
+} \
+static inline void NInoClear##flag(ntfs_inode *ni) \
+{ \
+ clear_bit(NI_##flag, &(ni)->state); \
+}
+
+/*
+ * As above for NInoTestSetFoo() and NInoTestClearFoo().
+ */
+#define TAS_NINO_FNS(flag) \
+static inline int NInoTestSet##flag(ntfs_inode *ni) \
+{ \
+ return test_and_set_bit(NI_##flag, &(ni)->state); \
+} \
+static inline int NInoTestClear##flag(ntfs_inode *ni) \
+{ \
+ return test_and_clear_bit(NI_##flag, &(ni)->state); \
+}
+
+/* Emit the ntfs inode bitops functions. */
+NINO_FNS(Dirty)
+TAS_NINO_FNS(Dirty)
+NINO_FNS(AttrList)
+NINO_FNS(AttrListNonResident)
+NINO_FNS(Attr)
+NINO_FNS(MstProtected)
+NINO_FNS(NonResident)
+NINO_FNS(IndexAllocPresent)
+NINO_FNS(Compressed)
+NINO_FNS(Encrypted)
+NINO_FNS(Sparse)
+NINO_FNS(SparseDisabled)
+NINO_FNS(TruncateFailed)
+
+/*
+ * The full structure containing a ntfs_inode and a vfs struct inode. Used for
+ * all real and fake inodes but not for extent inodes which lack the vfs struct
+ * inode.
+ */
+typedef struct {
+ ntfs_inode ntfs_inode;
+ struct inode vfs_inode; /* The vfs inode structure. */
+} big_ntfs_inode;
+
+/**
+ * NTFS_I - return the ntfs inode given a vfs inode
+ * @inode: VFS inode
+ *
+ * NTFS_I() returns the ntfs inode associated with the VFS @inode.
+ */
+static inline ntfs_inode *NTFS_I(struct inode *inode)
+{
+ return (ntfs_inode *)container_of(inode, big_ntfs_inode, vfs_inode);
+}
+
+static inline struct inode *VFS_I(ntfs_inode *ni)
+{
+ return &((big_ntfs_inode *)ni)->vfs_inode;
+}
+
+/**
+ * ntfs_attr - ntfs in memory attribute structure
+ * @mft_no: mft record number of the base mft record of this attribute
+ * @name: Unicode name of the attribute (NULL if unnamed)
+ * @name_len: length of @name in Unicode characters (0 if unnamed)
+ * @type: attribute type (see layout.h)
+ *
+ * This structure exists only to provide a small structure for the
+ * ntfs_{attr_}iget()/ntfs_test_inode()/ntfs_init_locked_inode() mechanism.
+ *
+ * NOTE: Elements are ordered by size to make the structure as compact as
+ * possible on all architectures.
+ */
+typedef struct {
+ unsigned long mft_no;
+ ntfschar *name;
+ u32 name_len;
+ ATTR_TYPE type;
+} ntfs_attr;
+
+extern int ntfs_test_inode(struct inode *vi, void *data);
+
+extern struct inode *ntfs_iget(struct super_block *sb, unsigned long mft_no);
+extern struct inode *ntfs_attr_iget(struct inode *base_vi, ATTR_TYPE type,
+ ntfschar *name, u32 name_len);
+extern struct inode *ntfs_index_iget(struct inode *base_vi, ntfschar *name,
+ u32 name_len);
+
+extern struct inode *ntfs_alloc_big_inode(struct super_block *sb);
+extern void ntfs_free_big_inode(struct inode *inode);
+extern void ntfs_evict_big_inode(struct inode *vi);
+
+extern void __ntfs_init_inode(struct super_block *sb, ntfs_inode *ni);
+
+static inline void ntfs_init_big_inode(struct inode *vi)
+{
+ ntfs_inode *ni = NTFS_I(vi);
+
+ ntfs_debug("Entering.");
+ __ntfs_init_inode(vi->i_sb, ni);
+ ni->mft_no = vi->i_ino;
+}
+
+extern ntfs_inode *ntfs_new_extent_inode(struct super_block *sb,
+ unsigned long mft_no);
+extern void ntfs_clear_extent_inode(ntfs_inode *ni);
+
+extern int ntfs_read_inode_mount(struct inode *vi);
+
+extern int ntfs_show_options(struct seq_file *sf, struct dentry *root);
+
+#ifdef NTFS_RW
+
+extern int ntfs_truncate(struct inode *vi);
+extern void ntfs_truncate_vfs(struct inode *vi);
+
+extern int ntfs_setattr(struct user_namespace *mnt_userns,
+ struct dentry *dentry, struct iattr *attr);
+
+extern int __ntfs_write_inode(struct inode *vi, int sync);
+
+static inline void ntfs_commit_inode(struct inode *vi)
+{
+ if (!is_bad_inode(vi))
+ __ntfs_write_inode(vi, 1);
+ return;
+}
+
+#else
+
+static inline void ntfs_truncate_vfs(struct inode *vi) {}
+
+#endif /* NTFS_RW */
+
+#endif /* _LINUX_NTFS_INODE_H */
diff --git a/fs/ntfs/layout.h b/fs/ntfs/layout.h
new file mode 100644
index 000000000..5d4bf7a32
--- /dev/null
+++ b/fs/ntfs/layout.h
@@ -0,0 +1,2421 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * layout.h - All NTFS associated on-disk structures. Part of the Linux-NTFS
+ * project.
+ *
+ * Copyright (c) 2001-2005 Anton Altaparmakov
+ * Copyright (c) 2002 Richard Russon
+ */
+
+#ifndef _LINUX_NTFS_LAYOUT_H
+#define _LINUX_NTFS_LAYOUT_H
+
+#include <linux/types.h>
+#include <linux/bitops.h>
+#include <linux/list.h>
+#include <asm/byteorder.h>
+
+#include "types.h"
+
+/* The NTFS oem_id "NTFS " */
+#define magicNTFS cpu_to_le64(0x202020205346544eULL)
+
+/*
+ * Location of bootsector on partition:
+ * The standard NTFS_BOOT_SECTOR is on sector 0 of the partition.
+ * On NT4 and above there is one backup copy of the boot sector to
+ * be found on the last sector of the partition (not normally accessible
+ * from within Windows as the bootsector contained number of sectors
+ * value is one less than the actual value!).
+ * On versions of NT 3.51 and earlier, the backup copy was located at
+ * number of sectors/2 (integer divide), i.e. in the middle of the volume.
+ */
+
+/*
+ * BIOS parameter block (bpb) structure.
+ */
+typedef struct {
+ le16 bytes_per_sector; /* Size of a sector in bytes. */
+ u8 sectors_per_cluster; /* Size of a cluster in sectors. */
+ le16 reserved_sectors; /* zero */
+ u8 fats; /* zero */
+ le16 root_entries; /* zero */
+ le16 sectors; /* zero */
+ u8 media_type; /* 0xf8 = hard disk */
+ le16 sectors_per_fat; /* zero */
+ le16 sectors_per_track; /* irrelevant */
+ le16 heads; /* irrelevant */
+ le32 hidden_sectors; /* zero */
+ le32 large_sectors; /* zero */
+} __attribute__ ((__packed__)) BIOS_PARAMETER_BLOCK;
+
+/*
+ * NTFS boot sector structure.
+ */
+typedef struct {
+ u8 jump[3]; /* Irrelevant (jump to boot up code).*/
+ le64 oem_id; /* Magic "NTFS ". */
+ BIOS_PARAMETER_BLOCK bpb; /* See BIOS_PARAMETER_BLOCK. */
+ u8 unused[4]; /* zero, NTFS diskedit.exe states that
+ this is actually:
+ __u8 physical_drive; // 0x80
+ __u8 current_head; // zero
+ __u8 extended_boot_signature;
+ // 0x80
+ __u8 unused; // zero
+ */
+/*0x28*/sle64 number_of_sectors; /* Number of sectors in volume. Gives
+ maximum volume size of 2^63 sectors.
+ Assuming standard sector size of 512
+ bytes, the maximum byte size is
+ approx. 4.7x10^21 bytes. (-; */
+ sle64 mft_lcn; /* Cluster location of mft data. */
+ sle64 mftmirr_lcn; /* Cluster location of copy of mft. */
+ s8 clusters_per_mft_record; /* Mft record size in clusters. */
+ u8 reserved0[3]; /* zero */
+ s8 clusters_per_index_record; /* Index block size in clusters. */
+ u8 reserved1[3]; /* zero */
+ le64 volume_serial_number; /* Irrelevant (serial number). */
+ le32 checksum; /* Boot sector checksum. */
+/*0x54*/u8 bootstrap[426]; /* Irrelevant (boot up code). */
+ le16 end_of_sector_marker; /* End of bootsector magic. Always is
+ 0xaa55 in little endian. */
+/* sizeof() = 512 (0x200) bytes */
+} __attribute__ ((__packed__)) NTFS_BOOT_SECTOR;
+
+/*
+ * Magic identifiers present at the beginning of all ntfs record containing
+ * records (like mft records for example).
+ */
+enum {
+ /* Found in $MFT/$DATA. */
+ magic_FILE = cpu_to_le32(0x454c4946), /* Mft entry. */
+ magic_INDX = cpu_to_le32(0x58444e49), /* Index buffer. */
+ magic_HOLE = cpu_to_le32(0x454c4f48), /* ? (NTFS 3.0+?) */
+
+ /* Found in $LogFile/$DATA. */
+ magic_RSTR = cpu_to_le32(0x52545352), /* Restart page. */
+ magic_RCRD = cpu_to_le32(0x44524352), /* Log record page. */
+
+ /* Found in $LogFile/$DATA. (May be found in $MFT/$DATA, also?) */
+ magic_CHKD = cpu_to_le32(0x444b4843), /* Modified by chkdsk. */
+
+ /* Found in all ntfs record containing records. */
+ magic_BAAD = cpu_to_le32(0x44414142), /* Failed multi sector
+ transfer was detected. */
+ /*
+ * Found in $LogFile/$DATA when a page is full of 0xff bytes and is
+ * thus not initialized. Page must be initialized before using it.
+ */
+ magic_empty = cpu_to_le32(0xffffffff) /* Record is empty. */
+};
+
+typedef le32 NTFS_RECORD_TYPE;
+
+/*
+ * Generic magic comparison macros. Finally found a use for the ## preprocessor
+ * operator! (-8
+ */
+
+static inline bool __ntfs_is_magic(le32 x, NTFS_RECORD_TYPE r)
+{
+ return (x == r);
+}
+#define ntfs_is_magic(x, m) __ntfs_is_magic(x, magic_##m)
+
+static inline bool __ntfs_is_magicp(le32 *p, NTFS_RECORD_TYPE r)
+{
+ return (*p == r);
+}
+#define ntfs_is_magicp(p, m) __ntfs_is_magicp(p, magic_##m)
+
+/*
+ * Specialised magic comparison macros for the NTFS_RECORD_TYPEs defined above.
+ */
+#define ntfs_is_file_record(x) ( ntfs_is_magic (x, FILE) )
+#define ntfs_is_file_recordp(p) ( ntfs_is_magicp(p, FILE) )
+#define ntfs_is_mft_record(x) ( ntfs_is_file_record (x) )
+#define ntfs_is_mft_recordp(p) ( ntfs_is_file_recordp(p) )
+#define ntfs_is_indx_record(x) ( ntfs_is_magic (x, INDX) )
+#define ntfs_is_indx_recordp(p) ( ntfs_is_magicp(p, INDX) )
+#define ntfs_is_hole_record(x) ( ntfs_is_magic (x, HOLE) )
+#define ntfs_is_hole_recordp(p) ( ntfs_is_magicp(p, HOLE) )
+
+#define ntfs_is_rstr_record(x) ( ntfs_is_magic (x, RSTR) )
+#define ntfs_is_rstr_recordp(p) ( ntfs_is_magicp(p, RSTR) )
+#define ntfs_is_rcrd_record(x) ( ntfs_is_magic (x, RCRD) )
+#define ntfs_is_rcrd_recordp(p) ( ntfs_is_magicp(p, RCRD) )
+
+#define ntfs_is_chkd_record(x) ( ntfs_is_magic (x, CHKD) )
+#define ntfs_is_chkd_recordp(p) ( ntfs_is_magicp(p, CHKD) )
+
+#define ntfs_is_baad_record(x) ( ntfs_is_magic (x, BAAD) )
+#define ntfs_is_baad_recordp(p) ( ntfs_is_magicp(p, BAAD) )
+
+#define ntfs_is_empty_record(x) ( ntfs_is_magic (x, empty) )
+#define ntfs_is_empty_recordp(p) ( ntfs_is_magicp(p, empty) )
+
+/*
+ * The Update Sequence Array (usa) is an array of the le16 values which belong
+ * to the end of each sector protected by the update sequence record in which
+ * this array is contained. Note that the first entry is the Update Sequence
+ * Number (usn), a cyclic counter of how many times the protected record has
+ * been written to disk. The values 0 and -1 (ie. 0xffff) are not used. All
+ * last le16's of each sector have to be equal to the usn (during reading) or
+ * are set to it (during writing). If they are not, an incomplete multi sector
+ * transfer has occurred when the data was written.
+ * The maximum size for the update sequence array is fixed to:
+ * maximum size = usa_ofs + (usa_count * 2) = 510 bytes
+ * The 510 bytes comes from the fact that the last le16 in the array has to
+ * (obviously) finish before the last le16 of the first 512-byte sector.
+ * This formula can be used as a consistency check in that usa_ofs +
+ * (usa_count * 2) has to be less than or equal to 510.
+ */
+typedef struct {
+ NTFS_RECORD_TYPE magic; /* A four-byte magic identifying the record
+ type and/or status. */
+ le16 usa_ofs; /* Offset to the Update Sequence Array (usa)
+ from the start of the ntfs record. */
+ le16 usa_count; /* Number of le16 sized entries in the usa
+ including the Update Sequence Number (usn),
+ thus the number of fixups is the usa_count
+ minus 1. */
+} __attribute__ ((__packed__)) NTFS_RECORD;
+
+/*
+ * System files mft record numbers. All these files are always marked as used
+ * in the bitmap attribute of the mft; presumably in order to avoid accidental
+ * allocation for random other mft records. Also, the sequence number for each
+ * of the system files is always equal to their mft record number and it is
+ * never modified.
+ */
+typedef enum {
+ FILE_MFT = 0, /* Master file table (mft). Data attribute
+ contains the entries and bitmap attribute
+ records which ones are in use (bit==1). */
+ FILE_MFTMirr = 1, /* Mft mirror: copy of first four mft records
+ in data attribute. If cluster size > 4kiB,
+ copy of first N mft records, with
+ N = cluster_size / mft_record_size. */
+ FILE_LogFile = 2, /* Journalling log in data attribute. */
+ FILE_Volume = 3, /* Volume name attribute and volume information
+ attribute (flags and ntfs version). Windows
+ refers to this file as volume DASD (Direct
+ Access Storage Device). */
+ FILE_AttrDef = 4, /* Array of attribute definitions in data
+ attribute. */
+ FILE_root = 5, /* Root directory. */
+ FILE_Bitmap = 6, /* Allocation bitmap of all clusters (lcns) in
+ data attribute. */
+ FILE_Boot = 7, /* Boot sector (always at cluster 0) in data
+ attribute. */
+ FILE_BadClus = 8, /* Contains all bad clusters in the non-resident
+ data attribute. */
+ FILE_Secure = 9, /* Shared security descriptors in data attribute
+ and two indexes into the descriptors.
+ Appeared in Windows 2000. Before that, this
+ file was named $Quota but was unused. */
+ FILE_UpCase = 10, /* Uppercase equivalents of all 65536 Unicode
+ characters in data attribute. */
+ FILE_Extend = 11, /* Directory containing other system files (eg.
+ $ObjId, $Quota, $Reparse and $UsnJrnl). This
+ is new to NTFS3.0. */
+ FILE_reserved12 = 12, /* Reserved for future use (records 12-15). */
+ FILE_reserved13 = 13,
+ FILE_reserved14 = 14,
+ FILE_reserved15 = 15,
+ FILE_first_user = 16, /* First user file, used as test limit for
+ whether to allow opening a file or not. */
+} NTFS_SYSTEM_FILES;
+
+/*
+ * These are the so far known MFT_RECORD_* flags (16-bit) which contain
+ * information about the mft record in which they are present.
+ */
+enum {
+ MFT_RECORD_IN_USE = cpu_to_le16(0x0001),
+ MFT_RECORD_IS_DIRECTORY = cpu_to_le16(0x0002),
+} __attribute__ ((__packed__));
+
+typedef le16 MFT_RECORD_FLAGS;
+
+/*
+ * mft references (aka file references or file record segment references) are
+ * used whenever a structure needs to refer to a record in the mft.
+ *
+ * A reference consists of a 48-bit index into the mft and a 16-bit sequence
+ * number used to detect stale references.
+ *
+ * For error reporting purposes we treat the 48-bit index as a signed quantity.
+ *
+ * The sequence number is a circular counter (skipping 0) describing how many
+ * times the referenced mft record has been (re)used. This has to match the
+ * sequence number of the mft record being referenced, otherwise the reference
+ * is considered stale and removed (FIXME: only ntfsck or the driver itself?).
+ *
+ * If the sequence number is zero it is assumed that no sequence number
+ * consistency checking should be performed.
+ *
+ * FIXME: Since inodes are 32-bit as of now, the driver needs to always check
+ * for high_part being 0 and if not either BUG(), cause a panic() or handle
+ * the situation in some other way. This shouldn't be a problem as a volume has
+ * to become HUGE in order to need more than 32-bits worth of mft records.
+ * Assuming the standard mft record size of 1kb only the records (never mind
+ * the non-resident attributes, etc.) would require 4Tb of space on their own
+ * for the first 32 bits worth of records. This is only if some strange person
+ * doesn't decide to foul play and make the mft sparse which would be a really
+ * horrible thing to do as it would trash our current driver implementation. )-:
+ * Do I hear screams "we want 64-bit inodes!" ?!? (-;
+ *
+ * FIXME: The mft zone is defined as the first 12% of the volume. This space is
+ * reserved so that the mft can grow contiguously and hence doesn't become
+ * fragmented. Volume free space includes the empty part of the mft zone and
+ * when the volume's free 88% are used up, the mft zone is shrunk by a factor
+ * of 2, thus making more space available for more files/data. This process is
+ * repeated every time there is no more free space except for the mft zone until
+ * there really is no more free space.
+ */
+
+/*
+ * Typedef the MFT_REF as a 64-bit value for easier handling.
+ * Also define two unpacking macros to get to the reference (MREF) and
+ * sequence number (MSEQNO) respectively.
+ * The _LE versions are to be applied on little endian MFT_REFs.
+ * Note: The _LE versions will return a CPU endian formatted value!
+ */
+#define MFT_REF_MASK_CPU 0x0000ffffffffffffULL
+#define MFT_REF_MASK_LE cpu_to_le64(MFT_REF_MASK_CPU)
+
+typedef u64 MFT_REF;
+typedef le64 leMFT_REF;
+
+#define MK_MREF(m, s) ((MFT_REF)(((MFT_REF)(s) << 48) | \
+ ((MFT_REF)(m) & MFT_REF_MASK_CPU)))
+#define MK_LE_MREF(m, s) cpu_to_le64(MK_MREF(m, s))
+
+#define MREF(x) ((unsigned long)((x) & MFT_REF_MASK_CPU))
+#define MSEQNO(x) ((u16)(((x) >> 48) & 0xffff))
+#define MREF_LE(x) ((unsigned long)(le64_to_cpu(x) & MFT_REF_MASK_CPU))
+#define MSEQNO_LE(x) ((u16)((le64_to_cpu(x) >> 48) & 0xffff))
+
+#define IS_ERR_MREF(x) (((x) & 0x0000800000000000ULL) ? true : false)
+#define ERR_MREF(x) ((u64)((s64)(x)))
+#define MREF_ERR(x) ((int)((s64)(x)))
+
+/*
+ * The mft record header present at the beginning of every record in the mft.
+ * This is followed by a sequence of variable length attribute records which
+ * is terminated by an attribute of type AT_END which is a truncated attribute
+ * in that it only consists of the attribute type code AT_END and none of the
+ * other members of the attribute structure are present.
+ */
+typedef struct {
+/*Ofs*/
+/* 0 NTFS_RECORD; -- Unfolded here as gcc doesn't like unnamed structs. */
+ NTFS_RECORD_TYPE magic; /* Usually the magic is "FILE". */
+ le16 usa_ofs; /* See NTFS_RECORD definition above. */
+ le16 usa_count; /* See NTFS_RECORD definition above. */
+
+/* 8*/ le64 lsn; /* $LogFile sequence number for this record.
+ Changed every time the record is modified. */
+/* 16*/ le16 sequence_number; /* Number of times this mft record has been
+ reused. (See description for MFT_REF
+ above.) NOTE: The increment (skipping zero)
+ is done when the file is deleted. NOTE: If
+ this is zero it is left zero. */
+/* 18*/ le16 link_count; /* Number of hard links, i.e. the number of
+ directory entries referencing this record.
+ NOTE: Only used in mft base records.
+ NOTE: When deleting a directory entry we
+ check the link_count and if it is 1 we
+ delete the file. Otherwise we delete the
+ FILE_NAME_ATTR being referenced by the
+ directory entry from the mft record and
+ decrement the link_count.
+ FIXME: Careful with Win32 + DOS names! */
+/* 20*/ le16 attrs_offset; /* Byte offset to the first attribute in this
+ mft record from the start of the mft record.
+ NOTE: Must be aligned to 8-byte boundary. */
+/* 22*/ MFT_RECORD_FLAGS flags; /* Bit array of MFT_RECORD_FLAGS. When a file
+ is deleted, the MFT_RECORD_IN_USE flag is
+ set to zero. */
+/* 24*/ le32 bytes_in_use; /* Number of bytes used in this mft record.
+ NOTE: Must be aligned to 8-byte boundary. */
+/* 28*/ le32 bytes_allocated; /* Number of bytes allocated for this mft
+ record. This should be equal to the mft
+ record size. */
+/* 32*/ leMFT_REF base_mft_record;/* This is zero for base mft records.
+ When it is not zero it is a mft reference
+ pointing to the base mft record to which
+ this record belongs (this is then used to
+ locate the attribute list attribute present
+ in the base record which describes this
+ extension record and hence might need
+ modification when the extension record
+ itself is modified, also locating the
+ attribute list also means finding the other
+ potential extents, belonging to the non-base
+ mft record). */
+/* 40*/ le16 next_attr_instance;/* The instance number that will be assigned to
+ the next attribute added to this mft record.
+ NOTE: Incremented each time after it is used.
+ NOTE: Every time the mft record is reused
+ this number is set to zero. NOTE: The first
+ instance number is always 0. */
+/* The below fields are specific to NTFS 3.1+ (Windows XP and above): */
+/* 42*/ le16 reserved; /* Reserved/alignment. */
+/* 44*/ le32 mft_record_number; /* Number of this mft record. */
+/* sizeof() = 48 bytes */
+/*
+ * When (re)using the mft record, we place the update sequence array at this
+ * offset, i.e. before we start with the attributes. This also makes sense,
+ * otherwise we could run into problems with the update sequence array
+ * containing in itself the last two bytes of a sector which would mean that
+ * multi sector transfer protection wouldn't work. As you can't protect data
+ * by overwriting it since you then can't get it back...
+ * When reading we obviously use the data from the ntfs record header.
+ */
+} __attribute__ ((__packed__)) MFT_RECORD;
+
+/* This is the version without the NTFS 3.1+ specific fields. */
+typedef struct {
+/*Ofs*/
+/* 0 NTFS_RECORD; -- Unfolded here as gcc doesn't like unnamed structs. */
+ NTFS_RECORD_TYPE magic; /* Usually the magic is "FILE". */
+ le16 usa_ofs; /* See NTFS_RECORD definition above. */
+ le16 usa_count; /* See NTFS_RECORD definition above. */
+
+/* 8*/ le64 lsn; /* $LogFile sequence number for this record.
+ Changed every time the record is modified. */
+/* 16*/ le16 sequence_number; /* Number of times this mft record has been
+ reused. (See description for MFT_REF
+ above.) NOTE: The increment (skipping zero)
+ is done when the file is deleted. NOTE: If
+ this is zero it is left zero. */
+/* 18*/ le16 link_count; /* Number of hard links, i.e. the number of
+ directory entries referencing this record.
+ NOTE: Only used in mft base records.
+ NOTE: When deleting a directory entry we
+ check the link_count and if it is 1 we
+ delete the file. Otherwise we delete the
+ FILE_NAME_ATTR being referenced by the
+ directory entry from the mft record and
+ decrement the link_count.
+ FIXME: Careful with Win32 + DOS names! */
+/* 20*/ le16 attrs_offset; /* Byte offset to the first attribute in this
+ mft record from the start of the mft record.
+ NOTE: Must be aligned to 8-byte boundary. */
+/* 22*/ MFT_RECORD_FLAGS flags; /* Bit array of MFT_RECORD_FLAGS. When a file
+ is deleted, the MFT_RECORD_IN_USE flag is
+ set to zero. */
+/* 24*/ le32 bytes_in_use; /* Number of bytes used in this mft record.
+ NOTE: Must be aligned to 8-byte boundary. */
+/* 28*/ le32 bytes_allocated; /* Number of bytes allocated for this mft
+ record. This should be equal to the mft
+ record size. */
+/* 32*/ leMFT_REF base_mft_record;/* This is zero for base mft records.
+ When it is not zero it is a mft reference
+ pointing to the base mft record to which
+ this record belongs (this is then used to
+ locate the attribute list attribute present
+ in the base record which describes this
+ extension record and hence might need
+ modification when the extension record
+ itself is modified, also locating the
+ attribute list also means finding the other
+ potential extents, belonging to the non-base
+ mft record). */
+/* 40*/ le16 next_attr_instance;/* The instance number that will be assigned to
+ the next attribute added to this mft record.
+ NOTE: Incremented each time after it is used.
+ NOTE: Every time the mft record is reused
+ this number is set to zero. NOTE: The first
+ instance number is always 0. */
+/* sizeof() = 42 bytes */
+/*
+ * When (re)using the mft record, we place the update sequence array at this
+ * offset, i.e. before we start with the attributes. This also makes sense,
+ * otherwise we could run into problems with the update sequence array
+ * containing in itself the last two bytes of a sector which would mean that
+ * multi sector transfer protection wouldn't work. As you can't protect data
+ * by overwriting it since you then can't get it back...
+ * When reading we obviously use the data from the ntfs record header.
+ */
+} __attribute__ ((__packed__)) MFT_RECORD_OLD;
+
+/*
+ * System defined attributes (32-bit). Each attribute type has a corresponding
+ * attribute name (Unicode string of maximum 64 character length) as described
+ * by the attribute definitions present in the data attribute of the $AttrDef
+ * system file. On NTFS 3.0 volumes the names are just as the types are named
+ * in the below defines exchanging AT_ for the dollar sign ($). If that is not
+ * a revealing choice of symbol I do not know what is... (-;
+ */
+enum {
+ AT_UNUSED = cpu_to_le32( 0),
+ AT_STANDARD_INFORMATION = cpu_to_le32( 0x10),
+ AT_ATTRIBUTE_LIST = cpu_to_le32( 0x20),
+ AT_FILE_NAME = cpu_to_le32( 0x30),
+ AT_OBJECT_ID = cpu_to_le32( 0x40),
+ AT_SECURITY_DESCRIPTOR = cpu_to_le32( 0x50),
+ AT_VOLUME_NAME = cpu_to_le32( 0x60),
+ AT_VOLUME_INFORMATION = cpu_to_le32( 0x70),
+ AT_DATA = cpu_to_le32( 0x80),
+ AT_INDEX_ROOT = cpu_to_le32( 0x90),
+ AT_INDEX_ALLOCATION = cpu_to_le32( 0xa0),
+ AT_BITMAP = cpu_to_le32( 0xb0),
+ AT_REPARSE_POINT = cpu_to_le32( 0xc0),
+ AT_EA_INFORMATION = cpu_to_le32( 0xd0),
+ AT_EA = cpu_to_le32( 0xe0),
+ AT_PROPERTY_SET = cpu_to_le32( 0xf0),
+ AT_LOGGED_UTILITY_STREAM = cpu_to_le32( 0x100),
+ AT_FIRST_USER_DEFINED_ATTRIBUTE = cpu_to_le32( 0x1000),
+ AT_END = cpu_to_le32(0xffffffff)
+};
+
+typedef le32 ATTR_TYPE;
+
+/*
+ * The collation rules for sorting views/indexes/etc (32-bit).
+ *
+ * COLLATION_BINARY - Collate by binary compare where the first byte is most
+ * significant.
+ * COLLATION_UNICODE_STRING - Collate Unicode strings by comparing their binary
+ * Unicode values, except that when a character can be uppercased, the
+ * upper case value collates before the lower case one.
+ * COLLATION_FILE_NAME - Collate file names as Unicode strings. The collation
+ * is done very much like COLLATION_UNICODE_STRING. In fact I have no idea
+ * what the difference is. Perhaps the difference is that file names
+ * would treat some special characters in an odd way (see
+ * unistr.c::ntfs_collate_names() and unistr.c::legal_ansi_char_array[]
+ * for what I mean but COLLATION_UNICODE_STRING would not give any special
+ * treatment to any characters at all, but this is speculation.
+ * COLLATION_NTOFS_ULONG - Sorting is done according to ascending le32 key
+ * values. E.g. used for $SII index in FILE_Secure, which sorts by
+ * security_id (le32).
+ * COLLATION_NTOFS_SID - Sorting is done according to ascending SID values.
+ * E.g. used for $O index in FILE_Extend/$Quota.
+ * COLLATION_NTOFS_SECURITY_HASH - Sorting is done first by ascending hash
+ * values and second by ascending security_id values. E.g. used for $SDH
+ * index in FILE_Secure.
+ * COLLATION_NTOFS_ULONGS - Sorting is done according to a sequence of ascending
+ * le32 key values. E.g. used for $O index in FILE_Extend/$ObjId, which
+ * sorts by object_id (16-byte), by splitting up the object_id in four
+ * le32 values and using them as individual keys. E.g. take the following
+ * two security_ids, stored as follows on disk:
+ * 1st: a1 61 65 b7 65 7b d4 11 9e 3d 00 e0 81 10 42 59
+ * 2nd: 38 14 37 d2 d2 f3 d4 11 a5 21 c8 6b 79 b1 97 45
+ * To compare them, they are split into four le32 values each, like so:
+ * 1st: 0xb76561a1 0x11d47b65 0xe0003d9e 0x59421081
+ * 2nd: 0xd2371438 0x11d4f3d2 0x6bc821a5 0x4597b179
+ * Now, it is apparent why the 2nd object_id collates after the 1st: the
+ * first le32 value of the 1st object_id is less than the first le32 of
+ * the 2nd object_id. If the first le32 values of both object_ids were
+ * equal then the second le32 values would be compared, etc.
+ */
+enum {
+ COLLATION_BINARY = cpu_to_le32(0x00),
+ COLLATION_FILE_NAME = cpu_to_le32(0x01),
+ COLLATION_UNICODE_STRING = cpu_to_le32(0x02),
+ COLLATION_NTOFS_ULONG = cpu_to_le32(0x10),
+ COLLATION_NTOFS_SID = cpu_to_le32(0x11),
+ COLLATION_NTOFS_SECURITY_HASH = cpu_to_le32(0x12),
+ COLLATION_NTOFS_ULONGS = cpu_to_le32(0x13),
+};
+
+typedef le32 COLLATION_RULE;
+
+/*
+ * The flags (32-bit) describing attribute properties in the attribute
+ * definition structure. FIXME: This information is based on Regis's
+ * information and, according to him, it is not certain and probably
+ * incomplete. The INDEXABLE flag is fairly certainly correct as only the file
+ * name attribute has this flag set and this is the only attribute indexed in
+ * NT4.
+ */
+enum {
+ ATTR_DEF_INDEXABLE = cpu_to_le32(0x02), /* Attribute can be
+ indexed. */
+ ATTR_DEF_MULTIPLE = cpu_to_le32(0x04), /* Attribute type
+ can be present multiple times in the
+ mft records of an inode. */
+ ATTR_DEF_NOT_ZERO = cpu_to_le32(0x08), /* Attribute value
+ must contain at least one non-zero
+ byte. */
+ ATTR_DEF_INDEXED_UNIQUE = cpu_to_le32(0x10), /* Attribute must be
+ indexed and the attribute value must be
+ unique for the attribute type in all of
+ the mft records of an inode. */
+ ATTR_DEF_NAMED_UNIQUE = cpu_to_le32(0x20), /* Attribute must be
+ named and the name must be unique for
+ the attribute type in all of the mft
+ records of an inode. */
+ ATTR_DEF_RESIDENT = cpu_to_le32(0x40), /* Attribute must be
+ resident. */
+ ATTR_DEF_ALWAYS_LOG = cpu_to_le32(0x80), /* Always log
+ modifications to this attribute,
+ regardless of whether it is resident or
+ non-resident. Without this, only log
+ modifications if the attribute is
+ resident. */
+};
+
+typedef le32 ATTR_DEF_FLAGS;
+
+/*
+ * The data attribute of FILE_AttrDef contains a sequence of attribute
+ * definitions for the NTFS volume. With this, it is supposed to be safe for an
+ * older NTFS driver to mount a volume containing a newer NTFS version without
+ * damaging it (that's the theory. In practice it's: not damaging it too much).
+ * Entries are sorted by attribute type. The flags describe whether the
+ * attribute can be resident/non-resident and possibly other things, but the
+ * actual bits are unknown.
+ */
+typedef struct {
+/*hex ofs*/
+/* 0*/ ntfschar name[0x40]; /* Unicode name of the attribute. Zero
+ terminated. */
+/* 80*/ ATTR_TYPE type; /* Type of the attribute. */
+/* 84*/ le32 display_rule; /* Default display rule.
+ FIXME: What does it mean? (AIA) */
+/* 88*/ COLLATION_RULE collation_rule; /* Default collation rule. */
+/* 8c*/ ATTR_DEF_FLAGS flags; /* Flags describing the attribute. */
+/* 90*/ sle64 min_size; /* Optional minimum attribute size. */
+/* 98*/ sle64 max_size; /* Maximum size of attribute. */
+/* sizeof() = 0xa0 or 160 bytes */
+} __attribute__ ((__packed__)) ATTR_DEF;
+
+/*
+ * Attribute flags (16-bit).
+ */
+enum {
+ ATTR_IS_COMPRESSED = cpu_to_le16(0x0001),
+ ATTR_COMPRESSION_MASK = cpu_to_le16(0x00ff), /* Compression method
+ mask. Also, first
+ illegal value. */
+ ATTR_IS_ENCRYPTED = cpu_to_le16(0x4000),
+ ATTR_IS_SPARSE = cpu_to_le16(0x8000),
+} __attribute__ ((__packed__));
+
+typedef le16 ATTR_FLAGS;
+
+/*
+ * Attribute compression.
+ *
+ * Only the data attribute is ever compressed in the current ntfs driver in
+ * Windows. Further, compression is only applied when the data attribute is
+ * non-resident. Finally, to use compression, the maximum allowed cluster size
+ * on a volume is 4kib.
+ *
+ * The compression method is based on independently compressing blocks of X
+ * clusters, where X is determined from the compression_unit value found in the
+ * non-resident attribute record header (more precisely: X = 2^compression_unit
+ * clusters). On Windows NT/2k, X always is 16 clusters (compression_unit = 4).
+ *
+ * There are three different cases of how a compression block of X clusters
+ * can be stored:
+ *
+ * 1) The data in the block is all zero (a sparse block):
+ * This is stored as a sparse block in the runlist, i.e. the runlist
+ * entry has length = X and lcn = -1. The mapping pairs array actually
+ * uses a delta_lcn value length of 0, i.e. delta_lcn is not present at
+ * all, which is then interpreted by the driver as lcn = -1.
+ * NOTE: Even uncompressed files can be sparse on NTFS 3.0 volumes, then
+ * the same principles apply as above, except that the length is not
+ * restricted to being any particular value.
+ *
+ * 2) The data in the block is not compressed:
+ * This happens when compression doesn't reduce the size of the block
+ * in clusters. I.e. if compression has a small effect so that the
+ * compressed data still occupies X clusters, then the uncompressed data
+ * is stored in the block.
+ * This case is recognised by the fact that the runlist entry has
+ * length = X and lcn >= 0. The mapping pairs array stores this as
+ * normal with a run length of X and some specific delta_lcn, i.e.
+ * delta_lcn has to be present.
+ *
+ * 3) The data in the block is compressed:
+ * The common case. This case is recognised by the fact that the run
+ * list entry has length L < X and lcn >= 0. The mapping pairs array
+ * stores this as normal with a run length of X and some specific
+ * delta_lcn, i.e. delta_lcn has to be present. This runlist entry is
+ * immediately followed by a sparse entry with length = X - L and
+ * lcn = -1. The latter entry is to make up the vcn counting to the
+ * full compression block size X.
+ *
+ * In fact, life is more complicated because adjacent entries of the same type
+ * can be coalesced. This means that one has to keep track of the number of
+ * clusters handled and work on a basis of X clusters at a time being one
+ * block. An example: if length L > X this means that this particular runlist
+ * entry contains a block of length X and part of one or more blocks of length
+ * L - X. Another example: if length L < X, this does not necessarily mean that
+ * the block is compressed as it might be that the lcn changes inside the block
+ * and hence the following runlist entry describes the continuation of the
+ * potentially compressed block. The block would be compressed if the
+ * following runlist entry describes at least X - L sparse clusters, thus
+ * making up the compression block length as described in point 3 above. (Of
+ * course, there can be several runlist entries with small lengths so that the
+ * sparse entry does not follow the first data containing entry with
+ * length < X.)
+ *
+ * NOTE: At the end of the compressed attribute value, there most likely is not
+ * just the right amount of data to make up a compression block, thus this data
+ * is not even attempted to be compressed. It is just stored as is, unless
+ * the number of clusters it occupies is reduced when compressed in which case
+ * it is stored as a compressed compression block, complete with sparse
+ * clusters at the end.
+ */
+
+/*
+ * Flags of resident attributes (8-bit).
+ */
+enum {
+ RESIDENT_ATTR_IS_INDEXED = 0x01, /* Attribute is referenced in an index
+ (has implications for deleting and
+ modifying the attribute). */
+} __attribute__ ((__packed__));
+
+typedef u8 RESIDENT_ATTR_FLAGS;
+
+/*
+ * Attribute record header. Always aligned to 8-byte boundary.
+ */
+typedef struct {
+/*Ofs*/
+/* 0*/ ATTR_TYPE type; /* The (32-bit) type of the attribute. */
+/* 4*/ le32 length; /* Byte size of the resident part of the
+ attribute (aligned to 8-byte boundary).
+ Used to get to the next attribute. */
+/* 8*/ u8 non_resident; /* If 0, attribute is resident.
+ If 1, attribute is non-resident. */
+/* 9*/ u8 name_length; /* Unicode character size of name of attribute.
+ 0 if unnamed. */
+/* 10*/ le16 name_offset; /* If name_length != 0, the byte offset to the
+ beginning of the name from the attribute
+ record. Note that the name is stored as a
+ Unicode string. When creating, place offset
+ just at the end of the record header. Then,
+ follow with attribute value or mapping pairs
+ array, resident and non-resident attributes
+ respectively, aligning to an 8-byte
+ boundary. */
+/* 12*/ ATTR_FLAGS flags; /* Flags describing the attribute. */
+/* 14*/ le16 instance; /* The instance of this attribute record. This
+ number is unique within this mft record (see
+ MFT_RECORD/next_attribute_instance notes in
+ mft.h for more details). */
+/* 16*/ union {
+ /* Resident attributes. */
+ struct {
+/* 16 */ le32 value_length;/* Byte size of attribute value. */
+/* 20 */ le16 value_offset;/* Byte offset of the attribute
+ value from the start of the
+ attribute record. When creating,
+ align to 8-byte boundary if we
+ have a name present as this might
+ not have a length of a multiple
+ of 8-bytes. */
+/* 22 */ RESIDENT_ATTR_FLAGS flags; /* See above. */
+/* 23 */ s8 reserved; /* Reserved/alignment to 8-byte
+ boundary. */
+ } __attribute__ ((__packed__)) resident;
+ /* Non-resident attributes. */
+ struct {
+/* 16*/ leVCN lowest_vcn;/* Lowest valid virtual cluster number
+ for this portion of the attribute value or
+ 0 if this is the only extent (usually the
+ case). - Only when an attribute list is used
+ does lowest_vcn != 0 ever occur. */
+/* 24*/ leVCN highest_vcn;/* Highest valid vcn of this extent of
+ the attribute value. - Usually there is only one
+ portion, so this usually equals the attribute
+ value size in clusters minus 1. Can be -1 for
+ zero length files. Can be 0 for "single extent"
+ attributes. */
+/* 32*/ le16 mapping_pairs_offset; /* Byte offset from the
+ beginning of the structure to the mapping pairs
+ array which contains the mappings between the
+ vcns and the logical cluster numbers (lcns).
+ When creating, place this at the end of this
+ record header aligned to 8-byte boundary. */
+/* 34*/ u8 compression_unit; /* The compression unit expressed
+ as the log to the base 2 of the number of
+ clusters in a compression unit. 0 means not
+ compressed. (This effectively limits the
+ compression unit size to be a power of two
+ clusters.) WinNT4 only uses a value of 4.
+ Sparse files have this set to 0 on XPSP2. */
+/* 35*/ u8 reserved[5]; /* Align to 8-byte boundary. */
+/* The sizes below are only used when lowest_vcn is zero, as otherwise it would
+ be difficult to keep them up-to-date.*/
+/* 40*/ sle64 allocated_size; /* Byte size of disk space
+ allocated to hold the attribute value. Always
+ is a multiple of the cluster size. When a file
+ is compressed, this field is a multiple of the
+ compression block size (2^compression_unit) and
+ it represents the logically allocated space
+ rather than the actual on disk usage. For this
+ use the compressed_size (see below). */
+/* 48*/ sle64 data_size; /* Byte size of the attribute
+ value. Can be larger than allocated_size if
+ attribute value is compressed or sparse. */
+/* 56*/ sle64 initialized_size; /* Byte size of initialized
+ portion of the attribute value. Usually equals
+ data_size. */
+/* sizeof(uncompressed attr) = 64*/
+/* 64*/ sle64 compressed_size; /* Byte size of the attribute
+ value after compression. Only present when
+ compressed or sparse. Always is a multiple of
+ the cluster size. Represents the actual amount
+ of disk space being used on the disk. */
+/* sizeof(compressed attr) = 72*/
+ } __attribute__ ((__packed__)) non_resident;
+ } __attribute__ ((__packed__)) data;
+} __attribute__ ((__packed__)) ATTR_RECORD;
+
+typedef ATTR_RECORD ATTR_REC;
+
+/*
+ * File attribute flags (32-bit) appearing in the file_attributes fields of the
+ * STANDARD_INFORMATION attribute of MFT_RECORDs and the FILENAME_ATTR
+ * attributes of MFT_RECORDs and directory index entries.
+ *
+ * All of the below flags appear in the directory index entries but only some
+ * appear in the STANDARD_INFORMATION attribute whilst only some others appear
+ * in the FILENAME_ATTR attribute of MFT_RECORDs. Unless otherwise stated the
+ * flags appear in all of the above.
+ */
+enum {
+ FILE_ATTR_READONLY = cpu_to_le32(0x00000001),
+ FILE_ATTR_HIDDEN = cpu_to_le32(0x00000002),
+ FILE_ATTR_SYSTEM = cpu_to_le32(0x00000004),
+ /* Old DOS volid. Unused in NT. = cpu_to_le32(0x00000008), */
+
+ FILE_ATTR_DIRECTORY = cpu_to_le32(0x00000010),
+ /* Note, FILE_ATTR_DIRECTORY is not considered valid in NT. It is
+ reserved for the DOS SUBDIRECTORY flag. */
+ FILE_ATTR_ARCHIVE = cpu_to_le32(0x00000020),
+ FILE_ATTR_DEVICE = cpu_to_le32(0x00000040),
+ FILE_ATTR_NORMAL = cpu_to_le32(0x00000080),
+
+ FILE_ATTR_TEMPORARY = cpu_to_le32(0x00000100),
+ FILE_ATTR_SPARSE_FILE = cpu_to_le32(0x00000200),
+ FILE_ATTR_REPARSE_POINT = cpu_to_le32(0x00000400),
+ FILE_ATTR_COMPRESSED = cpu_to_le32(0x00000800),
+
+ FILE_ATTR_OFFLINE = cpu_to_le32(0x00001000),
+ FILE_ATTR_NOT_CONTENT_INDEXED = cpu_to_le32(0x00002000),
+ FILE_ATTR_ENCRYPTED = cpu_to_le32(0x00004000),
+
+ FILE_ATTR_VALID_FLAGS = cpu_to_le32(0x00007fb7),
+ /* Note, FILE_ATTR_VALID_FLAGS masks out the old DOS VolId and the
+ FILE_ATTR_DEVICE and preserves everything else. This mask is used
+ to obtain all flags that are valid for reading. */
+ FILE_ATTR_VALID_SET_FLAGS = cpu_to_le32(0x000031a7),
+ /* Note, FILE_ATTR_VALID_SET_FLAGS masks out the old DOS VolId, the
+ F_A_DEVICE, F_A_DIRECTORY, F_A_SPARSE_FILE, F_A_REPARSE_POINT,
+ F_A_COMPRESSED, and F_A_ENCRYPTED and preserves the rest. This mask
+ is used to obtain all flags that are valid for setting. */
+ /*
+ * The flag FILE_ATTR_DUP_FILENAME_INDEX_PRESENT is present in all
+ * FILENAME_ATTR attributes but not in the STANDARD_INFORMATION
+ * attribute of an mft record.
+ */
+ FILE_ATTR_DUP_FILE_NAME_INDEX_PRESENT = cpu_to_le32(0x10000000),
+ /* Note, this is a copy of the corresponding bit from the mft record,
+ telling us whether this is a directory or not, i.e. whether it has
+ an index root attribute or not. */
+ FILE_ATTR_DUP_VIEW_INDEX_PRESENT = cpu_to_le32(0x20000000),
+ /* Note, this is a copy of the corresponding bit from the mft record,
+ telling us whether this file has a view index present (eg. object id
+ index, quota index, one of the security indexes or the encrypting
+ filesystem related indexes). */
+};
+
+typedef le32 FILE_ATTR_FLAGS;
+
+/*
+ * NOTE on times in NTFS: All times are in MS standard time format, i.e. they
+ * are the number of 100-nanosecond intervals since 1st January 1601, 00:00:00
+ * universal coordinated time (UTC). (In Linux time starts 1st January 1970,
+ * 00:00:00 UTC and is stored as the number of 1-second intervals since then.)
+ */
+
+/*
+ * Attribute: Standard information (0x10).
+ *
+ * NOTE: Always resident.
+ * NOTE: Present in all base file records on a volume.
+ * NOTE: There is conflicting information about the meaning of each of the time
+ * fields but the meaning as defined below has been verified to be
+ * correct by practical experimentation on Windows NT4 SP6a and is hence
+ * assumed to be the one and only correct interpretation.
+ */
+typedef struct {
+/*Ofs*/
+/* 0*/ sle64 creation_time; /* Time file was created. Updated when
+ a filename is changed(?). */
+/* 8*/ sle64 last_data_change_time; /* Time the data attribute was last
+ modified. */
+/* 16*/ sle64 last_mft_change_time; /* Time this mft record was last
+ modified. */
+/* 24*/ sle64 last_access_time; /* Approximate time when the file was
+ last accessed (obviously this is not
+ updated on read-only volumes). In
+ Windows this is only updated when
+ accessed if some time delta has
+ passed since the last update. Also,
+ last access time updates can be
+ disabled altogether for speed. */
+/* 32*/ FILE_ATTR_FLAGS file_attributes; /* Flags describing the file. */
+/* 36*/ union {
+ /* NTFS 1.2 */
+ struct {
+ /* 36*/ u8 reserved12[12]; /* Reserved/alignment to 8-byte
+ boundary. */
+ } __attribute__ ((__packed__)) v1;
+ /* sizeof() = 48 bytes */
+ /* NTFS 3.x */
+ struct {
+/*
+ * If a volume has been upgraded from a previous NTFS version, then these
+ * fields are present only if the file has been accessed since the upgrade.
+ * Recognize the difference by comparing the length of the resident attribute
+ * value. If it is 48, then the following fields are missing. If it is 72 then
+ * the fields are present. Maybe just check like this:
+ * if (resident.ValueLength < sizeof(STANDARD_INFORMATION)) {
+ * Assume NTFS 1.2- format.
+ * If (volume version is 3.x)
+ * Upgrade attribute to NTFS 3.x format.
+ * else
+ * Use NTFS 1.2- format for access.
+ * } else
+ * Use NTFS 3.x format for access.
+ * Only problem is that it might be legal to set the length of the value to
+ * arbitrarily large values thus spoiling this check. - But chkdsk probably
+ * views that as a corruption, assuming that it behaves like this for all
+ * attributes.
+ */
+ /* 36*/ le32 maximum_versions; /* Maximum allowed versions for
+ file. Zero if version numbering is disabled. */
+ /* 40*/ le32 version_number; /* This file's version (if any).
+ Set to zero if maximum_versions is zero. */
+ /* 44*/ le32 class_id; /* Class id from bidirectional
+ class id index (?). */
+ /* 48*/ le32 owner_id; /* Owner_id of the user owning
+ the file. Translate via $Q index in FILE_Extend
+ /$Quota to the quota control entry for the user
+ owning the file. Zero if quotas are disabled. */
+ /* 52*/ le32 security_id; /* Security_id for the file.
+ Translate via $SII index and $SDS data stream
+ in FILE_Secure to the security descriptor. */
+ /* 56*/ le64 quota_charged; /* Byte size of the charge to
+ the quota for all streams of the file. Note: Is
+ zero if quotas are disabled. */
+ /* 64*/ leUSN usn; /* Last update sequence number
+ of the file. This is a direct index into the
+ transaction log file ($UsnJrnl). It is zero if
+ the usn journal is disabled or this file has
+ not been subject to logging yet. See usnjrnl.h
+ for details. */
+ } __attribute__ ((__packed__)) v3;
+ /* sizeof() = 72 bytes (NTFS 3.x) */
+ } __attribute__ ((__packed__)) ver;
+} __attribute__ ((__packed__)) STANDARD_INFORMATION;
+
+/*
+ * Attribute: Attribute list (0x20).
+ *
+ * - Can be either resident or non-resident.
+ * - Value consists of a sequence of variable length, 8-byte aligned,
+ * ATTR_LIST_ENTRY records.
+ * - The list is not terminated by anything at all! The only way to know when
+ * the end is reached is to keep track of the current offset and compare it to
+ * the attribute value size.
+ * - The attribute list attribute contains one entry for each attribute of
+ * the file in which the list is located, except for the list attribute
+ * itself. The list is sorted: first by attribute type, second by attribute
+ * name (if present), third by instance number. The extents of one
+ * non-resident attribute (if present) immediately follow after the initial
+ * extent. They are ordered by lowest_vcn and have their instace set to zero.
+ * It is not allowed to have two attributes with all sorting keys equal.
+ * - Further restrictions:
+ * - If not resident, the vcn to lcn mapping array has to fit inside the
+ * base mft record.
+ * - The attribute list attribute value has a maximum size of 256kb. This
+ * is imposed by the Windows cache manager.
+ * - Attribute lists are only used when the attributes of mft record do not
+ * fit inside the mft record despite all attributes (that can be made
+ * non-resident) having been made non-resident. This can happen e.g. when:
+ * - File has a large number of hard links (lots of file name
+ * attributes present).
+ * - The mapping pairs array of some non-resident attribute becomes so
+ * large due to fragmentation that it overflows the mft record.
+ * - The security descriptor is very complex (not applicable to
+ * NTFS 3.0 volumes).
+ * - There are many named streams.
+ */
+typedef struct {
+/*Ofs*/
+/* 0*/ ATTR_TYPE type; /* Type of referenced attribute. */
+/* 4*/ le16 length; /* Byte size of this entry (8-byte aligned). */
+/* 6*/ u8 name_length; /* Size in Unicode chars of the name of the
+ attribute or 0 if unnamed. */
+/* 7*/ u8 name_offset; /* Byte offset to beginning of attribute name
+ (always set this to where the name would
+ start even if unnamed). */
+/* 8*/ leVCN lowest_vcn; /* Lowest virtual cluster number of this portion
+ of the attribute value. This is usually 0. It
+ is non-zero for the case where one attribute
+ does not fit into one mft record and thus
+ several mft records are allocated to hold
+ this attribute. In the latter case, each mft
+ record holds one extent of the attribute and
+ there is one attribute list entry for each
+ extent. NOTE: This is DEFINITELY a signed
+ value! The windows driver uses cmp, followed
+ by jg when comparing this, thus it treats it
+ as signed. */
+/* 16*/ leMFT_REF mft_reference;/* The reference of the mft record holding
+ the ATTR_RECORD for this portion of the
+ attribute value. */
+/* 24*/ le16 instance; /* If lowest_vcn = 0, the instance of the
+ attribute being referenced; otherwise 0. */
+/* 26*/ ntfschar name[0]; /* Use when creating only. When reading use
+ name_offset to determine the location of the
+ name. */
+/* sizeof() = 26 + (attribute_name_length * 2) bytes */
+} __attribute__ ((__packed__)) ATTR_LIST_ENTRY;
+
+/*
+ * The maximum allowed length for a file name.
+ */
+#define MAXIMUM_FILE_NAME_LENGTH 255
+
+/*
+ * Possible namespaces for filenames in ntfs (8-bit).
+ */
+enum {
+ FILE_NAME_POSIX = 0x00,
+ /* This is the largest namespace. It is case sensitive and allows all
+ Unicode characters except for: '\0' and '/'. Beware that in
+ WinNT/2k/2003 by default files which eg have the same name except
+ for their case will not be distinguished by the standard utilities
+ and thus a "del filename" will delete both "filename" and "fileName"
+ without warning. However if for example Services For Unix (SFU) are
+ installed and the case sensitive option was enabled at installation
+ time, then you can create/access/delete such files.
+ Note that even SFU places restrictions on the filenames beyond the
+ '\0' and '/' and in particular the following set of characters is
+ not allowed: '"', '/', '<', '>', '\'. All other characters,
+ including the ones no allowed in WIN32 namespace are allowed.
+ Tested with SFU 3.5 (this is now free) running on Windows XP. */
+ FILE_NAME_WIN32 = 0x01,
+ /* The standard WinNT/2k NTFS long filenames. Case insensitive. All
+ Unicode chars except: '\0', '"', '*', '/', ':', '<', '>', '?', '\',
+ and '|'. Further, names cannot end with a '.' or a space. */
+ FILE_NAME_DOS = 0x02,
+ /* The standard DOS filenames (8.3 format). Uppercase only. All 8-bit
+ characters greater space, except: '"', '*', '+', ',', '/', ':', ';',
+ '<', '=', '>', '?', and '\'. */
+ FILE_NAME_WIN32_AND_DOS = 0x03,
+ /* 3 means that both the Win32 and the DOS filenames are identical and
+ hence have been saved in this single filename record. */
+} __attribute__ ((__packed__));
+
+typedef u8 FILE_NAME_TYPE_FLAGS;
+
+/*
+ * Attribute: Filename (0x30).
+ *
+ * NOTE: Always resident.
+ * NOTE: All fields, except the parent_directory, are only updated when the
+ * filename is changed. Until then, they just become out of sync with
+ * reality and the more up to date values are present in the standard
+ * information attribute.
+ * NOTE: There is conflicting information about the meaning of each of the time
+ * fields but the meaning as defined below has been verified to be
+ * correct by practical experimentation on Windows NT4 SP6a and is hence
+ * assumed to be the one and only correct interpretation.
+ */
+typedef struct {
+/*hex ofs*/
+/* 0*/ leMFT_REF parent_directory; /* Directory this filename is
+ referenced from. */
+/* 8*/ sle64 creation_time; /* Time file was created. */
+/* 10*/ sle64 last_data_change_time; /* Time the data attribute was last
+ modified. */
+/* 18*/ sle64 last_mft_change_time; /* Time this mft record was last
+ modified. */
+/* 20*/ sle64 last_access_time; /* Time this mft record was last
+ accessed. */
+/* 28*/ sle64 allocated_size; /* Byte size of on-disk allocated space
+ for the unnamed data attribute. So
+ for normal $DATA, this is the
+ allocated_size from the unnamed
+ $DATA attribute and for compressed
+ and/or sparse $DATA, this is the
+ compressed_size from the unnamed
+ $DATA attribute. For a directory or
+ other inode without an unnamed $DATA
+ attribute, this is always 0. NOTE:
+ This is a multiple of the cluster
+ size. */
+/* 30*/ sle64 data_size; /* Byte size of actual data in unnamed
+ data attribute. For a directory or
+ other inode without an unnamed $DATA
+ attribute, this is always 0. */
+/* 38*/ FILE_ATTR_FLAGS file_attributes; /* Flags describing the file. */
+/* 3c*/ union {
+ /* 3c*/ struct {
+ /* 3c*/ le16 packed_ea_size; /* Size of the buffer needed to
+ pack the extended attributes
+ (EAs), if such are present.*/
+ /* 3e*/ le16 reserved; /* Reserved for alignment. */
+ } __attribute__ ((__packed__)) ea;
+ /* 3c*/ struct {
+ /* 3c*/ le32 reparse_point_tag; /* Type of reparse point,
+ present only in reparse
+ points and only if there are
+ no EAs. */
+ } __attribute__ ((__packed__)) rp;
+ } __attribute__ ((__packed__)) type;
+/* 40*/ u8 file_name_length; /* Length of file name in
+ (Unicode) characters. */
+/* 41*/ FILE_NAME_TYPE_FLAGS file_name_type; /* Namespace of the file name.*/
+/* 42*/ ntfschar file_name[0]; /* File name in Unicode. */
+} __attribute__ ((__packed__)) FILE_NAME_ATTR;
+
+/*
+ * GUID structures store globally unique identifiers (GUID). A GUID is a
+ * 128-bit value consisting of one group of eight hexadecimal digits, followed
+ * by three groups of four hexadecimal digits each, followed by one group of
+ * twelve hexadecimal digits. GUIDs are Microsoft's implementation of the
+ * distributed computing environment (DCE) universally unique identifier (UUID).
+ * Example of a GUID:
+ * 1F010768-5A73-BC91-0010A52216A7
+ */
+typedef struct {
+ le32 data1; /* The first eight hexadecimal digits of the GUID. */
+ le16 data2; /* The first group of four hexadecimal digits. */
+ le16 data3; /* The second group of four hexadecimal digits. */
+ u8 data4[8]; /* The first two bytes are the third group of four
+ hexadecimal digits. The remaining six bytes are the
+ final 12 hexadecimal digits. */
+} __attribute__ ((__packed__)) GUID;
+
+/*
+ * FILE_Extend/$ObjId contains an index named $O. This index contains all
+ * object_ids present on the volume as the index keys and the corresponding
+ * mft_record numbers as the index entry data parts. The data part (defined
+ * below) also contains three other object_ids:
+ * birth_volume_id - object_id of FILE_Volume on which the file was first
+ * created. Optional (i.e. can be zero).
+ * birth_object_id - object_id of file when it was first created. Usually
+ * equals the object_id. Optional (i.e. can be zero).
+ * domain_id - Reserved (always zero).
+ */
+typedef struct {
+ leMFT_REF mft_reference;/* Mft record containing the object_id in
+ the index entry key. */
+ union {
+ struct {
+ GUID birth_volume_id;
+ GUID birth_object_id;
+ GUID domain_id;
+ } __attribute__ ((__packed__)) origin;
+ u8 extended_info[48];
+ } __attribute__ ((__packed__)) opt;
+} __attribute__ ((__packed__)) OBJ_ID_INDEX_DATA;
+
+/*
+ * Attribute: Object id (NTFS 3.0+) (0x40).
+ *
+ * NOTE: Always resident.
+ */
+typedef struct {
+ GUID object_id; /* Unique id assigned to the
+ file.*/
+ /* The following fields are optional. The attribute value size is 16
+ bytes, i.e. sizeof(GUID), if these are not present at all. Note,
+ the entries can be present but one or more (or all) can be zero
+ meaning that that particular value(s) is(are) not defined. */
+ union {
+ struct {
+ GUID birth_volume_id; /* Unique id of volume on which
+ the file was first created.*/
+ GUID birth_object_id; /* Unique id of file when it was
+ first created. */
+ GUID domain_id; /* Reserved, zero. */
+ } __attribute__ ((__packed__)) origin;
+ u8 extended_info[48];
+ } __attribute__ ((__packed__)) opt;
+} __attribute__ ((__packed__)) OBJECT_ID_ATTR;
+
+/*
+ * The pre-defined IDENTIFIER_AUTHORITIES used as SID_IDENTIFIER_AUTHORITY in
+ * the SID structure (see below).
+ */
+//typedef enum { /* SID string prefix. */
+// SECURITY_NULL_SID_AUTHORITY = {0, 0, 0, 0, 0, 0}, /* S-1-0 */
+// SECURITY_WORLD_SID_AUTHORITY = {0, 0, 0, 0, 0, 1}, /* S-1-1 */
+// SECURITY_LOCAL_SID_AUTHORITY = {0, 0, 0, 0, 0, 2}, /* S-1-2 */
+// SECURITY_CREATOR_SID_AUTHORITY = {0, 0, 0, 0, 0, 3}, /* S-1-3 */
+// SECURITY_NON_UNIQUE_AUTHORITY = {0, 0, 0, 0, 0, 4}, /* S-1-4 */
+// SECURITY_NT_SID_AUTHORITY = {0, 0, 0, 0, 0, 5}, /* S-1-5 */
+//} IDENTIFIER_AUTHORITIES;
+
+/*
+ * These relative identifiers (RIDs) are used with the above identifier
+ * authorities to make up universal well-known SIDs.
+ *
+ * Note: The relative identifier (RID) refers to the portion of a SID, which
+ * identifies a user or group in relation to the authority that issued the SID.
+ * For example, the universal well-known SID Creator Owner ID (S-1-3-0) is
+ * made up of the identifier authority SECURITY_CREATOR_SID_AUTHORITY (3) and
+ * the relative identifier SECURITY_CREATOR_OWNER_RID (0).
+ */
+typedef enum { /* Identifier authority. */
+ SECURITY_NULL_RID = 0, /* S-1-0 */
+ SECURITY_WORLD_RID = 0, /* S-1-1 */
+ SECURITY_LOCAL_RID = 0, /* S-1-2 */
+
+ SECURITY_CREATOR_OWNER_RID = 0, /* S-1-3 */
+ SECURITY_CREATOR_GROUP_RID = 1, /* S-1-3 */
+
+ SECURITY_CREATOR_OWNER_SERVER_RID = 2, /* S-1-3 */
+ SECURITY_CREATOR_GROUP_SERVER_RID = 3, /* S-1-3 */
+
+ SECURITY_DIALUP_RID = 1,
+ SECURITY_NETWORK_RID = 2,
+ SECURITY_BATCH_RID = 3,
+ SECURITY_INTERACTIVE_RID = 4,
+ SECURITY_SERVICE_RID = 6,
+ SECURITY_ANONYMOUS_LOGON_RID = 7,
+ SECURITY_PROXY_RID = 8,
+ SECURITY_ENTERPRISE_CONTROLLERS_RID=9,
+ SECURITY_SERVER_LOGON_RID = 9,
+ SECURITY_PRINCIPAL_SELF_RID = 0xa,
+ SECURITY_AUTHENTICATED_USER_RID = 0xb,
+ SECURITY_RESTRICTED_CODE_RID = 0xc,
+ SECURITY_TERMINAL_SERVER_RID = 0xd,
+
+ SECURITY_LOGON_IDS_RID = 5,
+ SECURITY_LOGON_IDS_RID_COUNT = 3,
+
+ SECURITY_LOCAL_SYSTEM_RID = 0x12,
+
+ SECURITY_NT_NON_UNIQUE = 0x15,
+
+ SECURITY_BUILTIN_DOMAIN_RID = 0x20,
+
+ /*
+ * Well-known domain relative sub-authority values (RIDs).
+ */
+
+ /* Users. */
+ DOMAIN_USER_RID_ADMIN = 0x1f4,
+ DOMAIN_USER_RID_GUEST = 0x1f5,
+ DOMAIN_USER_RID_KRBTGT = 0x1f6,
+
+ /* Groups. */
+ DOMAIN_GROUP_RID_ADMINS = 0x200,
+ DOMAIN_GROUP_RID_USERS = 0x201,
+ DOMAIN_GROUP_RID_GUESTS = 0x202,
+ DOMAIN_GROUP_RID_COMPUTERS = 0x203,
+ DOMAIN_GROUP_RID_CONTROLLERS = 0x204,
+ DOMAIN_GROUP_RID_CERT_ADMINS = 0x205,
+ DOMAIN_GROUP_RID_SCHEMA_ADMINS = 0x206,
+ DOMAIN_GROUP_RID_ENTERPRISE_ADMINS= 0x207,
+ DOMAIN_GROUP_RID_POLICY_ADMINS = 0x208,
+
+ /* Aliases. */
+ DOMAIN_ALIAS_RID_ADMINS = 0x220,
+ DOMAIN_ALIAS_RID_USERS = 0x221,
+ DOMAIN_ALIAS_RID_GUESTS = 0x222,
+ DOMAIN_ALIAS_RID_POWER_USERS = 0x223,
+
+ DOMAIN_ALIAS_RID_ACCOUNT_OPS = 0x224,
+ DOMAIN_ALIAS_RID_SYSTEM_OPS = 0x225,
+ DOMAIN_ALIAS_RID_PRINT_OPS = 0x226,
+ DOMAIN_ALIAS_RID_BACKUP_OPS = 0x227,
+
+ DOMAIN_ALIAS_RID_REPLICATOR = 0x228,
+ DOMAIN_ALIAS_RID_RAS_SERVERS = 0x229,
+ DOMAIN_ALIAS_RID_PREW2KCOMPACCESS = 0x22a,
+} RELATIVE_IDENTIFIERS;
+
+/*
+ * The universal well-known SIDs:
+ *
+ * NULL_SID S-1-0-0
+ * WORLD_SID S-1-1-0
+ * LOCAL_SID S-1-2-0
+ * CREATOR_OWNER_SID S-1-3-0
+ * CREATOR_GROUP_SID S-1-3-1
+ * CREATOR_OWNER_SERVER_SID S-1-3-2
+ * CREATOR_GROUP_SERVER_SID S-1-3-3
+ *
+ * (Non-unique IDs) S-1-4
+ *
+ * NT well-known SIDs:
+ *
+ * NT_AUTHORITY_SID S-1-5
+ * DIALUP_SID S-1-5-1
+ *
+ * NETWORD_SID S-1-5-2
+ * BATCH_SID S-1-5-3
+ * INTERACTIVE_SID S-1-5-4
+ * SERVICE_SID S-1-5-6
+ * ANONYMOUS_LOGON_SID S-1-5-7 (aka null logon session)
+ * PROXY_SID S-1-5-8
+ * SERVER_LOGON_SID S-1-5-9 (aka domain controller account)
+ * SELF_SID S-1-5-10 (self RID)
+ * AUTHENTICATED_USER_SID S-1-5-11
+ * RESTRICTED_CODE_SID S-1-5-12 (running restricted code)
+ * TERMINAL_SERVER_SID S-1-5-13 (running on terminal server)
+ *
+ * (Logon IDs) S-1-5-5-X-Y
+ *
+ * (NT non-unique IDs) S-1-5-0x15-...
+ *
+ * (Built-in domain) S-1-5-0x20
+ */
+
+/*
+ * The SID_IDENTIFIER_AUTHORITY is a 48-bit value used in the SID structure.
+ *
+ * NOTE: This is stored as a big endian number, hence the high_part comes
+ * before the low_part.
+ */
+typedef union {
+ struct {
+ u16 high_part; /* High 16-bits. */
+ u32 low_part; /* Low 32-bits. */
+ } __attribute__ ((__packed__)) parts;
+ u8 value[6]; /* Value as individual bytes. */
+} __attribute__ ((__packed__)) SID_IDENTIFIER_AUTHORITY;
+
+/*
+ * The SID structure is a variable-length structure used to uniquely identify
+ * users or groups. SID stands for security identifier.
+ *
+ * The standard textual representation of the SID is of the form:
+ * S-R-I-S-S...
+ * Where:
+ * - The first "S" is the literal character 'S' identifying the following
+ * digits as a SID.
+ * - R is the revision level of the SID expressed as a sequence of digits
+ * either in decimal or hexadecimal (if the later, prefixed by "0x").
+ * - I is the 48-bit identifier_authority, expressed as digits as R above.
+ * - S... is one or more sub_authority values, expressed as digits as above.
+ *
+ * Example SID; the domain-relative SID of the local Administrators group on
+ * Windows NT/2k:
+ * S-1-5-32-544
+ * This translates to a SID with:
+ * revision = 1,
+ * sub_authority_count = 2,
+ * identifier_authority = {0,0,0,0,0,5}, // SECURITY_NT_AUTHORITY
+ * sub_authority[0] = 32, // SECURITY_BUILTIN_DOMAIN_RID
+ * sub_authority[1] = 544 // DOMAIN_ALIAS_RID_ADMINS
+ */
+typedef struct {
+ u8 revision;
+ u8 sub_authority_count;
+ SID_IDENTIFIER_AUTHORITY identifier_authority;
+ le32 sub_authority[1]; /* At least one sub_authority. */
+} __attribute__ ((__packed__)) SID;
+
+/*
+ * Current constants for SIDs.
+ */
+typedef enum {
+ SID_REVISION = 1, /* Current revision level. */
+ SID_MAX_SUB_AUTHORITIES = 15, /* Maximum number of those. */
+ SID_RECOMMENDED_SUB_AUTHORITIES = 1, /* Will change to around 6 in
+ a future revision. */
+} SID_CONSTANTS;
+
+/*
+ * The predefined ACE types (8-bit, see below).
+ */
+enum {
+ ACCESS_MIN_MS_ACE_TYPE = 0,
+ ACCESS_ALLOWED_ACE_TYPE = 0,
+ ACCESS_DENIED_ACE_TYPE = 1,
+ SYSTEM_AUDIT_ACE_TYPE = 2,
+ SYSTEM_ALARM_ACE_TYPE = 3, /* Not implemented as of Win2k. */
+ ACCESS_MAX_MS_V2_ACE_TYPE = 3,
+
+ ACCESS_ALLOWED_COMPOUND_ACE_TYPE= 4,
+ ACCESS_MAX_MS_V3_ACE_TYPE = 4,
+
+ /* The following are Win2k only. */
+ ACCESS_MIN_MS_OBJECT_ACE_TYPE = 5,
+ ACCESS_ALLOWED_OBJECT_ACE_TYPE = 5,
+ ACCESS_DENIED_OBJECT_ACE_TYPE = 6,
+ SYSTEM_AUDIT_OBJECT_ACE_TYPE = 7,
+ SYSTEM_ALARM_OBJECT_ACE_TYPE = 8,
+ ACCESS_MAX_MS_OBJECT_ACE_TYPE = 8,
+
+ ACCESS_MAX_MS_V4_ACE_TYPE = 8,
+
+ /* This one is for WinNT/2k. */
+ ACCESS_MAX_MS_ACE_TYPE = 8,
+} __attribute__ ((__packed__));
+
+typedef u8 ACE_TYPES;
+
+/*
+ * The ACE flags (8-bit) for audit and inheritance (see below).
+ *
+ * SUCCESSFUL_ACCESS_ACE_FLAG is only used with system audit and alarm ACE
+ * types to indicate that a message is generated (in Windows!) for successful
+ * accesses.
+ *
+ * FAILED_ACCESS_ACE_FLAG is only used with system audit and alarm ACE types
+ * to indicate that a message is generated (in Windows!) for failed accesses.
+ */
+enum {
+ /* The inheritance flags. */
+ OBJECT_INHERIT_ACE = 0x01,
+ CONTAINER_INHERIT_ACE = 0x02,
+ NO_PROPAGATE_INHERIT_ACE = 0x04,
+ INHERIT_ONLY_ACE = 0x08,
+ INHERITED_ACE = 0x10, /* Win2k only. */
+ VALID_INHERIT_FLAGS = 0x1f,
+
+ /* The audit flags. */
+ SUCCESSFUL_ACCESS_ACE_FLAG = 0x40,
+ FAILED_ACCESS_ACE_FLAG = 0x80,
+} __attribute__ ((__packed__));
+
+typedef u8 ACE_FLAGS;
+
+/*
+ * An ACE is an access-control entry in an access-control list (ACL).
+ * An ACE defines access to an object for a specific user or group or defines
+ * the types of access that generate system-administration messages or alarms
+ * for a specific user or group. The user or group is identified by a security
+ * identifier (SID).
+ *
+ * Each ACE starts with an ACE_HEADER structure (aligned on 4-byte boundary),
+ * which specifies the type and size of the ACE. The format of the subsequent
+ * data depends on the ACE type.
+ */
+typedef struct {
+/*Ofs*/
+/* 0*/ ACE_TYPES type; /* Type of the ACE. */
+/* 1*/ ACE_FLAGS flags; /* Flags describing the ACE. */
+/* 2*/ le16 size; /* Size in bytes of the ACE. */
+} __attribute__ ((__packed__)) ACE_HEADER;
+
+/*
+ * The access mask (32-bit). Defines the access rights.
+ *
+ * The specific rights (bits 0 to 15). These depend on the type of the object
+ * being secured by the ACE.
+ */
+enum {
+ /* Specific rights for files and directories are as follows: */
+
+ /* Right to read data from the file. (FILE) */
+ FILE_READ_DATA = cpu_to_le32(0x00000001),
+ /* Right to list contents of a directory. (DIRECTORY) */
+ FILE_LIST_DIRECTORY = cpu_to_le32(0x00000001),
+
+ /* Right to write data to the file. (FILE) */
+ FILE_WRITE_DATA = cpu_to_le32(0x00000002),
+ /* Right to create a file in the directory. (DIRECTORY) */
+ FILE_ADD_FILE = cpu_to_le32(0x00000002),
+
+ /* Right to append data to the file. (FILE) */
+ FILE_APPEND_DATA = cpu_to_le32(0x00000004),
+ /* Right to create a subdirectory. (DIRECTORY) */
+ FILE_ADD_SUBDIRECTORY = cpu_to_le32(0x00000004),
+
+ /* Right to read extended attributes. (FILE/DIRECTORY) */
+ FILE_READ_EA = cpu_to_le32(0x00000008),
+
+ /* Right to write extended attributes. (FILE/DIRECTORY) */
+ FILE_WRITE_EA = cpu_to_le32(0x00000010),
+
+ /* Right to execute a file. (FILE) */
+ FILE_EXECUTE = cpu_to_le32(0x00000020),
+ /* Right to traverse the directory. (DIRECTORY) */
+ FILE_TRAVERSE = cpu_to_le32(0x00000020),
+
+ /*
+ * Right to delete a directory and all the files it contains (its
+ * children), even if the files are read-only. (DIRECTORY)
+ */
+ FILE_DELETE_CHILD = cpu_to_le32(0x00000040),
+
+ /* Right to read file attributes. (FILE/DIRECTORY) */
+ FILE_READ_ATTRIBUTES = cpu_to_le32(0x00000080),
+
+ /* Right to change file attributes. (FILE/DIRECTORY) */
+ FILE_WRITE_ATTRIBUTES = cpu_to_le32(0x00000100),
+
+ /*
+ * The standard rights (bits 16 to 23). These are independent of the
+ * type of object being secured.
+ */
+
+ /* Right to delete the object. */
+ DELETE = cpu_to_le32(0x00010000),
+
+ /*
+ * Right to read the information in the object's security descriptor,
+ * not including the information in the SACL, i.e. right to read the
+ * security descriptor and owner.
+ */
+ READ_CONTROL = cpu_to_le32(0x00020000),
+
+ /* Right to modify the DACL in the object's security descriptor. */
+ WRITE_DAC = cpu_to_le32(0x00040000),
+
+ /* Right to change the owner in the object's security descriptor. */
+ WRITE_OWNER = cpu_to_le32(0x00080000),
+
+ /*
+ * Right to use the object for synchronization. Enables a process to
+ * wait until the object is in the signalled state. Some object types
+ * do not support this access right.
+ */
+ SYNCHRONIZE = cpu_to_le32(0x00100000),
+
+ /*
+ * The following STANDARD_RIGHTS_* are combinations of the above for
+ * convenience and are defined by the Win32 API.
+ */
+
+ /* These are currently defined to READ_CONTROL. */
+ STANDARD_RIGHTS_READ = cpu_to_le32(0x00020000),
+ STANDARD_RIGHTS_WRITE = cpu_to_le32(0x00020000),
+ STANDARD_RIGHTS_EXECUTE = cpu_to_le32(0x00020000),
+
+ /* Combines DELETE, READ_CONTROL, WRITE_DAC, and WRITE_OWNER access. */
+ STANDARD_RIGHTS_REQUIRED = cpu_to_le32(0x000f0000),
+
+ /*
+ * Combines DELETE, READ_CONTROL, WRITE_DAC, WRITE_OWNER, and
+ * SYNCHRONIZE access.
+ */
+ STANDARD_RIGHTS_ALL = cpu_to_le32(0x001f0000),
+
+ /*
+ * The access system ACL and maximum allowed access types (bits 24 to
+ * 25, bits 26 to 27 are reserved).
+ */
+ ACCESS_SYSTEM_SECURITY = cpu_to_le32(0x01000000),
+ MAXIMUM_ALLOWED = cpu_to_le32(0x02000000),
+
+ /*
+ * The generic rights (bits 28 to 31). These map onto the standard and
+ * specific rights.
+ */
+
+ /* Read, write, and execute access. */
+ GENERIC_ALL = cpu_to_le32(0x10000000),
+
+ /* Execute access. */
+ GENERIC_EXECUTE = cpu_to_le32(0x20000000),
+
+ /*
+ * Write access. For files, this maps onto:
+ * FILE_APPEND_DATA | FILE_WRITE_ATTRIBUTES | FILE_WRITE_DATA |
+ * FILE_WRITE_EA | STANDARD_RIGHTS_WRITE | SYNCHRONIZE
+ * For directories, the mapping has the same numerical value. See
+ * above for the descriptions of the rights granted.
+ */
+ GENERIC_WRITE = cpu_to_le32(0x40000000),
+
+ /*
+ * Read access. For files, this maps onto:
+ * FILE_READ_ATTRIBUTES | FILE_READ_DATA | FILE_READ_EA |
+ * STANDARD_RIGHTS_READ | SYNCHRONIZE
+ * For directories, the mapping has the same numberical value. See
+ * above for the descriptions of the rights granted.
+ */
+ GENERIC_READ = cpu_to_le32(0x80000000),
+};
+
+typedef le32 ACCESS_MASK;
+
+/*
+ * The generic mapping array. Used to denote the mapping of each generic
+ * access right to a specific access mask.
+ *
+ * FIXME: What exactly is this and what is it for? (AIA)
+ */
+typedef struct {
+ ACCESS_MASK generic_read;
+ ACCESS_MASK generic_write;
+ ACCESS_MASK generic_execute;
+ ACCESS_MASK generic_all;
+} __attribute__ ((__packed__)) GENERIC_MAPPING;
+
+/*
+ * The predefined ACE type structures are as defined below.
+ */
+
+/*
+ * ACCESS_ALLOWED_ACE, ACCESS_DENIED_ACE, SYSTEM_AUDIT_ACE, SYSTEM_ALARM_ACE
+ */
+typedef struct {
+/* 0 ACE_HEADER; -- Unfolded here as gcc doesn't like unnamed structs. */
+ ACE_TYPES type; /* Type of the ACE. */
+ ACE_FLAGS flags; /* Flags describing the ACE. */
+ le16 size; /* Size in bytes of the ACE. */
+/* 4*/ ACCESS_MASK mask; /* Access mask associated with the ACE. */
+
+/* 8*/ SID sid; /* The SID associated with the ACE. */
+} __attribute__ ((__packed__)) ACCESS_ALLOWED_ACE, ACCESS_DENIED_ACE,
+ SYSTEM_AUDIT_ACE, SYSTEM_ALARM_ACE;
+
+/*
+ * The object ACE flags (32-bit).
+ */
+enum {
+ ACE_OBJECT_TYPE_PRESENT = cpu_to_le32(1),
+ ACE_INHERITED_OBJECT_TYPE_PRESENT = cpu_to_le32(2),
+};
+
+typedef le32 OBJECT_ACE_FLAGS;
+
+typedef struct {
+/* 0 ACE_HEADER; -- Unfolded here as gcc doesn't like unnamed structs. */
+ ACE_TYPES type; /* Type of the ACE. */
+ ACE_FLAGS flags; /* Flags describing the ACE. */
+ le16 size; /* Size in bytes of the ACE. */
+/* 4*/ ACCESS_MASK mask; /* Access mask associated with the ACE. */
+
+/* 8*/ OBJECT_ACE_FLAGS object_flags; /* Flags describing the object ACE. */
+/* 12*/ GUID object_type;
+/* 28*/ GUID inherited_object_type;
+
+/* 44*/ SID sid; /* The SID associated with the ACE. */
+} __attribute__ ((__packed__)) ACCESS_ALLOWED_OBJECT_ACE,
+ ACCESS_DENIED_OBJECT_ACE,
+ SYSTEM_AUDIT_OBJECT_ACE,
+ SYSTEM_ALARM_OBJECT_ACE;
+
+/*
+ * An ACL is an access-control list (ACL).
+ * An ACL starts with an ACL header structure, which specifies the size of
+ * the ACL and the number of ACEs it contains. The ACL header is followed by
+ * zero or more access control entries (ACEs). The ACL as well as each ACE
+ * are aligned on 4-byte boundaries.
+ */
+typedef struct {
+ u8 revision; /* Revision of this ACL. */
+ u8 alignment1;
+ le16 size; /* Allocated space in bytes for ACL. Includes this
+ header, the ACEs and the remaining free space. */
+ le16 ace_count; /* Number of ACEs in the ACL. */
+ le16 alignment2;
+/* sizeof() = 8 bytes */
+} __attribute__ ((__packed__)) ACL;
+
+/*
+ * Current constants for ACLs.
+ */
+typedef enum {
+ /* Current revision. */
+ ACL_REVISION = 2,
+ ACL_REVISION_DS = 4,
+
+ /* History of revisions. */
+ ACL_REVISION1 = 1,
+ MIN_ACL_REVISION = 2,
+ ACL_REVISION2 = 2,
+ ACL_REVISION3 = 3,
+ ACL_REVISION4 = 4,
+ MAX_ACL_REVISION = 4,
+} ACL_CONSTANTS;
+
+/*
+ * The security descriptor control flags (16-bit).
+ *
+ * SE_OWNER_DEFAULTED - This boolean flag, when set, indicates that the SID
+ * pointed to by the Owner field was provided by a defaulting mechanism
+ * rather than explicitly provided by the original provider of the
+ * security descriptor. This may affect the treatment of the SID with
+ * respect to inheritance of an owner.
+ *
+ * SE_GROUP_DEFAULTED - This boolean flag, when set, indicates that the SID in
+ * the Group field was provided by a defaulting mechanism rather than
+ * explicitly provided by the original provider of the security
+ * descriptor. This may affect the treatment of the SID with respect to
+ * inheritance of a primary group.
+ *
+ * SE_DACL_PRESENT - This boolean flag, when set, indicates that the security
+ * descriptor contains a discretionary ACL. If this flag is set and the
+ * Dacl field of the SECURITY_DESCRIPTOR is null, then a null ACL is
+ * explicitly being specified.
+ *
+ * SE_DACL_DEFAULTED - This boolean flag, when set, indicates that the ACL
+ * pointed to by the Dacl field was provided by a defaulting mechanism
+ * rather than explicitly provided by the original provider of the
+ * security descriptor. This may affect the treatment of the ACL with
+ * respect to inheritance of an ACL. This flag is ignored if the
+ * DaclPresent flag is not set.
+ *
+ * SE_SACL_PRESENT - This boolean flag, when set, indicates that the security
+ * descriptor contains a system ACL pointed to by the Sacl field. If this
+ * flag is set and the Sacl field of the SECURITY_DESCRIPTOR is null, then
+ * an empty (but present) ACL is being specified.
+ *
+ * SE_SACL_DEFAULTED - This boolean flag, when set, indicates that the ACL
+ * pointed to by the Sacl field was provided by a defaulting mechanism
+ * rather than explicitly provided by the original provider of the
+ * security descriptor. This may affect the treatment of the ACL with
+ * respect to inheritance of an ACL. This flag is ignored if the
+ * SaclPresent flag is not set.
+ *
+ * SE_SELF_RELATIVE - This boolean flag, when set, indicates that the security
+ * descriptor is in self-relative form. In this form, all fields of the
+ * security descriptor are contiguous in memory and all pointer fields are
+ * expressed as offsets from the beginning of the security descriptor.
+ */
+enum {
+ SE_OWNER_DEFAULTED = cpu_to_le16(0x0001),
+ SE_GROUP_DEFAULTED = cpu_to_le16(0x0002),
+ SE_DACL_PRESENT = cpu_to_le16(0x0004),
+ SE_DACL_DEFAULTED = cpu_to_le16(0x0008),
+
+ SE_SACL_PRESENT = cpu_to_le16(0x0010),
+ SE_SACL_DEFAULTED = cpu_to_le16(0x0020),
+
+ SE_DACL_AUTO_INHERIT_REQ = cpu_to_le16(0x0100),
+ SE_SACL_AUTO_INHERIT_REQ = cpu_to_le16(0x0200),
+ SE_DACL_AUTO_INHERITED = cpu_to_le16(0x0400),
+ SE_SACL_AUTO_INHERITED = cpu_to_le16(0x0800),
+
+ SE_DACL_PROTECTED = cpu_to_le16(0x1000),
+ SE_SACL_PROTECTED = cpu_to_le16(0x2000),
+ SE_RM_CONTROL_VALID = cpu_to_le16(0x4000),
+ SE_SELF_RELATIVE = cpu_to_le16(0x8000)
+} __attribute__ ((__packed__));
+
+typedef le16 SECURITY_DESCRIPTOR_CONTROL;
+
+/*
+ * Self-relative security descriptor. Contains the owner and group SIDs as well
+ * as the sacl and dacl ACLs inside the security descriptor itself.
+ */
+typedef struct {
+ u8 revision; /* Revision level of the security descriptor. */
+ u8 alignment;
+ SECURITY_DESCRIPTOR_CONTROL control; /* Flags qualifying the type of
+ the descriptor as well as the following fields. */
+ le32 owner; /* Byte offset to a SID representing an object's
+ owner. If this is NULL, no owner SID is present in
+ the descriptor. */
+ le32 group; /* Byte offset to a SID representing an object's
+ primary group. If this is NULL, no primary group
+ SID is present in the descriptor. */
+ le32 sacl; /* Byte offset to a system ACL. Only valid, if
+ SE_SACL_PRESENT is set in the control field. If
+ SE_SACL_PRESENT is set but sacl is NULL, a NULL ACL
+ is specified. */
+ le32 dacl; /* Byte offset to a discretionary ACL. Only valid, if
+ SE_DACL_PRESENT is set in the control field. If
+ SE_DACL_PRESENT is set but dacl is NULL, a NULL ACL
+ (unconditionally granting access) is specified. */
+/* sizeof() = 0x14 bytes */
+} __attribute__ ((__packed__)) SECURITY_DESCRIPTOR_RELATIVE;
+
+/*
+ * Absolute security descriptor. Does not contain the owner and group SIDs, nor
+ * the sacl and dacl ACLs inside the security descriptor. Instead, it contains
+ * pointers to these structures in memory. Obviously, absolute security
+ * descriptors are only useful for in memory representations of security
+ * descriptors. On disk, a self-relative security descriptor is used.
+ */
+typedef struct {
+ u8 revision; /* Revision level of the security descriptor. */
+ u8 alignment;
+ SECURITY_DESCRIPTOR_CONTROL control; /* Flags qualifying the type of
+ the descriptor as well as the following fields. */
+ SID *owner; /* Points to a SID representing an object's owner. If
+ this is NULL, no owner SID is present in the
+ descriptor. */
+ SID *group; /* Points to a SID representing an object's primary
+ group. If this is NULL, no primary group SID is
+ present in the descriptor. */
+ ACL *sacl; /* Points to a system ACL. Only valid, if
+ SE_SACL_PRESENT is set in the control field. If
+ SE_SACL_PRESENT is set but sacl is NULL, a NULL ACL
+ is specified. */
+ ACL *dacl; /* Points to a discretionary ACL. Only valid, if
+ SE_DACL_PRESENT is set in the control field. If
+ SE_DACL_PRESENT is set but dacl is NULL, a NULL ACL
+ (unconditionally granting access) is specified. */
+} __attribute__ ((__packed__)) SECURITY_DESCRIPTOR;
+
+/*
+ * Current constants for security descriptors.
+ */
+typedef enum {
+ /* Current revision. */
+ SECURITY_DESCRIPTOR_REVISION = 1,
+ SECURITY_DESCRIPTOR_REVISION1 = 1,
+
+ /* The sizes of both the absolute and relative security descriptors is
+ the same as pointers, at least on ia32 architecture are 32-bit. */
+ SECURITY_DESCRIPTOR_MIN_LENGTH = sizeof(SECURITY_DESCRIPTOR),
+} SECURITY_DESCRIPTOR_CONSTANTS;
+
+/*
+ * Attribute: Security descriptor (0x50). A standard self-relative security
+ * descriptor.
+ *
+ * NOTE: Can be resident or non-resident.
+ * NOTE: Not used in NTFS 3.0+, as security descriptors are stored centrally
+ * in FILE_Secure and the correct descriptor is found using the security_id
+ * from the standard information attribute.
+ */
+typedef SECURITY_DESCRIPTOR_RELATIVE SECURITY_DESCRIPTOR_ATTR;
+
+/*
+ * On NTFS 3.0+, all security descriptors are stored in FILE_Secure. Only one
+ * referenced instance of each unique security descriptor is stored.
+ *
+ * FILE_Secure contains no unnamed data attribute, i.e. it has zero length. It
+ * does, however, contain two indexes ($SDH and $SII) as well as a named data
+ * stream ($SDS).
+ *
+ * Every unique security descriptor is assigned a unique security identifier
+ * (security_id, not to be confused with a SID). The security_id is unique for
+ * the NTFS volume and is used as an index into the $SII index, which maps
+ * security_ids to the security descriptor's storage location within the $SDS
+ * data attribute. The $SII index is sorted by ascending security_id.
+ *
+ * A simple hash is computed from each security descriptor. This hash is used
+ * as an index into the $SDH index, which maps security descriptor hashes to
+ * the security descriptor's storage location within the $SDS data attribute.
+ * The $SDH index is sorted by security descriptor hash and is stored in a B+
+ * tree. When searching $SDH (with the intent of determining whether or not a
+ * new security descriptor is already present in the $SDS data stream), if a
+ * matching hash is found, but the security descriptors do not match, the
+ * search in the $SDH index is continued, searching for a next matching hash.
+ *
+ * When a precise match is found, the security_id coresponding to the security
+ * descriptor in the $SDS attribute is read from the found $SDH index entry and
+ * is stored in the $STANDARD_INFORMATION attribute of the file/directory to
+ * which the security descriptor is being applied. The $STANDARD_INFORMATION
+ * attribute is present in all base mft records (i.e. in all files and
+ * directories).
+ *
+ * If a match is not found, the security descriptor is assigned a new unique
+ * security_id and is added to the $SDS data attribute. Then, entries
+ * referencing the this security descriptor in the $SDS data attribute are
+ * added to the $SDH and $SII indexes.
+ *
+ * Note: Entries are never deleted from FILE_Secure, even if nothing
+ * references an entry any more.
+ */
+
+/*
+ * This header precedes each security descriptor in the $SDS data stream.
+ * This is also the index entry data part of both the $SII and $SDH indexes.
+ */
+typedef struct {
+ le32 hash; /* Hash of the security descriptor. */
+ le32 security_id; /* The security_id assigned to the descriptor. */
+ le64 offset; /* Byte offset of this entry in the $SDS stream. */
+ le32 length; /* Size in bytes of this entry in $SDS stream. */
+} __attribute__ ((__packed__)) SECURITY_DESCRIPTOR_HEADER;
+
+/*
+ * The $SDS data stream contains the security descriptors, aligned on 16-byte
+ * boundaries, sorted by security_id in a B+ tree. Security descriptors cannot
+ * cross 256kib boundaries (this restriction is imposed by the Windows cache
+ * manager). Each security descriptor is contained in a SDS_ENTRY structure.
+ * Also, each security descriptor is stored twice in the $SDS stream with a
+ * fixed offset of 0x40000 bytes (256kib, the Windows cache manager's max size)
+ * between them; i.e. if a SDS_ENTRY specifies an offset of 0x51d0, then the
+ * first copy of the security descriptor will be at offset 0x51d0 in the
+ * $SDS data stream and the second copy will be at offset 0x451d0.
+ */
+typedef struct {
+/*Ofs*/
+/* 0 SECURITY_DESCRIPTOR_HEADER; -- Unfolded here as gcc doesn't like
+ unnamed structs. */
+ le32 hash; /* Hash of the security descriptor. */
+ le32 security_id; /* The security_id assigned to the descriptor. */
+ le64 offset; /* Byte offset of this entry in the $SDS stream. */
+ le32 length; /* Size in bytes of this entry in $SDS stream. */
+/* 20*/ SECURITY_DESCRIPTOR_RELATIVE sid; /* The self-relative security
+ descriptor. */
+} __attribute__ ((__packed__)) SDS_ENTRY;
+
+/*
+ * The index entry key used in the $SII index. The collation type is
+ * COLLATION_NTOFS_ULONG.
+ */
+typedef struct {
+ le32 security_id; /* The security_id assigned to the descriptor. */
+} __attribute__ ((__packed__)) SII_INDEX_KEY;
+
+/*
+ * The index entry key used in the $SDH index. The keys are sorted first by
+ * hash and then by security_id. The collation rule is
+ * COLLATION_NTOFS_SECURITY_HASH.
+ */
+typedef struct {
+ le32 hash; /* Hash of the security descriptor. */
+ le32 security_id; /* The security_id assigned to the descriptor. */
+} __attribute__ ((__packed__)) SDH_INDEX_KEY;
+
+/*
+ * Attribute: Volume name (0x60).
+ *
+ * NOTE: Always resident.
+ * NOTE: Present only in FILE_Volume.
+ */
+typedef struct {
+ ntfschar name[0]; /* The name of the volume in Unicode. */
+} __attribute__ ((__packed__)) VOLUME_NAME;
+
+/*
+ * Possible flags for the volume (16-bit).
+ */
+enum {
+ VOLUME_IS_DIRTY = cpu_to_le16(0x0001),
+ VOLUME_RESIZE_LOG_FILE = cpu_to_le16(0x0002),
+ VOLUME_UPGRADE_ON_MOUNT = cpu_to_le16(0x0004),
+ VOLUME_MOUNTED_ON_NT4 = cpu_to_le16(0x0008),
+
+ VOLUME_DELETE_USN_UNDERWAY = cpu_to_le16(0x0010),
+ VOLUME_REPAIR_OBJECT_ID = cpu_to_le16(0x0020),
+
+ VOLUME_CHKDSK_UNDERWAY = cpu_to_le16(0x4000),
+ VOLUME_MODIFIED_BY_CHKDSK = cpu_to_le16(0x8000),
+
+ VOLUME_FLAGS_MASK = cpu_to_le16(0xc03f),
+
+ /* To make our life easier when checking if we must mount read-only. */
+ VOLUME_MUST_MOUNT_RO_MASK = cpu_to_le16(0xc027),
+} __attribute__ ((__packed__));
+
+typedef le16 VOLUME_FLAGS;
+
+/*
+ * Attribute: Volume information (0x70).
+ *
+ * NOTE: Always resident.
+ * NOTE: Present only in FILE_Volume.
+ * NOTE: Windows 2000 uses NTFS 3.0 while Windows NT4 service pack 6a uses
+ * NTFS 1.2. I haven't personally seen other values yet.
+ */
+typedef struct {
+ le64 reserved; /* Not used (yet?). */
+ u8 major_ver; /* Major version of the ntfs format. */
+ u8 minor_ver; /* Minor version of the ntfs format. */
+ VOLUME_FLAGS flags; /* Bit array of VOLUME_* flags. */
+} __attribute__ ((__packed__)) VOLUME_INFORMATION;
+
+/*
+ * Attribute: Data attribute (0x80).
+ *
+ * NOTE: Can be resident or non-resident.
+ *
+ * Data contents of a file (i.e. the unnamed stream) or of a named stream.
+ */
+typedef struct {
+ u8 data[0]; /* The file's data contents. */
+} __attribute__ ((__packed__)) DATA_ATTR;
+
+/*
+ * Index header flags (8-bit).
+ */
+enum {
+ /*
+ * When index header is in an index root attribute:
+ */
+ SMALL_INDEX = 0, /* The index is small enough to fit inside the index
+ root attribute and there is no index allocation
+ attribute present. */
+ LARGE_INDEX = 1, /* The index is too large to fit in the index root
+ attribute and/or an index allocation attribute is
+ present. */
+ /*
+ * When index header is in an index block, i.e. is part of index
+ * allocation attribute:
+ */
+ LEAF_NODE = 0, /* This is a leaf node, i.e. there are no more nodes
+ branching off it. */
+ INDEX_NODE = 1, /* This node indexes other nodes, i.e. it is not a leaf
+ node. */
+ NODE_MASK = 1, /* Mask for accessing the *_NODE bits. */
+} __attribute__ ((__packed__));
+
+typedef u8 INDEX_HEADER_FLAGS;
+
+/*
+ * This is the header for indexes, describing the INDEX_ENTRY records, which
+ * follow the INDEX_HEADER. Together the index header and the index entries
+ * make up a complete index.
+ *
+ * IMPORTANT NOTE: The offset, length and size structure members are counted
+ * relative to the start of the index header structure and not relative to the
+ * start of the index root or index allocation structures themselves.
+ */
+typedef struct {
+ le32 entries_offset; /* Byte offset to first INDEX_ENTRY
+ aligned to 8-byte boundary. */
+ le32 index_length; /* Data size of the index in bytes,
+ i.e. bytes used from allocated
+ size, aligned to 8-byte boundary. */
+ le32 allocated_size; /* Byte size of this index (block),
+ multiple of 8 bytes. */
+ /* NOTE: For the index root attribute, the above two numbers are always
+ equal, as the attribute is resident and it is resized as needed. In
+ the case of the index allocation attribute the attribute is not
+ resident and hence the allocated_size is a fixed value and must
+ equal the index_block_size specified by the INDEX_ROOT attribute
+ corresponding to the INDEX_ALLOCATION attribute this INDEX_BLOCK
+ belongs to. */
+ INDEX_HEADER_FLAGS flags; /* Bit field of INDEX_HEADER_FLAGS. */
+ u8 reserved[3]; /* Reserved/align to 8-byte boundary. */
+} __attribute__ ((__packed__)) INDEX_HEADER;
+
+/*
+ * Attribute: Index root (0x90).
+ *
+ * NOTE: Always resident.
+ *
+ * This is followed by a sequence of index entries (INDEX_ENTRY structures)
+ * as described by the index header.
+ *
+ * When a directory is small enough to fit inside the index root then this
+ * is the only attribute describing the directory. When the directory is too
+ * large to fit in the index root, on the other hand, two additional attributes
+ * are present: an index allocation attribute, containing sub-nodes of the B+
+ * directory tree (see below), and a bitmap attribute, describing which virtual
+ * cluster numbers (vcns) in the index allocation attribute are in use by an
+ * index block.
+ *
+ * NOTE: The root directory (FILE_root) contains an entry for itself. Other
+ * directories do not contain entries for themselves, though.
+ */
+typedef struct {
+ ATTR_TYPE type; /* Type of the indexed attribute. Is
+ $FILE_NAME for directories, zero
+ for view indexes. No other values
+ allowed. */
+ COLLATION_RULE collation_rule; /* Collation rule used to sort the
+ index entries. If type is $FILE_NAME,
+ this must be COLLATION_FILE_NAME. */
+ le32 index_block_size; /* Size of each index block in bytes (in
+ the index allocation attribute). */
+ u8 clusters_per_index_block; /* Cluster size of each index block (in
+ the index allocation attribute), when
+ an index block is >= than a cluster,
+ otherwise this will be the log of
+ the size (like how the encoding of
+ the mft record size and the index
+ record size found in the boot sector
+ work). Has to be a power of 2. */
+ u8 reserved[3]; /* Reserved/align to 8-byte boundary. */
+ INDEX_HEADER index; /* Index header describing the
+ following index entries. */
+} __attribute__ ((__packed__)) INDEX_ROOT;
+
+/*
+ * Attribute: Index allocation (0xa0).
+ *
+ * NOTE: Always non-resident (doesn't make sense to be resident anyway!).
+ *
+ * This is an array of index blocks. Each index block starts with an
+ * INDEX_BLOCK structure containing an index header, followed by a sequence of
+ * index entries (INDEX_ENTRY structures), as described by the INDEX_HEADER.
+ */
+typedef struct {
+/* 0 NTFS_RECORD; -- Unfolded here as gcc doesn't like unnamed structs. */
+ NTFS_RECORD_TYPE magic; /* Magic is "INDX". */
+ le16 usa_ofs; /* See NTFS_RECORD definition. */
+ le16 usa_count; /* See NTFS_RECORD definition. */
+
+/* 8*/ sle64 lsn; /* $LogFile sequence number of the last
+ modification of this index block. */
+/* 16*/ leVCN index_block_vcn; /* Virtual cluster number of the index block.
+ If the cluster_size on the volume is <= the
+ index_block_size of the directory,
+ index_block_vcn counts in units of clusters,
+ and in units of sectors otherwise. */
+/* 24*/ INDEX_HEADER index; /* Describes the following index entries. */
+/* sizeof()= 40 (0x28) bytes */
+/*
+ * When creating the index block, we place the update sequence array at this
+ * offset, i.e. before we start with the index entries. This also makes sense,
+ * otherwise we could run into problems with the update sequence array
+ * containing in itself the last two bytes of a sector which would mean that
+ * multi sector transfer protection wouldn't work. As you can't protect data
+ * by overwriting it since you then can't get it back...
+ * When reading use the data from the ntfs record header.
+ */
+} __attribute__ ((__packed__)) INDEX_BLOCK;
+
+typedef INDEX_BLOCK INDEX_ALLOCATION;
+
+/*
+ * The system file FILE_Extend/$Reparse contains an index named $R listing
+ * all reparse points on the volume. The index entry keys are as defined
+ * below. Note, that there is no index data associated with the index entries.
+ *
+ * The index entries are sorted by the index key file_id. The collation rule is
+ * COLLATION_NTOFS_ULONGS. FIXME: Verify whether the reparse_tag is not the
+ * primary key / is not a key at all. (AIA)
+ */
+typedef struct {
+ le32 reparse_tag; /* Reparse point type (inc. flags). */
+ leMFT_REF file_id; /* Mft record of the file containing the
+ reparse point attribute. */
+} __attribute__ ((__packed__)) REPARSE_INDEX_KEY;
+
+/*
+ * Quota flags (32-bit).
+ *
+ * The user quota flags. Names explain meaning.
+ */
+enum {
+ QUOTA_FLAG_DEFAULT_LIMITS = cpu_to_le32(0x00000001),
+ QUOTA_FLAG_LIMIT_REACHED = cpu_to_le32(0x00000002),
+ QUOTA_FLAG_ID_DELETED = cpu_to_le32(0x00000004),
+
+ QUOTA_FLAG_USER_MASK = cpu_to_le32(0x00000007),
+ /* This is a bit mask for the user quota flags. */
+
+ /*
+ * These flags are only present in the quota defaults index entry, i.e.
+ * in the entry where owner_id = QUOTA_DEFAULTS_ID.
+ */
+ QUOTA_FLAG_TRACKING_ENABLED = cpu_to_le32(0x00000010),
+ QUOTA_FLAG_ENFORCEMENT_ENABLED = cpu_to_le32(0x00000020),
+ QUOTA_FLAG_TRACKING_REQUESTED = cpu_to_le32(0x00000040),
+ QUOTA_FLAG_LOG_THRESHOLD = cpu_to_le32(0x00000080),
+
+ QUOTA_FLAG_LOG_LIMIT = cpu_to_le32(0x00000100),
+ QUOTA_FLAG_OUT_OF_DATE = cpu_to_le32(0x00000200),
+ QUOTA_FLAG_CORRUPT = cpu_to_le32(0x00000400),
+ QUOTA_FLAG_PENDING_DELETES = cpu_to_le32(0x00000800),
+};
+
+typedef le32 QUOTA_FLAGS;
+
+/*
+ * The system file FILE_Extend/$Quota contains two indexes $O and $Q. Quotas
+ * are on a per volume and per user basis.
+ *
+ * The $Q index contains one entry for each existing user_id on the volume. The
+ * index key is the user_id of the user/group owning this quota control entry,
+ * i.e. the key is the owner_id. The user_id of the owner of a file, i.e. the
+ * owner_id, is found in the standard information attribute. The collation rule
+ * for $Q is COLLATION_NTOFS_ULONG.
+ *
+ * The $O index contains one entry for each user/group who has been assigned
+ * a quota on that volume. The index key holds the SID of the user_id the
+ * entry belongs to, i.e. the owner_id. The collation rule for $O is
+ * COLLATION_NTOFS_SID.
+ *
+ * The $O index entry data is the user_id of the user corresponding to the SID.
+ * This user_id is used as an index into $Q to find the quota control entry
+ * associated with the SID.
+ *
+ * The $Q index entry data is the quota control entry and is defined below.
+ */
+typedef struct {
+ le32 version; /* Currently equals 2. */
+ QUOTA_FLAGS flags; /* Flags describing this quota entry. */
+ le64 bytes_used; /* How many bytes of the quota are in use. */
+ sle64 change_time; /* Last time this quota entry was changed. */
+ sle64 threshold; /* Soft quota (-1 if not limited). */
+ sle64 limit; /* Hard quota (-1 if not limited). */
+ sle64 exceeded_time; /* How long the soft quota has been exceeded. */
+ SID sid; /* The SID of the user/object associated with
+ this quota entry. Equals zero for the quota
+ defaults entry (and in fact on a WinXP
+ volume, it is not present at all). */
+} __attribute__ ((__packed__)) QUOTA_CONTROL_ENTRY;
+
+/*
+ * Predefined owner_id values (32-bit).
+ */
+enum {
+ QUOTA_INVALID_ID = cpu_to_le32(0x00000000),
+ QUOTA_DEFAULTS_ID = cpu_to_le32(0x00000001),
+ QUOTA_FIRST_USER_ID = cpu_to_le32(0x00000100),
+};
+
+/*
+ * Current constants for quota control entries.
+ */
+typedef enum {
+ /* Current version. */
+ QUOTA_VERSION = 2,
+} QUOTA_CONTROL_ENTRY_CONSTANTS;
+
+/*
+ * Index entry flags (16-bit).
+ */
+enum {
+ INDEX_ENTRY_NODE = cpu_to_le16(1), /* This entry contains a
+ sub-node, i.e. a reference to an index block in form of
+ a virtual cluster number (see below). */
+ INDEX_ENTRY_END = cpu_to_le16(2), /* This signifies the last
+ entry in an index block. The index entry does not
+ represent a file but it can point to a sub-node. */
+
+ INDEX_ENTRY_SPACE_FILLER = cpu_to_le16(0xffff), /* gcc: Force
+ enum bit width to 16-bit. */
+} __attribute__ ((__packed__));
+
+typedef le16 INDEX_ENTRY_FLAGS;
+
+/*
+ * This the index entry header (see below).
+ */
+typedef struct {
+/* 0*/ union {
+ struct { /* Only valid when INDEX_ENTRY_END is not set. */
+ leMFT_REF indexed_file; /* The mft reference of the file
+ described by this index
+ entry. Used for directory
+ indexes. */
+ } __attribute__ ((__packed__)) dir;
+ struct { /* Used for views/indexes to find the entry's data. */
+ le16 data_offset; /* Data byte offset from this
+ INDEX_ENTRY. Follows the
+ index key. */
+ le16 data_length; /* Data length in bytes. */
+ le32 reservedV; /* Reserved (zero). */
+ } __attribute__ ((__packed__)) vi;
+ } __attribute__ ((__packed__)) data;
+/* 8*/ le16 length; /* Byte size of this index entry, multiple of
+ 8-bytes. */
+/* 10*/ le16 key_length; /* Byte size of the key value, which is in the
+ index entry. It follows field reserved. Not
+ multiple of 8-bytes. */
+/* 12*/ INDEX_ENTRY_FLAGS flags; /* Bit field of INDEX_ENTRY_* flags. */
+/* 14*/ le16 reserved; /* Reserved/align to 8-byte boundary. */
+/* sizeof() = 16 bytes */
+} __attribute__ ((__packed__)) INDEX_ENTRY_HEADER;
+
+/*
+ * This is an index entry. A sequence of such entries follows each INDEX_HEADER
+ * structure. Together they make up a complete index. The index follows either
+ * an index root attribute or an index allocation attribute.
+ *
+ * NOTE: Before NTFS 3.0 only filename attributes were indexed.
+ */
+typedef struct {
+/*Ofs*/
+/* 0 INDEX_ENTRY_HEADER; -- Unfolded here as gcc dislikes unnamed structs. */
+ union {
+ struct { /* Only valid when INDEX_ENTRY_END is not set. */
+ leMFT_REF indexed_file; /* The mft reference of the file
+ described by this index
+ entry. Used for directory
+ indexes. */
+ } __attribute__ ((__packed__)) dir;
+ struct { /* Used for views/indexes to find the entry's data. */
+ le16 data_offset; /* Data byte offset from this
+ INDEX_ENTRY. Follows the
+ index key. */
+ le16 data_length; /* Data length in bytes. */
+ le32 reservedV; /* Reserved (zero). */
+ } __attribute__ ((__packed__)) vi;
+ } __attribute__ ((__packed__)) data;
+ le16 length; /* Byte size of this index entry, multiple of
+ 8-bytes. */
+ le16 key_length; /* Byte size of the key value, which is in the
+ index entry. It follows field reserved. Not
+ multiple of 8-bytes. */
+ INDEX_ENTRY_FLAGS flags; /* Bit field of INDEX_ENTRY_* flags. */
+ le16 reserved; /* Reserved/align to 8-byte boundary. */
+
+/* 16*/ union { /* The key of the indexed attribute. NOTE: Only present
+ if INDEX_ENTRY_END bit in flags is not set. NOTE: On
+ NTFS versions before 3.0 the only valid key is the
+ FILE_NAME_ATTR. On NTFS 3.0+ the following
+ additional index keys are defined: */
+ FILE_NAME_ATTR file_name;/* $I30 index in directories. */
+ SII_INDEX_KEY sii; /* $SII index in $Secure. */
+ SDH_INDEX_KEY sdh; /* $SDH index in $Secure. */
+ GUID object_id; /* $O index in FILE_Extend/$ObjId: The
+ object_id of the mft record found in
+ the data part of the index. */
+ REPARSE_INDEX_KEY reparse; /* $R index in
+ FILE_Extend/$Reparse. */
+ SID sid; /* $O index in FILE_Extend/$Quota:
+ SID of the owner of the user_id. */
+ le32 owner_id; /* $Q index in FILE_Extend/$Quota:
+ user_id of the owner of the quota
+ control entry in the data part of
+ the index. */
+ } __attribute__ ((__packed__)) key;
+ /* The (optional) index data is inserted here when creating. */
+ // leVCN vcn; /* If INDEX_ENTRY_NODE bit in flags is set, the last
+ // eight bytes of this index entry contain the virtual
+ // cluster number of the index block that holds the
+ // entries immediately preceding the current entry (the
+ // vcn references the corresponding cluster in the data
+ // of the non-resident index allocation attribute). If
+ // the key_length is zero, then the vcn immediately
+ // follows the INDEX_ENTRY_HEADER. Regardless of
+ // key_length, the address of the 8-byte boundary
+ // aligned vcn of INDEX_ENTRY{_HEADER} *ie is given by
+ // (char*)ie + le16_to_cpu(ie*)->length) - sizeof(VCN),
+ // where sizeof(VCN) can be hardcoded as 8 if wanted. */
+} __attribute__ ((__packed__)) INDEX_ENTRY;
+
+/*
+ * Attribute: Bitmap (0xb0).
+ *
+ * Contains an array of bits (aka a bitfield).
+ *
+ * When used in conjunction with the index allocation attribute, each bit
+ * corresponds to one index block within the index allocation attribute. Thus
+ * the number of bits in the bitmap * index block size / cluster size is the
+ * number of clusters in the index allocation attribute.
+ */
+typedef struct {
+ u8 bitmap[0]; /* Array of bits. */
+} __attribute__ ((__packed__)) BITMAP_ATTR;
+
+/*
+ * The reparse point tag defines the type of the reparse point. It also
+ * includes several flags, which further describe the reparse point.
+ *
+ * The reparse point tag is an unsigned 32-bit value divided in three parts:
+ *
+ * 1. The least significant 16 bits (i.e. bits 0 to 15) specifiy the type of
+ * the reparse point.
+ * 2. The 13 bits after this (i.e. bits 16 to 28) are reserved for future use.
+ * 3. The most significant three bits are flags describing the reparse point.
+ * They are defined as follows:
+ * bit 29: Name surrogate bit. If set, the filename is an alias for
+ * another object in the system.
+ * bit 30: High-latency bit. If set, accessing the first byte of data will
+ * be slow. (E.g. the data is stored on a tape drive.)
+ * bit 31: Microsoft bit. If set, the tag is owned by Microsoft. User
+ * defined tags have to use zero here.
+ *
+ * These are the predefined reparse point tags:
+ */
+enum {
+ IO_REPARSE_TAG_IS_ALIAS = cpu_to_le32(0x20000000),
+ IO_REPARSE_TAG_IS_HIGH_LATENCY = cpu_to_le32(0x40000000),
+ IO_REPARSE_TAG_IS_MICROSOFT = cpu_to_le32(0x80000000),
+
+ IO_REPARSE_TAG_RESERVED_ZERO = cpu_to_le32(0x00000000),
+ IO_REPARSE_TAG_RESERVED_ONE = cpu_to_le32(0x00000001),
+ IO_REPARSE_TAG_RESERVED_RANGE = cpu_to_le32(0x00000001),
+
+ IO_REPARSE_TAG_NSS = cpu_to_le32(0x68000005),
+ IO_REPARSE_TAG_NSS_RECOVER = cpu_to_le32(0x68000006),
+ IO_REPARSE_TAG_SIS = cpu_to_le32(0x68000007),
+ IO_REPARSE_TAG_DFS = cpu_to_le32(0x68000008),
+
+ IO_REPARSE_TAG_MOUNT_POINT = cpu_to_le32(0x88000003),
+
+ IO_REPARSE_TAG_HSM = cpu_to_le32(0xa8000004),
+
+ IO_REPARSE_TAG_SYMBOLIC_LINK = cpu_to_le32(0xe8000000),
+
+ IO_REPARSE_TAG_VALID_VALUES = cpu_to_le32(0xe000ffff),
+};
+
+/*
+ * Attribute: Reparse point (0xc0).
+ *
+ * NOTE: Can be resident or non-resident.
+ */
+typedef struct {
+ le32 reparse_tag; /* Reparse point type (inc. flags). */
+ le16 reparse_data_length; /* Byte size of reparse data. */
+ le16 reserved; /* Align to 8-byte boundary. */
+ u8 reparse_data[0]; /* Meaning depends on reparse_tag. */
+} __attribute__ ((__packed__)) REPARSE_POINT;
+
+/*
+ * Attribute: Extended attribute (EA) information (0xd0).
+ *
+ * NOTE: Always resident. (Is this true???)
+ */
+typedef struct {
+ le16 ea_length; /* Byte size of the packed extended
+ attributes. */
+ le16 need_ea_count; /* The number of extended attributes which have
+ the NEED_EA bit set. */
+ le32 ea_query_length; /* Byte size of the buffer required to query
+ the extended attributes when calling
+ ZwQueryEaFile() in Windows NT/2k. I.e. the
+ byte size of the unpacked extended
+ attributes. */
+} __attribute__ ((__packed__)) EA_INFORMATION;
+
+/*
+ * Extended attribute flags (8-bit).
+ */
+enum {
+ NEED_EA = 0x80 /* If set the file to which the EA belongs
+ cannot be interpreted without understanding
+ the associates extended attributes. */
+} __attribute__ ((__packed__));
+
+typedef u8 EA_FLAGS;
+
+/*
+ * Attribute: Extended attribute (EA) (0xe0).
+ *
+ * NOTE: Can be resident or non-resident.
+ *
+ * Like the attribute list and the index buffer list, the EA attribute value is
+ * a sequence of EA_ATTR variable length records.
+ */
+typedef struct {
+ le32 next_entry_offset; /* Offset to the next EA_ATTR. */
+ EA_FLAGS flags; /* Flags describing the EA. */
+ u8 ea_name_length; /* Length of the name of the EA in bytes
+ excluding the '\0' byte terminator. */
+ le16 ea_value_length; /* Byte size of the EA's value. */
+ u8 ea_name[0]; /* Name of the EA. Note this is ASCII, not
+ Unicode and it is zero terminated. */
+ u8 ea_value[0]; /* The value of the EA. Immediately follows
+ the name. */
+} __attribute__ ((__packed__)) EA_ATTR;
+
+/*
+ * Attribute: Property set (0xf0).
+ *
+ * Intended to support Native Structure Storage (NSS) - a feature removed from
+ * NTFS 3.0 during beta testing.
+ */
+typedef struct {
+ /* Irrelevant as feature unused. */
+} __attribute__ ((__packed__)) PROPERTY_SET;
+
+/*
+ * Attribute: Logged utility stream (0x100).
+ *
+ * NOTE: Can be resident or non-resident.
+ *
+ * Operations on this attribute are logged to the journal ($LogFile) like
+ * normal metadata changes.
+ *
+ * Used by the Encrypting File System (EFS). All encrypted files have this
+ * attribute with the name $EFS.
+ */
+typedef struct {
+ /* Can be anything the creator chooses. */
+ /* EFS uses it as follows: */
+ // FIXME: Type this info, verifying it along the way. (AIA)
+} __attribute__ ((__packed__)) LOGGED_UTILITY_STREAM, EFS_ATTR;
+
+#endif /* _LINUX_NTFS_LAYOUT_H */
diff --git a/fs/ntfs/lcnalloc.c b/fs/ntfs/lcnalloc.c
new file mode 100644
index 000000000..eda9972e6
--- /dev/null
+++ b/fs/ntfs/lcnalloc.c
@@ -0,0 +1,1000 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * lcnalloc.c - Cluster (de)allocation code. Part of the Linux-NTFS project.
+ *
+ * Copyright (c) 2004-2005 Anton Altaparmakov
+ */
+
+#ifdef NTFS_RW
+
+#include <linux/pagemap.h>
+
+#include "lcnalloc.h"
+#include "debug.h"
+#include "bitmap.h"
+#include "inode.h"
+#include "volume.h"
+#include "attrib.h"
+#include "malloc.h"
+#include "aops.h"
+#include "ntfs.h"
+
+/**
+ * ntfs_cluster_free_from_rl_nolock - free clusters from runlist
+ * @vol: mounted ntfs volume on which to free the clusters
+ * @rl: runlist describing the clusters to free
+ *
+ * Free all the clusters described by the runlist @rl on the volume @vol. In
+ * the case of an error being returned, at least some of the clusters were not
+ * freed.
+ *
+ * Return 0 on success and -errno on error.
+ *
+ * Locking: - The volume lcn bitmap must be locked for writing on entry and is
+ * left locked on return.
+ */
+int ntfs_cluster_free_from_rl_nolock(ntfs_volume *vol,
+ const runlist_element *rl)
+{
+ struct inode *lcnbmp_vi = vol->lcnbmp_ino;
+ int ret = 0;
+
+ ntfs_debug("Entering.");
+ if (!rl)
+ return 0;
+ for (; rl->length; rl++) {
+ int err;
+
+ if (rl->lcn < 0)
+ continue;
+ err = ntfs_bitmap_clear_run(lcnbmp_vi, rl->lcn, rl->length);
+ if (unlikely(err && (!ret || ret == -ENOMEM) && ret != err))
+ ret = err;
+ }
+ ntfs_debug("Done.");
+ return ret;
+}
+
+/**
+ * ntfs_cluster_alloc - allocate clusters on an ntfs volume
+ * @vol: mounted ntfs volume on which to allocate the clusters
+ * @start_vcn: vcn to use for the first allocated cluster
+ * @count: number of clusters to allocate
+ * @start_lcn: starting lcn at which to allocate the clusters (or -1 if none)
+ * @zone: zone from which to allocate the clusters
+ * @is_extension: if 'true', this is an attribute extension
+ *
+ * Allocate @count clusters preferably starting at cluster @start_lcn or at the
+ * current allocator position if @start_lcn is -1, on the mounted ntfs volume
+ * @vol. @zone is either DATA_ZONE for allocation of normal clusters or
+ * MFT_ZONE for allocation of clusters for the master file table, i.e. the
+ * $MFT/$DATA attribute.
+ *
+ * @start_vcn specifies the vcn of the first allocated cluster. This makes
+ * merging the resulting runlist with the old runlist easier.
+ *
+ * If @is_extension is 'true', the caller is allocating clusters to extend an
+ * attribute and if it is 'false', the caller is allocating clusters to fill a
+ * hole in an attribute. Practically the difference is that if @is_extension
+ * is 'true' the returned runlist will be terminated with LCN_ENOENT and if
+ * @is_extension is 'false' the runlist will be terminated with
+ * LCN_RL_NOT_MAPPED.
+ *
+ * You need to check the return value with IS_ERR(). If this is false, the
+ * function was successful and the return value is a runlist describing the
+ * allocated cluster(s). If IS_ERR() is true, the function failed and
+ * PTR_ERR() gives you the error code.
+ *
+ * Notes on the allocation algorithm
+ * =================================
+ *
+ * There are two data zones. First is the area between the end of the mft zone
+ * and the end of the volume, and second is the area between the start of the
+ * volume and the start of the mft zone. On unmodified/standard NTFS 1.x
+ * volumes, the second data zone does not exist due to the mft zone being
+ * expanded to cover the start of the volume in order to reserve space for the
+ * mft bitmap attribute.
+ *
+ * This is not the prettiest function but the complexity stems from the need of
+ * implementing the mft vs data zoned approach and from the fact that we have
+ * access to the lcn bitmap in portions of up to 8192 bytes at a time, so we
+ * need to cope with crossing over boundaries of two buffers. Further, the
+ * fact that the allocator allows for caller supplied hints as to the location
+ * of where allocation should begin and the fact that the allocator keeps track
+ * of where in the data zones the next natural allocation should occur,
+ * contribute to the complexity of the function. But it should all be
+ * worthwhile, because this allocator should: 1) be a full implementation of
+ * the MFT zone approach used by Windows NT, 2) cause reduction in
+ * fragmentation, and 3) be speedy in allocations (the code is not optimized
+ * for speed, but the algorithm is, so further speed improvements are probably
+ * possible).
+ *
+ * FIXME: We should be monitoring cluster allocation and increment the MFT zone
+ * size dynamically but this is something for the future. We will just cause
+ * heavier fragmentation by not doing it and I am not even sure Windows would
+ * grow the MFT zone dynamically, so it might even be correct not to do this.
+ * The overhead in doing dynamic MFT zone expansion would be very large and
+ * unlikely worth the effort. (AIA)
+ *
+ * TODO: I have added in double the required zone position pointer wrap around
+ * logic which can be optimized to having only one of the two logic sets.
+ * However, having the double logic will work fine, but if we have only one of
+ * the sets and we get it wrong somewhere, then we get into trouble, so
+ * removing the duplicate logic requires _very_ careful consideration of _all_
+ * possible code paths. So at least for now, I am leaving the double logic -
+ * better safe than sorry... (AIA)
+ *
+ * Locking: - The volume lcn bitmap must be unlocked on entry and is unlocked
+ * on return.
+ * - This function takes the volume lcn bitmap lock for writing and
+ * modifies the bitmap contents.
+ */
+runlist_element *ntfs_cluster_alloc(ntfs_volume *vol, const VCN start_vcn,
+ const s64 count, const LCN start_lcn,
+ const NTFS_CLUSTER_ALLOCATION_ZONES zone,
+ const bool is_extension)
+{
+ LCN zone_start, zone_end, bmp_pos, bmp_initial_pos, last_read_pos, lcn;
+ LCN prev_lcn = 0, prev_run_len = 0, mft_zone_size;
+ s64 clusters;
+ loff_t i_size;
+ struct inode *lcnbmp_vi;
+ runlist_element *rl = NULL;
+ struct address_space *mapping;
+ struct page *page = NULL;
+ u8 *buf, *byte;
+ int err = 0, rlpos, rlsize, buf_size;
+ u8 pass, done_zones, search_zone, need_writeback = 0, bit;
+
+ ntfs_debug("Entering for start_vcn 0x%llx, count 0x%llx, start_lcn "
+ "0x%llx, zone %s_ZONE.", (unsigned long long)start_vcn,
+ (unsigned long long)count,
+ (unsigned long long)start_lcn,
+ zone == MFT_ZONE ? "MFT" : "DATA");
+ BUG_ON(!vol);
+ lcnbmp_vi = vol->lcnbmp_ino;
+ BUG_ON(!lcnbmp_vi);
+ BUG_ON(start_vcn < 0);
+ BUG_ON(count < 0);
+ BUG_ON(start_lcn < -1);
+ BUG_ON(zone < FIRST_ZONE);
+ BUG_ON(zone > LAST_ZONE);
+
+ /* Return NULL if @count is zero. */
+ if (!count)
+ return NULL;
+ /* Take the lcnbmp lock for writing. */
+ down_write(&vol->lcnbmp_lock);
+ /*
+ * If no specific @start_lcn was requested, use the current data zone
+ * position, otherwise use the requested @start_lcn but make sure it
+ * lies outside the mft zone. Also set done_zones to 0 (no zones done)
+ * and pass depending on whether we are starting inside a zone (1) or
+ * at the beginning of a zone (2). If requesting from the MFT_ZONE,
+ * we either start at the current position within the mft zone or at
+ * the specified position. If the latter is out of bounds then we start
+ * at the beginning of the MFT_ZONE.
+ */
+ done_zones = 0;
+ pass = 1;
+ /*
+ * zone_start and zone_end are the current search range. search_zone
+ * is 1 for mft zone, 2 for data zone 1 (end of mft zone till end of
+ * volume) and 4 for data zone 2 (start of volume till start of mft
+ * zone).
+ */
+ zone_start = start_lcn;
+ if (zone_start < 0) {
+ if (zone == DATA_ZONE)
+ zone_start = vol->data1_zone_pos;
+ else
+ zone_start = vol->mft_zone_pos;
+ if (!zone_start) {
+ /*
+ * Zone starts at beginning of volume which means a
+ * single pass is sufficient.
+ */
+ pass = 2;
+ }
+ } else if (zone == DATA_ZONE && zone_start >= vol->mft_zone_start &&
+ zone_start < vol->mft_zone_end) {
+ zone_start = vol->mft_zone_end;
+ /*
+ * Starting at beginning of data1_zone which means a single
+ * pass in this zone is sufficient.
+ */
+ pass = 2;
+ } else if (zone == MFT_ZONE && (zone_start < vol->mft_zone_start ||
+ zone_start >= vol->mft_zone_end)) {
+ zone_start = vol->mft_lcn;
+ if (!vol->mft_zone_end)
+ zone_start = 0;
+ /*
+ * Starting at beginning of volume which means a single pass
+ * is sufficient.
+ */
+ pass = 2;
+ }
+ if (zone == MFT_ZONE) {
+ zone_end = vol->mft_zone_end;
+ search_zone = 1;
+ } else /* if (zone == DATA_ZONE) */ {
+ /* Skip searching the mft zone. */
+ done_zones |= 1;
+ if (zone_start >= vol->mft_zone_end) {
+ zone_end = vol->nr_clusters;
+ search_zone = 2;
+ } else {
+ zone_end = vol->mft_zone_start;
+ search_zone = 4;
+ }
+ }
+ /*
+ * bmp_pos is the current bit position inside the bitmap. We use
+ * bmp_initial_pos to determine whether or not to do a zone switch.
+ */
+ bmp_pos = bmp_initial_pos = zone_start;
+
+ /* Loop until all clusters are allocated, i.e. clusters == 0. */
+ clusters = count;
+ rlpos = rlsize = 0;
+ mapping = lcnbmp_vi->i_mapping;
+ i_size = i_size_read(lcnbmp_vi);
+ while (1) {
+ ntfs_debug("Start of outer while loop: done_zones 0x%x, "
+ "search_zone %i, pass %i, zone_start 0x%llx, "
+ "zone_end 0x%llx, bmp_initial_pos 0x%llx, "
+ "bmp_pos 0x%llx, rlpos %i, rlsize %i.",
+ done_zones, search_zone, pass,
+ (unsigned long long)zone_start,
+ (unsigned long long)zone_end,
+ (unsigned long long)bmp_initial_pos,
+ (unsigned long long)bmp_pos, rlpos, rlsize);
+ /* Loop until we run out of free clusters. */
+ last_read_pos = bmp_pos >> 3;
+ ntfs_debug("last_read_pos 0x%llx.",
+ (unsigned long long)last_read_pos);
+ if (last_read_pos > i_size) {
+ ntfs_debug("End of attribute reached. "
+ "Skipping to zone_pass_done.");
+ goto zone_pass_done;
+ }
+ if (likely(page)) {
+ if (need_writeback) {
+ ntfs_debug("Marking page dirty.");
+ flush_dcache_page(page);
+ set_page_dirty(page);
+ need_writeback = 0;
+ }
+ ntfs_unmap_page(page);
+ }
+ page = ntfs_map_page(mapping, last_read_pos >>
+ PAGE_SHIFT);
+ if (IS_ERR(page)) {
+ err = PTR_ERR(page);
+ ntfs_error(vol->sb, "Failed to map page.");
+ goto out;
+ }
+ buf_size = last_read_pos & ~PAGE_MASK;
+ buf = page_address(page) + buf_size;
+ buf_size = PAGE_SIZE - buf_size;
+ if (unlikely(last_read_pos + buf_size > i_size))
+ buf_size = i_size - last_read_pos;
+ buf_size <<= 3;
+ lcn = bmp_pos & 7;
+ bmp_pos &= ~(LCN)7;
+ ntfs_debug("Before inner while loop: buf_size %i, lcn 0x%llx, "
+ "bmp_pos 0x%llx, need_writeback %i.", buf_size,
+ (unsigned long long)lcn,
+ (unsigned long long)bmp_pos, need_writeback);
+ while (lcn < buf_size && lcn + bmp_pos < zone_end) {
+ byte = buf + (lcn >> 3);
+ ntfs_debug("In inner while loop: buf_size %i, "
+ "lcn 0x%llx, bmp_pos 0x%llx, "
+ "need_writeback %i, byte ofs 0x%x, "
+ "*byte 0x%x.", buf_size,
+ (unsigned long long)lcn,
+ (unsigned long long)bmp_pos,
+ need_writeback,
+ (unsigned int)(lcn >> 3),
+ (unsigned int)*byte);
+ /* Skip full bytes. */
+ if (*byte == 0xff) {
+ lcn = (lcn + 8) & ~(LCN)7;
+ ntfs_debug("Continuing while loop 1.");
+ continue;
+ }
+ bit = 1 << (lcn & 7);
+ ntfs_debug("bit 0x%x.", bit);
+ /* If the bit is already set, go onto the next one. */
+ if (*byte & bit) {
+ lcn++;
+ ntfs_debug("Continuing while loop 2.");
+ continue;
+ }
+ /*
+ * Allocate more memory if needed, including space for
+ * the terminator element.
+ * ntfs_malloc_nofs() operates on whole pages only.
+ */
+ if ((rlpos + 2) * sizeof(*rl) > rlsize) {
+ runlist_element *rl2;
+
+ ntfs_debug("Reallocating memory.");
+ if (!rl)
+ ntfs_debug("First free bit is at LCN "
+ "0x%llx.",
+ (unsigned long long)
+ (lcn + bmp_pos));
+ rl2 = ntfs_malloc_nofs(rlsize + (int)PAGE_SIZE);
+ if (unlikely(!rl2)) {
+ err = -ENOMEM;
+ ntfs_error(vol->sb, "Failed to "
+ "allocate memory.");
+ goto out;
+ }
+ memcpy(rl2, rl, rlsize);
+ ntfs_free(rl);
+ rl = rl2;
+ rlsize += PAGE_SIZE;
+ ntfs_debug("Reallocated memory, rlsize 0x%x.",
+ rlsize);
+ }
+ /* Allocate the bitmap bit. */
+ *byte |= bit;
+ /* We need to write this bitmap page to disk. */
+ need_writeback = 1;
+ ntfs_debug("*byte 0x%x, need_writeback is set.",
+ (unsigned int)*byte);
+ /*
+ * Coalesce with previous run if adjacent LCNs.
+ * Otherwise, append a new run.
+ */
+ ntfs_debug("Adding run (lcn 0x%llx, len 0x%llx), "
+ "prev_lcn 0x%llx, lcn 0x%llx, "
+ "bmp_pos 0x%llx, prev_run_len 0x%llx, "
+ "rlpos %i.",
+ (unsigned long long)(lcn + bmp_pos),
+ 1ULL, (unsigned long long)prev_lcn,
+ (unsigned long long)lcn,
+ (unsigned long long)bmp_pos,
+ (unsigned long long)prev_run_len,
+ rlpos);
+ if (prev_lcn == lcn + bmp_pos - prev_run_len && rlpos) {
+ ntfs_debug("Coalescing to run (lcn 0x%llx, "
+ "len 0x%llx).",
+ (unsigned long long)
+ rl[rlpos - 1].lcn,
+ (unsigned long long)
+ rl[rlpos - 1].length);
+ rl[rlpos - 1].length = ++prev_run_len;
+ ntfs_debug("Run now (lcn 0x%llx, len 0x%llx), "
+ "prev_run_len 0x%llx.",
+ (unsigned long long)
+ rl[rlpos - 1].lcn,
+ (unsigned long long)
+ rl[rlpos - 1].length,
+ (unsigned long long)
+ prev_run_len);
+ } else {
+ if (likely(rlpos)) {
+ ntfs_debug("Adding new run, (previous "
+ "run lcn 0x%llx, "
+ "len 0x%llx).",
+ (unsigned long long)
+ rl[rlpos - 1].lcn,
+ (unsigned long long)
+ rl[rlpos - 1].length);
+ rl[rlpos].vcn = rl[rlpos - 1].vcn +
+ prev_run_len;
+ } else {
+ ntfs_debug("Adding new run, is first "
+ "run.");
+ rl[rlpos].vcn = start_vcn;
+ }
+ rl[rlpos].lcn = prev_lcn = lcn + bmp_pos;
+ rl[rlpos].length = prev_run_len = 1;
+ rlpos++;
+ }
+ /* Done? */
+ if (!--clusters) {
+ LCN tc;
+ /*
+ * Update the current zone position. Positions
+ * of already scanned zones have been updated
+ * during the respective zone switches.
+ */
+ tc = lcn + bmp_pos + 1;
+ ntfs_debug("Done. Updating current zone "
+ "position, tc 0x%llx, "
+ "search_zone %i.",
+ (unsigned long long)tc,
+ search_zone);
+ switch (search_zone) {
+ case 1:
+ ntfs_debug("Before checks, "
+ "vol->mft_zone_pos "
+ "0x%llx.",
+ (unsigned long long)
+ vol->mft_zone_pos);
+ if (tc >= vol->mft_zone_end) {
+ vol->mft_zone_pos =
+ vol->mft_lcn;
+ if (!vol->mft_zone_end)
+ vol->mft_zone_pos = 0;
+ } else if ((bmp_initial_pos >=
+ vol->mft_zone_pos ||
+ tc > vol->mft_zone_pos)
+ && tc >= vol->mft_lcn)
+ vol->mft_zone_pos = tc;
+ ntfs_debug("After checks, "
+ "vol->mft_zone_pos "
+ "0x%llx.",
+ (unsigned long long)
+ vol->mft_zone_pos);
+ break;
+ case 2:
+ ntfs_debug("Before checks, "
+ "vol->data1_zone_pos "
+ "0x%llx.",
+ (unsigned long long)
+ vol->data1_zone_pos);
+ if (tc >= vol->nr_clusters)
+ vol->data1_zone_pos =
+ vol->mft_zone_end;
+ else if ((bmp_initial_pos >=
+ vol->data1_zone_pos ||
+ tc > vol->data1_zone_pos)
+ && tc >= vol->mft_zone_end)
+ vol->data1_zone_pos = tc;
+ ntfs_debug("After checks, "
+ "vol->data1_zone_pos "
+ "0x%llx.",
+ (unsigned long long)
+ vol->data1_zone_pos);
+ break;
+ case 4:
+ ntfs_debug("Before checks, "
+ "vol->data2_zone_pos "
+ "0x%llx.",
+ (unsigned long long)
+ vol->data2_zone_pos);
+ if (tc >= vol->mft_zone_start)
+ vol->data2_zone_pos = 0;
+ else if (bmp_initial_pos >=
+ vol->data2_zone_pos ||
+ tc > vol->data2_zone_pos)
+ vol->data2_zone_pos = tc;
+ ntfs_debug("After checks, "
+ "vol->data2_zone_pos "
+ "0x%llx.",
+ (unsigned long long)
+ vol->data2_zone_pos);
+ break;
+ default:
+ BUG();
+ }
+ ntfs_debug("Finished. Going to out.");
+ goto out;
+ }
+ lcn++;
+ }
+ bmp_pos += buf_size;
+ ntfs_debug("After inner while loop: buf_size 0x%x, lcn "
+ "0x%llx, bmp_pos 0x%llx, need_writeback %i.",
+ buf_size, (unsigned long long)lcn,
+ (unsigned long long)bmp_pos, need_writeback);
+ if (bmp_pos < zone_end) {
+ ntfs_debug("Continuing outer while loop, "
+ "bmp_pos 0x%llx, zone_end 0x%llx.",
+ (unsigned long long)bmp_pos,
+ (unsigned long long)zone_end);
+ continue;
+ }
+zone_pass_done: /* Finished with the current zone pass. */
+ ntfs_debug("At zone_pass_done, pass %i.", pass);
+ if (pass == 1) {
+ /*
+ * Now do pass 2, scanning the first part of the zone
+ * we omitted in pass 1.
+ */
+ pass = 2;
+ zone_end = zone_start;
+ switch (search_zone) {
+ case 1: /* mft_zone */
+ zone_start = vol->mft_zone_start;
+ break;
+ case 2: /* data1_zone */
+ zone_start = vol->mft_zone_end;
+ break;
+ case 4: /* data2_zone */
+ zone_start = 0;
+ break;
+ default:
+ BUG();
+ }
+ /* Sanity check. */
+ if (zone_end < zone_start)
+ zone_end = zone_start;
+ bmp_pos = zone_start;
+ ntfs_debug("Continuing outer while loop, pass 2, "
+ "zone_start 0x%llx, zone_end 0x%llx, "
+ "bmp_pos 0x%llx.",
+ (unsigned long long)zone_start,
+ (unsigned long long)zone_end,
+ (unsigned long long)bmp_pos);
+ continue;
+ } /* pass == 2 */
+done_zones_check:
+ ntfs_debug("At done_zones_check, search_zone %i, done_zones "
+ "before 0x%x, done_zones after 0x%x.",
+ search_zone, done_zones,
+ done_zones | search_zone);
+ done_zones |= search_zone;
+ if (done_zones < 7) {
+ ntfs_debug("Switching zone.");
+ /* Now switch to the next zone we haven't done yet. */
+ pass = 1;
+ switch (search_zone) {
+ case 1:
+ ntfs_debug("Switching from mft zone to data1 "
+ "zone.");
+ /* Update mft zone position. */
+ if (rlpos) {
+ LCN tc;
+
+ ntfs_debug("Before checks, "
+ "vol->mft_zone_pos "
+ "0x%llx.",
+ (unsigned long long)
+ vol->mft_zone_pos);
+ tc = rl[rlpos - 1].lcn +
+ rl[rlpos - 1].length;
+ if (tc >= vol->mft_zone_end) {
+ vol->mft_zone_pos =
+ vol->mft_lcn;
+ if (!vol->mft_zone_end)
+ vol->mft_zone_pos = 0;
+ } else if ((bmp_initial_pos >=
+ vol->mft_zone_pos ||
+ tc > vol->mft_zone_pos)
+ && tc >= vol->mft_lcn)
+ vol->mft_zone_pos = tc;
+ ntfs_debug("After checks, "
+ "vol->mft_zone_pos "
+ "0x%llx.",
+ (unsigned long long)
+ vol->mft_zone_pos);
+ }
+ /* Switch from mft zone to data1 zone. */
+switch_to_data1_zone: search_zone = 2;
+ zone_start = bmp_initial_pos =
+ vol->data1_zone_pos;
+ zone_end = vol->nr_clusters;
+ if (zone_start == vol->mft_zone_end)
+ pass = 2;
+ if (zone_start >= zone_end) {
+ vol->data1_zone_pos = zone_start =
+ vol->mft_zone_end;
+ pass = 2;
+ }
+ break;
+ case 2:
+ ntfs_debug("Switching from data1 zone to "
+ "data2 zone.");
+ /* Update data1 zone position. */
+ if (rlpos) {
+ LCN tc;
+
+ ntfs_debug("Before checks, "
+ "vol->data1_zone_pos "
+ "0x%llx.",
+ (unsigned long long)
+ vol->data1_zone_pos);
+ tc = rl[rlpos - 1].lcn +
+ rl[rlpos - 1].length;
+ if (tc >= vol->nr_clusters)
+ vol->data1_zone_pos =
+ vol->mft_zone_end;
+ else if ((bmp_initial_pos >=
+ vol->data1_zone_pos ||
+ tc > vol->data1_zone_pos)
+ && tc >= vol->mft_zone_end)
+ vol->data1_zone_pos = tc;
+ ntfs_debug("After checks, "
+ "vol->data1_zone_pos "
+ "0x%llx.",
+ (unsigned long long)
+ vol->data1_zone_pos);
+ }
+ /* Switch from data1 zone to data2 zone. */
+ search_zone = 4;
+ zone_start = bmp_initial_pos =
+ vol->data2_zone_pos;
+ zone_end = vol->mft_zone_start;
+ if (!zone_start)
+ pass = 2;
+ if (zone_start >= zone_end) {
+ vol->data2_zone_pos = zone_start =
+ bmp_initial_pos = 0;
+ pass = 2;
+ }
+ break;
+ case 4:
+ ntfs_debug("Switching from data2 zone to "
+ "data1 zone.");
+ /* Update data2 zone position. */
+ if (rlpos) {
+ LCN tc;
+
+ ntfs_debug("Before checks, "
+ "vol->data2_zone_pos "
+ "0x%llx.",
+ (unsigned long long)
+ vol->data2_zone_pos);
+ tc = rl[rlpos - 1].lcn +
+ rl[rlpos - 1].length;
+ if (tc >= vol->mft_zone_start)
+ vol->data2_zone_pos = 0;
+ else if (bmp_initial_pos >=
+ vol->data2_zone_pos ||
+ tc > vol->data2_zone_pos)
+ vol->data2_zone_pos = tc;
+ ntfs_debug("After checks, "
+ "vol->data2_zone_pos "
+ "0x%llx.",
+ (unsigned long long)
+ vol->data2_zone_pos);
+ }
+ /* Switch from data2 zone to data1 zone. */
+ goto switch_to_data1_zone;
+ default:
+ BUG();
+ }
+ ntfs_debug("After zone switch, search_zone %i, "
+ "pass %i, bmp_initial_pos 0x%llx, "
+ "zone_start 0x%llx, zone_end 0x%llx.",
+ search_zone, pass,
+ (unsigned long long)bmp_initial_pos,
+ (unsigned long long)zone_start,
+ (unsigned long long)zone_end);
+ bmp_pos = zone_start;
+ if (zone_start == zone_end) {
+ ntfs_debug("Empty zone, going to "
+ "done_zones_check.");
+ /* Empty zone. Don't bother searching it. */
+ goto done_zones_check;
+ }
+ ntfs_debug("Continuing outer while loop.");
+ continue;
+ } /* done_zones == 7 */
+ ntfs_debug("All zones are finished.");
+ /*
+ * All zones are finished! If DATA_ZONE, shrink mft zone. If
+ * MFT_ZONE, we have really run out of space.
+ */
+ mft_zone_size = vol->mft_zone_end - vol->mft_zone_start;
+ ntfs_debug("vol->mft_zone_start 0x%llx, vol->mft_zone_end "
+ "0x%llx, mft_zone_size 0x%llx.",
+ (unsigned long long)vol->mft_zone_start,
+ (unsigned long long)vol->mft_zone_end,
+ (unsigned long long)mft_zone_size);
+ if (zone == MFT_ZONE || mft_zone_size <= 0) {
+ ntfs_debug("No free clusters left, going to out.");
+ /* Really no more space left on device. */
+ err = -ENOSPC;
+ goto out;
+ } /* zone == DATA_ZONE && mft_zone_size > 0 */
+ ntfs_debug("Shrinking mft zone.");
+ zone_end = vol->mft_zone_end;
+ mft_zone_size >>= 1;
+ if (mft_zone_size > 0)
+ vol->mft_zone_end = vol->mft_zone_start + mft_zone_size;
+ else /* mft zone and data2 zone no longer exist. */
+ vol->data2_zone_pos = vol->mft_zone_start =
+ vol->mft_zone_end = 0;
+ if (vol->mft_zone_pos >= vol->mft_zone_end) {
+ vol->mft_zone_pos = vol->mft_lcn;
+ if (!vol->mft_zone_end)
+ vol->mft_zone_pos = 0;
+ }
+ bmp_pos = zone_start = bmp_initial_pos =
+ vol->data1_zone_pos = vol->mft_zone_end;
+ search_zone = 2;
+ pass = 2;
+ done_zones &= ~2;
+ ntfs_debug("After shrinking mft zone, mft_zone_size 0x%llx, "
+ "vol->mft_zone_start 0x%llx, "
+ "vol->mft_zone_end 0x%llx, "
+ "vol->mft_zone_pos 0x%llx, search_zone 2, "
+ "pass 2, dones_zones 0x%x, zone_start 0x%llx, "
+ "zone_end 0x%llx, vol->data1_zone_pos 0x%llx, "
+ "continuing outer while loop.",
+ (unsigned long long)mft_zone_size,
+ (unsigned long long)vol->mft_zone_start,
+ (unsigned long long)vol->mft_zone_end,
+ (unsigned long long)vol->mft_zone_pos,
+ done_zones, (unsigned long long)zone_start,
+ (unsigned long long)zone_end,
+ (unsigned long long)vol->data1_zone_pos);
+ }
+ ntfs_debug("After outer while loop.");
+out:
+ ntfs_debug("At out.");
+ /* Add runlist terminator element. */
+ if (likely(rl)) {
+ rl[rlpos].vcn = rl[rlpos - 1].vcn + rl[rlpos - 1].length;
+ rl[rlpos].lcn = is_extension ? LCN_ENOENT : LCN_RL_NOT_MAPPED;
+ rl[rlpos].length = 0;
+ }
+ if (likely(page && !IS_ERR(page))) {
+ if (need_writeback) {
+ ntfs_debug("Marking page dirty.");
+ flush_dcache_page(page);
+ set_page_dirty(page);
+ need_writeback = 0;
+ }
+ ntfs_unmap_page(page);
+ }
+ if (likely(!err)) {
+ up_write(&vol->lcnbmp_lock);
+ ntfs_debug("Done.");
+ return rl;
+ }
+ ntfs_error(vol->sb, "Failed to allocate clusters, aborting "
+ "(error %i).", err);
+ if (rl) {
+ int err2;
+
+ if (err == -ENOSPC)
+ ntfs_debug("Not enough space to complete allocation, "
+ "err -ENOSPC, first free lcn 0x%llx, "
+ "could allocate up to 0x%llx "
+ "clusters.",
+ (unsigned long long)rl[0].lcn,
+ (unsigned long long)(count - clusters));
+ /* Deallocate all allocated clusters. */
+ ntfs_debug("Attempting rollback...");
+ err2 = ntfs_cluster_free_from_rl_nolock(vol, rl);
+ if (err2) {
+ ntfs_error(vol->sb, "Failed to rollback (error %i). "
+ "Leaving inconsistent metadata! "
+ "Unmount and run chkdsk.", err2);
+ NVolSetErrors(vol);
+ }
+ /* Free the runlist. */
+ ntfs_free(rl);
+ } else if (err == -ENOSPC)
+ ntfs_debug("No space left at all, err = -ENOSPC, first free "
+ "lcn = 0x%llx.",
+ (long long)vol->data1_zone_pos);
+ up_write(&vol->lcnbmp_lock);
+ return ERR_PTR(err);
+}
+
+/**
+ * __ntfs_cluster_free - free clusters on an ntfs volume
+ * @ni: ntfs inode whose runlist describes the clusters to free
+ * @start_vcn: vcn in the runlist of @ni at which to start freeing clusters
+ * @count: number of clusters to free or -1 for all clusters
+ * @ctx: active attribute search context if present or NULL if not
+ * @is_rollback: true if this is a rollback operation
+ *
+ * Free @count clusters starting at the cluster @start_vcn in the runlist
+ * described by the vfs inode @ni.
+ *
+ * If @count is -1, all clusters from @start_vcn to the end of the runlist are
+ * deallocated. Thus, to completely free all clusters in a runlist, use
+ * @start_vcn = 0 and @count = -1.
+ *
+ * If @ctx is specified, it is an active search context of @ni and its base mft
+ * record. This is needed when __ntfs_cluster_free() encounters unmapped
+ * runlist fragments and allows their mapping. If you do not have the mft
+ * record mapped, you can specify @ctx as NULL and __ntfs_cluster_free() will
+ * perform the necessary mapping and unmapping.
+ *
+ * Note, __ntfs_cluster_free() saves the state of @ctx on entry and restores it
+ * before returning. Thus, @ctx will be left pointing to the same attribute on
+ * return as on entry. However, the actual pointers in @ctx may point to
+ * different memory locations on return, so you must remember to reset any
+ * cached pointers from the @ctx, i.e. after the call to __ntfs_cluster_free(),
+ * you will probably want to do:
+ * m = ctx->mrec;
+ * a = ctx->attr;
+ * Assuming you cache ctx->attr in a variable @a of type ATTR_RECORD * and that
+ * you cache ctx->mrec in a variable @m of type MFT_RECORD *.
+ *
+ * @is_rollback should always be 'false', it is for internal use to rollback
+ * errors. You probably want to use ntfs_cluster_free() instead.
+ *
+ * Note, __ntfs_cluster_free() does not modify the runlist, so you have to
+ * remove from the runlist or mark sparse the freed runs later.
+ *
+ * Return the number of deallocated clusters (not counting sparse ones) on
+ * success and -errno on error.
+ *
+ * WARNING: If @ctx is supplied, regardless of whether success or failure is
+ * returned, you need to check IS_ERR(@ctx->mrec) and if 'true' the @ctx
+ * is no longer valid, i.e. you need to either call
+ * ntfs_attr_reinit_search_ctx() or ntfs_attr_put_search_ctx() on it.
+ * In that case PTR_ERR(@ctx->mrec) will give you the error code for
+ * why the mapping of the old inode failed.
+ *
+ * Locking: - The runlist described by @ni must be locked for writing on entry
+ * and is locked on return. Note the runlist may be modified when
+ * needed runlist fragments need to be mapped.
+ * - The volume lcn bitmap must be unlocked on entry and is unlocked
+ * on return.
+ * - This function takes the volume lcn bitmap lock for writing and
+ * modifies the bitmap contents.
+ * - If @ctx is NULL, the base mft record of @ni must not be mapped on
+ * entry and it will be left unmapped on return.
+ * - If @ctx is not NULL, the base mft record must be mapped on entry
+ * and it will be left mapped on return.
+ */
+s64 __ntfs_cluster_free(ntfs_inode *ni, const VCN start_vcn, s64 count,
+ ntfs_attr_search_ctx *ctx, const bool is_rollback)
+{
+ s64 delta, to_free, total_freed, real_freed;
+ ntfs_volume *vol;
+ struct inode *lcnbmp_vi;
+ runlist_element *rl;
+ int err;
+
+ BUG_ON(!ni);
+ ntfs_debug("Entering for i_ino 0x%lx, start_vcn 0x%llx, count "
+ "0x%llx.%s", ni->mft_no, (unsigned long long)start_vcn,
+ (unsigned long long)count,
+ is_rollback ? " (rollback)" : "");
+ vol = ni->vol;
+ lcnbmp_vi = vol->lcnbmp_ino;
+ BUG_ON(!lcnbmp_vi);
+ BUG_ON(start_vcn < 0);
+ BUG_ON(count < -1);
+ /*
+ * Lock the lcn bitmap for writing but only if not rolling back. We
+ * must hold the lock all the way including through rollback otherwise
+ * rollback is not possible because once we have cleared a bit and
+ * dropped the lock, anyone could have set the bit again, thus
+ * allocating the cluster for another use.
+ */
+ if (likely(!is_rollback))
+ down_write(&vol->lcnbmp_lock);
+
+ total_freed = real_freed = 0;
+
+ rl = ntfs_attr_find_vcn_nolock(ni, start_vcn, ctx);
+ if (IS_ERR(rl)) {
+ if (!is_rollback)
+ ntfs_error(vol->sb, "Failed to find first runlist "
+ "element (error %li), aborting.",
+ PTR_ERR(rl));
+ err = PTR_ERR(rl);
+ goto err_out;
+ }
+ if (unlikely(rl->lcn < LCN_HOLE)) {
+ if (!is_rollback)
+ ntfs_error(vol->sb, "First runlist element has "
+ "invalid lcn, aborting.");
+ err = -EIO;
+ goto err_out;
+ }
+ /* Find the starting cluster inside the run that needs freeing. */
+ delta = start_vcn - rl->vcn;
+
+ /* The number of clusters in this run that need freeing. */
+ to_free = rl->length - delta;
+ if (count >= 0 && to_free > count)
+ to_free = count;
+
+ if (likely(rl->lcn >= 0)) {
+ /* Do the actual freeing of the clusters in this run. */
+ err = ntfs_bitmap_set_bits_in_run(lcnbmp_vi, rl->lcn + delta,
+ to_free, likely(!is_rollback) ? 0 : 1);
+ if (unlikely(err)) {
+ if (!is_rollback)
+ ntfs_error(vol->sb, "Failed to clear first run "
+ "(error %i), aborting.", err);
+ goto err_out;
+ }
+ /* We have freed @to_free real clusters. */
+ real_freed = to_free;
+ };
+ /* Go to the next run and adjust the number of clusters left to free. */
+ ++rl;
+ if (count >= 0)
+ count -= to_free;
+
+ /* Keep track of the total "freed" clusters, including sparse ones. */
+ total_freed = to_free;
+ /*
+ * Loop over the remaining runs, using @count as a capping value, and
+ * free them.
+ */
+ for (; rl->length && count != 0; ++rl) {
+ if (unlikely(rl->lcn < LCN_HOLE)) {
+ VCN vcn;
+
+ /* Attempt to map runlist. */
+ vcn = rl->vcn;
+ rl = ntfs_attr_find_vcn_nolock(ni, vcn, ctx);
+ if (IS_ERR(rl)) {
+ err = PTR_ERR(rl);
+ if (!is_rollback)
+ ntfs_error(vol->sb, "Failed to map "
+ "runlist fragment or "
+ "failed to find "
+ "subsequent runlist "
+ "element.");
+ goto err_out;
+ }
+ if (unlikely(rl->lcn < LCN_HOLE)) {
+ if (!is_rollback)
+ ntfs_error(vol->sb, "Runlist element "
+ "has invalid lcn "
+ "(0x%llx).",
+ (unsigned long long)
+ rl->lcn);
+ err = -EIO;
+ goto err_out;
+ }
+ }
+ /* The number of clusters in this run that need freeing. */
+ to_free = rl->length;
+ if (count >= 0 && to_free > count)
+ to_free = count;
+
+ if (likely(rl->lcn >= 0)) {
+ /* Do the actual freeing of the clusters in the run. */
+ err = ntfs_bitmap_set_bits_in_run(lcnbmp_vi, rl->lcn,
+ to_free, likely(!is_rollback) ? 0 : 1);
+ if (unlikely(err)) {
+ if (!is_rollback)
+ ntfs_error(vol->sb, "Failed to clear "
+ "subsequent run.");
+ goto err_out;
+ }
+ /* We have freed @to_free real clusters. */
+ real_freed += to_free;
+ }
+ /* Adjust the number of clusters left to free. */
+ if (count >= 0)
+ count -= to_free;
+
+ /* Update the total done clusters. */
+ total_freed += to_free;
+ }
+ if (likely(!is_rollback))
+ up_write(&vol->lcnbmp_lock);
+
+ BUG_ON(count > 0);
+
+ /* We are done. Return the number of actually freed clusters. */
+ ntfs_debug("Done.");
+ return real_freed;
+err_out:
+ if (is_rollback)
+ return err;
+ /* If no real clusters were freed, no need to rollback. */
+ if (!real_freed) {
+ up_write(&vol->lcnbmp_lock);
+ return err;
+ }
+ /*
+ * Attempt to rollback and if that succeeds just return the error code.
+ * If rollback fails, set the volume errors flag, emit an error
+ * message, and return the error code.
+ */
+ delta = __ntfs_cluster_free(ni, start_vcn, total_freed, ctx, true);
+ if (delta < 0) {
+ ntfs_error(vol->sb, "Failed to rollback (error %i). Leaving "
+ "inconsistent metadata! Unmount and run "
+ "chkdsk.", (int)delta);
+ NVolSetErrors(vol);
+ }
+ up_write(&vol->lcnbmp_lock);
+ ntfs_error(vol->sb, "Aborting (error %i).", err);
+ return err;
+}
+
+#endif /* NTFS_RW */
diff --git a/fs/ntfs/lcnalloc.h b/fs/ntfs/lcnalloc.h
new file mode 100644
index 000000000..1589a6d84
--- /dev/null
+++ b/fs/ntfs/lcnalloc.h
@@ -0,0 +1,131 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * lcnalloc.h - Exports for NTFS kernel cluster (de)allocation. Part of the
+ * Linux-NTFS project.
+ *
+ * Copyright (c) 2004-2005 Anton Altaparmakov
+ */
+
+#ifndef _LINUX_NTFS_LCNALLOC_H
+#define _LINUX_NTFS_LCNALLOC_H
+
+#ifdef NTFS_RW
+
+#include <linux/fs.h>
+
+#include "attrib.h"
+#include "types.h"
+#include "inode.h"
+#include "runlist.h"
+#include "volume.h"
+
+typedef enum {
+ FIRST_ZONE = 0, /* For sanity checking. */
+ MFT_ZONE = 0, /* Allocate from $MFT zone. */
+ DATA_ZONE = 1, /* Allocate from $DATA zone. */
+ LAST_ZONE = 1, /* For sanity checking. */
+} NTFS_CLUSTER_ALLOCATION_ZONES;
+
+extern runlist_element *ntfs_cluster_alloc(ntfs_volume *vol,
+ const VCN start_vcn, const s64 count, const LCN start_lcn,
+ const NTFS_CLUSTER_ALLOCATION_ZONES zone,
+ const bool is_extension);
+
+extern s64 __ntfs_cluster_free(ntfs_inode *ni, const VCN start_vcn,
+ s64 count, ntfs_attr_search_ctx *ctx, const bool is_rollback);
+
+/**
+ * ntfs_cluster_free - free clusters on an ntfs volume
+ * @ni: ntfs inode whose runlist describes the clusters to free
+ * @start_vcn: vcn in the runlist of @ni at which to start freeing clusters
+ * @count: number of clusters to free or -1 for all clusters
+ * @ctx: active attribute search context if present or NULL if not
+ *
+ * Free @count clusters starting at the cluster @start_vcn in the runlist
+ * described by the ntfs inode @ni.
+ *
+ * If @count is -1, all clusters from @start_vcn to the end of the runlist are
+ * deallocated. Thus, to completely free all clusters in a runlist, use
+ * @start_vcn = 0 and @count = -1.
+ *
+ * If @ctx is specified, it is an active search context of @ni and its base mft
+ * record. This is needed when ntfs_cluster_free() encounters unmapped runlist
+ * fragments and allows their mapping. If you do not have the mft record
+ * mapped, you can specify @ctx as NULL and ntfs_cluster_free() will perform
+ * the necessary mapping and unmapping.
+ *
+ * Note, ntfs_cluster_free() saves the state of @ctx on entry and restores it
+ * before returning. Thus, @ctx will be left pointing to the same attribute on
+ * return as on entry. However, the actual pointers in @ctx may point to
+ * different memory locations on return, so you must remember to reset any
+ * cached pointers from the @ctx, i.e. after the call to ntfs_cluster_free(),
+ * you will probably want to do:
+ * m = ctx->mrec;
+ * a = ctx->attr;
+ * Assuming you cache ctx->attr in a variable @a of type ATTR_RECORD * and that
+ * you cache ctx->mrec in a variable @m of type MFT_RECORD *.
+ *
+ * Note, ntfs_cluster_free() does not modify the runlist, so you have to remove
+ * from the runlist or mark sparse the freed runs later.
+ *
+ * Return the number of deallocated clusters (not counting sparse ones) on
+ * success and -errno on error.
+ *
+ * WARNING: If @ctx is supplied, regardless of whether success or failure is
+ * returned, you need to check IS_ERR(@ctx->mrec) and if 'true' the @ctx
+ * is no longer valid, i.e. you need to either call
+ * ntfs_attr_reinit_search_ctx() or ntfs_attr_put_search_ctx() on it.
+ * In that case PTR_ERR(@ctx->mrec) will give you the error code for
+ * why the mapping of the old inode failed.
+ *
+ * Locking: - The runlist described by @ni must be locked for writing on entry
+ * and is locked on return. Note the runlist may be modified when
+ * needed runlist fragments need to be mapped.
+ * - The volume lcn bitmap must be unlocked on entry and is unlocked
+ * on return.
+ * - This function takes the volume lcn bitmap lock for writing and
+ * modifies the bitmap contents.
+ * - If @ctx is NULL, the base mft record of @ni must not be mapped on
+ * entry and it will be left unmapped on return.
+ * - If @ctx is not NULL, the base mft record must be mapped on entry
+ * and it will be left mapped on return.
+ */
+static inline s64 ntfs_cluster_free(ntfs_inode *ni, const VCN start_vcn,
+ s64 count, ntfs_attr_search_ctx *ctx)
+{
+ return __ntfs_cluster_free(ni, start_vcn, count, ctx, false);
+}
+
+extern int ntfs_cluster_free_from_rl_nolock(ntfs_volume *vol,
+ const runlist_element *rl);
+
+/**
+ * ntfs_cluster_free_from_rl - free clusters from runlist
+ * @vol: mounted ntfs volume on which to free the clusters
+ * @rl: runlist describing the clusters to free
+ *
+ * Free all the clusters described by the runlist @rl on the volume @vol. In
+ * the case of an error being returned, at least some of the clusters were not
+ * freed.
+ *
+ * Return 0 on success and -errno on error.
+ *
+ * Locking: - This function takes the volume lcn bitmap lock for writing and
+ * modifies the bitmap contents.
+ * - The caller must have locked the runlist @rl for reading or
+ * writing.
+ */
+static inline int ntfs_cluster_free_from_rl(ntfs_volume *vol,
+ const runlist_element *rl)
+{
+ int ret;
+
+ down_write(&vol->lcnbmp_lock);
+ ret = ntfs_cluster_free_from_rl_nolock(vol, rl);
+ up_write(&vol->lcnbmp_lock);
+ return ret;
+}
+
+#endif /* NTFS_RW */
+
+#endif /* defined _LINUX_NTFS_LCNALLOC_H */
diff --git a/fs/ntfs/logfile.c b/fs/ntfs/logfile.c
new file mode 100644
index 000000000..6ce60ffc6
--- /dev/null
+++ b/fs/ntfs/logfile.c
@@ -0,0 +1,849 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * logfile.c - NTFS kernel journal handling. Part of the Linux-NTFS project.
+ *
+ * Copyright (c) 2002-2007 Anton Altaparmakov
+ */
+
+#ifdef NTFS_RW
+
+#include <linux/types.h>
+#include <linux/fs.h>
+#include <linux/highmem.h>
+#include <linux/buffer_head.h>
+#include <linux/bitops.h>
+#include <linux/log2.h>
+#include <linux/bio.h>
+
+#include "attrib.h"
+#include "aops.h"
+#include "debug.h"
+#include "logfile.h"
+#include "malloc.h"
+#include "volume.h"
+#include "ntfs.h"
+
+/**
+ * ntfs_check_restart_page_header - check the page header for consistency
+ * @vi: $LogFile inode to which the restart page header belongs
+ * @rp: restart page header to check
+ * @pos: position in @vi at which the restart page header resides
+ *
+ * Check the restart page header @rp for consistency and return 'true' if it is
+ * consistent and 'false' otherwise.
+ *
+ * This function only needs NTFS_BLOCK_SIZE bytes in @rp, i.e. it does not
+ * require the full restart page.
+ */
+static bool ntfs_check_restart_page_header(struct inode *vi,
+ RESTART_PAGE_HEADER *rp, s64 pos)
+{
+ u32 logfile_system_page_size, logfile_log_page_size;
+ u16 ra_ofs, usa_count, usa_ofs, usa_end = 0;
+ bool have_usa = true;
+
+ ntfs_debug("Entering.");
+ /*
+ * If the system or log page sizes are smaller than the ntfs block size
+ * or either is not a power of 2 we cannot handle this log file.
+ */
+ logfile_system_page_size = le32_to_cpu(rp->system_page_size);
+ logfile_log_page_size = le32_to_cpu(rp->log_page_size);
+ if (logfile_system_page_size < NTFS_BLOCK_SIZE ||
+ logfile_log_page_size < NTFS_BLOCK_SIZE ||
+ logfile_system_page_size &
+ (logfile_system_page_size - 1) ||
+ !is_power_of_2(logfile_log_page_size)) {
+ ntfs_error(vi->i_sb, "$LogFile uses unsupported page size.");
+ return false;
+ }
+ /*
+ * We must be either at !pos (1st restart page) or at pos = system page
+ * size (2nd restart page).
+ */
+ if (pos && pos != logfile_system_page_size) {
+ ntfs_error(vi->i_sb, "Found restart area in incorrect "
+ "position in $LogFile.");
+ return false;
+ }
+ /* We only know how to handle version 1.1. */
+ if (sle16_to_cpu(rp->major_ver) != 1 ||
+ sle16_to_cpu(rp->minor_ver) != 1) {
+ ntfs_error(vi->i_sb, "$LogFile version %i.%i is not "
+ "supported. (This driver supports version "
+ "1.1 only.)", (int)sle16_to_cpu(rp->major_ver),
+ (int)sle16_to_cpu(rp->minor_ver));
+ return false;
+ }
+ /*
+ * If chkdsk has been run the restart page may not be protected by an
+ * update sequence array.
+ */
+ if (ntfs_is_chkd_record(rp->magic) && !le16_to_cpu(rp->usa_count)) {
+ have_usa = false;
+ goto skip_usa_checks;
+ }
+ /* Verify the size of the update sequence array. */
+ usa_count = 1 + (logfile_system_page_size >> NTFS_BLOCK_SIZE_BITS);
+ if (usa_count != le16_to_cpu(rp->usa_count)) {
+ ntfs_error(vi->i_sb, "$LogFile restart page specifies "
+ "inconsistent update sequence array count.");
+ return false;
+ }
+ /* Verify the position of the update sequence array. */
+ usa_ofs = le16_to_cpu(rp->usa_ofs);
+ usa_end = usa_ofs + usa_count * sizeof(u16);
+ if (usa_ofs < sizeof(RESTART_PAGE_HEADER) ||
+ usa_end > NTFS_BLOCK_SIZE - sizeof(u16)) {
+ ntfs_error(vi->i_sb, "$LogFile restart page specifies "
+ "inconsistent update sequence array offset.");
+ return false;
+ }
+skip_usa_checks:
+ /*
+ * Verify the position of the restart area. It must be:
+ * - aligned to 8-byte boundary,
+ * - after the update sequence array, and
+ * - within the system page size.
+ */
+ ra_ofs = le16_to_cpu(rp->restart_area_offset);
+ if (ra_ofs & 7 || (have_usa ? ra_ofs < usa_end :
+ ra_ofs < sizeof(RESTART_PAGE_HEADER)) ||
+ ra_ofs > logfile_system_page_size) {
+ ntfs_error(vi->i_sb, "$LogFile restart page specifies "
+ "inconsistent restart area offset.");
+ return false;
+ }
+ /*
+ * Only restart pages modified by chkdsk are allowed to have chkdsk_lsn
+ * set.
+ */
+ if (!ntfs_is_chkd_record(rp->magic) && sle64_to_cpu(rp->chkdsk_lsn)) {
+ ntfs_error(vi->i_sb, "$LogFile restart page is not modified "
+ "by chkdsk but a chkdsk LSN is specified.");
+ return false;
+ }
+ ntfs_debug("Done.");
+ return true;
+}
+
+/**
+ * ntfs_check_restart_area - check the restart area for consistency
+ * @vi: $LogFile inode to which the restart page belongs
+ * @rp: restart page whose restart area to check
+ *
+ * Check the restart area of the restart page @rp for consistency and return
+ * 'true' if it is consistent and 'false' otherwise.
+ *
+ * This function assumes that the restart page header has already been
+ * consistency checked.
+ *
+ * This function only needs NTFS_BLOCK_SIZE bytes in @rp, i.e. it does not
+ * require the full restart page.
+ */
+static bool ntfs_check_restart_area(struct inode *vi, RESTART_PAGE_HEADER *rp)
+{
+ u64 file_size;
+ RESTART_AREA *ra;
+ u16 ra_ofs, ra_len, ca_ofs;
+ u8 fs_bits;
+
+ ntfs_debug("Entering.");
+ ra_ofs = le16_to_cpu(rp->restart_area_offset);
+ ra = (RESTART_AREA*)((u8*)rp + ra_ofs);
+ /*
+ * Everything before ra->file_size must be before the first word
+ * protected by an update sequence number. This ensures that it is
+ * safe to access ra->client_array_offset.
+ */
+ if (ra_ofs + offsetof(RESTART_AREA, file_size) >
+ NTFS_BLOCK_SIZE - sizeof(u16)) {
+ ntfs_error(vi->i_sb, "$LogFile restart area specifies "
+ "inconsistent file offset.");
+ return false;
+ }
+ /*
+ * Now that we can access ra->client_array_offset, make sure everything
+ * up to the log client array is before the first word protected by an
+ * update sequence number. This ensures we can access all of the
+ * restart area elements safely. Also, the client array offset must be
+ * aligned to an 8-byte boundary.
+ */
+ ca_ofs = le16_to_cpu(ra->client_array_offset);
+ if (((ca_ofs + 7) & ~7) != ca_ofs ||
+ ra_ofs + ca_ofs > NTFS_BLOCK_SIZE - sizeof(u16)) {
+ ntfs_error(vi->i_sb, "$LogFile restart area specifies "
+ "inconsistent client array offset.");
+ return false;
+ }
+ /*
+ * The restart area must end within the system page size both when
+ * calculated manually and as specified by ra->restart_area_length.
+ * Also, the calculated length must not exceed the specified length.
+ */
+ ra_len = ca_ofs + le16_to_cpu(ra->log_clients) *
+ sizeof(LOG_CLIENT_RECORD);
+ if (ra_ofs + ra_len > le32_to_cpu(rp->system_page_size) ||
+ ra_ofs + le16_to_cpu(ra->restart_area_length) >
+ le32_to_cpu(rp->system_page_size) ||
+ ra_len > le16_to_cpu(ra->restart_area_length)) {
+ ntfs_error(vi->i_sb, "$LogFile restart area is out of bounds "
+ "of the system page size specified by the "
+ "restart page header and/or the specified "
+ "restart area length is inconsistent.");
+ return false;
+ }
+ /*
+ * The ra->client_free_list and ra->client_in_use_list must be either
+ * LOGFILE_NO_CLIENT or less than ra->log_clients or they are
+ * overflowing the client array.
+ */
+ if ((ra->client_free_list != LOGFILE_NO_CLIENT &&
+ le16_to_cpu(ra->client_free_list) >=
+ le16_to_cpu(ra->log_clients)) ||
+ (ra->client_in_use_list != LOGFILE_NO_CLIENT &&
+ le16_to_cpu(ra->client_in_use_list) >=
+ le16_to_cpu(ra->log_clients))) {
+ ntfs_error(vi->i_sb, "$LogFile restart area specifies "
+ "overflowing client free and/or in use lists.");
+ return false;
+ }
+ /*
+ * Check ra->seq_number_bits against ra->file_size for consistency.
+ * We cannot just use ffs() because the file size is not a power of 2.
+ */
+ file_size = (u64)sle64_to_cpu(ra->file_size);
+ fs_bits = 0;
+ while (file_size) {
+ file_size >>= 1;
+ fs_bits++;
+ }
+ if (le32_to_cpu(ra->seq_number_bits) != 67 - fs_bits) {
+ ntfs_error(vi->i_sb, "$LogFile restart area specifies "
+ "inconsistent sequence number bits.");
+ return false;
+ }
+ /* The log record header length must be a multiple of 8. */
+ if (((le16_to_cpu(ra->log_record_header_length) + 7) & ~7) !=
+ le16_to_cpu(ra->log_record_header_length)) {
+ ntfs_error(vi->i_sb, "$LogFile restart area specifies "
+ "inconsistent log record header length.");
+ return false;
+ }
+ /* Dito for the log page data offset. */
+ if (((le16_to_cpu(ra->log_page_data_offset) + 7) & ~7) !=
+ le16_to_cpu(ra->log_page_data_offset)) {
+ ntfs_error(vi->i_sb, "$LogFile restart area specifies "
+ "inconsistent log page data offset.");
+ return false;
+ }
+ ntfs_debug("Done.");
+ return true;
+}
+
+/**
+ * ntfs_check_log_client_array - check the log client array for consistency
+ * @vi: $LogFile inode to which the restart page belongs
+ * @rp: restart page whose log client array to check
+ *
+ * Check the log client array of the restart page @rp for consistency and
+ * return 'true' if it is consistent and 'false' otherwise.
+ *
+ * This function assumes that the restart page header and the restart area have
+ * already been consistency checked.
+ *
+ * Unlike ntfs_check_restart_page_header() and ntfs_check_restart_area(), this
+ * function needs @rp->system_page_size bytes in @rp, i.e. it requires the full
+ * restart page and the page must be multi sector transfer deprotected.
+ */
+static bool ntfs_check_log_client_array(struct inode *vi,
+ RESTART_PAGE_HEADER *rp)
+{
+ RESTART_AREA *ra;
+ LOG_CLIENT_RECORD *ca, *cr;
+ u16 nr_clients, idx;
+ bool in_free_list, idx_is_first;
+
+ ntfs_debug("Entering.");
+ ra = (RESTART_AREA*)((u8*)rp + le16_to_cpu(rp->restart_area_offset));
+ ca = (LOG_CLIENT_RECORD*)((u8*)ra +
+ le16_to_cpu(ra->client_array_offset));
+ /*
+ * Check the ra->client_free_list first and then check the
+ * ra->client_in_use_list. Check each of the log client records in
+ * each of the lists and check that the array does not overflow the
+ * ra->log_clients value. Also keep track of the number of records
+ * visited as there cannot be more than ra->log_clients records and
+ * that way we detect eventual loops in within a list.
+ */
+ nr_clients = le16_to_cpu(ra->log_clients);
+ idx = le16_to_cpu(ra->client_free_list);
+ in_free_list = true;
+check_list:
+ for (idx_is_first = true; idx != LOGFILE_NO_CLIENT_CPU; nr_clients--,
+ idx = le16_to_cpu(cr->next_client)) {
+ if (!nr_clients || idx >= le16_to_cpu(ra->log_clients))
+ goto err_out;
+ /* Set @cr to the current log client record. */
+ cr = ca + idx;
+ /* The first log client record must not have a prev_client. */
+ if (idx_is_first) {
+ if (cr->prev_client != LOGFILE_NO_CLIENT)
+ goto err_out;
+ idx_is_first = false;
+ }
+ }
+ /* Switch to and check the in use list if we just did the free list. */
+ if (in_free_list) {
+ in_free_list = false;
+ idx = le16_to_cpu(ra->client_in_use_list);
+ goto check_list;
+ }
+ ntfs_debug("Done.");
+ return true;
+err_out:
+ ntfs_error(vi->i_sb, "$LogFile log client array is corrupt.");
+ return false;
+}
+
+/**
+ * ntfs_check_and_load_restart_page - check the restart page for consistency
+ * @vi: $LogFile inode to which the restart page belongs
+ * @rp: restart page to check
+ * @pos: position in @vi at which the restart page resides
+ * @wrp: [OUT] copy of the multi sector transfer deprotected restart page
+ * @lsn: [OUT] set to the current logfile lsn on success
+ *
+ * Check the restart page @rp for consistency and return 0 if it is consistent
+ * and -errno otherwise. The restart page may have been modified by chkdsk in
+ * which case its magic is CHKD instead of RSTR.
+ *
+ * This function only needs NTFS_BLOCK_SIZE bytes in @rp, i.e. it does not
+ * require the full restart page.
+ *
+ * If @wrp is not NULL, on success, *@wrp will point to a buffer containing a
+ * copy of the complete multi sector transfer deprotected page. On failure,
+ * *@wrp is undefined.
+ *
+ * Simillarly, if @lsn is not NULL, on success *@lsn will be set to the current
+ * logfile lsn according to this restart page. On failure, *@lsn is undefined.
+ *
+ * The following error codes are defined:
+ * -EINVAL - The restart page is inconsistent.
+ * -ENOMEM - Not enough memory to load the restart page.
+ * -EIO - Failed to reading from $LogFile.
+ */
+static int ntfs_check_and_load_restart_page(struct inode *vi,
+ RESTART_PAGE_HEADER *rp, s64 pos, RESTART_PAGE_HEADER **wrp,
+ LSN *lsn)
+{
+ RESTART_AREA *ra;
+ RESTART_PAGE_HEADER *trp;
+ int size, err;
+
+ ntfs_debug("Entering.");
+ /* Check the restart page header for consistency. */
+ if (!ntfs_check_restart_page_header(vi, rp, pos)) {
+ /* Error output already done inside the function. */
+ return -EINVAL;
+ }
+ /* Check the restart area for consistency. */
+ if (!ntfs_check_restart_area(vi, rp)) {
+ /* Error output already done inside the function. */
+ return -EINVAL;
+ }
+ ra = (RESTART_AREA*)((u8*)rp + le16_to_cpu(rp->restart_area_offset));
+ /*
+ * Allocate a buffer to store the whole restart page so we can multi
+ * sector transfer deprotect it.
+ */
+ trp = ntfs_malloc_nofs(le32_to_cpu(rp->system_page_size));
+ if (!trp) {
+ ntfs_error(vi->i_sb, "Failed to allocate memory for $LogFile "
+ "restart page buffer.");
+ return -ENOMEM;
+ }
+ /*
+ * Read the whole of the restart page into the buffer. If it fits
+ * completely inside @rp, just copy it from there. Otherwise map all
+ * the required pages and copy the data from them.
+ */
+ size = PAGE_SIZE - (pos & ~PAGE_MASK);
+ if (size >= le32_to_cpu(rp->system_page_size)) {
+ memcpy(trp, rp, le32_to_cpu(rp->system_page_size));
+ } else {
+ pgoff_t idx;
+ struct page *page;
+ int have_read, to_read;
+
+ /* First copy what we already have in @rp. */
+ memcpy(trp, rp, size);
+ /* Copy the remaining data one page at a time. */
+ have_read = size;
+ to_read = le32_to_cpu(rp->system_page_size) - size;
+ idx = (pos + size) >> PAGE_SHIFT;
+ BUG_ON((pos + size) & ~PAGE_MASK);
+ do {
+ page = ntfs_map_page(vi->i_mapping, idx);
+ if (IS_ERR(page)) {
+ ntfs_error(vi->i_sb, "Error mapping $LogFile "
+ "page (index %lu).", idx);
+ err = PTR_ERR(page);
+ if (err != -EIO && err != -ENOMEM)
+ err = -EIO;
+ goto err_out;
+ }
+ size = min_t(int, to_read, PAGE_SIZE);
+ memcpy((u8*)trp + have_read, page_address(page), size);
+ ntfs_unmap_page(page);
+ have_read += size;
+ to_read -= size;
+ idx++;
+ } while (to_read > 0);
+ }
+ /*
+ * Perform the multi sector transfer deprotection on the buffer if the
+ * restart page is protected.
+ */
+ if ((!ntfs_is_chkd_record(trp->magic) || le16_to_cpu(trp->usa_count))
+ && post_read_mst_fixup((NTFS_RECORD*)trp,
+ le32_to_cpu(rp->system_page_size))) {
+ /*
+ * A multi sector tranfer error was detected. We only need to
+ * abort if the restart page contents exceed the multi sector
+ * transfer fixup of the first sector.
+ */
+ if (le16_to_cpu(rp->restart_area_offset) +
+ le16_to_cpu(ra->restart_area_length) >
+ NTFS_BLOCK_SIZE - sizeof(u16)) {
+ ntfs_error(vi->i_sb, "Multi sector transfer error "
+ "detected in $LogFile restart page.");
+ err = -EINVAL;
+ goto err_out;
+ }
+ }
+ /*
+ * If the restart page is modified by chkdsk or there are no active
+ * logfile clients, the logfile is consistent. Otherwise, need to
+ * check the log client records for consistency, too.
+ */
+ err = 0;
+ if (ntfs_is_rstr_record(rp->magic) &&
+ ra->client_in_use_list != LOGFILE_NO_CLIENT) {
+ if (!ntfs_check_log_client_array(vi, trp)) {
+ err = -EINVAL;
+ goto err_out;
+ }
+ }
+ if (lsn) {
+ if (ntfs_is_rstr_record(rp->magic))
+ *lsn = sle64_to_cpu(ra->current_lsn);
+ else /* if (ntfs_is_chkd_record(rp->magic)) */
+ *lsn = sle64_to_cpu(rp->chkdsk_lsn);
+ }
+ ntfs_debug("Done.");
+ if (wrp)
+ *wrp = trp;
+ else {
+err_out:
+ ntfs_free(trp);
+ }
+ return err;
+}
+
+/**
+ * ntfs_check_logfile - check the journal for consistency
+ * @log_vi: struct inode of loaded journal $LogFile to check
+ * @rp: [OUT] on success this is a copy of the current restart page
+ *
+ * Check the $LogFile journal for consistency and return 'true' if it is
+ * consistent and 'false' if not. On success, the current restart page is
+ * returned in *@rp. Caller must call ntfs_free(*@rp) when finished with it.
+ *
+ * At present we only check the two restart pages and ignore the log record
+ * pages.
+ *
+ * Note that the MstProtected flag is not set on the $LogFile inode and hence
+ * when reading pages they are not deprotected. This is because we do not know
+ * if the $LogFile was created on a system with a different page size to ours
+ * yet and mst deprotection would fail if our page size is smaller.
+ */
+bool ntfs_check_logfile(struct inode *log_vi, RESTART_PAGE_HEADER **rp)
+{
+ s64 size, pos;
+ LSN rstr1_lsn, rstr2_lsn;
+ ntfs_volume *vol = NTFS_SB(log_vi->i_sb);
+ struct address_space *mapping = log_vi->i_mapping;
+ struct page *page = NULL;
+ u8 *kaddr = NULL;
+ RESTART_PAGE_HEADER *rstr1_ph = NULL;
+ RESTART_PAGE_HEADER *rstr2_ph = NULL;
+ int log_page_size, err;
+ bool logfile_is_empty = true;
+ u8 log_page_bits;
+
+ ntfs_debug("Entering.");
+ /* An empty $LogFile must have been clean before it got emptied. */
+ if (NVolLogFileEmpty(vol))
+ goto is_empty;
+ size = i_size_read(log_vi);
+ /* Make sure the file doesn't exceed the maximum allowed size. */
+ if (size > MaxLogFileSize)
+ size = MaxLogFileSize;
+ /*
+ * Truncate size to a multiple of the page cache size or the default
+ * log page size if the page cache size is between the default log page
+ * log page size if the page cache size is between the default log page
+ * size and twice that.
+ */
+ if (PAGE_SIZE >= DefaultLogPageSize && PAGE_SIZE <=
+ DefaultLogPageSize * 2)
+ log_page_size = DefaultLogPageSize;
+ else
+ log_page_size = PAGE_SIZE;
+ /*
+ * Use ntfs_ffs() instead of ffs() to enable the compiler to
+ * optimize log_page_size and log_page_bits into constants.
+ */
+ log_page_bits = ntfs_ffs(log_page_size) - 1;
+ size &= ~(s64)(log_page_size - 1);
+ /*
+ * Ensure the log file is big enough to store at least the two restart
+ * pages and the minimum number of log record pages.
+ */
+ if (size < log_page_size * 2 || (size - log_page_size * 2) >>
+ log_page_bits < MinLogRecordPages) {
+ ntfs_error(vol->sb, "$LogFile is too small.");
+ return false;
+ }
+ /*
+ * Read through the file looking for a restart page. Since the restart
+ * page header is at the beginning of a page we only need to search at
+ * what could be the beginning of a page (for each page size) rather
+ * than scanning the whole file byte by byte. If all potential places
+ * contain empty and uninitialzed records, the log file can be assumed
+ * to be empty.
+ */
+ for (pos = 0; pos < size; pos <<= 1) {
+ pgoff_t idx = pos >> PAGE_SHIFT;
+ if (!page || page->index != idx) {
+ if (page)
+ ntfs_unmap_page(page);
+ page = ntfs_map_page(mapping, idx);
+ if (IS_ERR(page)) {
+ ntfs_error(vol->sb, "Error mapping $LogFile "
+ "page (index %lu).", idx);
+ goto err_out;
+ }
+ }
+ kaddr = (u8*)page_address(page) + (pos & ~PAGE_MASK);
+ /*
+ * A non-empty block means the logfile is not empty while an
+ * empty block after a non-empty block has been encountered
+ * means we are done.
+ */
+ if (!ntfs_is_empty_recordp((le32*)kaddr))
+ logfile_is_empty = false;
+ else if (!logfile_is_empty)
+ break;
+ /*
+ * A log record page means there cannot be a restart page after
+ * this so no need to continue searching.
+ */
+ if (ntfs_is_rcrd_recordp((le32*)kaddr))
+ break;
+ /* If not a (modified by chkdsk) restart page, continue. */
+ if (!ntfs_is_rstr_recordp((le32*)kaddr) &&
+ !ntfs_is_chkd_recordp((le32*)kaddr)) {
+ if (!pos)
+ pos = NTFS_BLOCK_SIZE >> 1;
+ continue;
+ }
+ /*
+ * Check the (modified by chkdsk) restart page for consistency
+ * and get a copy of the complete multi sector transfer
+ * deprotected restart page.
+ */
+ err = ntfs_check_and_load_restart_page(log_vi,
+ (RESTART_PAGE_HEADER*)kaddr, pos,
+ !rstr1_ph ? &rstr1_ph : &rstr2_ph,
+ !rstr1_ph ? &rstr1_lsn : &rstr2_lsn);
+ if (!err) {
+ /*
+ * If we have now found the first (modified by chkdsk)
+ * restart page, continue looking for the second one.
+ */
+ if (!pos) {
+ pos = NTFS_BLOCK_SIZE >> 1;
+ continue;
+ }
+ /*
+ * We have now found the second (modified by chkdsk)
+ * restart page, so we can stop looking.
+ */
+ break;
+ }
+ /*
+ * Error output already done inside the function. Note, we do
+ * not abort if the restart page was invalid as we might still
+ * find a valid one further in the file.
+ */
+ if (err != -EINVAL) {
+ ntfs_unmap_page(page);
+ goto err_out;
+ }
+ /* Continue looking. */
+ if (!pos)
+ pos = NTFS_BLOCK_SIZE >> 1;
+ }
+ if (page)
+ ntfs_unmap_page(page);
+ if (logfile_is_empty) {
+ NVolSetLogFileEmpty(vol);
+is_empty:
+ ntfs_debug("Done. ($LogFile is empty.)");
+ return true;
+ }
+ if (!rstr1_ph) {
+ BUG_ON(rstr2_ph);
+ ntfs_error(vol->sb, "Did not find any restart pages in "
+ "$LogFile and it was not empty.");
+ return false;
+ }
+ /* If both restart pages were found, use the more recent one. */
+ if (rstr2_ph) {
+ /*
+ * If the second restart area is more recent, switch to it.
+ * Otherwise just throw it away.
+ */
+ if (rstr2_lsn > rstr1_lsn) {
+ ntfs_debug("Using second restart page as it is more "
+ "recent.");
+ ntfs_free(rstr1_ph);
+ rstr1_ph = rstr2_ph;
+ /* rstr1_lsn = rstr2_lsn; */
+ } else {
+ ntfs_debug("Using first restart page as it is more "
+ "recent.");
+ ntfs_free(rstr2_ph);
+ }
+ rstr2_ph = NULL;
+ }
+ /* All consistency checks passed. */
+ if (rp)
+ *rp = rstr1_ph;
+ else
+ ntfs_free(rstr1_ph);
+ ntfs_debug("Done.");
+ return true;
+err_out:
+ if (rstr1_ph)
+ ntfs_free(rstr1_ph);
+ return false;
+}
+
+/**
+ * ntfs_is_logfile_clean - check in the journal if the volume is clean
+ * @log_vi: struct inode of loaded journal $LogFile to check
+ * @rp: copy of the current restart page
+ *
+ * Analyze the $LogFile journal and return 'true' if it indicates the volume was
+ * shutdown cleanly and 'false' if not.
+ *
+ * At present we only look at the two restart pages and ignore the log record
+ * pages. This is a little bit crude in that there will be a very small number
+ * of cases where we think that a volume is dirty when in fact it is clean.
+ * This should only affect volumes that have not been shutdown cleanly but did
+ * not have any pending, non-check-pointed i/o, i.e. they were completely idle
+ * at least for the five seconds preceding the unclean shutdown.
+ *
+ * This function assumes that the $LogFile journal has already been consistency
+ * checked by a call to ntfs_check_logfile() and in particular if the $LogFile
+ * is empty this function requires that NVolLogFileEmpty() is true otherwise an
+ * empty volume will be reported as dirty.
+ */
+bool ntfs_is_logfile_clean(struct inode *log_vi, const RESTART_PAGE_HEADER *rp)
+{
+ ntfs_volume *vol = NTFS_SB(log_vi->i_sb);
+ RESTART_AREA *ra;
+
+ ntfs_debug("Entering.");
+ /* An empty $LogFile must have been clean before it got emptied. */
+ if (NVolLogFileEmpty(vol)) {
+ ntfs_debug("Done. ($LogFile is empty.)");
+ return true;
+ }
+ BUG_ON(!rp);
+ if (!ntfs_is_rstr_record(rp->magic) &&
+ !ntfs_is_chkd_record(rp->magic)) {
+ ntfs_error(vol->sb, "Restart page buffer is invalid. This is "
+ "probably a bug in that the $LogFile should "
+ "have been consistency checked before calling "
+ "this function.");
+ return false;
+ }
+ ra = (RESTART_AREA*)((u8*)rp + le16_to_cpu(rp->restart_area_offset));
+ /*
+ * If the $LogFile has active clients, i.e. it is open, and we do not
+ * have the RESTART_VOLUME_IS_CLEAN bit set in the restart area flags,
+ * we assume there was an unclean shutdown.
+ */
+ if (ra->client_in_use_list != LOGFILE_NO_CLIENT &&
+ !(ra->flags & RESTART_VOLUME_IS_CLEAN)) {
+ ntfs_debug("Done. $LogFile indicates a dirty shutdown.");
+ return false;
+ }
+ /* $LogFile indicates a clean shutdown. */
+ ntfs_debug("Done. $LogFile indicates a clean shutdown.");
+ return true;
+}
+
+/**
+ * ntfs_empty_logfile - empty the contents of the $LogFile journal
+ * @log_vi: struct inode of loaded journal $LogFile to empty
+ *
+ * Empty the contents of the $LogFile journal @log_vi and return 'true' on
+ * success and 'false' on error.
+ *
+ * This function assumes that the $LogFile journal has already been consistency
+ * checked by a call to ntfs_check_logfile() and that ntfs_is_logfile_clean()
+ * has been used to ensure that the $LogFile is clean.
+ */
+bool ntfs_empty_logfile(struct inode *log_vi)
+{
+ VCN vcn, end_vcn;
+ ntfs_inode *log_ni = NTFS_I(log_vi);
+ ntfs_volume *vol = log_ni->vol;
+ struct super_block *sb = vol->sb;
+ runlist_element *rl;
+ unsigned long flags;
+ unsigned block_size, block_size_bits;
+ int err;
+ bool should_wait = true;
+
+ ntfs_debug("Entering.");
+ if (NVolLogFileEmpty(vol)) {
+ ntfs_debug("Done.");
+ return true;
+ }
+ /*
+ * We cannot use ntfs_attr_set() because we may be still in the middle
+ * of a mount operation. Thus we do the emptying by hand by first
+ * zapping the page cache pages for the $LogFile/$DATA attribute and
+ * then emptying each of the buffers in each of the clusters specified
+ * by the runlist by hand.
+ */
+ block_size = sb->s_blocksize;
+ block_size_bits = sb->s_blocksize_bits;
+ vcn = 0;
+ read_lock_irqsave(&log_ni->size_lock, flags);
+ end_vcn = (log_ni->initialized_size + vol->cluster_size_mask) >>
+ vol->cluster_size_bits;
+ read_unlock_irqrestore(&log_ni->size_lock, flags);
+ truncate_inode_pages(log_vi->i_mapping, 0);
+ down_write(&log_ni->runlist.lock);
+ rl = log_ni->runlist.rl;
+ if (unlikely(!rl || vcn < rl->vcn || !rl->length)) {
+map_vcn:
+ err = ntfs_map_runlist_nolock(log_ni, vcn, NULL);
+ if (err) {
+ ntfs_error(sb, "Failed to map runlist fragment (error "
+ "%d).", -err);
+ goto err;
+ }
+ rl = log_ni->runlist.rl;
+ BUG_ON(!rl || vcn < rl->vcn || !rl->length);
+ }
+ /* Seek to the runlist element containing @vcn. */
+ while (rl->length && vcn >= rl[1].vcn)
+ rl++;
+ do {
+ LCN lcn;
+ sector_t block, end_block;
+ s64 len;
+
+ /*
+ * If this run is not mapped map it now and start again as the
+ * runlist will have been updated.
+ */
+ lcn = rl->lcn;
+ if (unlikely(lcn == LCN_RL_NOT_MAPPED)) {
+ vcn = rl->vcn;
+ goto map_vcn;
+ }
+ /* If this run is not valid abort with an error. */
+ if (unlikely(!rl->length || lcn < LCN_HOLE))
+ goto rl_err;
+ /* Skip holes. */
+ if (lcn == LCN_HOLE)
+ continue;
+ block = lcn << vol->cluster_size_bits >> block_size_bits;
+ len = rl->length;
+ if (rl[1].vcn > end_vcn)
+ len = end_vcn - rl->vcn;
+ end_block = (lcn + len) << vol->cluster_size_bits >>
+ block_size_bits;
+ /* Iterate over the blocks in the run and empty them. */
+ do {
+ struct buffer_head *bh;
+
+ /* Obtain the buffer, possibly not uptodate. */
+ bh = sb_getblk(sb, block);
+ BUG_ON(!bh);
+ /* Setup buffer i/o submission. */
+ lock_buffer(bh);
+ bh->b_end_io = end_buffer_write_sync;
+ get_bh(bh);
+ /* Set the entire contents of the buffer to 0xff. */
+ memset(bh->b_data, -1, block_size);
+ if (!buffer_uptodate(bh))
+ set_buffer_uptodate(bh);
+ if (buffer_dirty(bh))
+ clear_buffer_dirty(bh);
+ /*
+ * Submit the buffer and wait for i/o to complete but
+ * only for the first buffer so we do not miss really
+ * serious i/o errors. Once the first buffer has
+ * completed ignore errors afterwards as we can assume
+ * that if one buffer worked all of them will work.
+ */
+ submit_bh(REQ_OP_WRITE, bh);
+ if (should_wait) {
+ should_wait = false;
+ wait_on_buffer(bh);
+ if (unlikely(!buffer_uptodate(bh)))
+ goto io_err;
+ }
+ brelse(bh);
+ } while (++block < end_block);
+ } while ((++rl)->vcn < end_vcn);
+ up_write(&log_ni->runlist.lock);
+ /*
+ * Zap the pages again just in case any got instantiated whilst we were
+ * emptying the blocks by hand. FIXME: We may not have completed
+ * writing to all the buffer heads yet so this may happen too early.
+ * We really should use a kernel thread to do the emptying
+ * asynchronously and then we can also set the volume dirty and output
+ * an error message if emptying should fail.
+ */
+ truncate_inode_pages(log_vi->i_mapping, 0);
+ /* Set the flag so we do not have to do it again on remount. */
+ NVolSetLogFileEmpty(vol);
+ ntfs_debug("Done.");
+ return true;
+io_err:
+ ntfs_error(sb, "Failed to write buffer. Unmount and run chkdsk.");
+ goto dirty_err;
+rl_err:
+ ntfs_error(sb, "Runlist is corrupt. Unmount and run chkdsk.");
+dirty_err:
+ NVolSetErrors(vol);
+ err = -EIO;
+err:
+ up_write(&log_ni->runlist.lock);
+ ntfs_error(sb, "Failed to fill $LogFile with 0xff bytes (error %d).",
+ -err);
+ return false;
+}
+
+#endif /* NTFS_RW */
diff --git a/fs/ntfs/logfile.h b/fs/ntfs/logfile.h
new file mode 100644
index 000000000..429d4909c
--- /dev/null
+++ b/fs/ntfs/logfile.h
@@ -0,0 +1,295 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * logfile.h - Defines for NTFS kernel journal ($LogFile) handling. Part of
+ * the Linux-NTFS project.
+ *
+ * Copyright (c) 2000-2005 Anton Altaparmakov
+ */
+
+#ifndef _LINUX_NTFS_LOGFILE_H
+#define _LINUX_NTFS_LOGFILE_H
+
+#ifdef NTFS_RW
+
+#include <linux/fs.h>
+
+#include "types.h"
+#include "endian.h"
+#include "layout.h"
+
+/*
+ * Journal ($LogFile) organization:
+ *
+ * Two restart areas present in the first two pages (restart pages, one restart
+ * area in each page). When the volume is dismounted they should be identical,
+ * except for the update sequence array which usually has a different update
+ * sequence number.
+ *
+ * These are followed by log records organized in pages headed by a log record
+ * header going up to log file size. Not all pages contain log records when a
+ * volume is first formatted, but as the volume ages, all records will be used.
+ * When the log file fills up, the records at the beginning are purged (by
+ * modifying the oldest_lsn to a higher value presumably) and writing begins
+ * at the beginning of the file. Effectively, the log file is viewed as a
+ * circular entity.
+ *
+ * NOTE: Windows NT, 2000, and XP all use log file version 1.1 but they accept
+ * versions <= 1.x, including 0.-1. (Yes, that is a minus one in there!) We
+ * probably only want to support 1.1 as this seems to be the current version
+ * and we don't know how that differs from the older versions. The only
+ * exception is if the journal is clean as marked by the two restart pages
+ * then it doesn't matter whether we are on an earlier version. We can just
+ * reinitialize the logfile and start again with version 1.1.
+ */
+
+/* Some $LogFile related constants. */
+#define MaxLogFileSize 0x100000000ULL
+#define DefaultLogPageSize 4096
+#define MinLogRecordPages 48
+
+/*
+ * Log file restart page header (begins the restart area).
+ */
+typedef struct {
+/*Ofs*/
+/* 0 NTFS_RECORD; -- Unfolded here as gcc doesn't like unnamed structs. */
+/* 0*/ NTFS_RECORD_TYPE magic; /* The magic is "RSTR". */
+/* 4*/ le16 usa_ofs; /* See NTFS_RECORD definition in layout.h.
+ When creating, set this to be immediately
+ after this header structure (without any
+ alignment). */
+/* 6*/ le16 usa_count; /* See NTFS_RECORD definition in layout.h. */
+
+/* 8*/ leLSN chkdsk_lsn; /* The last log file sequence number found by
+ chkdsk. Only used when the magic is changed
+ to "CHKD". Otherwise this is zero. */
+/* 16*/ le32 system_page_size; /* Byte size of system pages when the log file
+ was created, has to be >= 512 and a power of
+ 2. Use this to calculate the required size
+ of the usa (usa_count) and add it to usa_ofs.
+ Then verify that the result is less than the
+ value of the restart_area_offset. */
+/* 20*/ le32 log_page_size; /* Byte size of log file pages, has to be >=
+ 512 and a power of 2. The default is 4096
+ and is used when the system page size is
+ between 4096 and 8192. Otherwise this is
+ set to the system page size instead. */
+/* 24*/ le16 restart_area_offset;/* Byte offset from the start of this header to
+ the RESTART_AREA. Value has to be aligned
+ to 8-byte boundary. When creating, set this
+ to be after the usa. */
+/* 26*/ sle16 minor_ver; /* Log file minor version. Only check if major
+ version is 1. */
+/* 28*/ sle16 major_ver; /* Log file major version. We only support
+ version 1.1. */
+/* sizeof() = 30 (0x1e) bytes */
+} __attribute__ ((__packed__)) RESTART_PAGE_HEADER;
+
+/*
+ * Constant for the log client indices meaning that there are no client records
+ * in this particular client array. Also inside the client records themselves,
+ * this means that there are no client records preceding or following this one.
+ */
+#define LOGFILE_NO_CLIENT cpu_to_le16(0xffff)
+#define LOGFILE_NO_CLIENT_CPU 0xffff
+
+/*
+ * These are the so far known RESTART_AREA_* flags (16-bit) which contain
+ * information about the log file in which they are present.
+ */
+enum {
+ RESTART_VOLUME_IS_CLEAN = cpu_to_le16(0x0002),
+ RESTART_SPACE_FILLER = cpu_to_le16(0xffff), /* gcc: Force enum bit width to 16. */
+} __attribute__ ((__packed__));
+
+typedef le16 RESTART_AREA_FLAGS;
+
+/*
+ * Log file restart area record. The offset of this record is found by adding
+ * the offset of the RESTART_PAGE_HEADER to the restart_area_offset value found
+ * in it. See notes at restart_area_offset above.
+ */
+typedef struct {
+/*Ofs*/
+/* 0*/ leLSN current_lsn; /* The current, i.e. last LSN inside the log
+ when the restart area was last written.
+ This happens often but what is the interval?
+ Is it just fixed time or is it every time a
+ check point is written or somethine else?
+ On create set to 0. */
+/* 8*/ le16 log_clients; /* Number of log client records in the array of
+ log client records which follows this
+ restart area. Must be 1. */
+/* 10*/ le16 client_free_list; /* The index of the first free log client record
+ in the array of log client records.
+ LOGFILE_NO_CLIENT means that there are no
+ free log client records in the array.
+ If != LOGFILE_NO_CLIENT, check that
+ log_clients > client_free_list. On Win2k
+ and presumably earlier, on a clean volume
+ this is != LOGFILE_NO_CLIENT, and it should
+ be 0, i.e. the first (and only) client
+ record is free and thus the logfile is
+ closed and hence clean. A dirty volume
+ would have left the logfile open and hence
+ this would be LOGFILE_NO_CLIENT. On WinXP
+ and presumably later, the logfile is always
+ open, even on clean shutdown so this should
+ always be LOGFILE_NO_CLIENT. */
+/* 12*/ le16 client_in_use_list;/* The index of the first in-use log client
+ record in the array of log client records.
+ LOGFILE_NO_CLIENT means that there are no
+ in-use log client records in the array. If
+ != LOGFILE_NO_CLIENT check that log_clients
+ > client_in_use_list. On Win2k and
+ presumably earlier, on a clean volume this
+ is LOGFILE_NO_CLIENT, i.e. there are no
+ client records in use and thus the logfile
+ is closed and hence clean. A dirty volume
+ would have left the logfile open and hence
+ this would be != LOGFILE_NO_CLIENT, and it
+ should be 0, i.e. the first (and only)
+ client record is in use. On WinXP and
+ presumably later, the logfile is always
+ open, even on clean shutdown so this should
+ always be 0. */
+/* 14*/ RESTART_AREA_FLAGS flags;/* Flags modifying LFS behaviour. On Win2k
+ and presumably earlier this is always 0. On
+ WinXP and presumably later, if the logfile
+ was shutdown cleanly, the second bit,
+ RESTART_VOLUME_IS_CLEAN, is set. This bit
+ is cleared when the volume is mounted by
+ WinXP and set when the volume is dismounted,
+ thus if the logfile is dirty, this bit is
+ clear. Thus we don't need to check the
+ Windows version to determine if the logfile
+ is clean. Instead if the logfile is closed,
+ we know it must be clean. If it is open and
+ this bit is set, we also know it must be
+ clean. If on the other hand the logfile is
+ open and this bit is clear, we can be almost
+ certain that the logfile is dirty. */
+/* 16*/ le32 seq_number_bits; /* How many bits to use for the sequence
+ number. This is calculated as 67 - the
+ number of bits required to store the logfile
+ size in bytes and this can be used in with
+ the specified file_size as a consistency
+ check. */
+/* 20*/ le16 restart_area_length;/* Length of the restart area including the
+ client array. Following checks required if
+ version matches. Otherwise, skip them.
+ restart_area_offset + restart_area_length
+ has to be <= system_page_size. Also,
+ restart_area_length has to be >=
+ client_array_offset + (log_clients *
+ sizeof(log client record)). */
+/* 22*/ le16 client_array_offset;/* Offset from the start of this record to
+ the first log client record if versions are
+ matched. When creating, set this to be
+ after this restart area structure, aligned
+ to 8-bytes boundary. If the versions do not
+ match, this is ignored and the offset is
+ assumed to be (sizeof(RESTART_AREA) + 7) &
+ ~7, i.e. rounded up to first 8-byte
+ boundary. Either way, client_array_offset
+ has to be aligned to an 8-byte boundary.
+ Also, restart_area_offset +
+ client_array_offset has to be <= 510.
+ Finally, client_array_offset + (log_clients
+ * sizeof(log client record)) has to be <=
+ system_page_size. On Win2k and presumably
+ earlier, this is 0x30, i.e. immediately
+ following this record. On WinXP and
+ presumably later, this is 0x40, i.e. there
+ are 16 extra bytes between this record and
+ the client array. This probably means that
+ the RESTART_AREA record is actually bigger
+ in WinXP and later. */
+/* 24*/ sle64 file_size; /* Usable byte size of the log file. If the
+ restart_area_offset + the offset of the
+ file_size are > 510 then corruption has
+ occurred. This is the very first check when
+ starting with the restart_area as if it
+ fails it means that some of the above values
+ will be corrupted by the multi sector
+ transfer protection. The file_size has to
+ be rounded down to be a multiple of the
+ log_page_size in the RESTART_PAGE_HEADER and
+ then it has to be at least big enough to
+ store the two restart pages and 48 (0x30)
+ log record pages. */
+/* 32*/ le32 last_lsn_data_length;/* Length of data of last LSN, not including
+ the log record header. On create set to
+ 0. */
+/* 36*/ le16 log_record_header_length;/* Byte size of the log record header.
+ If the version matches then check that the
+ value of log_record_header_length is a
+ multiple of 8, i.e.
+ (log_record_header_length + 7) & ~7 ==
+ log_record_header_length. When creating set
+ it to sizeof(LOG_RECORD_HEADER), aligned to
+ 8 bytes. */
+/* 38*/ le16 log_page_data_offset;/* Offset to the start of data in a log record
+ page. Must be a multiple of 8. On create
+ set it to immediately after the update
+ sequence array of the log record page. */
+/* 40*/ le32 restart_log_open_count;/* A counter that gets incremented every
+ time the logfile is restarted which happens
+ at mount time when the logfile is opened.
+ When creating set to a random value. Win2k
+ sets it to the low 32 bits of the current
+ system time in NTFS format (see time.h). */
+/* 44*/ le32 reserved; /* Reserved/alignment to 8-byte boundary. */
+/* sizeof() = 48 (0x30) bytes */
+} __attribute__ ((__packed__)) RESTART_AREA;
+
+/*
+ * Log client record. The offset of this record is found by adding the offset
+ * of the RESTART_AREA to the client_array_offset value found in it.
+ */
+typedef struct {
+/*Ofs*/
+/* 0*/ leLSN oldest_lsn; /* Oldest LSN needed by this client. On create
+ set to 0. */
+/* 8*/ leLSN client_restart_lsn;/* LSN at which this client needs to restart
+ the volume, i.e. the current position within
+ the log file. At present, if clean this
+ should = current_lsn in restart area but it
+ probably also = current_lsn when dirty most
+ of the time. At create set to 0. */
+/* 16*/ le16 prev_client; /* The offset to the previous log client record
+ in the array of log client records.
+ LOGFILE_NO_CLIENT means there is no previous
+ client record, i.e. this is the first one.
+ This is always LOGFILE_NO_CLIENT. */
+/* 18*/ le16 next_client; /* The offset to the next log client record in
+ the array of log client records.
+ LOGFILE_NO_CLIENT means there are no next
+ client records, i.e. this is the last one.
+ This is always LOGFILE_NO_CLIENT. */
+/* 20*/ le16 seq_number; /* On Win2k and presumably earlier, this is set
+ to zero every time the logfile is restarted
+ and it is incremented when the logfile is
+ closed at dismount time. Thus it is 0 when
+ dirty and 1 when clean. On WinXP and
+ presumably later, this is always 0. */
+/* 22*/ u8 reserved[6]; /* Reserved/alignment. */
+/* 28*/ le32 client_name_length;/* Length of client name in bytes. Should
+ always be 8. */
+/* 32*/ ntfschar client_name[64];/* Name of the client in Unicode. Should
+ always be "NTFS" with the remaining bytes
+ set to 0. */
+/* sizeof() = 160 (0xa0) bytes */
+} __attribute__ ((__packed__)) LOG_CLIENT_RECORD;
+
+extern bool ntfs_check_logfile(struct inode *log_vi,
+ RESTART_PAGE_HEADER **rp);
+
+extern bool ntfs_is_logfile_clean(struct inode *log_vi,
+ const RESTART_PAGE_HEADER *rp);
+
+extern bool ntfs_empty_logfile(struct inode *log_vi);
+
+#endif /* NTFS_RW */
+
+#endif /* _LINUX_NTFS_LOGFILE_H */
diff --git a/fs/ntfs/malloc.h b/fs/ntfs/malloc.h
new file mode 100644
index 000000000..706842573
--- /dev/null
+++ b/fs/ntfs/malloc.h
@@ -0,0 +1,77 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * malloc.h - NTFS kernel memory handling. Part of the Linux-NTFS project.
+ *
+ * Copyright (c) 2001-2005 Anton Altaparmakov
+ */
+
+#ifndef _LINUX_NTFS_MALLOC_H
+#define _LINUX_NTFS_MALLOC_H
+
+#include <linux/vmalloc.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+
+/**
+ * __ntfs_malloc - allocate memory in multiples of pages
+ * @size: number of bytes to allocate
+ * @gfp_mask: extra flags for the allocator
+ *
+ * Internal function. You probably want ntfs_malloc_nofs()...
+ *
+ * Allocates @size bytes of memory, rounded up to multiples of PAGE_SIZE and
+ * returns a pointer to the allocated memory.
+ *
+ * If there was insufficient memory to complete the request, return NULL.
+ * Depending on @gfp_mask the allocation may be guaranteed to succeed.
+ */
+static inline void *__ntfs_malloc(unsigned long size, gfp_t gfp_mask)
+{
+ if (likely(size <= PAGE_SIZE)) {
+ BUG_ON(!size);
+ /* kmalloc() has per-CPU caches so is faster for now. */
+ return kmalloc(PAGE_SIZE, gfp_mask & ~__GFP_HIGHMEM);
+ /* return (void *)__get_free_page(gfp_mask); */
+ }
+ if (likely((size >> PAGE_SHIFT) < totalram_pages()))
+ return __vmalloc(size, gfp_mask);
+ return NULL;
+}
+
+/**
+ * ntfs_malloc_nofs - allocate memory in multiples of pages
+ * @size: number of bytes to allocate
+ *
+ * Allocates @size bytes of memory, rounded up to multiples of PAGE_SIZE and
+ * returns a pointer to the allocated memory.
+ *
+ * If there was insufficient memory to complete the request, return NULL.
+ */
+static inline void *ntfs_malloc_nofs(unsigned long size)
+{
+ return __ntfs_malloc(size, GFP_NOFS | __GFP_HIGHMEM);
+}
+
+/**
+ * ntfs_malloc_nofs_nofail - allocate memory in multiples of pages
+ * @size: number of bytes to allocate
+ *
+ * Allocates @size bytes of memory, rounded up to multiples of PAGE_SIZE and
+ * returns a pointer to the allocated memory.
+ *
+ * This function guarantees that the allocation will succeed. It will sleep
+ * for as long as it takes to complete the allocation.
+ *
+ * If there was insufficient memory to complete the request, return NULL.
+ */
+static inline void *ntfs_malloc_nofs_nofail(unsigned long size)
+{
+ return __ntfs_malloc(size, GFP_NOFS | __GFP_HIGHMEM | __GFP_NOFAIL);
+}
+
+static inline void ntfs_free(void *addr)
+{
+ kvfree(addr);
+}
+
+#endif /* _LINUX_NTFS_MALLOC_H */
diff --git a/fs/ntfs/mft.c b/fs/ntfs/mft.c
new file mode 100644
index 000000000..f7bf5ce96
--- /dev/null
+++ b/fs/ntfs/mft.c
@@ -0,0 +1,2906 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/**
+ * mft.c - NTFS kernel mft record operations. Part of the Linux-NTFS project.
+ *
+ * Copyright (c) 2001-2012 Anton Altaparmakov and Tuxera Inc.
+ * Copyright (c) 2002 Richard Russon
+ */
+
+#include <linux/buffer_head.h>
+#include <linux/slab.h>
+#include <linux/swap.h>
+#include <linux/bio.h>
+
+#include "attrib.h"
+#include "aops.h"
+#include "bitmap.h"
+#include "debug.h"
+#include "dir.h"
+#include "lcnalloc.h"
+#include "malloc.h"
+#include "mft.h"
+#include "ntfs.h"
+
+#define MAX_BHS (PAGE_SIZE / NTFS_BLOCK_SIZE)
+
+/**
+ * map_mft_record_page - map the page in which a specific mft record resides
+ * @ni: ntfs inode whose mft record page to map
+ *
+ * This maps the page in which the mft record of the ntfs inode @ni is situated
+ * and returns a pointer to the mft record within the mapped page.
+ *
+ * Return value needs to be checked with IS_ERR() and if that is true PTR_ERR()
+ * contains the negative error code returned.
+ */
+static inline MFT_RECORD *map_mft_record_page(ntfs_inode *ni)
+{
+ loff_t i_size;
+ ntfs_volume *vol = ni->vol;
+ struct inode *mft_vi = vol->mft_ino;
+ struct page *page;
+ unsigned long index, end_index;
+ unsigned ofs;
+
+ BUG_ON(ni->page);
+ /*
+ * The index into the page cache and the offset within the page cache
+ * page of the wanted mft record. FIXME: We need to check for
+ * overflowing the unsigned long, but I don't think we would ever get
+ * here if the volume was that big...
+ */
+ index = (u64)ni->mft_no << vol->mft_record_size_bits >>
+ PAGE_SHIFT;
+ ofs = (ni->mft_no << vol->mft_record_size_bits) & ~PAGE_MASK;
+
+ i_size = i_size_read(mft_vi);
+ /* The maximum valid index into the page cache for $MFT's data. */
+ end_index = i_size >> PAGE_SHIFT;
+
+ /* If the wanted index is out of bounds the mft record doesn't exist. */
+ if (unlikely(index >= end_index)) {
+ if (index > end_index || (i_size & ~PAGE_MASK) < ofs +
+ vol->mft_record_size) {
+ page = ERR_PTR(-ENOENT);
+ ntfs_error(vol->sb, "Attempt to read mft record 0x%lx, "
+ "which is beyond the end of the mft. "
+ "This is probably a bug in the ntfs "
+ "driver.", ni->mft_no);
+ goto err_out;
+ }
+ }
+ /* Read, map, and pin the page. */
+ page = ntfs_map_page(mft_vi->i_mapping, index);
+ if (!IS_ERR(page)) {
+ /* Catch multi sector transfer fixup errors. */
+ if (likely(ntfs_is_mft_recordp((le32*)(page_address(page) +
+ ofs)))) {
+ ni->page = page;
+ ni->page_ofs = ofs;
+ return page_address(page) + ofs;
+ }
+ ntfs_error(vol->sb, "Mft record 0x%lx is corrupt. "
+ "Run chkdsk.", ni->mft_no);
+ ntfs_unmap_page(page);
+ page = ERR_PTR(-EIO);
+ NVolSetErrors(vol);
+ }
+err_out:
+ ni->page = NULL;
+ ni->page_ofs = 0;
+ return (void*)page;
+}
+
+/**
+ * map_mft_record - map, pin and lock an mft record
+ * @ni: ntfs inode whose MFT record to map
+ *
+ * First, take the mrec_lock mutex. We might now be sleeping, while waiting
+ * for the mutex if it was already locked by someone else.
+ *
+ * The page of the record is mapped using map_mft_record_page() before being
+ * returned to the caller.
+ *
+ * This in turn uses ntfs_map_page() to get the page containing the wanted mft
+ * record (it in turn calls read_cache_page() which reads it in from disk if
+ * necessary, increments the use count on the page so that it cannot disappear
+ * under us and returns a reference to the page cache page).
+ *
+ * If read_cache_page() invokes ntfs_readpage() to load the page from disk, it
+ * sets PG_locked and clears PG_uptodate on the page. Once I/O has completed
+ * and the post-read mst fixups on each mft record in the page have been
+ * performed, the page gets PG_uptodate set and PG_locked cleared (this is done
+ * in our asynchronous I/O completion handler end_buffer_read_mft_async()).
+ * ntfs_map_page() waits for PG_locked to become clear and checks if
+ * PG_uptodate is set and returns an error code if not. This provides
+ * sufficient protection against races when reading/using the page.
+ *
+ * However there is the write mapping to think about. Doing the above described
+ * checking here will be fine, because when initiating the write we will set
+ * PG_locked and clear PG_uptodate making sure nobody is touching the page
+ * contents. Doing the locking this way means that the commit to disk code in
+ * the page cache code paths is automatically sufficiently locked with us as
+ * we will not touch a page that has been locked or is not uptodate. The only
+ * locking problem then is them locking the page while we are accessing it.
+ *
+ * So that code will end up having to own the mrec_lock of all mft
+ * records/inodes present in the page before I/O can proceed. In that case we
+ * wouldn't need to bother with PG_locked and PG_uptodate as nobody will be
+ * accessing anything without owning the mrec_lock mutex. But we do need to
+ * use them because of the read_cache_page() invocation and the code becomes so
+ * much simpler this way that it is well worth it.
+ *
+ * The mft record is now ours and we return a pointer to it. You need to check
+ * the returned pointer with IS_ERR() and if that is true, PTR_ERR() will return
+ * the error code.
+ *
+ * NOTE: Caller is responsible for setting the mft record dirty before calling
+ * unmap_mft_record(). This is obviously only necessary if the caller really
+ * modified the mft record...
+ * Q: Do we want to recycle one of the VFS inode state bits instead?
+ * A: No, the inode ones mean we want to change the mft record, not we want to
+ * write it out.
+ */
+MFT_RECORD *map_mft_record(ntfs_inode *ni)
+{
+ MFT_RECORD *m;
+
+ ntfs_debug("Entering for mft_no 0x%lx.", ni->mft_no);
+
+ /* Make sure the ntfs inode doesn't go away. */
+ atomic_inc(&ni->count);
+
+ /* Serialize access to this mft record. */
+ mutex_lock(&ni->mrec_lock);
+
+ m = map_mft_record_page(ni);
+ if (!IS_ERR(m))
+ return m;
+
+ mutex_unlock(&ni->mrec_lock);
+ atomic_dec(&ni->count);
+ ntfs_error(ni->vol->sb, "Failed with error code %lu.", -PTR_ERR(m));
+ return m;
+}
+
+/**
+ * unmap_mft_record_page - unmap the page in which a specific mft record resides
+ * @ni: ntfs inode whose mft record page to unmap
+ *
+ * This unmaps the page in which the mft record of the ntfs inode @ni is
+ * situated and returns. This is a NOOP if highmem is not configured.
+ *
+ * The unmap happens via ntfs_unmap_page() which in turn decrements the use
+ * count on the page thus releasing it from the pinned state.
+ *
+ * We do not actually unmap the page from memory of course, as that will be
+ * done by the page cache code itself when memory pressure increases or
+ * whatever.
+ */
+static inline void unmap_mft_record_page(ntfs_inode *ni)
+{
+ BUG_ON(!ni->page);
+
+ // TODO: If dirty, blah...
+ ntfs_unmap_page(ni->page);
+ ni->page = NULL;
+ ni->page_ofs = 0;
+ return;
+}
+
+/**
+ * unmap_mft_record - release a mapped mft record
+ * @ni: ntfs inode whose MFT record to unmap
+ *
+ * We release the page mapping and the mrec_lock mutex which unmaps the mft
+ * record and releases it for others to get hold of. We also release the ntfs
+ * inode by decrementing the ntfs inode reference count.
+ *
+ * NOTE: If caller has modified the mft record, it is imperative to set the mft
+ * record dirty BEFORE calling unmap_mft_record().
+ */
+void unmap_mft_record(ntfs_inode *ni)
+{
+ struct page *page = ni->page;
+
+ BUG_ON(!page);
+
+ ntfs_debug("Entering for mft_no 0x%lx.", ni->mft_no);
+
+ unmap_mft_record_page(ni);
+ mutex_unlock(&ni->mrec_lock);
+ atomic_dec(&ni->count);
+ /*
+ * If pure ntfs_inode, i.e. no vfs inode attached, we leave it to
+ * ntfs_clear_extent_inode() in the extent inode case, and to the
+ * caller in the non-extent, yet pure ntfs inode case, to do the actual
+ * tear down of all structures and freeing of all allocated memory.
+ */
+ return;
+}
+
+/**
+ * map_extent_mft_record - load an extent inode and attach it to its base
+ * @base_ni: base ntfs inode
+ * @mref: mft reference of the extent inode to load
+ * @ntfs_ino: on successful return, pointer to the ntfs_inode structure
+ *
+ * Load the extent mft record @mref and attach it to its base inode @base_ni.
+ * Return the mapped extent mft record if IS_ERR(result) is false. Otherwise
+ * PTR_ERR(result) gives the negative error code.
+ *
+ * On successful return, @ntfs_ino contains a pointer to the ntfs_inode
+ * structure of the mapped extent inode.
+ */
+MFT_RECORD *map_extent_mft_record(ntfs_inode *base_ni, MFT_REF mref,
+ ntfs_inode **ntfs_ino)
+{
+ MFT_RECORD *m;
+ ntfs_inode *ni = NULL;
+ ntfs_inode **extent_nis = NULL;
+ int i;
+ unsigned long mft_no = MREF(mref);
+ u16 seq_no = MSEQNO(mref);
+ bool destroy_ni = false;
+
+ ntfs_debug("Mapping extent mft record 0x%lx (base mft record 0x%lx).",
+ mft_no, base_ni->mft_no);
+ /* Make sure the base ntfs inode doesn't go away. */
+ atomic_inc(&base_ni->count);
+ /*
+ * Check if this extent inode has already been added to the base inode,
+ * in which case just return it. If not found, add it to the base
+ * inode before returning it.
+ */
+ mutex_lock(&base_ni->extent_lock);
+ if (base_ni->nr_extents > 0) {
+ extent_nis = base_ni->ext.extent_ntfs_inos;
+ for (i = 0; i < base_ni->nr_extents; i++) {
+ if (mft_no != extent_nis[i]->mft_no)
+ continue;
+ ni = extent_nis[i];
+ /* Make sure the ntfs inode doesn't go away. */
+ atomic_inc(&ni->count);
+ break;
+ }
+ }
+ if (likely(ni != NULL)) {
+ mutex_unlock(&base_ni->extent_lock);
+ atomic_dec(&base_ni->count);
+ /* We found the record; just have to map and return it. */
+ m = map_mft_record(ni);
+ /* map_mft_record() has incremented this on success. */
+ atomic_dec(&ni->count);
+ if (!IS_ERR(m)) {
+ /* Verify the sequence number. */
+ if (likely(le16_to_cpu(m->sequence_number) == seq_no)) {
+ ntfs_debug("Done 1.");
+ *ntfs_ino = ni;
+ return m;
+ }
+ unmap_mft_record(ni);
+ ntfs_error(base_ni->vol->sb, "Found stale extent mft "
+ "reference! Corrupt filesystem. "
+ "Run chkdsk.");
+ return ERR_PTR(-EIO);
+ }
+map_err_out:
+ ntfs_error(base_ni->vol->sb, "Failed to map extent "
+ "mft record, error code %ld.", -PTR_ERR(m));
+ return m;
+ }
+ /* Record wasn't there. Get a new ntfs inode and initialize it. */
+ ni = ntfs_new_extent_inode(base_ni->vol->sb, mft_no);
+ if (unlikely(!ni)) {
+ mutex_unlock(&base_ni->extent_lock);
+ atomic_dec(&base_ni->count);
+ return ERR_PTR(-ENOMEM);
+ }
+ ni->vol = base_ni->vol;
+ ni->seq_no = seq_no;
+ ni->nr_extents = -1;
+ ni->ext.base_ntfs_ino = base_ni;
+ /* Now map the record. */
+ m = map_mft_record(ni);
+ if (IS_ERR(m)) {
+ mutex_unlock(&base_ni->extent_lock);
+ atomic_dec(&base_ni->count);
+ ntfs_clear_extent_inode(ni);
+ goto map_err_out;
+ }
+ /* Verify the sequence number if it is present. */
+ if (seq_no && (le16_to_cpu(m->sequence_number) != seq_no)) {
+ ntfs_error(base_ni->vol->sb, "Found stale extent mft "
+ "reference! Corrupt filesystem. Run chkdsk.");
+ destroy_ni = true;
+ m = ERR_PTR(-EIO);
+ goto unm_err_out;
+ }
+ /* Attach extent inode to base inode, reallocating memory if needed. */
+ if (!(base_ni->nr_extents & 3)) {
+ ntfs_inode **tmp;
+ int new_size = (base_ni->nr_extents + 4) * sizeof(ntfs_inode *);
+
+ tmp = kmalloc(new_size, GFP_NOFS);
+ if (unlikely(!tmp)) {
+ ntfs_error(base_ni->vol->sb, "Failed to allocate "
+ "internal buffer.");
+ destroy_ni = true;
+ m = ERR_PTR(-ENOMEM);
+ goto unm_err_out;
+ }
+ if (base_ni->nr_extents) {
+ BUG_ON(!base_ni->ext.extent_ntfs_inos);
+ memcpy(tmp, base_ni->ext.extent_ntfs_inos, new_size -
+ 4 * sizeof(ntfs_inode *));
+ kfree(base_ni->ext.extent_ntfs_inos);
+ }
+ base_ni->ext.extent_ntfs_inos = tmp;
+ }
+ base_ni->ext.extent_ntfs_inos[base_ni->nr_extents++] = ni;
+ mutex_unlock(&base_ni->extent_lock);
+ atomic_dec(&base_ni->count);
+ ntfs_debug("Done 2.");
+ *ntfs_ino = ni;
+ return m;
+unm_err_out:
+ unmap_mft_record(ni);
+ mutex_unlock(&base_ni->extent_lock);
+ atomic_dec(&base_ni->count);
+ /*
+ * If the extent inode was not attached to the base inode we need to
+ * release it or we will leak memory.
+ */
+ if (destroy_ni)
+ ntfs_clear_extent_inode(ni);
+ return m;
+}
+
+#ifdef NTFS_RW
+
+/**
+ * __mark_mft_record_dirty - set the mft record and the page containing it dirty
+ * @ni: ntfs inode describing the mapped mft record
+ *
+ * Internal function. Users should call mark_mft_record_dirty() instead.
+ *
+ * Set the mapped (extent) mft record of the (base or extent) ntfs inode @ni,
+ * as well as the page containing the mft record, dirty. Also, mark the base
+ * vfs inode dirty. This ensures that any changes to the mft record are
+ * written out to disk.
+ *
+ * NOTE: We only set I_DIRTY_DATASYNC (and not I_DIRTY_PAGES)
+ * on the base vfs inode, because even though file data may have been modified,
+ * it is dirty in the inode meta data rather than the data page cache of the
+ * inode, and thus there are no data pages that need writing out. Therefore, a
+ * full mark_inode_dirty() is overkill. A mark_inode_dirty_sync(), on the
+ * other hand, is not sufficient, because ->write_inode needs to be called even
+ * in case of fdatasync. This needs to happen or the file data would not
+ * necessarily hit the device synchronously, even though the vfs inode has the
+ * O_SYNC flag set. Also, I_DIRTY_DATASYNC simply "feels" better than just
+ * I_DIRTY_SYNC, since the file data has not actually hit the block device yet,
+ * which is not what I_DIRTY_SYNC on its own would suggest.
+ */
+void __mark_mft_record_dirty(ntfs_inode *ni)
+{
+ ntfs_inode *base_ni;
+
+ ntfs_debug("Entering for inode 0x%lx.", ni->mft_no);
+ BUG_ON(NInoAttr(ni));
+ mark_ntfs_record_dirty(ni->page, ni->page_ofs);
+ /* Determine the base vfs inode and mark it dirty, too. */
+ mutex_lock(&ni->extent_lock);
+ if (likely(ni->nr_extents >= 0))
+ base_ni = ni;
+ else
+ base_ni = ni->ext.base_ntfs_ino;
+ mutex_unlock(&ni->extent_lock);
+ __mark_inode_dirty(VFS_I(base_ni), I_DIRTY_DATASYNC);
+}
+
+static const char *ntfs_please_email = "Please email "
+ "linux-ntfs-dev@lists.sourceforge.net and say that you saw "
+ "this message. Thank you.";
+
+/**
+ * ntfs_sync_mft_mirror_umount - synchronise an mft record to the mft mirror
+ * @vol: ntfs volume on which the mft record to synchronize resides
+ * @mft_no: mft record number of mft record to synchronize
+ * @m: mapped, mst protected (extent) mft record to synchronize
+ *
+ * Write the mapped, mst protected (extent) mft record @m with mft record
+ * number @mft_no to the mft mirror ($MFTMirr) of the ntfs volume @vol,
+ * bypassing the page cache and the $MFTMirr inode itself.
+ *
+ * This function is only for use at umount time when the mft mirror inode has
+ * already been disposed off. We BUG() if we are called while the mft mirror
+ * inode is still attached to the volume.
+ *
+ * On success return 0. On error return -errno.
+ *
+ * NOTE: This function is not implemented yet as I am not convinced it can
+ * actually be triggered considering the sequence of commits we do in super.c::
+ * ntfs_put_super(). But just in case we provide this place holder as the
+ * alternative would be either to BUG() or to get a NULL pointer dereference
+ * and Oops.
+ */
+static int ntfs_sync_mft_mirror_umount(ntfs_volume *vol,
+ const unsigned long mft_no, MFT_RECORD *m)
+{
+ BUG_ON(vol->mftmirr_ino);
+ ntfs_error(vol->sb, "Umount time mft mirror syncing is not "
+ "implemented yet. %s", ntfs_please_email);
+ return -EOPNOTSUPP;
+}
+
+/**
+ * ntfs_sync_mft_mirror - synchronize an mft record to the mft mirror
+ * @vol: ntfs volume on which the mft record to synchronize resides
+ * @mft_no: mft record number of mft record to synchronize
+ * @m: mapped, mst protected (extent) mft record to synchronize
+ * @sync: if true, wait for i/o completion
+ *
+ * Write the mapped, mst protected (extent) mft record @m with mft record
+ * number @mft_no to the mft mirror ($MFTMirr) of the ntfs volume @vol.
+ *
+ * On success return 0. On error return -errno and set the volume errors flag
+ * in the ntfs volume @vol.
+ *
+ * NOTE: We always perform synchronous i/o and ignore the @sync parameter.
+ *
+ * TODO: If @sync is false, want to do truly asynchronous i/o, i.e. just
+ * schedule i/o via ->writepage or do it via kntfsd or whatever.
+ */
+int ntfs_sync_mft_mirror(ntfs_volume *vol, const unsigned long mft_no,
+ MFT_RECORD *m, int sync)
+{
+ struct page *page;
+ unsigned int blocksize = vol->sb->s_blocksize;
+ int max_bhs = vol->mft_record_size / blocksize;
+ struct buffer_head *bhs[MAX_BHS];
+ struct buffer_head *bh, *head;
+ u8 *kmirr;
+ runlist_element *rl;
+ unsigned int block_start, block_end, m_start, m_end, page_ofs;
+ int i_bhs, nr_bhs, err = 0;
+ unsigned char blocksize_bits = vol->sb->s_blocksize_bits;
+
+ ntfs_debug("Entering for inode 0x%lx.", mft_no);
+ BUG_ON(!max_bhs);
+ if (WARN_ON(max_bhs > MAX_BHS))
+ return -EINVAL;
+ if (unlikely(!vol->mftmirr_ino)) {
+ /* This could happen during umount... */
+ err = ntfs_sync_mft_mirror_umount(vol, mft_no, m);
+ if (likely(!err))
+ return err;
+ goto err_out;
+ }
+ /* Get the page containing the mirror copy of the mft record @m. */
+ page = ntfs_map_page(vol->mftmirr_ino->i_mapping, mft_no >>
+ (PAGE_SHIFT - vol->mft_record_size_bits));
+ if (IS_ERR(page)) {
+ ntfs_error(vol->sb, "Failed to map mft mirror page.");
+ err = PTR_ERR(page);
+ goto err_out;
+ }
+ lock_page(page);
+ BUG_ON(!PageUptodate(page));
+ ClearPageUptodate(page);
+ /* Offset of the mft mirror record inside the page. */
+ page_ofs = (mft_no << vol->mft_record_size_bits) & ~PAGE_MASK;
+ /* The address in the page of the mirror copy of the mft record @m. */
+ kmirr = page_address(page) + page_ofs;
+ /* Copy the mst protected mft record to the mirror. */
+ memcpy(kmirr, m, vol->mft_record_size);
+ /* Create uptodate buffers if not present. */
+ if (unlikely(!page_has_buffers(page))) {
+ struct buffer_head *tail;
+
+ bh = head = alloc_page_buffers(page, blocksize, true);
+ do {
+ set_buffer_uptodate(bh);
+ tail = bh;
+ bh = bh->b_this_page;
+ } while (bh);
+ tail->b_this_page = head;
+ attach_page_private(page, head);
+ }
+ bh = head = page_buffers(page);
+ BUG_ON(!bh);
+ rl = NULL;
+ nr_bhs = 0;
+ block_start = 0;
+ m_start = kmirr - (u8*)page_address(page);
+ m_end = m_start + vol->mft_record_size;
+ do {
+ block_end = block_start + blocksize;
+ /* If the buffer is outside the mft record, skip it. */
+ if (block_end <= m_start)
+ continue;
+ if (unlikely(block_start >= m_end))
+ break;
+ /* Need to map the buffer if it is not mapped already. */
+ if (unlikely(!buffer_mapped(bh))) {
+ VCN vcn;
+ LCN lcn;
+ unsigned int vcn_ofs;
+
+ bh->b_bdev = vol->sb->s_bdev;
+ /* Obtain the vcn and offset of the current block. */
+ vcn = ((VCN)mft_no << vol->mft_record_size_bits) +
+ (block_start - m_start);
+ vcn_ofs = vcn & vol->cluster_size_mask;
+ vcn >>= vol->cluster_size_bits;
+ if (!rl) {
+ down_read(&NTFS_I(vol->mftmirr_ino)->
+ runlist.lock);
+ rl = NTFS_I(vol->mftmirr_ino)->runlist.rl;
+ /*
+ * $MFTMirr always has the whole of its runlist
+ * in memory.
+ */
+ BUG_ON(!rl);
+ }
+ /* Seek to element containing target vcn. */
+ while (rl->length && rl[1].vcn <= vcn)
+ rl++;
+ lcn = ntfs_rl_vcn_to_lcn(rl, vcn);
+ /* For $MFTMirr, only lcn >= 0 is a successful remap. */
+ if (likely(lcn >= 0)) {
+ /* Setup buffer head to correct block. */
+ bh->b_blocknr = ((lcn <<
+ vol->cluster_size_bits) +
+ vcn_ofs) >> blocksize_bits;
+ set_buffer_mapped(bh);
+ } else {
+ bh->b_blocknr = -1;
+ ntfs_error(vol->sb, "Cannot write mft mirror "
+ "record 0x%lx because its "
+ "location on disk could not "
+ "be determined (error code "
+ "%lli).", mft_no,
+ (long long)lcn);
+ err = -EIO;
+ }
+ }
+ BUG_ON(!buffer_uptodate(bh));
+ BUG_ON(!nr_bhs && (m_start != block_start));
+ BUG_ON(nr_bhs >= max_bhs);
+ bhs[nr_bhs++] = bh;
+ BUG_ON((nr_bhs >= max_bhs) && (m_end != block_end));
+ } while (block_start = block_end, (bh = bh->b_this_page) != head);
+ if (unlikely(rl))
+ up_read(&NTFS_I(vol->mftmirr_ino)->runlist.lock);
+ if (likely(!err)) {
+ /* Lock buffers and start synchronous write i/o on them. */
+ for (i_bhs = 0; i_bhs < nr_bhs; i_bhs++) {
+ struct buffer_head *tbh = bhs[i_bhs];
+
+ if (!trylock_buffer(tbh))
+ BUG();
+ BUG_ON(!buffer_uptodate(tbh));
+ clear_buffer_dirty(tbh);
+ get_bh(tbh);
+ tbh->b_end_io = end_buffer_write_sync;
+ submit_bh(REQ_OP_WRITE, tbh);
+ }
+ /* Wait on i/o completion of buffers. */
+ for (i_bhs = 0; i_bhs < nr_bhs; i_bhs++) {
+ struct buffer_head *tbh = bhs[i_bhs];
+
+ wait_on_buffer(tbh);
+ if (unlikely(!buffer_uptodate(tbh))) {
+ err = -EIO;
+ /*
+ * Set the buffer uptodate so the page and
+ * buffer states do not become out of sync.
+ */
+ set_buffer_uptodate(tbh);
+ }
+ }
+ } else /* if (unlikely(err)) */ {
+ /* Clean the buffers. */
+ for (i_bhs = 0; i_bhs < nr_bhs; i_bhs++)
+ clear_buffer_dirty(bhs[i_bhs]);
+ }
+ /* Current state: all buffers are clean, unlocked, and uptodate. */
+ /* Remove the mst protection fixups again. */
+ post_write_mst_fixup((NTFS_RECORD*)kmirr);
+ flush_dcache_page(page);
+ SetPageUptodate(page);
+ unlock_page(page);
+ ntfs_unmap_page(page);
+ if (likely(!err)) {
+ ntfs_debug("Done.");
+ } else {
+ ntfs_error(vol->sb, "I/O error while writing mft mirror "
+ "record 0x%lx!", mft_no);
+err_out:
+ ntfs_error(vol->sb, "Failed to synchronize $MFTMirr (error "
+ "code %i). Volume will be left marked dirty "
+ "on umount. Run ntfsfix on the partition "
+ "after umounting to correct this.", -err);
+ NVolSetErrors(vol);
+ }
+ return err;
+}
+
+/**
+ * write_mft_record_nolock - write out a mapped (extent) mft record
+ * @ni: ntfs inode describing the mapped (extent) mft record
+ * @m: mapped (extent) mft record to write
+ * @sync: if true, wait for i/o completion
+ *
+ * Write the mapped (extent) mft record @m described by the (regular or extent)
+ * ntfs inode @ni to backing store. If the mft record @m has a counterpart in
+ * the mft mirror, that is also updated.
+ *
+ * We only write the mft record if the ntfs inode @ni is dirty and the first
+ * buffer belonging to its mft record is dirty, too. We ignore the dirty state
+ * of subsequent buffers because we could have raced with
+ * fs/ntfs/aops.c::mark_ntfs_record_dirty().
+ *
+ * On success, clean the mft record and return 0. On error, leave the mft
+ * record dirty and return -errno.
+ *
+ * NOTE: We always perform synchronous i/o and ignore the @sync parameter.
+ * However, if the mft record has a counterpart in the mft mirror and @sync is
+ * true, we write the mft record, wait for i/o completion, and only then write
+ * the mft mirror copy. This ensures that if the system crashes either the mft
+ * or the mft mirror will contain a self-consistent mft record @m. If @sync is
+ * false on the other hand, we start i/o on both and then wait for completion
+ * on them. This provides a speedup but no longer guarantees that you will end
+ * up with a self-consistent mft record in the case of a crash but if you asked
+ * for asynchronous writing you probably do not care about that anyway.
+ *
+ * TODO: If @sync is false, want to do truly asynchronous i/o, i.e. just
+ * schedule i/o via ->writepage or do it via kntfsd or whatever.
+ */
+int write_mft_record_nolock(ntfs_inode *ni, MFT_RECORD *m, int sync)
+{
+ ntfs_volume *vol = ni->vol;
+ struct page *page = ni->page;
+ unsigned int blocksize = vol->sb->s_blocksize;
+ unsigned char blocksize_bits = vol->sb->s_blocksize_bits;
+ int max_bhs = vol->mft_record_size / blocksize;
+ struct buffer_head *bhs[MAX_BHS];
+ struct buffer_head *bh, *head;
+ runlist_element *rl;
+ unsigned int block_start, block_end, m_start, m_end;
+ int i_bhs, nr_bhs, err = 0;
+
+ ntfs_debug("Entering for inode 0x%lx.", ni->mft_no);
+ BUG_ON(NInoAttr(ni));
+ BUG_ON(!max_bhs);
+ BUG_ON(!PageLocked(page));
+ if (WARN_ON(max_bhs > MAX_BHS)) {
+ err = -EINVAL;
+ goto err_out;
+ }
+ /*
+ * If the ntfs_inode is clean no need to do anything. If it is dirty,
+ * mark it as clean now so that it can be redirtied later on if needed.
+ * There is no danger of races since the caller is holding the locks
+ * for the mft record @m and the page it is in.
+ */
+ if (!NInoTestClearDirty(ni))
+ goto done;
+ bh = head = page_buffers(page);
+ BUG_ON(!bh);
+ rl = NULL;
+ nr_bhs = 0;
+ block_start = 0;
+ m_start = ni->page_ofs;
+ m_end = m_start + vol->mft_record_size;
+ do {
+ block_end = block_start + blocksize;
+ /* If the buffer is outside the mft record, skip it. */
+ if (block_end <= m_start)
+ continue;
+ if (unlikely(block_start >= m_end))
+ break;
+ /*
+ * If this block is not the first one in the record, we ignore
+ * the buffer's dirty state because we could have raced with a
+ * parallel mark_ntfs_record_dirty().
+ */
+ if (block_start == m_start) {
+ /* This block is the first one in the record. */
+ if (!buffer_dirty(bh)) {
+ BUG_ON(nr_bhs);
+ /* Clean records are not written out. */
+ break;
+ }
+ }
+ /* Need to map the buffer if it is not mapped already. */
+ if (unlikely(!buffer_mapped(bh))) {
+ VCN vcn;
+ LCN lcn;
+ unsigned int vcn_ofs;
+
+ bh->b_bdev = vol->sb->s_bdev;
+ /* Obtain the vcn and offset of the current block. */
+ vcn = ((VCN)ni->mft_no << vol->mft_record_size_bits) +
+ (block_start - m_start);
+ vcn_ofs = vcn & vol->cluster_size_mask;
+ vcn >>= vol->cluster_size_bits;
+ if (!rl) {
+ down_read(&NTFS_I(vol->mft_ino)->runlist.lock);
+ rl = NTFS_I(vol->mft_ino)->runlist.rl;
+ BUG_ON(!rl);
+ }
+ /* Seek to element containing target vcn. */
+ while (rl->length && rl[1].vcn <= vcn)
+ rl++;
+ lcn = ntfs_rl_vcn_to_lcn(rl, vcn);
+ /* For $MFT, only lcn >= 0 is a successful remap. */
+ if (likely(lcn >= 0)) {
+ /* Setup buffer head to correct block. */
+ bh->b_blocknr = ((lcn <<
+ vol->cluster_size_bits) +
+ vcn_ofs) >> blocksize_bits;
+ set_buffer_mapped(bh);
+ } else {
+ bh->b_blocknr = -1;
+ ntfs_error(vol->sb, "Cannot write mft record "
+ "0x%lx because its location "
+ "on disk could not be "
+ "determined (error code %lli).",
+ ni->mft_no, (long long)lcn);
+ err = -EIO;
+ }
+ }
+ BUG_ON(!buffer_uptodate(bh));
+ BUG_ON(!nr_bhs && (m_start != block_start));
+ BUG_ON(nr_bhs >= max_bhs);
+ bhs[nr_bhs++] = bh;
+ BUG_ON((nr_bhs >= max_bhs) && (m_end != block_end));
+ } while (block_start = block_end, (bh = bh->b_this_page) != head);
+ if (unlikely(rl))
+ up_read(&NTFS_I(vol->mft_ino)->runlist.lock);
+ if (!nr_bhs)
+ goto done;
+ if (unlikely(err))
+ goto cleanup_out;
+ /* Apply the mst protection fixups. */
+ err = pre_write_mst_fixup((NTFS_RECORD*)m, vol->mft_record_size);
+ if (err) {
+ ntfs_error(vol->sb, "Failed to apply mst fixups!");
+ goto cleanup_out;
+ }
+ flush_dcache_mft_record_page(ni);
+ /* Lock buffers and start synchronous write i/o on them. */
+ for (i_bhs = 0; i_bhs < nr_bhs; i_bhs++) {
+ struct buffer_head *tbh = bhs[i_bhs];
+
+ if (!trylock_buffer(tbh))
+ BUG();
+ BUG_ON(!buffer_uptodate(tbh));
+ clear_buffer_dirty(tbh);
+ get_bh(tbh);
+ tbh->b_end_io = end_buffer_write_sync;
+ submit_bh(REQ_OP_WRITE, tbh);
+ }
+ /* Synchronize the mft mirror now if not @sync. */
+ if (!sync && ni->mft_no < vol->mftmirr_size)
+ ntfs_sync_mft_mirror(vol, ni->mft_no, m, sync);
+ /* Wait on i/o completion of buffers. */
+ for (i_bhs = 0; i_bhs < nr_bhs; i_bhs++) {
+ struct buffer_head *tbh = bhs[i_bhs];
+
+ wait_on_buffer(tbh);
+ if (unlikely(!buffer_uptodate(tbh))) {
+ err = -EIO;
+ /*
+ * Set the buffer uptodate so the page and buffer
+ * states do not become out of sync.
+ */
+ if (PageUptodate(page))
+ set_buffer_uptodate(tbh);
+ }
+ }
+ /* If @sync, now synchronize the mft mirror. */
+ if (sync && ni->mft_no < vol->mftmirr_size)
+ ntfs_sync_mft_mirror(vol, ni->mft_no, m, sync);
+ /* Remove the mst protection fixups again. */
+ post_write_mst_fixup((NTFS_RECORD*)m);
+ flush_dcache_mft_record_page(ni);
+ if (unlikely(err)) {
+ /* I/O error during writing. This is really bad! */
+ ntfs_error(vol->sb, "I/O error while writing mft record "
+ "0x%lx! Marking base inode as bad. You "
+ "should unmount the volume and run chkdsk.",
+ ni->mft_no);
+ goto err_out;
+ }
+done:
+ ntfs_debug("Done.");
+ return 0;
+cleanup_out:
+ /* Clean the buffers. */
+ for (i_bhs = 0; i_bhs < nr_bhs; i_bhs++)
+ clear_buffer_dirty(bhs[i_bhs]);
+err_out:
+ /*
+ * Current state: all buffers are clean, unlocked, and uptodate.
+ * The caller should mark the base inode as bad so that no more i/o
+ * happens. ->clear_inode() will still be invoked so all extent inodes
+ * and other allocated memory will be freed.
+ */
+ if (err == -ENOMEM) {
+ ntfs_error(vol->sb, "Not enough memory to write mft record. "
+ "Redirtying so the write is retried later.");
+ mark_mft_record_dirty(ni);
+ err = 0;
+ } else
+ NVolSetErrors(vol);
+ return err;
+}
+
+/**
+ * ntfs_may_write_mft_record - check if an mft record may be written out
+ * @vol: [IN] ntfs volume on which the mft record to check resides
+ * @mft_no: [IN] mft record number of the mft record to check
+ * @m: [IN] mapped mft record to check
+ * @locked_ni: [OUT] caller has to unlock this ntfs inode if one is returned
+ *
+ * Check if the mapped (base or extent) mft record @m with mft record number
+ * @mft_no belonging to the ntfs volume @vol may be written out. If necessary
+ * and possible the ntfs inode of the mft record is locked and the base vfs
+ * inode is pinned. The locked ntfs inode is then returned in @locked_ni. The
+ * caller is responsible for unlocking the ntfs inode and unpinning the base
+ * vfs inode.
+ *
+ * Return 'true' if the mft record may be written out and 'false' if not.
+ *
+ * The caller has locked the page and cleared the uptodate flag on it which
+ * means that we can safely write out any dirty mft records that do not have
+ * their inodes in icache as determined by ilookup5() as anyone
+ * opening/creating such an inode would block when attempting to map the mft
+ * record in read_cache_page() until we are finished with the write out.
+ *
+ * Here is a description of the tests we perform:
+ *
+ * If the inode is found in icache we know the mft record must be a base mft
+ * record. If it is dirty, we do not write it and return 'false' as the vfs
+ * inode write paths will result in the access times being updated which would
+ * cause the base mft record to be redirtied and written out again. (We know
+ * the access time update will modify the base mft record because Windows
+ * chkdsk complains if the standard information attribute is not in the base
+ * mft record.)
+ *
+ * If the inode is in icache and not dirty, we attempt to lock the mft record
+ * and if we find the lock was already taken, it is not safe to write the mft
+ * record and we return 'false'.
+ *
+ * If we manage to obtain the lock we have exclusive access to the mft record,
+ * which also allows us safe writeout of the mft record. We then set
+ * @locked_ni to the locked ntfs inode and return 'true'.
+ *
+ * Note we cannot just lock the mft record and sleep while waiting for the lock
+ * because this would deadlock due to lock reversal (normally the mft record is
+ * locked before the page is locked but we already have the page locked here
+ * when we try to lock the mft record).
+ *
+ * If the inode is not in icache we need to perform further checks.
+ *
+ * If the mft record is not a FILE record or it is a base mft record, we can
+ * safely write it and return 'true'.
+ *
+ * We now know the mft record is an extent mft record. We check if the inode
+ * corresponding to its base mft record is in icache and obtain a reference to
+ * it if it is. If it is not, we can safely write it and return 'true'.
+ *
+ * We now have the base inode for the extent mft record. We check if it has an
+ * ntfs inode for the extent mft record attached and if not it is safe to write
+ * the extent mft record and we return 'true'.
+ *
+ * The ntfs inode for the extent mft record is attached to the base inode so we
+ * attempt to lock the extent mft record and if we find the lock was already
+ * taken, it is not safe to write the extent mft record and we return 'false'.
+ *
+ * If we manage to obtain the lock we have exclusive access to the extent mft
+ * record, which also allows us safe writeout of the extent mft record. We
+ * set the ntfs inode of the extent mft record clean and then set @locked_ni to
+ * the now locked ntfs inode and return 'true'.
+ *
+ * Note, the reason for actually writing dirty mft records here and not just
+ * relying on the vfs inode dirty code paths is that we can have mft records
+ * modified without them ever having actual inodes in memory. Also we can have
+ * dirty mft records with clean ntfs inodes in memory. None of the described
+ * cases would result in the dirty mft records being written out if we only
+ * relied on the vfs inode dirty code paths. And these cases can really occur
+ * during allocation of new mft records and in particular when the
+ * initialized_size of the $MFT/$DATA attribute is extended and the new space
+ * is initialized using ntfs_mft_record_format(). The clean inode can then
+ * appear if the mft record is reused for a new inode before it got written
+ * out.
+ */
+bool ntfs_may_write_mft_record(ntfs_volume *vol, const unsigned long mft_no,
+ const MFT_RECORD *m, ntfs_inode **locked_ni)
+{
+ struct super_block *sb = vol->sb;
+ struct inode *mft_vi = vol->mft_ino;
+ struct inode *vi;
+ ntfs_inode *ni, *eni, **extent_nis;
+ int i;
+ ntfs_attr na;
+
+ ntfs_debug("Entering for inode 0x%lx.", mft_no);
+ /*
+ * Normally we do not return a locked inode so set @locked_ni to NULL.
+ */
+ BUG_ON(!locked_ni);
+ *locked_ni = NULL;
+ /*
+ * Check if the inode corresponding to this mft record is in the VFS
+ * inode cache and obtain a reference to it if it is.
+ */
+ ntfs_debug("Looking for inode 0x%lx in icache.", mft_no);
+ na.mft_no = mft_no;
+ na.name = NULL;
+ na.name_len = 0;
+ na.type = AT_UNUSED;
+ /*
+ * Optimize inode 0, i.e. $MFT itself, since we have it in memory and
+ * we get here for it rather often.
+ */
+ if (!mft_no) {
+ /* Balance the below iput(). */
+ vi = igrab(mft_vi);
+ BUG_ON(vi != mft_vi);
+ } else {
+ /*
+ * Have to use ilookup5_nowait() since ilookup5() waits for the
+ * inode lock which causes ntfs to deadlock when a concurrent
+ * inode write via the inode dirty code paths and the page
+ * dirty code path of the inode dirty code path when writing
+ * $MFT occurs.
+ */
+ vi = ilookup5_nowait(sb, mft_no, ntfs_test_inode, &na);
+ }
+ if (vi) {
+ ntfs_debug("Base inode 0x%lx is in icache.", mft_no);
+ /* The inode is in icache. */
+ ni = NTFS_I(vi);
+ /* Take a reference to the ntfs inode. */
+ atomic_inc(&ni->count);
+ /* If the inode is dirty, do not write this record. */
+ if (NInoDirty(ni)) {
+ ntfs_debug("Inode 0x%lx is dirty, do not write it.",
+ mft_no);
+ atomic_dec(&ni->count);
+ iput(vi);
+ return false;
+ }
+ ntfs_debug("Inode 0x%lx is not dirty.", mft_no);
+ /* The inode is not dirty, try to take the mft record lock. */
+ if (unlikely(!mutex_trylock(&ni->mrec_lock))) {
+ ntfs_debug("Mft record 0x%lx is already locked, do "
+ "not write it.", mft_no);
+ atomic_dec(&ni->count);
+ iput(vi);
+ return false;
+ }
+ ntfs_debug("Managed to lock mft record 0x%lx, write it.",
+ mft_no);
+ /*
+ * The write has to occur while we hold the mft record lock so
+ * return the locked ntfs inode.
+ */
+ *locked_ni = ni;
+ return true;
+ }
+ ntfs_debug("Inode 0x%lx is not in icache.", mft_no);
+ /* The inode is not in icache. */
+ /* Write the record if it is not a mft record (type "FILE"). */
+ if (!ntfs_is_mft_record(m->magic)) {
+ ntfs_debug("Mft record 0x%lx is not a FILE record, write it.",
+ mft_no);
+ return true;
+ }
+ /* Write the mft record if it is a base inode. */
+ if (!m->base_mft_record) {
+ ntfs_debug("Mft record 0x%lx is a base record, write it.",
+ mft_no);
+ return true;
+ }
+ /*
+ * This is an extent mft record. Check if the inode corresponding to
+ * its base mft record is in icache and obtain a reference to it if it
+ * is.
+ */
+ na.mft_no = MREF_LE(m->base_mft_record);
+ ntfs_debug("Mft record 0x%lx is an extent record. Looking for base "
+ "inode 0x%lx in icache.", mft_no, na.mft_no);
+ if (!na.mft_no) {
+ /* Balance the below iput(). */
+ vi = igrab(mft_vi);
+ BUG_ON(vi != mft_vi);
+ } else
+ vi = ilookup5_nowait(sb, na.mft_no, ntfs_test_inode,
+ &na);
+ if (!vi) {
+ /*
+ * The base inode is not in icache, write this extent mft
+ * record.
+ */
+ ntfs_debug("Base inode 0x%lx is not in icache, write the "
+ "extent record.", na.mft_no);
+ return true;
+ }
+ ntfs_debug("Base inode 0x%lx is in icache.", na.mft_no);
+ /*
+ * The base inode is in icache. Check if it has the extent inode
+ * corresponding to this extent mft record attached.
+ */
+ ni = NTFS_I(vi);
+ mutex_lock(&ni->extent_lock);
+ if (ni->nr_extents <= 0) {
+ /*
+ * The base inode has no attached extent inodes, write this
+ * extent mft record.
+ */
+ mutex_unlock(&ni->extent_lock);
+ iput(vi);
+ ntfs_debug("Base inode 0x%lx has no attached extent inodes, "
+ "write the extent record.", na.mft_no);
+ return true;
+ }
+ /* Iterate over the attached extent inodes. */
+ extent_nis = ni->ext.extent_ntfs_inos;
+ for (eni = NULL, i = 0; i < ni->nr_extents; ++i) {
+ if (mft_no == extent_nis[i]->mft_no) {
+ /*
+ * Found the extent inode corresponding to this extent
+ * mft record.
+ */
+ eni = extent_nis[i];
+ break;
+ }
+ }
+ /*
+ * If the extent inode was not attached to the base inode, write this
+ * extent mft record.
+ */
+ if (!eni) {
+ mutex_unlock(&ni->extent_lock);
+ iput(vi);
+ ntfs_debug("Extent inode 0x%lx is not attached to its base "
+ "inode 0x%lx, write the extent record.",
+ mft_no, na.mft_no);
+ return true;
+ }
+ ntfs_debug("Extent inode 0x%lx is attached to its base inode 0x%lx.",
+ mft_no, na.mft_no);
+ /* Take a reference to the extent ntfs inode. */
+ atomic_inc(&eni->count);
+ mutex_unlock(&ni->extent_lock);
+ /*
+ * Found the extent inode coresponding to this extent mft record.
+ * Try to take the mft record lock.
+ */
+ if (unlikely(!mutex_trylock(&eni->mrec_lock))) {
+ atomic_dec(&eni->count);
+ iput(vi);
+ ntfs_debug("Extent mft record 0x%lx is already locked, do "
+ "not write it.", mft_no);
+ return false;
+ }
+ ntfs_debug("Managed to lock extent mft record 0x%lx, write it.",
+ mft_no);
+ if (NInoTestClearDirty(eni))
+ ntfs_debug("Extent inode 0x%lx is dirty, marking it clean.",
+ mft_no);
+ /*
+ * The write has to occur while we hold the mft record lock so return
+ * the locked extent ntfs inode.
+ */
+ *locked_ni = eni;
+ return true;
+}
+
+static const char *es = " Leaving inconsistent metadata. Unmount and run "
+ "chkdsk.";
+
+/**
+ * ntfs_mft_bitmap_find_and_alloc_free_rec_nolock - see name
+ * @vol: volume on which to search for a free mft record
+ * @base_ni: open base inode if allocating an extent mft record or NULL
+ *
+ * Search for a free mft record in the mft bitmap attribute on the ntfs volume
+ * @vol.
+ *
+ * If @base_ni is NULL start the search at the default allocator position.
+ *
+ * If @base_ni is not NULL start the search at the mft record after the base
+ * mft record @base_ni.
+ *
+ * Return the free mft record on success and -errno on error. An error code of
+ * -ENOSPC means that there are no free mft records in the currently
+ * initialized mft bitmap.
+ *
+ * Locking: Caller must hold vol->mftbmp_lock for writing.
+ */
+static int ntfs_mft_bitmap_find_and_alloc_free_rec_nolock(ntfs_volume *vol,
+ ntfs_inode *base_ni)
+{
+ s64 pass_end, ll, data_pos, pass_start, ofs, bit;
+ unsigned long flags;
+ struct address_space *mftbmp_mapping;
+ u8 *buf, *byte;
+ struct page *page;
+ unsigned int page_ofs, size;
+ u8 pass, b;
+
+ ntfs_debug("Searching for free mft record in the currently "
+ "initialized mft bitmap.");
+ mftbmp_mapping = vol->mftbmp_ino->i_mapping;
+ /*
+ * Set the end of the pass making sure we do not overflow the mft
+ * bitmap.
+ */
+ read_lock_irqsave(&NTFS_I(vol->mft_ino)->size_lock, flags);
+ pass_end = NTFS_I(vol->mft_ino)->allocated_size >>
+ vol->mft_record_size_bits;
+ read_unlock_irqrestore(&NTFS_I(vol->mft_ino)->size_lock, flags);
+ read_lock_irqsave(&NTFS_I(vol->mftbmp_ino)->size_lock, flags);
+ ll = NTFS_I(vol->mftbmp_ino)->initialized_size << 3;
+ read_unlock_irqrestore(&NTFS_I(vol->mftbmp_ino)->size_lock, flags);
+ if (pass_end > ll)
+ pass_end = ll;
+ pass = 1;
+ if (!base_ni)
+ data_pos = vol->mft_data_pos;
+ else
+ data_pos = base_ni->mft_no + 1;
+ if (data_pos < 24)
+ data_pos = 24;
+ if (data_pos >= pass_end) {
+ data_pos = 24;
+ pass = 2;
+ /* This happens on a freshly formatted volume. */
+ if (data_pos >= pass_end)
+ return -ENOSPC;
+ }
+ pass_start = data_pos;
+ ntfs_debug("Starting bitmap search: pass %u, pass_start 0x%llx, "
+ "pass_end 0x%llx, data_pos 0x%llx.", pass,
+ (long long)pass_start, (long long)pass_end,
+ (long long)data_pos);
+ /* Loop until a free mft record is found. */
+ for (; pass <= 2;) {
+ /* Cap size to pass_end. */
+ ofs = data_pos >> 3;
+ page_ofs = ofs & ~PAGE_MASK;
+ size = PAGE_SIZE - page_ofs;
+ ll = ((pass_end + 7) >> 3) - ofs;
+ if (size > ll)
+ size = ll;
+ size <<= 3;
+ /*
+ * If we are still within the active pass, search the next page
+ * for a zero bit.
+ */
+ if (size) {
+ page = ntfs_map_page(mftbmp_mapping,
+ ofs >> PAGE_SHIFT);
+ if (IS_ERR(page)) {
+ ntfs_error(vol->sb, "Failed to read mft "
+ "bitmap, aborting.");
+ return PTR_ERR(page);
+ }
+ buf = (u8*)page_address(page) + page_ofs;
+ bit = data_pos & 7;
+ data_pos &= ~7ull;
+ ntfs_debug("Before inner for loop: size 0x%x, "
+ "data_pos 0x%llx, bit 0x%llx", size,
+ (long long)data_pos, (long long)bit);
+ for (; bit < size && data_pos + bit < pass_end;
+ bit &= ~7ull, bit += 8) {
+ byte = buf + (bit >> 3);
+ if (*byte == 0xff)
+ continue;
+ b = ffz((unsigned long)*byte);
+ if (b < 8 && b >= (bit & 7)) {
+ ll = data_pos + (bit & ~7ull) + b;
+ if (unlikely(ll > (1ll << 32))) {
+ ntfs_unmap_page(page);
+ return -ENOSPC;
+ }
+ *byte |= 1 << b;
+ flush_dcache_page(page);
+ set_page_dirty(page);
+ ntfs_unmap_page(page);
+ ntfs_debug("Done. (Found and "
+ "allocated mft record "
+ "0x%llx.)",
+ (long long)ll);
+ return ll;
+ }
+ }
+ ntfs_debug("After inner for loop: size 0x%x, "
+ "data_pos 0x%llx, bit 0x%llx", size,
+ (long long)data_pos, (long long)bit);
+ data_pos += size;
+ ntfs_unmap_page(page);
+ /*
+ * If the end of the pass has not been reached yet,
+ * continue searching the mft bitmap for a zero bit.
+ */
+ if (data_pos < pass_end)
+ continue;
+ }
+ /* Do the next pass. */
+ if (++pass == 2) {
+ /*
+ * Starting the second pass, in which we scan the first
+ * part of the zone which we omitted earlier.
+ */
+ pass_end = pass_start;
+ data_pos = pass_start = 24;
+ ntfs_debug("pass %i, pass_start 0x%llx, pass_end "
+ "0x%llx.", pass, (long long)pass_start,
+ (long long)pass_end);
+ if (data_pos >= pass_end)
+ break;
+ }
+ }
+ /* No free mft records in currently initialized mft bitmap. */
+ ntfs_debug("Done. (No free mft records left in currently initialized "
+ "mft bitmap.)");
+ return -ENOSPC;
+}
+
+/**
+ * ntfs_mft_bitmap_extend_allocation_nolock - extend mft bitmap by a cluster
+ * @vol: volume on which to extend the mft bitmap attribute
+ *
+ * Extend the mft bitmap attribute on the ntfs volume @vol by one cluster.
+ *
+ * Note: Only changes allocated_size, i.e. does not touch initialized_size or
+ * data_size.
+ *
+ * Return 0 on success and -errno on error.
+ *
+ * Locking: - Caller must hold vol->mftbmp_lock for writing.
+ * - This function takes NTFS_I(vol->mftbmp_ino)->runlist.lock for
+ * writing and releases it before returning.
+ * - This function takes vol->lcnbmp_lock for writing and releases it
+ * before returning.
+ */
+static int ntfs_mft_bitmap_extend_allocation_nolock(ntfs_volume *vol)
+{
+ LCN lcn;
+ s64 ll;
+ unsigned long flags;
+ struct page *page;
+ ntfs_inode *mft_ni, *mftbmp_ni;
+ runlist_element *rl, *rl2 = NULL;
+ ntfs_attr_search_ctx *ctx = NULL;
+ MFT_RECORD *mrec;
+ ATTR_RECORD *a = NULL;
+ int ret, mp_size;
+ u32 old_alen = 0;
+ u8 *b, tb;
+ struct {
+ u8 added_cluster:1;
+ u8 added_run:1;
+ u8 mp_rebuilt:1;
+ } status = { 0, 0, 0 };
+
+ ntfs_debug("Extending mft bitmap allocation.");
+ mft_ni = NTFS_I(vol->mft_ino);
+ mftbmp_ni = NTFS_I(vol->mftbmp_ino);
+ /*
+ * Determine the last lcn of the mft bitmap. The allocated size of the
+ * mft bitmap cannot be zero so we are ok to do this.
+ */
+ down_write(&mftbmp_ni->runlist.lock);
+ read_lock_irqsave(&mftbmp_ni->size_lock, flags);
+ ll = mftbmp_ni->allocated_size;
+ read_unlock_irqrestore(&mftbmp_ni->size_lock, flags);
+ rl = ntfs_attr_find_vcn_nolock(mftbmp_ni,
+ (ll - 1) >> vol->cluster_size_bits, NULL);
+ if (IS_ERR(rl) || unlikely(!rl->length || rl->lcn < 0)) {
+ up_write(&mftbmp_ni->runlist.lock);
+ ntfs_error(vol->sb, "Failed to determine last allocated "
+ "cluster of mft bitmap attribute.");
+ if (!IS_ERR(rl))
+ ret = -EIO;
+ else
+ ret = PTR_ERR(rl);
+ return ret;
+ }
+ lcn = rl->lcn + rl->length;
+ ntfs_debug("Last lcn of mft bitmap attribute is 0x%llx.",
+ (long long)lcn);
+ /*
+ * Attempt to get the cluster following the last allocated cluster by
+ * hand as it may be in the MFT zone so the allocator would not give it
+ * to us.
+ */
+ ll = lcn >> 3;
+ page = ntfs_map_page(vol->lcnbmp_ino->i_mapping,
+ ll >> PAGE_SHIFT);
+ if (IS_ERR(page)) {
+ up_write(&mftbmp_ni->runlist.lock);
+ ntfs_error(vol->sb, "Failed to read from lcn bitmap.");
+ return PTR_ERR(page);
+ }
+ b = (u8*)page_address(page) + (ll & ~PAGE_MASK);
+ tb = 1 << (lcn & 7ull);
+ down_write(&vol->lcnbmp_lock);
+ if (*b != 0xff && !(*b & tb)) {
+ /* Next cluster is free, allocate it. */
+ *b |= tb;
+ flush_dcache_page(page);
+ set_page_dirty(page);
+ up_write(&vol->lcnbmp_lock);
+ ntfs_unmap_page(page);
+ /* Update the mft bitmap runlist. */
+ rl->length++;
+ rl[1].vcn++;
+ status.added_cluster = 1;
+ ntfs_debug("Appending one cluster to mft bitmap.");
+ } else {
+ up_write(&vol->lcnbmp_lock);
+ ntfs_unmap_page(page);
+ /* Allocate a cluster from the DATA_ZONE. */
+ rl2 = ntfs_cluster_alloc(vol, rl[1].vcn, 1, lcn, DATA_ZONE,
+ true);
+ if (IS_ERR(rl2)) {
+ up_write(&mftbmp_ni->runlist.lock);
+ ntfs_error(vol->sb, "Failed to allocate a cluster for "
+ "the mft bitmap.");
+ return PTR_ERR(rl2);
+ }
+ rl = ntfs_runlists_merge(mftbmp_ni->runlist.rl, rl2);
+ if (IS_ERR(rl)) {
+ up_write(&mftbmp_ni->runlist.lock);
+ ntfs_error(vol->sb, "Failed to merge runlists for mft "
+ "bitmap.");
+ if (ntfs_cluster_free_from_rl(vol, rl2)) {
+ ntfs_error(vol->sb, "Failed to deallocate "
+ "allocated cluster.%s", es);
+ NVolSetErrors(vol);
+ }
+ ntfs_free(rl2);
+ return PTR_ERR(rl);
+ }
+ mftbmp_ni->runlist.rl = rl;
+ status.added_run = 1;
+ ntfs_debug("Adding one run to mft bitmap.");
+ /* Find the last run in the new runlist. */
+ for (; rl[1].length; rl++)
+ ;
+ }
+ /*
+ * Update the attribute record as well. Note: @rl is the last
+ * (non-terminator) runlist element of mft bitmap.
+ */
+ mrec = map_mft_record(mft_ni);
+ if (IS_ERR(mrec)) {
+ ntfs_error(vol->sb, "Failed to map mft record.");
+ ret = PTR_ERR(mrec);
+ goto undo_alloc;
+ }
+ ctx = ntfs_attr_get_search_ctx(mft_ni, mrec);
+ if (unlikely(!ctx)) {
+ ntfs_error(vol->sb, "Failed to get search context.");
+ ret = -ENOMEM;
+ goto undo_alloc;
+ }
+ ret = ntfs_attr_lookup(mftbmp_ni->type, mftbmp_ni->name,
+ mftbmp_ni->name_len, CASE_SENSITIVE, rl[1].vcn, NULL,
+ 0, ctx);
+ if (unlikely(ret)) {
+ ntfs_error(vol->sb, "Failed to find last attribute extent of "
+ "mft bitmap attribute.");
+ if (ret == -ENOENT)
+ ret = -EIO;
+ goto undo_alloc;
+ }
+ a = ctx->attr;
+ ll = sle64_to_cpu(a->data.non_resident.lowest_vcn);
+ /* Search back for the previous last allocated cluster of mft bitmap. */
+ for (rl2 = rl; rl2 > mftbmp_ni->runlist.rl; rl2--) {
+ if (ll >= rl2->vcn)
+ break;
+ }
+ BUG_ON(ll < rl2->vcn);
+ BUG_ON(ll >= rl2->vcn + rl2->length);
+ /* Get the size for the new mapping pairs array for this extent. */
+ mp_size = ntfs_get_size_for_mapping_pairs(vol, rl2, ll, -1);
+ if (unlikely(mp_size <= 0)) {
+ ntfs_error(vol->sb, "Get size for mapping pairs failed for "
+ "mft bitmap attribute extent.");
+ ret = mp_size;
+ if (!ret)
+ ret = -EIO;
+ goto undo_alloc;
+ }
+ /* Expand the attribute record if necessary. */
+ old_alen = le32_to_cpu(a->length);
+ ret = ntfs_attr_record_resize(ctx->mrec, a, mp_size +
+ le16_to_cpu(a->data.non_resident.mapping_pairs_offset));
+ if (unlikely(ret)) {
+ if (ret != -ENOSPC) {
+ ntfs_error(vol->sb, "Failed to resize attribute "
+ "record for mft bitmap attribute.");
+ goto undo_alloc;
+ }
+ // TODO: Deal with this by moving this extent to a new mft
+ // record or by starting a new extent in a new mft record or by
+ // moving other attributes out of this mft record.
+ // Note: It will need to be a special mft record and if none of
+ // those are available it gets rather complicated...
+ ntfs_error(vol->sb, "Not enough space in this mft record to "
+ "accommodate extended mft bitmap attribute "
+ "extent. Cannot handle this yet.");
+ ret = -EOPNOTSUPP;
+ goto undo_alloc;
+ }
+ status.mp_rebuilt = 1;
+ /* Generate the mapping pairs array directly into the attr record. */
+ ret = ntfs_mapping_pairs_build(vol, (u8*)a +
+ le16_to_cpu(a->data.non_resident.mapping_pairs_offset),
+ mp_size, rl2, ll, -1, NULL);
+ if (unlikely(ret)) {
+ ntfs_error(vol->sb, "Failed to build mapping pairs array for "
+ "mft bitmap attribute.");
+ goto undo_alloc;
+ }
+ /* Update the highest_vcn. */
+ a->data.non_resident.highest_vcn = cpu_to_sle64(rl[1].vcn - 1);
+ /*
+ * We now have extended the mft bitmap allocated_size by one cluster.
+ * Reflect this in the ntfs_inode structure and the attribute record.
+ */
+ if (a->data.non_resident.lowest_vcn) {
+ /*
+ * We are not in the first attribute extent, switch to it, but
+ * first ensure the changes will make it to disk later.
+ */
+ flush_dcache_mft_record_page(ctx->ntfs_ino);
+ mark_mft_record_dirty(ctx->ntfs_ino);
+ ntfs_attr_reinit_search_ctx(ctx);
+ ret = ntfs_attr_lookup(mftbmp_ni->type, mftbmp_ni->name,
+ mftbmp_ni->name_len, CASE_SENSITIVE, 0, NULL,
+ 0, ctx);
+ if (unlikely(ret)) {
+ ntfs_error(vol->sb, "Failed to find first attribute "
+ "extent of mft bitmap attribute.");
+ goto restore_undo_alloc;
+ }
+ a = ctx->attr;
+ }
+ write_lock_irqsave(&mftbmp_ni->size_lock, flags);
+ mftbmp_ni->allocated_size += vol->cluster_size;
+ a->data.non_resident.allocated_size =
+ cpu_to_sle64(mftbmp_ni->allocated_size);
+ write_unlock_irqrestore(&mftbmp_ni->size_lock, flags);
+ /* Ensure the changes make it to disk. */
+ flush_dcache_mft_record_page(ctx->ntfs_ino);
+ mark_mft_record_dirty(ctx->ntfs_ino);
+ ntfs_attr_put_search_ctx(ctx);
+ unmap_mft_record(mft_ni);
+ up_write(&mftbmp_ni->runlist.lock);
+ ntfs_debug("Done.");
+ return 0;
+restore_undo_alloc:
+ ntfs_attr_reinit_search_ctx(ctx);
+ if (ntfs_attr_lookup(mftbmp_ni->type, mftbmp_ni->name,
+ mftbmp_ni->name_len, CASE_SENSITIVE, rl[1].vcn, NULL,
+ 0, ctx)) {
+ ntfs_error(vol->sb, "Failed to find last attribute extent of "
+ "mft bitmap attribute.%s", es);
+ write_lock_irqsave(&mftbmp_ni->size_lock, flags);
+ mftbmp_ni->allocated_size += vol->cluster_size;
+ write_unlock_irqrestore(&mftbmp_ni->size_lock, flags);
+ ntfs_attr_put_search_ctx(ctx);
+ unmap_mft_record(mft_ni);
+ up_write(&mftbmp_ni->runlist.lock);
+ /*
+ * The only thing that is now wrong is ->allocated_size of the
+ * base attribute extent which chkdsk should be able to fix.
+ */
+ NVolSetErrors(vol);
+ return ret;
+ }
+ a = ctx->attr;
+ a->data.non_resident.highest_vcn = cpu_to_sle64(rl[1].vcn - 2);
+undo_alloc:
+ if (status.added_cluster) {
+ /* Truncate the last run in the runlist by one cluster. */
+ rl->length--;
+ rl[1].vcn--;
+ } else if (status.added_run) {
+ lcn = rl->lcn;
+ /* Remove the last run from the runlist. */
+ rl->lcn = rl[1].lcn;
+ rl->length = 0;
+ }
+ /* Deallocate the cluster. */
+ down_write(&vol->lcnbmp_lock);
+ if (ntfs_bitmap_clear_bit(vol->lcnbmp_ino, lcn)) {
+ ntfs_error(vol->sb, "Failed to free allocated cluster.%s", es);
+ NVolSetErrors(vol);
+ }
+ up_write(&vol->lcnbmp_lock);
+ if (status.mp_rebuilt) {
+ if (ntfs_mapping_pairs_build(vol, (u8*)a + le16_to_cpu(
+ a->data.non_resident.mapping_pairs_offset),
+ old_alen - le16_to_cpu(
+ a->data.non_resident.mapping_pairs_offset),
+ rl2, ll, -1, NULL)) {
+ ntfs_error(vol->sb, "Failed to restore mapping pairs "
+ "array.%s", es);
+ NVolSetErrors(vol);
+ }
+ if (ntfs_attr_record_resize(ctx->mrec, a, old_alen)) {
+ ntfs_error(vol->sb, "Failed to restore attribute "
+ "record.%s", es);
+ NVolSetErrors(vol);
+ }
+ flush_dcache_mft_record_page(ctx->ntfs_ino);
+ mark_mft_record_dirty(ctx->ntfs_ino);
+ }
+ if (ctx)
+ ntfs_attr_put_search_ctx(ctx);
+ if (!IS_ERR(mrec))
+ unmap_mft_record(mft_ni);
+ up_write(&mftbmp_ni->runlist.lock);
+ return ret;
+}
+
+/**
+ * ntfs_mft_bitmap_extend_initialized_nolock - extend mftbmp initialized data
+ * @vol: volume on which to extend the mft bitmap attribute
+ *
+ * Extend the initialized portion of the mft bitmap attribute on the ntfs
+ * volume @vol by 8 bytes.
+ *
+ * Note: Only changes initialized_size and data_size, i.e. requires that
+ * allocated_size is big enough to fit the new initialized_size.
+ *
+ * Return 0 on success and -error on error.
+ *
+ * Locking: Caller must hold vol->mftbmp_lock for writing.
+ */
+static int ntfs_mft_bitmap_extend_initialized_nolock(ntfs_volume *vol)
+{
+ s64 old_data_size, old_initialized_size;
+ unsigned long flags;
+ struct inode *mftbmp_vi;
+ ntfs_inode *mft_ni, *mftbmp_ni;
+ ntfs_attr_search_ctx *ctx;
+ MFT_RECORD *mrec;
+ ATTR_RECORD *a;
+ int ret;
+
+ ntfs_debug("Extending mft bitmap initiailized (and data) size.");
+ mft_ni = NTFS_I(vol->mft_ino);
+ mftbmp_vi = vol->mftbmp_ino;
+ mftbmp_ni = NTFS_I(mftbmp_vi);
+ /* Get the attribute record. */
+ mrec = map_mft_record(mft_ni);
+ if (IS_ERR(mrec)) {
+ ntfs_error(vol->sb, "Failed to map mft record.");
+ return PTR_ERR(mrec);
+ }
+ ctx = ntfs_attr_get_search_ctx(mft_ni, mrec);
+ if (unlikely(!ctx)) {
+ ntfs_error(vol->sb, "Failed to get search context.");
+ ret = -ENOMEM;
+ goto unm_err_out;
+ }
+ ret = ntfs_attr_lookup(mftbmp_ni->type, mftbmp_ni->name,
+ mftbmp_ni->name_len, CASE_SENSITIVE, 0, NULL, 0, ctx);
+ if (unlikely(ret)) {
+ ntfs_error(vol->sb, "Failed to find first attribute extent of "
+ "mft bitmap attribute.");
+ if (ret == -ENOENT)
+ ret = -EIO;
+ goto put_err_out;
+ }
+ a = ctx->attr;
+ write_lock_irqsave(&mftbmp_ni->size_lock, flags);
+ old_data_size = i_size_read(mftbmp_vi);
+ old_initialized_size = mftbmp_ni->initialized_size;
+ /*
+ * We can simply update the initialized_size before filling the space
+ * with zeroes because the caller is holding the mft bitmap lock for
+ * writing which ensures that no one else is trying to access the data.
+ */
+ mftbmp_ni->initialized_size += 8;
+ a->data.non_resident.initialized_size =
+ cpu_to_sle64(mftbmp_ni->initialized_size);
+ if (mftbmp_ni->initialized_size > old_data_size) {
+ i_size_write(mftbmp_vi, mftbmp_ni->initialized_size);
+ a->data.non_resident.data_size =
+ cpu_to_sle64(mftbmp_ni->initialized_size);
+ }
+ write_unlock_irqrestore(&mftbmp_ni->size_lock, flags);
+ /* Ensure the changes make it to disk. */
+ flush_dcache_mft_record_page(ctx->ntfs_ino);
+ mark_mft_record_dirty(ctx->ntfs_ino);
+ ntfs_attr_put_search_ctx(ctx);
+ unmap_mft_record(mft_ni);
+ /* Initialize the mft bitmap attribute value with zeroes. */
+ ret = ntfs_attr_set(mftbmp_ni, old_initialized_size, 8, 0);
+ if (likely(!ret)) {
+ ntfs_debug("Done. (Wrote eight initialized bytes to mft "
+ "bitmap.");
+ return 0;
+ }
+ ntfs_error(vol->sb, "Failed to write to mft bitmap.");
+ /* Try to recover from the error. */
+ mrec = map_mft_record(mft_ni);
+ if (IS_ERR(mrec)) {
+ ntfs_error(vol->sb, "Failed to map mft record.%s", es);
+ NVolSetErrors(vol);
+ return ret;
+ }
+ ctx = ntfs_attr_get_search_ctx(mft_ni, mrec);
+ if (unlikely(!ctx)) {
+ ntfs_error(vol->sb, "Failed to get search context.%s", es);
+ NVolSetErrors(vol);
+ goto unm_err_out;
+ }
+ if (ntfs_attr_lookup(mftbmp_ni->type, mftbmp_ni->name,
+ mftbmp_ni->name_len, CASE_SENSITIVE, 0, NULL, 0, ctx)) {
+ ntfs_error(vol->sb, "Failed to find first attribute extent of "
+ "mft bitmap attribute.%s", es);
+ NVolSetErrors(vol);
+put_err_out:
+ ntfs_attr_put_search_ctx(ctx);
+unm_err_out:
+ unmap_mft_record(mft_ni);
+ goto err_out;
+ }
+ a = ctx->attr;
+ write_lock_irqsave(&mftbmp_ni->size_lock, flags);
+ mftbmp_ni->initialized_size = old_initialized_size;
+ a->data.non_resident.initialized_size =
+ cpu_to_sle64(old_initialized_size);
+ if (i_size_read(mftbmp_vi) != old_data_size) {
+ i_size_write(mftbmp_vi, old_data_size);
+ a->data.non_resident.data_size = cpu_to_sle64(old_data_size);
+ }
+ write_unlock_irqrestore(&mftbmp_ni->size_lock, flags);
+ flush_dcache_mft_record_page(ctx->ntfs_ino);
+ mark_mft_record_dirty(ctx->ntfs_ino);
+ ntfs_attr_put_search_ctx(ctx);
+ unmap_mft_record(mft_ni);
+#ifdef DEBUG
+ read_lock_irqsave(&mftbmp_ni->size_lock, flags);
+ ntfs_debug("Restored status of mftbmp: allocated_size 0x%llx, "
+ "data_size 0x%llx, initialized_size 0x%llx.",
+ (long long)mftbmp_ni->allocated_size,
+ (long long)i_size_read(mftbmp_vi),
+ (long long)mftbmp_ni->initialized_size);
+ read_unlock_irqrestore(&mftbmp_ni->size_lock, flags);
+#endif /* DEBUG */
+err_out:
+ return ret;
+}
+
+/**
+ * ntfs_mft_data_extend_allocation_nolock - extend mft data attribute
+ * @vol: volume on which to extend the mft data attribute
+ *
+ * Extend the mft data attribute on the ntfs volume @vol by 16 mft records
+ * worth of clusters or if not enough space for this by one mft record worth
+ * of clusters.
+ *
+ * Note: Only changes allocated_size, i.e. does not touch initialized_size or
+ * data_size.
+ *
+ * Return 0 on success and -errno on error.
+ *
+ * Locking: - Caller must hold vol->mftbmp_lock for writing.
+ * - This function takes NTFS_I(vol->mft_ino)->runlist.lock for
+ * writing and releases it before returning.
+ * - This function calls functions which take vol->lcnbmp_lock for
+ * writing and release it before returning.
+ */
+static int ntfs_mft_data_extend_allocation_nolock(ntfs_volume *vol)
+{
+ LCN lcn;
+ VCN old_last_vcn;
+ s64 min_nr, nr, ll;
+ unsigned long flags;
+ ntfs_inode *mft_ni;
+ runlist_element *rl, *rl2;
+ ntfs_attr_search_ctx *ctx = NULL;
+ MFT_RECORD *mrec;
+ ATTR_RECORD *a = NULL;
+ int ret, mp_size;
+ u32 old_alen = 0;
+ bool mp_rebuilt = false;
+
+ ntfs_debug("Extending mft data allocation.");
+ mft_ni = NTFS_I(vol->mft_ino);
+ /*
+ * Determine the preferred allocation location, i.e. the last lcn of
+ * the mft data attribute. The allocated size of the mft data
+ * attribute cannot be zero so we are ok to do this.
+ */
+ down_write(&mft_ni->runlist.lock);
+ read_lock_irqsave(&mft_ni->size_lock, flags);
+ ll = mft_ni->allocated_size;
+ read_unlock_irqrestore(&mft_ni->size_lock, flags);
+ rl = ntfs_attr_find_vcn_nolock(mft_ni,
+ (ll - 1) >> vol->cluster_size_bits, NULL);
+ if (IS_ERR(rl) || unlikely(!rl->length || rl->lcn < 0)) {
+ up_write(&mft_ni->runlist.lock);
+ ntfs_error(vol->sb, "Failed to determine last allocated "
+ "cluster of mft data attribute.");
+ if (!IS_ERR(rl))
+ ret = -EIO;
+ else
+ ret = PTR_ERR(rl);
+ return ret;
+ }
+ lcn = rl->lcn + rl->length;
+ ntfs_debug("Last lcn of mft data attribute is 0x%llx.", (long long)lcn);
+ /* Minimum allocation is one mft record worth of clusters. */
+ min_nr = vol->mft_record_size >> vol->cluster_size_bits;
+ if (!min_nr)
+ min_nr = 1;
+ /* Want to allocate 16 mft records worth of clusters. */
+ nr = vol->mft_record_size << 4 >> vol->cluster_size_bits;
+ if (!nr)
+ nr = min_nr;
+ /* Ensure we do not go above 2^32-1 mft records. */
+ read_lock_irqsave(&mft_ni->size_lock, flags);
+ ll = mft_ni->allocated_size;
+ read_unlock_irqrestore(&mft_ni->size_lock, flags);
+ if (unlikely((ll + (nr << vol->cluster_size_bits)) >>
+ vol->mft_record_size_bits >= (1ll << 32))) {
+ nr = min_nr;
+ if (unlikely((ll + (nr << vol->cluster_size_bits)) >>
+ vol->mft_record_size_bits >= (1ll << 32))) {
+ ntfs_warning(vol->sb, "Cannot allocate mft record "
+ "because the maximum number of inodes "
+ "(2^32) has already been reached.");
+ up_write(&mft_ni->runlist.lock);
+ return -ENOSPC;
+ }
+ }
+ ntfs_debug("Trying mft data allocation with %s cluster count %lli.",
+ nr > min_nr ? "default" : "minimal", (long long)nr);
+ old_last_vcn = rl[1].vcn;
+ do {
+ rl2 = ntfs_cluster_alloc(vol, old_last_vcn, nr, lcn, MFT_ZONE,
+ true);
+ if (!IS_ERR(rl2))
+ break;
+ if (PTR_ERR(rl2) != -ENOSPC || nr == min_nr) {
+ ntfs_error(vol->sb, "Failed to allocate the minimal "
+ "number of clusters (%lli) for the "
+ "mft data attribute.", (long long)nr);
+ up_write(&mft_ni->runlist.lock);
+ return PTR_ERR(rl2);
+ }
+ /*
+ * There is not enough space to do the allocation, but there
+ * might be enough space to do a minimal allocation so try that
+ * before failing.
+ */
+ nr = min_nr;
+ ntfs_debug("Retrying mft data allocation with minimal cluster "
+ "count %lli.", (long long)nr);
+ } while (1);
+ rl = ntfs_runlists_merge(mft_ni->runlist.rl, rl2);
+ if (IS_ERR(rl)) {
+ up_write(&mft_ni->runlist.lock);
+ ntfs_error(vol->sb, "Failed to merge runlists for mft data "
+ "attribute.");
+ if (ntfs_cluster_free_from_rl(vol, rl2)) {
+ ntfs_error(vol->sb, "Failed to deallocate clusters "
+ "from the mft data attribute.%s", es);
+ NVolSetErrors(vol);
+ }
+ ntfs_free(rl2);
+ return PTR_ERR(rl);
+ }
+ mft_ni->runlist.rl = rl;
+ ntfs_debug("Allocated %lli clusters.", (long long)nr);
+ /* Find the last run in the new runlist. */
+ for (; rl[1].length; rl++)
+ ;
+ /* Update the attribute record as well. */
+ mrec = map_mft_record(mft_ni);
+ if (IS_ERR(mrec)) {
+ ntfs_error(vol->sb, "Failed to map mft record.");
+ ret = PTR_ERR(mrec);
+ goto undo_alloc;
+ }
+ ctx = ntfs_attr_get_search_ctx(mft_ni, mrec);
+ if (unlikely(!ctx)) {
+ ntfs_error(vol->sb, "Failed to get search context.");
+ ret = -ENOMEM;
+ goto undo_alloc;
+ }
+ ret = ntfs_attr_lookup(mft_ni->type, mft_ni->name, mft_ni->name_len,
+ CASE_SENSITIVE, rl[1].vcn, NULL, 0, ctx);
+ if (unlikely(ret)) {
+ ntfs_error(vol->sb, "Failed to find last attribute extent of "
+ "mft data attribute.");
+ if (ret == -ENOENT)
+ ret = -EIO;
+ goto undo_alloc;
+ }
+ a = ctx->attr;
+ ll = sle64_to_cpu(a->data.non_resident.lowest_vcn);
+ /* Search back for the previous last allocated cluster of mft bitmap. */
+ for (rl2 = rl; rl2 > mft_ni->runlist.rl; rl2--) {
+ if (ll >= rl2->vcn)
+ break;
+ }
+ BUG_ON(ll < rl2->vcn);
+ BUG_ON(ll >= rl2->vcn + rl2->length);
+ /* Get the size for the new mapping pairs array for this extent. */
+ mp_size = ntfs_get_size_for_mapping_pairs(vol, rl2, ll, -1);
+ if (unlikely(mp_size <= 0)) {
+ ntfs_error(vol->sb, "Get size for mapping pairs failed for "
+ "mft data attribute extent.");
+ ret = mp_size;
+ if (!ret)
+ ret = -EIO;
+ goto undo_alloc;
+ }
+ /* Expand the attribute record if necessary. */
+ old_alen = le32_to_cpu(a->length);
+ ret = ntfs_attr_record_resize(ctx->mrec, a, mp_size +
+ le16_to_cpu(a->data.non_resident.mapping_pairs_offset));
+ if (unlikely(ret)) {
+ if (ret != -ENOSPC) {
+ ntfs_error(vol->sb, "Failed to resize attribute "
+ "record for mft data attribute.");
+ goto undo_alloc;
+ }
+ // TODO: Deal with this by moving this extent to a new mft
+ // record or by starting a new extent in a new mft record or by
+ // moving other attributes out of this mft record.
+ // Note: Use the special reserved mft records and ensure that
+ // this extent is not required to find the mft record in
+ // question. If no free special records left we would need to
+ // move an existing record away, insert ours in its place, and
+ // then place the moved record into the newly allocated space
+ // and we would then need to update all references to this mft
+ // record appropriately. This is rather complicated...
+ ntfs_error(vol->sb, "Not enough space in this mft record to "
+ "accommodate extended mft data attribute "
+ "extent. Cannot handle this yet.");
+ ret = -EOPNOTSUPP;
+ goto undo_alloc;
+ }
+ mp_rebuilt = true;
+ /* Generate the mapping pairs array directly into the attr record. */
+ ret = ntfs_mapping_pairs_build(vol, (u8*)a +
+ le16_to_cpu(a->data.non_resident.mapping_pairs_offset),
+ mp_size, rl2, ll, -1, NULL);
+ if (unlikely(ret)) {
+ ntfs_error(vol->sb, "Failed to build mapping pairs array of "
+ "mft data attribute.");
+ goto undo_alloc;
+ }
+ /* Update the highest_vcn. */
+ a->data.non_resident.highest_vcn = cpu_to_sle64(rl[1].vcn - 1);
+ /*
+ * We now have extended the mft data allocated_size by nr clusters.
+ * Reflect this in the ntfs_inode structure and the attribute record.
+ * @rl is the last (non-terminator) runlist element of mft data
+ * attribute.
+ */
+ if (a->data.non_resident.lowest_vcn) {
+ /*
+ * We are not in the first attribute extent, switch to it, but
+ * first ensure the changes will make it to disk later.
+ */
+ flush_dcache_mft_record_page(ctx->ntfs_ino);
+ mark_mft_record_dirty(ctx->ntfs_ino);
+ ntfs_attr_reinit_search_ctx(ctx);
+ ret = ntfs_attr_lookup(mft_ni->type, mft_ni->name,
+ mft_ni->name_len, CASE_SENSITIVE, 0, NULL, 0,
+ ctx);
+ if (unlikely(ret)) {
+ ntfs_error(vol->sb, "Failed to find first attribute "
+ "extent of mft data attribute.");
+ goto restore_undo_alloc;
+ }
+ a = ctx->attr;
+ }
+ write_lock_irqsave(&mft_ni->size_lock, flags);
+ mft_ni->allocated_size += nr << vol->cluster_size_bits;
+ a->data.non_resident.allocated_size =
+ cpu_to_sle64(mft_ni->allocated_size);
+ write_unlock_irqrestore(&mft_ni->size_lock, flags);
+ /* Ensure the changes make it to disk. */
+ flush_dcache_mft_record_page(ctx->ntfs_ino);
+ mark_mft_record_dirty(ctx->ntfs_ino);
+ ntfs_attr_put_search_ctx(ctx);
+ unmap_mft_record(mft_ni);
+ up_write(&mft_ni->runlist.lock);
+ ntfs_debug("Done.");
+ return 0;
+restore_undo_alloc:
+ ntfs_attr_reinit_search_ctx(ctx);
+ if (ntfs_attr_lookup(mft_ni->type, mft_ni->name, mft_ni->name_len,
+ CASE_SENSITIVE, rl[1].vcn, NULL, 0, ctx)) {
+ ntfs_error(vol->sb, "Failed to find last attribute extent of "
+ "mft data attribute.%s", es);
+ write_lock_irqsave(&mft_ni->size_lock, flags);
+ mft_ni->allocated_size += nr << vol->cluster_size_bits;
+ write_unlock_irqrestore(&mft_ni->size_lock, flags);
+ ntfs_attr_put_search_ctx(ctx);
+ unmap_mft_record(mft_ni);
+ up_write(&mft_ni->runlist.lock);
+ /*
+ * The only thing that is now wrong is ->allocated_size of the
+ * base attribute extent which chkdsk should be able to fix.
+ */
+ NVolSetErrors(vol);
+ return ret;
+ }
+ ctx->attr->data.non_resident.highest_vcn =
+ cpu_to_sle64(old_last_vcn - 1);
+undo_alloc:
+ if (ntfs_cluster_free(mft_ni, old_last_vcn, -1, ctx) < 0) {
+ ntfs_error(vol->sb, "Failed to free clusters from mft data "
+ "attribute.%s", es);
+ NVolSetErrors(vol);
+ }
+ a = ctx->attr;
+ if (ntfs_rl_truncate_nolock(vol, &mft_ni->runlist, old_last_vcn)) {
+ ntfs_error(vol->sb, "Failed to truncate mft data attribute "
+ "runlist.%s", es);
+ NVolSetErrors(vol);
+ }
+ if (mp_rebuilt && !IS_ERR(ctx->mrec)) {
+ if (ntfs_mapping_pairs_build(vol, (u8*)a + le16_to_cpu(
+ a->data.non_resident.mapping_pairs_offset),
+ old_alen - le16_to_cpu(
+ a->data.non_resident.mapping_pairs_offset),
+ rl2, ll, -1, NULL)) {
+ ntfs_error(vol->sb, "Failed to restore mapping pairs "
+ "array.%s", es);
+ NVolSetErrors(vol);
+ }
+ if (ntfs_attr_record_resize(ctx->mrec, a, old_alen)) {
+ ntfs_error(vol->sb, "Failed to restore attribute "
+ "record.%s", es);
+ NVolSetErrors(vol);
+ }
+ flush_dcache_mft_record_page(ctx->ntfs_ino);
+ mark_mft_record_dirty(ctx->ntfs_ino);
+ } else if (IS_ERR(ctx->mrec)) {
+ ntfs_error(vol->sb, "Failed to restore attribute search "
+ "context.%s", es);
+ NVolSetErrors(vol);
+ }
+ if (ctx)
+ ntfs_attr_put_search_ctx(ctx);
+ if (!IS_ERR(mrec))
+ unmap_mft_record(mft_ni);
+ up_write(&mft_ni->runlist.lock);
+ return ret;
+}
+
+/**
+ * ntfs_mft_record_layout - layout an mft record into a memory buffer
+ * @vol: volume to which the mft record will belong
+ * @mft_no: mft reference specifying the mft record number
+ * @m: destination buffer of size >= @vol->mft_record_size bytes
+ *
+ * Layout an empty, unused mft record with the mft record number @mft_no into
+ * the buffer @m. The volume @vol is needed because the mft record structure
+ * was modified in NTFS 3.1 so we need to know which volume version this mft
+ * record will be used on.
+ *
+ * Return 0 on success and -errno on error.
+ */
+static int ntfs_mft_record_layout(const ntfs_volume *vol, const s64 mft_no,
+ MFT_RECORD *m)
+{
+ ATTR_RECORD *a;
+
+ ntfs_debug("Entering for mft record 0x%llx.", (long long)mft_no);
+ if (mft_no >= (1ll << 32)) {
+ ntfs_error(vol->sb, "Mft record number 0x%llx exceeds "
+ "maximum of 2^32.", (long long)mft_no);
+ return -ERANGE;
+ }
+ /* Start by clearing the whole mft record to gives us a clean slate. */
+ memset(m, 0, vol->mft_record_size);
+ /* Aligned to 2-byte boundary. */
+ if (vol->major_ver < 3 || (vol->major_ver == 3 && !vol->minor_ver))
+ m->usa_ofs = cpu_to_le16((sizeof(MFT_RECORD_OLD) + 1) & ~1);
+ else {
+ m->usa_ofs = cpu_to_le16((sizeof(MFT_RECORD) + 1) & ~1);
+ /*
+ * Set the NTFS 3.1+ specific fields while we know that the
+ * volume version is 3.1+.
+ */
+ m->reserved = 0;
+ m->mft_record_number = cpu_to_le32((u32)mft_no);
+ }
+ m->magic = magic_FILE;
+ if (vol->mft_record_size >= NTFS_BLOCK_SIZE)
+ m->usa_count = cpu_to_le16(vol->mft_record_size /
+ NTFS_BLOCK_SIZE + 1);
+ else {
+ m->usa_count = cpu_to_le16(1);
+ ntfs_warning(vol->sb, "Sector size is bigger than mft record "
+ "size. Setting usa_count to 1. If chkdsk "
+ "reports this as corruption, please email "
+ "linux-ntfs-dev@lists.sourceforge.net stating "
+ "that you saw this message and that the "
+ "modified filesystem created was corrupt. "
+ "Thank you.");
+ }
+ /* Set the update sequence number to 1. */
+ *(le16*)((u8*)m + le16_to_cpu(m->usa_ofs)) = cpu_to_le16(1);
+ m->lsn = 0;
+ m->sequence_number = cpu_to_le16(1);
+ m->link_count = 0;
+ /*
+ * Place the attributes straight after the update sequence array,
+ * aligned to 8-byte boundary.
+ */
+ m->attrs_offset = cpu_to_le16((le16_to_cpu(m->usa_ofs) +
+ (le16_to_cpu(m->usa_count) << 1) + 7) & ~7);
+ m->flags = 0;
+ /*
+ * Using attrs_offset plus eight bytes (for the termination attribute).
+ * attrs_offset is already aligned to 8-byte boundary, so no need to
+ * align again.
+ */
+ m->bytes_in_use = cpu_to_le32(le16_to_cpu(m->attrs_offset) + 8);
+ m->bytes_allocated = cpu_to_le32(vol->mft_record_size);
+ m->base_mft_record = 0;
+ m->next_attr_instance = 0;
+ /* Add the termination attribute. */
+ a = (ATTR_RECORD*)((u8*)m + le16_to_cpu(m->attrs_offset));
+ a->type = AT_END;
+ a->length = 0;
+ ntfs_debug("Done.");
+ return 0;
+}
+
+/**
+ * ntfs_mft_record_format - format an mft record on an ntfs volume
+ * @vol: volume on which to format the mft record
+ * @mft_no: mft record number to format
+ *
+ * Format the mft record @mft_no in $MFT/$DATA, i.e. lay out an empty, unused
+ * mft record into the appropriate place of the mft data attribute. This is
+ * used when extending the mft data attribute.
+ *
+ * Return 0 on success and -errno on error.
+ */
+static int ntfs_mft_record_format(const ntfs_volume *vol, const s64 mft_no)
+{
+ loff_t i_size;
+ struct inode *mft_vi = vol->mft_ino;
+ struct page *page;
+ MFT_RECORD *m;
+ pgoff_t index, end_index;
+ unsigned int ofs;
+ int err;
+
+ ntfs_debug("Entering for mft record 0x%llx.", (long long)mft_no);
+ /*
+ * The index into the page cache and the offset within the page cache
+ * page of the wanted mft record.
+ */
+ index = mft_no << vol->mft_record_size_bits >> PAGE_SHIFT;
+ ofs = (mft_no << vol->mft_record_size_bits) & ~PAGE_MASK;
+ /* The maximum valid index into the page cache for $MFT's data. */
+ i_size = i_size_read(mft_vi);
+ end_index = i_size >> PAGE_SHIFT;
+ if (unlikely(index >= end_index)) {
+ if (unlikely(index > end_index || ofs + vol->mft_record_size >=
+ (i_size & ~PAGE_MASK))) {
+ ntfs_error(vol->sb, "Tried to format non-existing mft "
+ "record 0x%llx.", (long long)mft_no);
+ return -ENOENT;
+ }
+ }
+ /* Read, map, and pin the page containing the mft record. */
+ page = ntfs_map_page(mft_vi->i_mapping, index);
+ if (IS_ERR(page)) {
+ ntfs_error(vol->sb, "Failed to map page containing mft record "
+ "to format 0x%llx.", (long long)mft_no);
+ return PTR_ERR(page);
+ }
+ lock_page(page);
+ BUG_ON(!PageUptodate(page));
+ ClearPageUptodate(page);
+ m = (MFT_RECORD*)((u8*)page_address(page) + ofs);
+ err = ntfs_mft_record_layout(vol, mft_no, m);
+ if (unlikely(err)) {
+ ntfs_error(vol->sb, "Failed to layout mft record 0x%llx.",
+ (long long)mft_no);
+ SetPageUptodate(page);
+ unlock_page(page);
+ ntfs_unmap_page(page);
+ return err;
+ }
+ flush_dcache_page(page);
+ SetPageUptodate(page);
+ unlock_page(page);
+ /*
+ * Make sure the mft record is written out to disk. We could use
+ * ilookup5() to check if an inode is in icache and so on but this is
+ * unnecessary as ntfs_writepage() will write the dirty record anyway.
+ */
+ mark_ntfs_record_dirty(page, ofs);
+ ntfs_unmap_page(page);
+ ntfs_debug("Done.");
+ return 0;
+}
+
+/**
+ * ntfs_mft_record_alloc - allocate an mft record on an ntfs volume
+ * @vol: [IN] volume on which to allocate the mft record
+ * @mode: [IN] mode if want a file or directory, i.e. base inode or 0
+ * @base_ni: [IN] open base inode if allocating an extent mft record or NULL
+ * @mrec: [OUT] on successful return this is the mapped mft record
+ *
+ * Allocate an mft record in $MFT/$DATA of an open ntfs volume @vol.
+ *
+ * If @base_ni is NULL make the mft record a base mft record, i.e. a file or
+ * direvctory inode, and allocate it at the default allocator position. In
+ * this case @mode is the file mode as given to us by the caller. We in
+ * particular use @mode to distinguish whether a file or a directory is being
+ * created (S_IFDIR(mode) and S_IFREG(mode), respectively).
+ *
+ * If @base_ni is not NULL make the allocated mft record an extent record,
+ * allocate it starting at the mft record after the base mft record and attach
+ * the allocated and opened ntfs inode to the base inode @base_ni. In this
+ * case @mode must be 0 as it is meaningless for extent inodes.
+ *
+ * You need to check the return value with IS_ERR(). If false, the function
+ * was successful and the return value is the now opened ntfs inode of the
+ * allocated mft record. *@mrec is then set to the allocated, mapped, pinned,
+ * and locked mft record. If IS_ERR() is true, the function failed and the
+ * error code is obtained from PTR_ERR(return value). *@mrec is undefined in
+ * this case.
+ *
+ * Allocation strategy:
+ *
+ * To find a free mft record, we scan the mft bitmap for a zero bit. To
+ * optimize this we start scanning at the place specified by @base_ni or if
+ * @base_ni is NULL we start where we last stopped and we perform wrap around
+ * when we reach the end. Note, we do not try to allocate mft records below
+ * number 24 because numbers 0 to 15 are the defined system files anyway and 16
+ * to 24 are special in that they are used for storing extension mft records
+ * for the $DATA attribute of $MFT. This is required to avoid the possibility
+ * of creating a runlist with a circular dependency which once written to disk
+ * can never be read in again. Windows will only use records 16 to 24 for
+ * normal files if the volume is completely out of space. We never use them
+ * which means that when the volume is really out of space we cannot create any
+ * more files while Windows can still create up to 8 small files. We can start
+ * doing this at some later time, it does not matter much for now.
+ *
+ * When scanning the mft bitmap, we only search up to the last allocated mft
+ * record. If there are no free records left in the range 24 to number of
+ * allocated mft records, then we extend the $MFT/$DATA attribute in order to
+ * create free mft records. We extend the allocated size of $MFT/$DATA by 16
+ * records at a time or one cluster, if cluster size is above 16kiB. If there
+ * is not sufficient space to do this, we try to extend by a single mft record
+ * or one cluster, if cluster size is above the mft record size.
+ *
+ * No matter how many mft records we allocate, we initialize only the first
+ * allocated mft record, incrementing mft data size and initialized size
+ * accordingly, open an ntfs_inode for it and return it to the caller, unless
+ * there are less than 24 mft records, in which case we allocate and initialize
+ * mft records until we reach record 24 which we consider as the first free mft
+ * record for use by normal files.
+ *
+ * If during any stage we overflow the initialized data in the mft bitmap, we
+ * extend the initialized size (and data size) by 8 bytes, allocating another
+ * cluster if required. The bitmap data size has to be at least equal to the
+ * number of mft records in the mft, but it can be bigger, in which case the
+ * superflous bits are padded with zeroes.
+ *
+ * Thus, when we return successfully (IS_ERR() is false), we will have:
+ * - initialized / extended the mft bitmap if necessary,
+ * - initialized / extended the mft data if necessary,
+ * - set the bit corresponding to the mft record being allocated in the
+ * mft bitmap,
+ * - opened an ntfs_inode for the allocated mft record, and we will have
+ * - returned the ntfs_inode as well as the allocated mapped, pinned, and
+ * locked mft record.
+ *
+ * On error, the volume will be left in a consistent state and no record will
+ * be allocated. If rolling back a partial operation fails, we may leave some
+ * inconsistent metadata in which case we set NVolErrors() so the volume is
+ * left dirty when unmounted.
+ *
+ * Note, this function cannot make use of most of the normal functions, like
+ * for example for attribute resizing, etc, because when the run list overflows
+ * the base mft record and an attribute list is used, it is very important that
+ * the extension mft records used to store the $DATA attribute of $MFT can be
+ * reached without having to read the information contained inside them, as
+ * this would make it impossible to find them in the first place after the
+ * volume is unmounted. $MFT/$BITMAP probably does not need to follow this
+ * rule because the bitmap is not essential for finding the mft records, but on
+ * the other hand, handling the bitmap in this special way would make life
+ * easier because otherwise there might be circular invocations of functions
+ * when reading the bitmap.
+ */
+ntfs_inode *ntfs_mft_record_alloc(ntfs_volume *vol, const int mode,
+ ntfs_inode *base_ni, MFT_RECORD **mrec)
+{
+ s64 ll, bit, old_data_initialized, old_data_size;
+ unsigned long flags;
+ struct inode *vi;
+ struct page *page;
+ ntfs_inode *mft_ni, *mftbmp_ni, *ni;
+ ntfs_attr_search_ctx *ctx;
+ MFT_RECORD *m;
+ ATTR_RECORD *a;
+ pgoff_t index;
+ unsigned int ofs;
+ int err;
+ le16 seq_no, usn;
+ bool record_formatted = false;
+
+ if (base_ni) {
+ ntfs_debug("Entering (allocating an extent mft record for "
+ "base mft record 0x%llx).",
+ (long long)base_ni->mft_no);
+ /* @mode and @base_ni are mutually exclusive. */
+ BUG_ON(mode);
+ } else
+ ntfs_debug("Entering (allocating a base mft record).");
+ if (mode) {
+ /* @mode and @base_ni are mutually exclusive. */
+ BUG_ON(base_ni);
+ /* We only support creation of normal files and directories. */
+ if (!S_ISREG(mode) && !S_ISDIR(mode))
+ return ERR_PTR(-EOPNOTSUPP);
+ }
+ BUG_ON(!mrec);
+ mft_ni = NTFS_I(vol->mft_ino);
+ mftbmp_ni = NTFS_I(vol->mftbmp_ino);
+ down_write(&vol->mftbmp_lock);
+ bit = ntfs_mft_bitmap_find_and_alloc_free_rec_nolock(vol, base_ni);
+ if (bit >= 0) {
+ ntfs_debug("Found and allocated free record (#1), bit 0x%llx.",
+ (long long)bit);
+ goto have_alloc_rec;
+ }
+ if (bit != -ENOSPC) {
+ up_write(&vol->mftbmp_lock);
+ return ERR_PTR(bit);
+ }
+ /*
+ * No free mft records left. If the mft bitmap already covers more
+ * than the currently used mft records, the next records are all free,
+ * so we can simply allocate the first unused mft record.
+ * Note: We also have to make sure that the mft bitmap at least covers
+ * the first 24 mft records as they are special and whilst they may not
+ * be in use, we do not allocate from them.
+ */
+ read_lock_irqsave(&mft_ni->size_lock, flags);
+ ll = mft_ni->initialized_size >> vol->mft_record_size_bits;
+ read_unlock_irqrestore(&mft_ni->size_lock, flags);
+ read_lock_irqsave(&mftbmp_ni->size_lock, flags);
+ old_data_initialized = mftbmp_ni->initialized_size;
+ read_unlock_irqrestore(&mftbmp_ni->size_lock, flags);
+ if (old_data_initialized << 3 > ll && old_data_initialized > 3) {
+ bit = ll;
+ if (bit < 24)
+ bit = 24;
+ if (unlikely(bit >= (1ll << 32)))
+ goto max_err_out;
+ ntfs_debug("Found free record (#2), bit 0x%llx.",
+ (long long)bit);
+ goto found_free_rec;
+ }
+ /*
+ * The mft bitmap needs to be expanded until it covers the first unused
+ * mft record that we can allocate.
+ * Note: The smallest mft record we allocate is mft record 24.
+ */
+ bit = old_data_initialized << 3;
+ if (unlikely(bit >= (1ll << 32)))
+ goto max_err_out;
+ read_lock_irqsave(&mftbmp_ni->size_lock, flags);
+ old_data_size = mftbmp_ni->allocated_size;
+ ntfs_debug("Status of mftbmp before extension: allocated_size 0x%llx, "
+ "data_size 0x%llx, initialized_size 0x%llx.",
+ (long long)old_data_size,
+ (long long)i_size_read(vol->mftbmp_ino),
+ (long long)old_data_initialized);
+ read_unlock_irqrestore(&mftbmp_ni->size_lock, flags);
+ if (old_data_initialized + 8 > old_data_size) {
+ /* Need to extend bitmap by one more cluster. */
+ ntfs_debug("mftbmp: initialized_size + 8 > allocated_size.");
+ err = ntfs_mft_bitmap_extend_allocation_nolock(vol);
+ if (unlikely(err)) {
+ up_write(&vol->mftbmp_lock);
+ goto err_out;
+ }
+#ifdef DEBUG
+ read_lock_irqsave(&mftbmp_ni->size_lock, flags);
+ ntfs_debug("Status of mftbmp after allocation extension: "
+ "allocated_size 0x%llx, data_size 0x%llx, "
+ "initialized_size 0x%llx.",
+ (long long)mftbmp_ni->allocated_size,
+ (long long)i_size_read(vol->mftbmp_ino),
+ (long long)mftbmp_ni->initialized_size);
+ read_unlock_irqrestore(&mftbmp_ni->size_lock, flags);
+#endif /* DEBUG */
+ }
+ /*
+ * We now have sufficient allocated space, extend the initialized_size
+ * as well as the data_size if necessary and fill the new space with
+ * zeroes.
+ */
+ err = ntfs_mft_bitmap_extend_initialized_nolock(vol);
+ if (unlikely(err)) {
+ up_write(&vol->mftbmp_lock);
+ goto err_out;
+ }
+#ifdef DEBUG
+ read_lock_irqsave(&mftbmp_ni->size_lock, flags);
+ ntfs_debug("Status of mftbmp after initialized extension: "
+ "allocated_size 0x%llx, data_size 0x%llx, "
+ "initialized_size 0x%llx.",
+ (long long)mftbmp_ni->allocated_size,
+ (long long)i_size_read(vol->mftbmp_ino),
+ (long long)mftbmp_ni->initialized_size);
+ read_unlock_irqrestore(&mftbmp_ni->size_lock, flags);
+#endif /* DEBUG */
+ ntfs_debug("Found free record (#3), bit 0x%llx.", (long long)bit);
+found_free_rec:
+ /* @bit is the found free mft record, allocate it in the mft bitmap. */
+ ntfs_debug("At found_free_rec.");
+ err = ntfs_bitmap_set_bit(vol->mftbmp_ino, bit);
+ if (unlikely(err)) {
+ ntfs_error(vol->sb, "Failed to allocate bit in mft bitmap.");
+ up_write(&vol->mftbmp_lock);
+ goto err_out;
+ }
+ ntfs_debug("Set bit 0x%llx in mft bitmap.", (long long)bit);
+have_alloc_rec:
+ /*
+ * The mft bitmap is now uptodate. Deal with mft data attribute now.
+ * Note, we keep hold of the mft bitmap lock for writing until all
+ * modifications to the mft data attribute are complete, too, as they
+ * will impact decisions for mft bitmap and mft record allocation done
+ * by a parallel allocation and if the lock is not maintained a
+ * parallel allocation could allocate the same mft record as this one.
+ */
+ ll = (bit + 1) << vol->mft_record_size_bits;
+ read_lock_irqsave(&mft_ni->size_lock, flags);
+ old_data_initialized = mft_ni->initialized_size;
+ read_unlock_irqrestore(&mft_ni->size_lock, flags);
+ if (ll <= old_data_initialized) {
+ ntfs_debug("Allocated mft record already initialized.");
+ goto mft_rec_already_initialized;
+ }
+ ntfs_debug("Initializing allocated mft record.");
+ /*
+ * The mft record is outside the initialized data. Extend the mft data
+ * attribute until it covers the allocated record. The loop is only
+ * actually traversed more than once when a freshly formatted volume is
+ * first written to so it optimizes away nicely in the common case.
+ */
+ read_lock_irqsave(&mft_ni->size_lock, flags);
+ ntfs_debug("Status of mft data before extension: "
+ "allocated_size 0x%llx, data_size 0x%llx, "
+ "initialized_size 0x%llx.",
+ (long long)mft_ni->allocated_size,
+ (long long)i_size_read(vol->mft_ino),
+ (long long)mft_ni->initialized_size);
+ while (ll > mft_ni->allocated_size) {
+ read_unlock_irqrestore(&mft_ni->size_lock, flags);
+ err = ntfs_mft_data_extend_allocation_nolock(vol);
+ if (unlikely(err)) {
+ ntfs_error(vol->sb, "Failed to extend mft data "
+ "allocation.");
+ goto undo_mftbmp_alloc_nolock;
+ }
+ read_lock_irqsave(&mft_ni->size_lock, flags);
+ ntfs_debug("Status of mft data after allocation extension: "
+ "allocated_size 0x%llx, data_size 0x%llx, "
+ "initialized_size 0x%llx.",
+ (long long)mft_ni->allocated_size,
+ (long long)i_size_read(vol->mft_ino),
+ (long long)mft_ni->initialized_size);
+ }
+ read_unlock_irqrestore(&mft_ni->size_lock, flags);
+ /*
+ * Extend mft data initialized size (and data size of course) to reach
+ * the allocated mft record, formatting the mft records allong the way.
+ * Note: We only modify the ntfs_inode structure as that is all that is
+ * needed by ntfs_mft_record_format(). We will update the attribute
+ * record itself in one fell swoop later on.
+ */
+ write_lock_irqsave(&mft_ni->size_lock, flags);
+ old_data_initialized = mft_ni->initialized_size;
+ old_data_size = vol->mft_ino->i_size;
+ while (ll > mft_ni->initialized_size) {
+ s64 new_initialized_size, mft_no;
+
+ new_initialized_size = mft_ni->initialized_size +
+ vol->mft_record_size;
+ mft_no = mft_ni->initialized_size >> vol->mft_record_size_bits;
+ if (new_initialized_size > i_size_read(vol->mft_ino))
+ i_size_write(vol->mft_ino, new_initialized_size);
+ write_unlock_irqrestore(&mft_ni->size_lock, flags);
+ ntfs_debug("Initializing mft record 0x%llx.",
+ (long long)mft_no);
+ err = ntfs_mft_record_format(vol, mft_no);
+ if (unlikely(err)) {
+ ntfs_error(vol->sb, "Failed to format mft record.");
+ goto undo_data_init;
+ }
+ write_lock_irqsave(&mft_ni->size_lock, flags);
+ mft_ni->initialized_size = new_initialized_size;
+ }
+ write_unlock_irqrestore(&mft_ni->size_lock, flags);
+ record_formatted = true;
+ /* Update the mft data attribute record to reflect the new sizes. */
+ m = map_mft_record(mft_ni);
+ if (IS_ERR(m)) {
+ ntfs_error(vol->sb, "Failed to map mft record.");
+ err = PTR_ERR(m);
+ goto undo_data_init;
+ }
+ ctx = ntfs_attr_get_search_ctx(mft_ni, m);
+ if (unlikely(!ctx)) {
+ ntfs_error(vol->sb, "Failed to get search context.");
+ err = -ENOMEM;
+ unmap_mft_record(mft_ni);
+ goto undo_data_init;
+ }
+ err = ntfs_attr_lookup(mft_ni->type, mft_ni->name, mft_ni->name_len,
+ CASE_SENSITIVE, 0, NULL, 0, ctx);
+ if (unlikely(err)) {
+ ntfs_error(vol->sb, "Failed to find first attribute extent of "
+ "mft data attribute.");
+ ntfs_attr_put_search_ctx(ctx);
+ unmap_mft_record(mft_ni);
+ goto undo_data_init;
+ }
+ a = ctx->attr;
+ read_lock_irqsave(&mft_ni->size_lock, flags);
+ a->data.non_resident.initialized_size =
+ cpu_to_sle64(mft_ni->initialized_size);
+ a->data.non_resident.data_size =
+ cpu_to_sle64(i_size_read(vol->mft_ino));
+ read_unlock_irqrestore(&mft_ni->size_lock, flags);
+ /* Ensure the changes make it to disk. */
+ flush_dcache_mft_record_page(ctx->ntfs_ino);
+ mark_mft_record_dirty(ctx->ntfs_ino);
+ ntfs_attr_put_search_ctx(ctx);
+ unmap_mft_record(mft_ni);
+ read_lock_irqsave(&mft_ni->size_lock, flags);
+ ntfs_debug("Status of mft data after mft record initialization: "
+ "allocated_size 0x%llx, data_size 0x%llx, "
+ "initialized_size 0x%llx.",
+ (long long)mft_ni->allocated_size,
+ (long long)i_size_read(vol->mft_ino),
+ (long long)mft_ni->initialized_size);
+ BUG_ON(i_size_read(vol->mft_ino) > mft_ni->allocated_size);
+ BUG_ON(mft_ni->initialized_size > i_size_read(vol->mft_ino));
+ read_unlock_irqrestore(&mft_ni->size_lock, flags);
+mft_rec_already_initialized:
+ /*
+ * We can finally drop the mft bitmap lock as the mft data attribute
+ * has been fully updated. The only disparity left is that the
+ * allocated mft record still needs to be marked as in use to match the
+ * set bit in the mft bitmap but this is actually not a problem since
+ * this mft record is not referenced from anywhere yet and the fact
+ * that it is allocated in the mft bitmap means that no-one will try to
+ * allocate it either.
+ */
+ up_write(&vol->mftbmp_lock);
+ /*
+ * We now have allocated and initialized the mft record. Calculate the
+ * index of and the offset within the page cache page the record is in.
+ */
+ index = bit << vol->mft_record_size_bits >> PAGE_SHIFT;
+ ofs = (bit << vol->mft_record_size_bits) & ~PAGE_MASK;
+ /* Read, map, and pin the page containing the mft record. */
+ page = ntfs_map_page(vol->mft_ino->i_mapping, index);
+ if (IS_ERR(page)) {
+ ntfs_error(vol->sb, "Failed to map page containing allocated "
+ "mft record 0x%llx.", (long long)bit);
+ err = PTR_ERR(page);
+ goto undo_mftbmp_alloc;
+ }
+ lock_page(page);
+ BUG_ON(!PageUptodate(page));
+ ClearPageUptodate(page);
+ m = (MFT_RECORD*)((u8*)page_address(page) + ofs);
+ /* If we just formatted the mft record no need to do it again. */
+ if (!record_formatted) {
+ /* Sanity check that the mft record is really not in use. */
+ if (ntfs_is_file_record(m->magic) &&
+ (m->flags & MFT_RECORD_IN_USE)) {
+ ntfs_error(vol->sb, "Mft record 0x%llx was marked "
+ "free in mft bitmap but is marked "
+ "used itself. Corrupt filesystem. "
+ "Unmount and run chkdsk.",
+ (long long)bit);
+ err = -EIO;
+ SetPageUptodate(page);
+ unlock_page(page);
+ ntfs_unmap_page(page);
+ NVolSetErrors(vol);
+ goto undo_mftbmp_alloc;
+ }
+ /*
+ * We need to (re-)format the mft record, preserving the
+ * sequence number if it is not zero as well as the update
+ * sequence number if it is not zero or -1 (0xffff). This
+ * means we do not need to care whether or not something went
+ * wrong with the previous mft record.
+ */
+ seq_no = m->sequence_number;
+ usn = *(le16*)((u8*)m + le16_to_cpu(m->usa_ofs));
+ err = ntfs_mft_record_layout(vol, bit, m);
+ if (unlikely(err)) {
+ ntfs_error(vol->sb, "Failed to layout allocated mft "
+ "record 0x%llx.", (long long)bit);
+ SetPageUptodate(page);
+ unlock_page(page);
+ ntfs_unmap_page(page);
+ goto undo_mftbmp_alloc;
+ }
+ if (seq_no)
+ m->sequence_number = seq_no;
+ if (usn && le16_to_cpu(usn) != 0xffff)
+ *(le16*)((u8*)m + le16_to_cpu(m->usa_ofs)) = usn;
+ }
+ /* Set the mft record itself in use. */
+ m->flags |= MFT_RECORD_IN_USE;
+ if (S_ISDIR(mode))
+ m->flags |= MFT_RECORD_IS_DIRECTORY;
+ flush_dcache_page(page);
+ SetPageUptodate(page);
+ if (base_ni) {
+ MFT_RECORD *m_tmp;
+
+ /*
+ * Setup the base mft record in the extent mft record. This
+ * completes initialization of the allocated extent mft record
+ * and we can simply use it with map_extent_mft_record().
+ */
+ m->base_mft_record = MK_LE_MREF(base_ni->mft_no,
+ base_ni->seq_no);
+ /*
+ * Allocate an extent inode structure for the new mft record,
+ * attach it to the base inode @base_ni and map, pin, and lock
+ * its, i.e. the allocated, mft record.
+ */
+ m_tmp = map_extent_mft_record(base_ni, bit, &ni);
+ if (IS_ERR(m_tmp)) {
+ ntfs_error(vol->sb, "Failed to map allocated extent "
+ "mft record 0x%llx.", (long long)bit);
+ err = PTR_ERR(m_tmp);
+ /* Set the mft record itself not in use. */
+ m->flags &= cpu_to_le16(
+ ~le16_to_cpu(MFT_RECORD_IN_USE));
+ flush_dcache_page(page);
+ /* Make sure the mft record is written out to disk. */
+ mark_ntfs_record_dirty(page, ofs);
+ unlock_page(page);
+ ntfs_unmap_page(page);
+ goto undo_mftbmp_alloc;
+ }
+ BUG_ON(m != m_tmp);
+ /*
+ * Make sure the allocated mft record is written out to disk.
+ * No need to set the inode dirty because the caller is going
+ * to do that anyway after finishing with the new extent mft
+ * record (e.g. at a minimum a new attribute will be added to
+ * the mft record.
+ */
+ mark_ntfs_record_dirty(page, ofs);
+ unlock_page(page);
+ /*
+ * Need to unmap the page since map_extent_mft_record() mapped
+ * it as well so we have it mapped twice at the moment.
+ */
+ ntfs_unmap_page(page);
+ } else {
+ /*
+ * Allocate a new VFS inode and set it up. NOTE: @vi->i_nlink
+ * is set to 1 but the mft record->link_count is 0. The caller
+ * needs to bear this in mind.
+ */
+ vi = new_inode(vol->sb);
+ if (unlikely(!vi)) {
+ err = -ENOMEM;
+ /* Set the mft record itself not in use. */
+ m->flags &= cpu_to_le16(
+ ~le16_to_cpu(MFT_RECORD_IN_USE));
+ flush_dcache_page(page);
+ /* Make sure the mft record is written out to disk. */
+ mark_ntfs_record_dirty(page, ofs);
+ unlock_page(page);
+ ntfs_unmap_page(page);
+ goto undo_mftbmp_alloc;
+ }
+ vi->i_ino = bit;
+
+ /* The owner and group come from the ntfs volume. */
+ vi->i_uid = vol->uid;
+ vi->i_gid = vol->gid;
+
+ /* Initialize the ntfs specific part of @vi. */
+ ntfs_init_big_inode(vi);
+ ni = NTFS_I(vi);
+ /*
+ * Set the appropriate mode, attribute type, and name. For
+ * directories, also setup the index values to the defaults.
+ */
+ if (S_ISDIR(mode)) {
+ vi->i_mode = S_IFDIR | S_IRWXUGO;
+ vi->i_mode &= ~vol->dmask;
+
+ NInoSetMstProtected(ni);
+ ni->type = AT_INDEX_ALLOCATION;
+ ni->name = I30;
+ ni->name_len = 4;
+
+ ni->itype.index.block_size = 4096;
+ ni->itype.index.block_size_bits = ntfs_ffs(4096) - 1;
+ ni->itype.index.collation_rule = COLLATION_FILE_NAME;
+ if (vol->cluster_size <= ni->itype.index.block_size) {
+ ni->itype.index.vcn_size = vol->cluster_size;
+ ni->itype.index.vcn_size_bits =
+ vol->cluster_size_bits;
+ } else {
+ ni->itype.index.vcn_size = vol->sector_size;
+ ni->itype.index.vcn_size_bits =
+ vol->sector_size_bits;
+ }
+ } else {
+ vi->i_mode = S_IFREG | S_IRWXUGO;
+ vi->i_mode &= ~vol->fmask;
+
+ ni->type = AT_DATA;
+ ni->name = NULL;
+ ni->name_len = 0;
+ }
+ if (IS_RDONLY(vi))
+ vi->i_mode &= ~S_IWUGO;
+
+ /* Set the inode times to the current time. */
+ vi->i_atime = vi->i_mtime = vi->i_ctime =
+ current_time(vi);
+ /*
+ * Set the file size to 0, the ntfs inode sizes are set to 0 by
+ * the call to ntfs_init_big_inode() below.
+ */
+ vi->i_size = 0;
+ vi->i_blocks = 0;
+
+ /* Set the sequence number. */
+ vi->i_generation = ni->seq_no = le16_to_cpu(m->sequence_number);
+ /*
+ * Manually map, pin, and lock the mft record as we already
+ * have its page mapped and it is very easy to do.
+ */
+ atomic_inc(&ni->count);
+ mutex_lock(&ni->mrec_lock);
+ ni->page = page;
+ ni->page_ofs = ofs;
+ /*
+ * Make sure the allocated mft record is written out to disk.
+ * NOTE: We do not set the ntfs inode dirty because this would
+ * fail in ntfs_write_inode() because the inode does not have a
+ * standard information attribute yet. Also, there is no need
+ * to set the inode dirty because the caller is going to do
+ * that anyway after finishing with the new mft record (e.g. at
+ * a minimum some new attributes will be added to the mft
+ * record.
+ */
+ mark_ntfs_record_dirty(page, ofs);
+ unlock_page(page);
+
+ /* Add the inode to the inode hash for the superblock. */
+ insert_inode_hash(vi);
+
+ /* Update the default mft allocation position. */
+ vol->mft_data_pos = bit + 1;
+ }
+ /*
+ * Return the opened, allocated inode of the allocated mft record as
+ * well as the mapped, pinned, and locked mft record.
+ */
+ ntfs_debug("Returning opened, allocated %sinode 0x%llx.",
+ base_ni ? "extent " : "", (long long)bit);
+ *mrec = m;
+ return ni;
+undo_data_init:
+ write_lock_irqsave(&mft_ni->size_lock, flags);
+ mft_ni->initialized_size = old_data_initialized;
+ i_size_write(vol->mft_ino, old_data_size);
+ write_unlock_irqrestore(&mft_ni->size_lock, flags);
+ goto undo_mftbmp_alloc_nolock;
+undo_mftbmp_alloc:
+ down_write(&vol->mftbmp_lock);
+undo_mftbmp_alloc_nolock:
+ if (ntfs_bitmap_clear_bit(vol->mftbmp_ino, bit)) {
+ ntfs_error(vol->sb, "Failed to clear bit in mft bitmap.%s", es);
+ NVolSetErrors(vol);
+ }
+ up_write(&vol->mftbmp_lock);
+err_out:
+ return ERR_PTR(err);
+max_err_out:
+ ntfs_warning(vol->sb, "Cannot allocate mft record because the maximum "
+ "number of inodes (2^32) has already been reached.");
+ up_write(&vol->mftbmp_lock);
+ return ERR_PTR(-ENOSPC);
+}
+
+/**
+ * ntfs_extent_mft_record_free - free an extent mft record on an ntfs volume
+ * @ni: ntfs inode of the mapped extent mft record to free
+ * @m: mapped extent mft record of the ntfs inode @ni
+ *
+ * Free the mapped extent mft record @m of the extent ntfs inode @ni.
+ *
+ * Note that this function unmaps the mft record and closes and destroys @ni
+ * internally and hence you cannot use either @ni nor @m any more after this
+ * function returns success.
+ *
+ * On success return 0 and on error return -errno. @ni and @m are still valid
+ * in this case and have not been freed.
+ *
+ * For some errors an error message is displayed and the success code 0 is
+ * returned and the volume is then left dirty on umount. This makes sense in
+ * case we could not rollback the changes that were already done since the
+ * caller no longer wants to reference this mft record so it does not matter to
+ * the caller if something is wrong with it as long as it is properly detached
+ * from the base inode.
+ */
+int ntfs_extent_mft_record_free(ntfs_inode *ni, MFT_RECORD *m)
+{
+ unsigned long mft_no = ni->mft_no;
+ ntfs_volume *vol = ni->vol;
+ ntfs_inode *base_ni;
+ ntfs_inode **extent_nis;
+ int i, err;
+ le16 old_seq_no;
+ u16 seq_no;
+
+ BUG_ON(NInoAttr(ni));
+ BUG_ON(ni->nr_extents != -1);
+
+ mutex_lock(&ni->extent_lock);
+ base_ni = ni->ext.base_ntfs_ino;
+ mutex_unlock(&ni->extent_lock);
+
+ BUG_ON(base_ni->nr_extents <= 0);
+
+ ntfs_debug("Entering for extent inode 0x%lx, base inode 0x%lx.\n",
+ mft_no, base_ni->mft_no);
+
+ mutex_lock(&base_ni->extent_lock);
+
+ /* Make sure we are holding the only reference to the extent inode. */
+ if (atomic_read(&ni->count) > 2) {
+ ntfs_error(vol->sb, "Tried to free busy extent inode 0x%lx, "
+ "not freeing.", base_ni->mft_no);
+ mutex_unlock(&base_ni->extent_lock);
+ return -EBUSY;
+ }
+
+ /* Dissociate the ntfs inode from the base inode. */
+ extent_nis = base_ni->ext.extent_ntfs_inos;
+ err = -ENOENT;
+ for (i = 0; i < base_ni->nr_extents; i++) {
+ if (ni != extent_nis[i])
+ continue;
+ extent_nis += i;
+ base_ni->nr_extents--;
+ memmove(extent_nis, extent_nis + 1, (base_ni->nr_extents - i) *
+ sizeof(ntfs_inode*));
+ err = 0;
+ break;
+ }
+
+ mutex_unlock(&base_ni->extent_lock);
+
+ if (unlikely(err)) {
+ ntfs_error(vol->sb, "Extent inode 0x%lx is not attached to "
+ "its base inode 0x%lx.", mft_no,
+ base_ni->mft_no);
+ BUG();
+ }
+
+ /*
+ * The extent inode is no longer attached to the base inode so no one
+ * can get a reference to it any more.
+ */
+
+ /* Mark the mft record as not in use. */
+ m->flags &= ~MFT_RECORD_IN_USE;
+
+ /* Increment the sequence number, skipping zero, if it is not zero. */
+ old_seq_no = m->sequence_number;
+ seq_no = le16_to_cpu(old_seq_no);
+ if (seq_no == 0xffff)
+ seq_no = 1;
+ else if (seq_no)
+ seq_no++;
+ m->sequence_number = cpu_to_le16(seq_no);
+
+ /*
+ * Set the ntfs inode dirty and write it out. We do not need to worry
+ * about the base inode here since whatever caused the extent mft
+ * record to be freed is guaranteed to do it already.
+ */
+ NInoSetDirty(ni);
+ err = write_mft_record(ni, m, 0);
+ if (unlikely(err)) {
+ ntfs_error(vol->sb, "Failed to write mft record 0x%lx, not "
+ "freeing.", mft_no);
+ goto rollback;
+ }
+rollback_error:
+ /* Unmap and throw away the now freed extent inode. */
+ unmap_extent_mft_record(ni);
+ ntfs_clear_extent_inode(ni);
+
+ /* Clear the bit in the $MFT/$BITMAP corresponding to this record. */
+ down_write(&vol->mftbmp_lock);
+ err = ntfs_bitmap_clear_bit(vol->mftbmp_ino, mft_no);
+ up_write(&vol->mftbmp_lock);
+ if (unlikely(err)) {
+ /*
+ * The extent inode is gone but we failed to deallocate it in
+ * the mft bitmap. Just emit a warning and leave the volume
+ * dirty on umount.
+ */
+ ntfs_error(vol->sb, "Failed to clear bit in mft bitmap.%s", es);
+ NVolSetErrors(vol);
+ }
+ return 0;
+rollback:
+ /* Rollback what we did... */
+ mutex_lock(&base_ni->extent_lock);
+ extent_nis = base_ni->ext.extent_ntfs_inos;
+ if (!(base_ni->nr_extents & 3)) {
+ int new_size = (base_ni->nr_extents + 4) * sizeof(ntfs_inode*);
+
+ extent_nis = kmalloc(new_size, GFP_NOFS);
+ if (unlikely(!extent_nis)) {
+ ntfs_error(vol->sb, "Failed to allocate internal "
+ "buffer during rollback.%s", es);
+ mutex_unlock(&base_ni->extent_lock);
+ NVolSetErrors(vol);
+ goto rollback_error;
+ }
+ if (base_ni->nr_extents) {
+ BUG_ON(!base_ni->ext.extent_ntfs_inos);
+ memcpy(extent_nis, base_ni->ext.extent_ntfs_inos,
+ new_size - 4 * sizeof(ntfs_inode*));
+ kfree(base_ni->ext.extent_ntfs_inos);
+ }
+ base_ni->ext.extent_ntfs_inos = extent_nis;
+ }
+ m->flags |= MFT_RECORD_IN_USE;
+ m->sequence_number = old_seq_no;
+ extent_nis[base_ni->nr_extents++] = ni;
+ mutex_unlock(&base_ni->extent_lock);
+ mark_mft_record_dirty(ni);
+ return err;
+}
+#endif /* NTFS_RW */
diff --git a/fs/ntfs/mft.h b/fs/ntfs/mft.h
new file mode 100644
index 000000000..49c001af1
--- /dev/null
+++ b/fs/ntfs/mft.h
@@ -0,0 +1,110 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * mft.h - Defines for mft record handling in NTFS Linux kernel driver.
+ * Part of the Linux-NTFS project.
+ *
+ * Copyright (c) 2001-2004 Anton Altaparmakov
+ */
+
+#ifndef _LINUX_NTFS_MFT_H
+#define _LINUX_NTFS_MFT_H
+
+#include <linux/fs.h>
+#include <linux/highmem.h>
+#include <linux/pagemap.h>
+
+#include "inode.h"
+
+extern MFT_RECORD *map_mft_record(ntfs_inode *ni);
+extern void unmap_mft_record(ntfs_inode *ni);
+
+extern MFT_RECORD *map_extent_mft_record(ntfs_inode *base_ni, MFT_REF mref,
+ ntfs_inode **ntfs_ino);
+
+static inline void unmap_extent_mft_record(ntfs_inode *ni)
+{
+ unmap_mft_record(ni);
+ return;
+}
+
+#ifdef NTFS_RW
+
+/**
+ * flush_dcache_mft_record_page - flush_dcache_page() for mft records
+ * @ni: ntfs inode structure of mft record
+ *
+ * Call flush_dcache_page() for the page in which an mft record resides.
+ *
+ * This must be called every time an mft record is modified, just after the
+ * modification.
+ */
+static inline void flush_dcache_mft_record_page(ntfs_inode *ni)
+{
+ flush_dcache_page(ni->page);
+}
+
+extern void __mark_mft_record_dirty(ntfs_inode *ni);
+
+/**
+ * mark_mft_record_dirty - set the mft record and the page containing it dirty
+ * @ni: ntfs inode describing the mapped mft record
+ *
+ * Set the mapped (extent) mft record of the (base or extent) ntfs inode @ni,
+ * as well as the page containing the mft record, dirty. Also, mark the base
+ * vfs inode dirty. This ensures that any changes to the mft record are
+ * written out to disk.
+ *
+ * NOTE: Do not do anything if the mft record is already marked dirty.
+ */
+static inline void mark_mft_record_dirty(ntfs_inode *ni)
+{
+ if (!NInoTestSetDirty(ni))
+ __mark_mft_record_dirty(ni);
+}
+
+extern int ntfs_sync_mft_mirror(ntfs_volume *vol, const unsigned long mft_no,
+ MFT_RECORD *m, int sync);
+
+extern int write_mft_record_nolock(ntfs_inode *ni, MFT_RECORD *m, int sync);
+
+/**
+ * write_mft_record - write out a mapped (extent) mft record
+ * @ni: ntfs inode describing the mapped (extent) mft record
+ * @m: mapped (extent) mft record to write
+ * @sync: if true, wait for i/o completion
+ *
+ * This is just a wrapper for write_mft_record_nolock() (see mft.c), which
+ * locks the page for the duration of the write. This ensures that there are
+ * no race conditions between writing the mft record via the dirty inode code
+ * paths and via the page cache write back code paths or between writing
+ * neighbouring mft records residing in the same page.
+ *
+ * Locking the page also serializes us against ->read_folio() if the page is not
+ * uptodate.
+ *
+ * On success, clean the mft record and return 0. On error, leave the mft
+ * record dirty and return -errno.
+ */
+static inline int write_mft_record(ntfs_inode *ni, MFT_RECORD *m, int sync)
+{
+ struct page *page = ni->page;
+ int err;
+
+ BUG_ON(!page);
+ lock_page(page);
+ err = write_mft_record_nolock(ni, m, sync);
+ unlock_page(page);
+ return err;
+}
+
+extern bool ntfs_may_write_mft_record(ntfs_volume *vol,
+ const unsigned long mft_no, const MFT_RECORD *m,
+ ntfs_inode **locked_ni);
+
+extern ntfs_inode *ntfs_mft_record_alloc(ntfs_volume *vol, const int mode,
+ ntfs_inode *base_ni, MFT_RECORD **mrec);
+extern int ntfs_extent_mft_record_free(ntfs_inode *ni, MFT_RECORD *m);
+
+#endif /* NTFS_RW */
+
+#endif /* _LINUX_NTFS_MFT_H */
diff --git a/fs/ntfs/mst.c b/fs/ntfs/mst.c
new file mode 100644
index 000000000..16b3c884a
--- /dev/null
+++ b/fs/ntfs/mst.c
@@ -0,0 +1,189 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * mst.c - NTFS multi sector transfer protection handling code. Part of the
+ * Linux-NTFS project.
+ *
+ * Copyright (c) 2001-2004 Anton Altaparmakov
+ */
+
+#include "ntfs.h"
+
+/**
+ * post_read_mst_fixup - deprotect multi sector transfer protected data
+ * @b: pointer to the data to deprotect
+ * @size: size in bytes of @b
+ *
+ * Perform the necessary post read multi sector transfer fixup and detect the
+ * presence of incomplete multi sector transfers. - In that case, overwrite the
+ * magic of the ntfs record header being processed with "BAAD" (in memory only!)
+ * and abort processing.
+ *
+ * Return 0 on success and -EINVAL on error ("BAAD" magic will be present).
+ *
+ * NOTE: We consider the absence / invalidity of an update sequence array to
+ * mean that the structure is not protected at all and hence doesn't need to
+ * be fixed up. Thus, we return success and not failure in this case. This is
+ * in contrast to pre_write_mst_fixup(), see below.
+ */
+int post_read_mst_fixup(NTFS_RECORD *b, const u32 size)
+{
+ u16 usa_ofs, usa_count, usn;
+ u16 *usa_pos, *data_pos;
+
+ /* Setup the variables. */
+ usa_ofs = le16_to_cpu(b->usa_ofs);
+ /* Decrement usa_count to get number of fixups. */
+ usa_count = le16_to_cpu(b->usa_count) - 1;
+ /* Size and alignment checks. */
+ if ( size & (NTFS_BLOCK_SIZE - 1) ||
+ usa_ofs & 1 ||
+ usa_ofs + (usa_count * 2) > size ||
+ (size >> NTFS_BLOCK_SIZE_BITS) != usa_count)
+ return 0;
+ /* Position of usn in update sequence array. */
+ usa_pos = (u16*)b + usa_ofs/sizeof(u16);
+ /*
+ * The update sequence number which has to be equal to each of the
+ * u16 values before they are fixed up. Note no need to care for
+ * endianness since we are comparing and moving data for on disk
+ * structures which means the data is consistent. - If it is
+ * consistenty the wrong endianness it doesn't make any difference.
+ */
+ usn = *usa_pos;
+ /*
+ * Position in protected data of first u16 that needs fixing up.
+ */
+ data_pos = (u16*)b + NTFS_BLOCK_SIZE/sizeof(u16) - 1;
+ /*
+ * Check for incomplete multi sector transfer(s).
+ */
+ while (usa_count--) {
+ if (*data_pos != usn) {
+ /*
+ * Incomplete multi sector transfer detected! )-:
+ * Set the magic to "BAAD" and return failure.
+ * Note that magic_BAAD is already converted to le32.
+ */
+ b->magic = magic_BAAD;
+ return -EINVAL;
+ }
+ data_pos += NTFS_BLOCK_SIZE/sizeof(u16);
+ }
+ /* Re-setup the variables. */
+ usa_count = le16_to_cpu(b->usa_count) - 1;
+ data_pos = (u16*)b + NTFS_BLOCK_SIZE/sizeof(u16) - 1;
+ /* Fixup all sectors. */
+ while (usa_count--) {
+ /*
+ * Increment position in usa and restore original data from
+ * the usa into the data buffer.
+ */
+ *data_pos = *(++usa_pos);
+ /* Increment position in data as well. */
+ data_pos += NTFS_BLOCK_SIZE/sizeof(u16);
+ }
+ return 0;
+}
+
+/**
+ * pre_write_mst_fixup - apply multi sector transfer protection
+ * @b: pointer to the data to protect
+ * @size: size in bytes of @b
+ *
+ * Perform the necessary pre write multi sector transfer fixup on the data
+ * pointer to by @b of @size.
+ *
+ * Return 0 if fixup applied (success) or -EINVAL if no fixup was performed
+ * (assumed not needed). This is in contrast to post_read_mst_fixup() above.
+ *
+ * NOTE: We consider the absence / invalidity of an update sequence array to
+ * mean that the structure is not subject to protection and hence doesn't need
+ * to be fixed up. This means that you have to create a valid update sequence
+ * array header in the ntfs record before calling this function, otherwise it
+ * will fail (the header needs to contain the position of the update sequence
+ * array together with the number of elements in the array). You also need to
+ * initialise the update sequence number before calling this function
+ * otherwise a random word will be used (whatever was in the record at that
+ * position at that time).
+ */
+int pre_write_mst_fixup(NTFS_RECORD *b, const u32 size)
+{
+ le16 *usa_pos, *data_pos;
+ u16 usa_ofs, usa_count, usn;
+ le16 le_usn;
+
+ /* Sanity check + only fixup if it makes sense. */
+ if (!b || ntfs_is_baad_record(b->magic) ||
+ ntfs_is_hole_record(b->magic))
+ return -EINVAL;
+ /* Setup the variables. */
+ usa_ofs = le16_to_cpu(b->usa_ofs);
+ /* Decrement usa_count to get number of fixups. */
+ usa_count = le16_to_cpu(b->usa_count) - 1;
+ /* Size and alignment checks. */
+ if ( size & (NTFS_BLOCK_SIZE - 1) ||
+ usa_ofs & 1 ||
+ usa_ofs + (usa_count * 2) > size ||
+ (size >> NTFS_BLOCK_SIZE_BITS) != usa_count)
+ return -EINVAL;
+ /* Position of usn in update sequence array. */
+ usa_pos = (le16*)((u8*)b + usa_ofs);
+ /*
+ * Cyclically increment the update sequence number
+ * (skipping 0 and -1, i.e. 0xffff).
+ */
+ usn = le16_to_cpup(usa_pos) + 1;
+ if (usn == 0xffff || !usn)
+ usn = 1;
+ le_usn = cpu_to_le16(usn);
+ *usa_pos = le_usn;
+ /* Position in data of first u16 that needs fixing up. */
+ data_pos = (le16*)b + NTFS_BLOCK_SIZE/sizeof(le16) - 1;
+ /* Fixup all sectors. */
+ while (usa_count--) {
+ /*
+ * Increment the position in the usa and save the
+ * original data from the data buffer into the usa.
+ */
+ *(++usa_pos) = *data_pos;
+ /* Apply fixup to data. */
+ *data_pos = le_usn;
+ /* Increment position in data as well. */
+ data_pos += NTFS_BLOCK_SIZE/sizeof(le16);
+ }
+ return 0;
+}
+
+/**
+ * post_write_mst_fixup - fast deprotect multi sector transfer protected data
+ * @b: pointer to the data to deprotect
+ *
+ * Perform the necessary post write multi sector transfer fixup, not checking
+ * for any errors, because we assume we have just used pre_write_mst_fixup(),
+ * thus the data will be fine or we would never have gotten here.
+ */
+void post_write_mst_fixup(NTFS_RECORD *b)
+{
+ le16 *usa_pos, *data_pos;
+
+ u16 usa_ofs = le16_to_cpu(b->usa_ofs);
+ u16 usa_count = le16_to_cpu(b->usa_count) - 1;
+
+ /* Position of usn in update sequence array. */
+ usa_pos = (le16*)b + usa_ofs/sizeof(le16);
+
+ /* Position in protected data of first u16 that needs fixing up. */
+ data_pos = (le16*)b + NTFS_BLOCK_SIZE/sizeof(le16) - 1;
+
+ /* Fixup all sectors. */
+ while (usa_count--) {
+ /*
+ * Increment position in usa and restore original data from
+ * the usa into the data buffer.
+ */
+ *data_pos = *(++usa_pos);
+
+ /* Increment position in data as well. */
+ data_pos += NTFS_BLOCK_SIZE/sizeof(le16);
+ }
+}
diff --git a/fs/ntfs/namei.c b/fs/ntfs/namei.c
new file mode 100644
index 000000000..4e6a44bc6
--- /dev/null
+++ b/fs/ntfs/namei.c
@@ -0,0 +1,391 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * namei.c - NTFS kernel directory inode operations. Part of the Linux-NTFS
+ * project.
+ *
+ * Copyright (c) 2001-2006 Anton Altaparmakov
+ */
+
+#include <linux/dcache.h>
+#include <linux/exportfs.h>
+#include <linux/security.h>
+#include <linux/slab.h>
+
+#include "attrib.h"
+#include "debug.h"
+#include "dir.h"
+#include "mft.h"
+#include "ntfs.h"
+
+/**
+ * ntfs_lookup - find the inode represented by a dentry in a directory inode
+ * @dir_ino: directory inode in which to look for the inode
+ * @dent: dentry representing the inode to look for
+ * @flags: lookup flags
+ *
+ * In short, ntfs_lookup() looks for the inode represented by the dentry @dent
+ * in the directory inode @dir_ino and if found attaches the inode to the
+ * dentry @dent.
+ *
+ * In more detail, the dentry @dent specifies which inode to look for by
+ * supplying the name of the inode in @dent->d_name.name. ntfs_lookup()
+ * converts the name to Unicode and walks the contents of the directory inode
+ * @dir_ino looking for the converted Unicode name. If the name is found in the
+ * directory, the corresponding inode is loaded by calling ntfs_iget() on its
+ * inode number and the inode is associated with the dentry @dent via a call to
+ * d_splice_alias().
+ *
+ * If the name is not found in the directory, a NULL inode is inserted into the
+ * dentry @dent via a call to d_add(). The dentry is then termed a negative
+ * dentry.
+ *
+ * Only if an actual error occurs, do we return an error via ERR_PTR().
+ *
+ * In order to handle the case insensitivity issues of NTFS with regards to the
+ * dcache and the dcache requiring only one dentry per directory, we deal with
+ * dentry aliases that only differ in case in ->ntfs_lookup() while maintaining
+ * a case sensitive dcache. This means that we get the full benefit of dcache
+ * speed when the file/directory is looked up with the same case as returned by
+ * ->ntfs_readdir() but that a lookup for any other case (or for the short file
+ * name) will not find anything in dcache and will enter ->ntfs_lookup()
+ * instead, where we search the directory for a fully matching file name
+ * (including case) and if that is not found, we search for a file name that
+ * matches with different case and if that has non-POSIX semantics we return
+ * that. We actually do only one search (case sensitive) and keep tabs on
+ * whether we have found a case insensitive match in the process.
+ *
+ * To simplify matters for us, we do not treat the short vs long filenames as
+ * two hard links but instead if the lookup matches a short filename, we
+ * return the dentry for the corresponding long filename instead.
+ *
+ * There are three cases we need to distinguish here:
+ *
+ * 1) @dent perfectly matches (i.e. including case) a directory entry with a
+ * file name in the WIN32 or POSIX namespaces. In this case
+ * ntfs_lookup_inode_by_name() will return with name set to NULL and we
+ * just d_splice_alias() @dent.
+ * 2) @dent matches (not including case) a directory entry with a file name in
+ * the WIN32 namespace. In this case ntfs_lookup_inode_by_name() will return
+ * with name set to point to a kmalloc()ed ntfs_name structure containing
+ * the properly cased little endian Unicode name. We convert the name to the
+ * current NLS code page, search if a dentry with this name already exists
+ * and if so return that instead of @dent. At this point things are
+ * complicated by the possibility of 'disconnected' dentries due to NFS
+ * which we deal with appropriately (see the code comments). The VFS will
+ * then destroy the old @dent and use the one we returned. If a dentry is
+ * not found, we allocate a new one, d_splice_alias() it, and return it as
+ * above.
+ * 3) @dent matches either perfectly or not (i.e. we don't care about case) a
+ * directory entry with a file name in the DOS namespace. In this case
+ * ntfs_lookup_inode_by_name() will return with name set to point to a
+ * kmalloc()ed ntfs_name structure containing the mft reference (cpu endian)
+ * of the inode. We use the mft reference to read the inode and to find the
+ * file name in the WIN32 namespace corresponding to the matched short file
+ * name. We then convert the name to the current NLS code page, and proceed
+ * searching for a dentry with this name, etc, as in case 2), above.
+ *
+ * Locking: Caller must hold i_mutex on the directory.
+ */
+static struct dentry *ntfs_lookup(struct inode *dir_ino, struct dentry *dent,
+ unsigned int flags)
+{
+ ntfs_volume *vol = NTFS_SB(dir_ino->i_sb);
+ struct inode *dent_inode;
+ ntfschar *uname;
+ ntfs_name *name = NULL;
+ MFT_REF mref;
+ unsigned long dent_ino;
+ int uname_len;
+
+ ntfs_debug("Looking up %pd in directory inode 0x%lx.",
+ dent, dir_ino->i_ino);
+ /* Convert the name of the dentry to Unicode. */
+ uname_len = ntfs_nlstoucs(vol, dent->d_name.name, dent->d_name.len,
+ &uname);
+ if (uname_len < 0) {
+ if (uname_len != -ENAMETOOLONG)
+ ntfs_error(vol->sb, "Failed to convert name to "
+ "Unicode.");
+ return ERR_PTR(uname_len);
+ }
+ mref = ntfs_lookup_inode_by_name(NTFS_I(dir_ino), uname, uname_len,
+ &name);
+ kmem_cache_free(ntfs_name_cache, uname);
+ if (!IS_ERR_MREF(mref)) {
+ dent_ino = MREF(mref);
+ ntfs_debug("Found inode 0x%lx. Calling ntfs_iget.", dent_ino);
+ dent_inode = ntfs_iget(vol->sb, dent_ino);
+ if (!IS_ERR(dent_inode)) {
+ /* Consistency check. */
+ if (is_bad_inode(dent_inode) || MSEQNO(mref) ==
+ NTFS_I(dent_inode)->seq_no ||
+ dent_ino == FILE_MFT) {
+ /* Perfect WIN32/POSIX match. -- Case 1. */
+ if (!name) {
+ ntfs_debug("Done. (Case 1.)");
+ return d_splice_alias(dent_inode, dent);
+ }
+ /*
+ * We are too indented. Handle imperfect
+ * matches and short file names further below.
+ */
+ goto handle_name;
+ }
+ ntfs_error(vol->sb, "Found stale reference to inode "
+ "0x%lx (reference sequence number = "
+ "0x%x, inode sequence number = 0x%x), "
+ "returning -EIO. Run chkdsk.",
+ dent_ino, MSEQNO(mref),
+ NTFS_I(dent_inode)->seq_no);
+ iput(dent_inode);
+ dent_inode = ERR_PTR(-EIO);
+ } else
+ ntfs_error(vol->sb, "ntfs_iget(0x%lx) failed with "
+ "error code %li.", dent_ino,
+ PTR_ERR(dent_inode));
+ kfree(name);
+ /* Return the error code. */
+ return ERR_CAST(dent_inode);
+ }
+ /* It is guaranteed that @name is no longer allocated at this point. */
+ if (MREF_ERR(mref) == -ENOENT) {
+ ntfs_debug("Entry was not found, adding negative dentry.");
+ /* The dcache will handle negative entries. */
+ d_add(dent, NULL);
+ ntfs_debug("Done.");
+ return NULL;
+ }
+ ntfs_error(vol->sb, "ntfs_lookup_ino_by_name() failed with error "
+ "code %i.", -MREF_ERR(mref));
+ return ERR_PTR(MREF_ERR(mref));
+ // TODO: Consider moving this lot to a separate function! (AIA)
+handle_name:
+ {
+ MFT_RECORD *m;
+ ntfs_attr_search_ctx *ctx;
+ ntfs_inode *ni = NTFS_I(dent_inode);
+ int err;
+ struct qstr nls_name;
+
+ nls_name.name = NULL;
+ if (name->type != FILE_NAME_DOS) { /* Case 2. */
+ ntfs_debug("Case 2.");
+ nls_name.len = (unsigned)ntfs_ucstonls(vol,
+ (ntfschar*)&name->name, name->len,
+ (unsigned char**)&nls_name.name, 0);
+ kfree(name);
+ } else /* if (name->type == FILE_NAME_DOS) */ { /* Case 3. */
+ FILE_NAME_ATTR *fn;
+
+ ntfs_debug("Case 3.");
+ kfree(name);
+
+ /* Find the WIN32 name corresponding to the matched DOS name. */
+ ni = NTFS_I(dent_inode);
+ m = map_mft_record(ni);
+ if (IS_ERR(m)) {
+ err = PTR_ERR(m);
+ m = NULL;
+ ctx = NULL;
+ goto err_out;
+ }
+ ctx = ntfs_attr_get_search_ctx(ni, m);
+ if (unlikely(!ctx)) {
+ err = -ENOMEM;
+ goto err_out;
+ }
+ do {
+ ATTR_RECORD *a;
+ u32 val_len;
+
+ err = ntfs_attr_lookup(AT_FILE_NAME, NULL, 0, 0, 0,
+ NULL, 0, ctx);
+ if (unlikely(err)) {
+ ntfs_error(vol->sb, "Inode corrupt: No WIN32 "
+ "namespace counterpart to DOS "
+ "file name. Run chkdsk.");
+ if (err == -ENOENT)
+ err = -EIO;
+ goto err_out;
+ }
+ /* Consistency checks. */
+ a = ctx->attr;
+ if (a->non_resident || a->flags)
+ goto eio_err_out;
+ val_len = le32_to_cpu(a->data.resident.value_length);
+ if (le16_to_cpu(a->data.resident.value_offset) +
+ val_len > le32_to_cpu(a->length))
+ goto eio_err_out;
+ fn = (FILE_NAME_ATTR*)((u8*)ctx->attr + le16_to_cpu(
+ ctx->attr->data.resident.value_offset));
+ if ((u32)(fn->file_name_length * sizeof(ntfschar) +
+ sizeof(FILE_NAME_ATTR)) > val_len)
+ goto eio_err_out;
+ } while (fn->file_name_type != FILE_NAME_WIN32);
+
+ /* Convert the found WIN32 name to current NLS code page. */
+ nls_name.len = (unsigned)ntfs_ucstonls(vol,
+ (ntfschar*)&fn->file_name, fn->file_name_length,
+ (unsigned char**)&nls_name.name, 0);
+
+ ntfs_attr_put_search_ctx(ctx);
+ unmap_mft_record(ni);
+ }
+ m = NULL;
+ ctx = NULL;
+
+ /* Check if a conversion error occurred. */
+ if ((signed)nls_name.len < 0) {
+ err = (signed)nls_name.len;
+ goto err_out;
+ }
+ nls_name.hash = full_name_hash(dent, nls_name.name, nls_name.len);
+
+ dent = d_add_ci(dent, dent_inode, &nls_name);
+ kfree(nls_name.name);
+ return dent;
+
+eio_err_out:
+ ntfs_error(vol->sb, "Illegal file name attribute. Run chkdsk.");
+ err = -EIO;
+err_out:
+ if (ctx)
+ ntfs_attr_put_search_ctx(ctx);
+ if (m)
+ unmap_mft_record(ni);
+ iput(dent_inode);
+ ntfs_error(vol->sb, "Failed, returning error code %i.", err);
+ return ERR_PTR(err);
+ }
+}
+
+/**
+ * Inode operations for directories.
+ */
+const struct inode_operations ntfs_dir_inode_ops = {
+ .lookup = ntfs_lookup, /* VFS: Lookup directory. */
+};
+
+/**
+ * ntfs_get_parent - find the dentry of the parent of a given directory dentry
+ * @child_dent: dentry of the directory whose parent directory to find
+ *
+ * Find the dentry for the parent directory of the directory specified by the
+ * dentry @child_dent. This function is called from
+ * fs/exportfs/expfs.c::find_exported_dentry() which in turn is called from the
+ * default ->decode_fh() which is export_decode_fh() in the same file.
+ *
+ * The code is based on the ext3 ->get_parent() implementation found in
+ * fs/ext3/namei.c::ext3_get_parent().
+ *
+ * Note: ntfs_get_parent() is called with @d_inode(child_dent)->i_mutex down.
+ *
+ * Return the dentry of the parent directory on success or the error code on
+ * error (IS_ERR() is true).
+ */
+static struct dentry *ntfs_get_parent(struct dentry *child_dent)
+{
+ struct inode *vi = d_inode(child_dent);
+ ntfs_inode *ni = NTFS_I(vi);
+ MFT_RECORD *mrec;
+ ntfs_attr_search_ctx *ctx;
+ ATTR_RECORD *attr;
+ FILE_NAME_ATTR *fn;
+ unsigned long parent_ino;
+ int err;
+
+ ntfs_debug("Entering for inode 0x%lx.", vi->i_ino);
+ /* Get the mft record of the inode belonging to the child dentry. */
+ mrec = map_mft_record(ni);
+ if (IS_ERR(mrec))
+ return ERR_CAST(mrec);
+ /* Find the first file name attribute in the mft record. */
+ ctx = ntfs_attr_get_search_ctx(ni, mrec);
+ if (unlikely(!ctx)) {
+ unmap_mft_record(ni);
+ return ERR_PTR(-ENOMEM);
+ }
+try_next:
+ err = ntfs_attr_lookup(AT_FILE_NAME, NULL, 0, CASE_SENSITIVE, 0, NULL,
+ 0, ctx);
+ if (unlikely(err)) {
+ ntfs_attr_put_search_ctx(ctx);
+ unmap_mft_record(ni);
+ if (err == -ENOENT)
+ ntfs_error(vi->i_sb, "Inode 0x%lx does not have a "
+ "file name attribute. Run chkdsk.",
+ vi->i_ino);
+ return ERR_PTR(err);
+ }
+ attr = ctx->attr;
+ if (unlikely(attr->non_resident))
+ goto try_next;
+ fn = (FILE_NAME_ATTR *)((u8 *)attr +
+ le16_to_cpu(attr->data.resident.value_offset));
+ if (unlikely((u8 *)fn + le32_to_cpu(attr->data.resident.value_length) >
+ (u8*)attr + le32_to_cpu(attr->length)))
+ goto try_next;
+ /* Get the inode number of the parent directory. */
+ parent_ino = MREF_LE(fn->parent_directory);
+ /* Release the search context and the mft record of the child. */
+ ntfs_attr_put_search_ctx(ctx);
+ unmap_mft_record(ni);
+
+ return d_obtain_alias(ntfs_iget(vi->i_sb, parent_ino));
+}
+
+static struct inode *ntfs_nfs_get_inode(struct super_block *sb,
+ u64 ino, u32 generation)
+{
+ struct inode *inode;
+
+ inode = ntfs_iget(sb, ino);
+ if (!IS_ERR(inode)) {
+ if (is_bad_inode(inode) || inode->i_generation != generation) {
+ iput(inode);
+ inode = ERR_PTR(-ESTALE);
+ }
+ }
+
+ return inode;
+}
+
+static struct dentry *ntfs_fh_to_dentry(struct super_block *sb, struct fid *fid,
+ int fh_len, int fh_type)
+{
+ return generic_fh_to_dentry(sb, fid, fh_len, fh_type,
+ ntfs_nfs_get_inode);
+}
+
+static struct dentry *ntfs_fh_to_parent(struct super_block *sb, struct fid *fid,
+ int fh_len, int fh_type)
+{
+ return generic_fh_to_parent(sb, fid, fh_len, fh_type,
+ ntfs_nfs_get_inode);
+}
+
+/**
+ * Export operations allowing NFS exporting of mounted NTFS partitions.
+ *
+ * We use the default ->encode_fh() for now. Note that they
+ * use 32 bits to store the inode number which is an unsigned long so on 64-bit
+ * architectures is usually 64 bits so it would all fail horribly on huge
+ * volumes. I guess we need to define our own encode and decode fh functions
+ * that store 64-bit inode numbers at some point but for now we will ignore the
+ * problem...
+ *
+ * We also use the default ->get_name() helper (used by ->decode_fh() via
+ * fs/exportfs/expfs.c::find_exported_dentry()) as that is completely fs
+ * independent.
+ *
+ * The default ->get_parent() just returns -EACCES so we have to provide our
+ * own and the default ->get_dentry() is incompatible with NTFS due to not
+ * allowing the inode number 0 which is used in NTFS for the system file $MFT
+ * and due to using iget() whereas NTFS needs ntfs_iget().
+ */
+const struct export_operations ntfs_export_ops = {
+ .get_parent = ntfs_get_parent, /* Find the parent of a given
+ directory. */
+ .fh_to_dentry = ntfs_fh_to_dentry,
+ .fh_to_parent = ntfs_fh_to_parent,
+};
diff --git a/fs/ntfs/ntfs.h b/fs/ntfs/ntfs.h
new file mode 100644
index 000000000..e81376ea9
--- /dev/null
+++ b/fs/ntfs/ntfs.h
@@ -0,0 +1,150 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * ntfs.h - Defines for NTFS Linux kernel driver.
+ *
+ * Copyright (c) 2001-2014 Anton Altaparmakov and Tuxera Inc.
+ * Copyright (C) 2002 Richard Russon
+ */
+
+#ifndef _LINUX_NTFS_H
+#define _LINUX_NTFS_H
+
+#include <linux/stddef.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/compiler.h>
+#include <linux/fs.h>
+#include <linux/nls.h>
+#include <linux/smp.h>
+#include <linux/pagemap.h>
+
+#include "types.h"
+#include "volume.h"
+#include "layout.h"
+
+typedef enum {
+ NTFS_BLOCK_SIZE = 512,
+ NTFS_BLOCK_SIZE_BITS = 9,
+ NTFS_SB_MAGIC = 0x5346544e, /* 'NTFS' */
+ NTFS_MAX_NAME_LEN = 255,
+ NTFS_MAX_ATTR_NAME_LEN = 255,
+ NTFS_MAX_CLUSTER_SIZE = 64 * 1024, /* 64kiB */
+ NTFS_MAX_PAGES_PER_CLUSTER = NTFS_MAX_CLUSTER_SIZE / PAGE_SIZE,
+} NTFS_CONSTANTS;
+
+/* Global variables. */
+
+/* Slab caches (from super.c). */
+extern struct kmem_cache *ntfs_name_cache;
+extern struct kmem_cache *ntfs_inode_cache;
+extern struct kmem_cache *ntfs_big_inode_cache;
+extern struct kmem_cache *ntfs_attr_ctx_cache;
+extern struct kmem_cache *ntfs_index_ctx_cache;
+
+/* The various operations structs defined throughout the driver files. */
+extern const struct address_space_operations ntfs_normal_aops;
+extern const struct address_space_operations ntfs_compressed_aops;
+extern const struct address_space_operations ntfs_mst_aops;
+
+extern const struct file_operations ntfs_file_ops;
+extern const struct inode_operations ntfs_file_inode_ops;
+
+extern const struct file_operations ntfs_dir_ops;
+extern const struct inode_operations ntfs_dir_inode_ops;
+
+extern const struct file_operations ntfs_empty_file_ops;
+extern const struct inode_operations ntfs_empty_inode_ops;
+
+extern const struct export_operations ntfs_export_ops;
+
+/**
+ * NTFS_SB - return the ntfs volume given a vfs super block
+ * @sb: VFS super block
+ *
+ * NTFS_SB() returns the ntfs volume associated with the VFS super block @sb.
+ */
+static inline ntfs_volume *NTFS_SB(struct super_block *sb)
+{
+ return sb->s_fs_info;
+}
+
+/* Declarations of functions and global variables. */
+
+/* From fs/ntfs/compress.c */
+extern int ntfs_read_compressed_block(struct page *page);
+extern int allocate_compression_buffers(void);
+extern void free_compression_buffers(void);
+
+/* From fs/ntfs/super.c */
+#define default_upcase_len 0x10000
+extern struct mutex ntfs_lock;
+
+typedef struct {
+ int val;
+ char *str;
+} option_t;
+extern const option_t on_errors_arr[];
+
+/* From fs/ntfs/mst.c */
+extern int post_read_mst_fixup(NTFS_RECORD *b, const u32 size);
+extern int pre_write_mst_fixup(NTFS_RECORD *b, const u32 size);
+extern void post_write_mst_fixup(NTFS_RECORD *b);
+
+/* From fs/ntfs/unistr.c */
+extern bool ntfs_are_names_equal(const ntfschar *s1, size_t s1_len,
+ const ntfschar *s2, size_t s2_len,
+ const IGNORE_CASE_BOOL ic,
+ const ntfschar *upcase, const u32 upcase_size);
+extern int ntfs_collate_names(const ntfschar *name1, const u32 name1_len,
+ const ntfschar *name2, const u32 name2_len,
+ const int err_val, const IGNORE_CASE_BOOL ic,
+ const ntfschar *upcase, const u32 upcase_len);
+extern int ntfs_ucsncmp(const ntfschar *s1, const ntfschar *s2, size_t n);
+extern int ntfs_ucsncasecmp(const ntfschar *s1, const ntfschar *s2, size_t n,
+ const ntfschar *upcase, const u32 upcase_size);
+extern void ntfs_upcase_name(ntfschar *name, u32 name_len,
+ const ntfschar *upcase, const u32 upcase_len);
+extern void ntfs_file_upcase_value(FILE_NAME_ATTR *file_name_attr,
+ const ntfschar *upcase, const u32 upcase_len);
+extern int ntfs_file_compare_values(FILE_NAME_ATTR *file_name_attr1,
+ FILE_NAME_ATTR *file_name_attr2,
+ const int err_val, const IGNORE_CASE_BOOL ic,
+ const ntfschar *upcase, const u32 upcase_len);
+extern int ntfs_nlstoucs(const ntfs_volume *vol, const char *ins,
+ const int ins_len, ntfschar **outs);
+extern int ntfs_ucstonls(const ntfs_volume *vol, const ntfschar *ins,
+ const int ins_len, unsigned char **outs, int outs_len);
+
+/* From fs/ntfs/upcase.c */
+extern ntfschar *generate_default_upcase(void);
+
+static inline int ntfs_ffs(int x)
+{
+ int r = 1;
+
+ if (!x)
+ return 0;
+ if (!(x & 0xffff)) {
+ x >>= 16;
+ r += 16;
+ }
+ if (!(x & 0xff)) {
+ x >>= 8;
+ r += 8;
+ }
+ if (!(x & 0xf)) {
+ x >>= 4;
+ r += 4;
+ }
+ if (!(x & 3)) {
+ x >>= 2;
+ r += 2;
+ }
+ if (!(x & 1)) {
+ x >>= 1;
+ r += 1;
+ }
+ return r;
+}
+
+#endif /* _LINUX_NTFS_H */
diff --git a/fs/ntfs/quota.c b/fs/ntfs/quota.c
new file mode 100644
index 000000000..916048022
--- /dev/null
+++ b/fs/ntfs/quota.c
@@ -0,0 +1,103 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * quota.c - NTFS kernel quota ($Quota) handling. Part of the Linux-NTFS
+ * project.
+ *
+ * Copyright (c) 2004 Anton Altaparmakov
+ */
+
+#ifdef NTFS_RW
+
+#include "index.h"
+#include "quota.h"
+#include "debug.h"
+#include "ntfs.h"
+
+/**
+ * ntfs_mark_quotas_out_of_date - mark the quotas out of date on an ntfs volume
+ * @vol: ntfs volume on which to mark the quotas out of date
+ *
+ * Mark the quotas out of date on the ntfs volume @vol and return 'true' on
+ * success and 'false' on error.
+ */
+bool ntfs_mark_quotas_out_of_date(ntfs_volume *vol)
+{
+ ntfs_index_context *ictx;
+ QUOTA_CONTROL_ENTRY *qce;
+ const le32 qid = QUOTA_DEFAULTS_ID;
+ int err;
+
+ ntfs_debug("Entering.");
+ if (NVolQuotaOutOfDate(vol))
+ goto done;
+ if (!vol->quota_ino || !vol->quota_q_ino) {
+ ntfs_error(vol->sb, "Quota inodes are not open.");
+ return false;
+ }
+ inode_lock(vol->quota_q_ino);
+ ictx = ntfs_index_ctx_get(NTFS_I(vol->quota_q_ino));
+ if (!ictx) {
+ ntfs_error(vol->sb, "Failed to get index context.");
+ goto err_out;
+ }
+ err = ntfs_index_lookup(&qid, sizeof(qid), ictx);
+ if (err) {
+ if (err == -ENOENT)
+ ntfs_error(vol->sb, "Quota defaults entry is not "
+ "present.");
+ else
+ ntfs_error(vol->sb, "Lookup of quota defaults entry "
+ "failed.");
+ goto err_out;
+ }
+ if (ictx->data_len < offsetof(QUOTA_CONTROL_ENTRY, sid)) {
+ ntfs_error(vol->sb, "Quota defaults entry size is invalid. "
+ "Run chkdsk.");
+ goto err_out;
+ }
+ qce = (QUOTA_CONTROL_ENTRY*)ictx->data;
+ if (le32_to_cpu(qce->version) != QUOTA_VERSION) {
+ ntfs_error(vol->sb, "Quota defaults entry version 0x%x is not "
+ "supported.", le32_to_cpu(qce->version));
+ goto err_out;
+ }
+ ntfs_debug("Quota defaults flags = 0x%x.", le32_to_cpu(qce->flags));
+ /* If quotas are already marked out of date, no need to do anything. */
+ if (qce->flags & QUOTA_FLAG_OUT_OF_DATE)
+ goto set_done;
+ /*
+ * If quota tracking is neither requested, nor enabled and there are no
+ * pending deletes, no need to mark the quotas out of date.
+ */
+ if (!(qce->flags & (QUOTA_FLAG_TRACKING_ENABLED |
+ QUOTA_FLAG_TRACKING_REQUESTED |
+ QUOTA_FLAG_PENDING_DELETES)))
+ goto set_done;
+ /*
+ * Set the QUOTA_FLAG_OUT_OF_DATE bit thus marking quotas out of date.
+ * This is verified on WinXP to be sufficient to cause windows to
+ * rescan the volume on boot and update all quota entries.
+ */
+ qce->flags |= QUOTA_FLAG_OUT_OF_DATE;
+ /* Ensure the modified flags are written to disk. */
+ ntfs_index_entry_flush_dcache_page(ictx);
+ ntfs_index_entry_mark_dirty(ictx);
+set_done:
+ ntfs_index_ctx_put(ictx);
+ inode_unlock(vol->quota_q_ino);
+ /*
+ * We set the flag so we do not try to mark the quotas out of date
+ * again on remount.
+ */
+ NVolSetQuotaOutOfDate(vol);
+done:
+ ntfs_debug("Done.");
+ return true;
+err_out:
+ if (ictx)
+ ntfs_index_ctx_put(ictx);
+ inode_unlock(vol->quota_q_ino);
+ return false;
+}
+
+#endif /* NTFS_RW */
diff --git a/fs/ntfs/quota.h b/fs/ntfs/quota.h
new file mode 100644
index 000000000..fe3132a3d
--- /dev/null
+++ b/fs/ntfs/quota.h
@@ -0,0 +1,21 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * quota.h - Defines for NTFS kernel quota ($Quota) handling. Part of the
+ * Linux-NTFS project.
+ *
+ * Copyright (c) 2004 Anton Altaparmakov
+ */
+
+#ifndef _LINUX_NTFS_QUOTA_H
+#define _LINUX_NTFS_QUOTA_H
+
+#ifdef NTFS_RW
+
+#include "types.h"
+#include "volume.h"
+
+extern bool ntfs_mark_quotas_out_of_date(ntfs_volume *vol);
+
+#endif /* NTFS_RW */
+
+#endif /* _LINUX_NTFS_QUOTA_H */
diff --git a/fs/ntfs/runlist.c b/fs/ntfs/runlist.c
new file mode 100644
index 000000000..97932fb51
--- /dev/null
+++ b/fs/ntfs/runlist.c
@@ -0,0 +1,1893 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/**
+ * runlist.c - NTFS runlist handling code. Part of the Linux-NTFS project.
+ *
+ * Copyright (c) 2001-2007 Anton Altaparmakov
+ * Copyright (c) 2002-2005 Richard Russon
+ */
+
+#include "debug.h"
+#include "dir.h"
+#include "endian.h"
+#include "malloc.h"
+#include "ntfs.h"
+
+/**
+ * ntfs_rl_mm - runlist memmove
+ *
+ * It is up to the caller to serialize access to the runlist @base.
+ */
+static inline void ntfs_rl_mm(runlist_element *base, int dst, int src,
+ int size)
+{
+ if (likely((dst != src) && (size > 0)))
+ memmove(base + dst, base + src, size * sizeof(*base));
+}
+
+/**
+ * ntfs_rl_mc - runlist memory copy
+ *
+ * It is up to the caller to serialize access to the runlists @dstbase and
+ * @srcbase.
+ */
+static inline void ntfs_rl_mc(runlist_element *dstbase, int dst,
+ runlist_element *srcbase, int src, int size)
+{
+ if (likely(size > 0))
+ memcpy(dstbase + dst, srcbase + src, size * sizeof(*dstbase));
+}
+
+/**
+ * ntfs_rl_realloc - Reallocate memory for runlists
+ * @rl: original runlist
+ * @old_size: number of runlist elements in the original runlist @rl
+ * @new_size: number of runlist elements we need space for
+ *
+ * As the runlists grow, more memory will be required. To prevent the
+ * kernel having to allocate and reallocate large numbers of small bits of
+ * memory, this function returns an entire page of memory.
+ *
+ * It is up to the caller to serialize access to the runlist @rl.
+ *
+ * N.B. If the new allocation doesn't require a different number of pages in
+ * memory, the function will return the original pointer.
+ *
+ * On success, return a pointer to the newly allocated, or recycled, memory.
+ * On error, return -errno. The following error codes are defined:
+ * -ENOMEM - Not enough memory to allocate runlist array.
+ * -EINVAL - Invalid parameters were passed in.
+ */
+static inline runlist_element *ntfs_rl_realloc(runlist_element *rl,
+ int old_size, int new_size)
+{
+ runlist_element *new_rl;
+
+ old_size = PAGE_ALIGN(old_size * sizeof(*rl));
+ new_size = PAGE_ALIGN(new_size * sizeof(*rl));
+ if (old_size == new_size)
+ return rl;
+
+ new_rl = ntfs_malloc_nofs(new_size);
+ if (unlikely(!new_rl))
+ return ERR_PTR(-ENOMEM);
+
+ if (likely(rl != NULL)) {
+ if (unlikely(old_size > new_size))
+ old_size = new_size;
+ memcpy(new_rl, rl, old_size);
+ ntfs_free(rl);
+ }
+ return new_rl;
+}
+
+/**
+ * ntfs_rl_realloc_nofail - Reallocate memory for runlists
+ * @rl: original runlist
+ * @old_size: number of runlist elements in the original runlist @rl
+ * @new_size: number of runlist elements we need space for
+ *
+ * As the runlists grow, more memory will be required. To prevent the
+ * kernel having to allocate and reallocate large numbers of small bits of
+ * memory, this function returns an entire page of memory.
+ *
+ * This function guarantees that the allocation will succeed. It will sleep
+ * for as long as it takes to complete the allocation.
+ *
+ * It is up to the caller to serialize access to the runlist @rl.
+ *
+ * N.B. If the new allocation doesn't require a different number of pages in
+ * memory, the function will return the original pointer.
+ *
+ * On success, return a pointer to the newly allocated, or recycled, memory.
+ * On error, return -errno. The following error codes are defined:
+ * -ENOMEM - Not enough memory to allocate runlist array.
+ * -EINVAL - Invalid parameters were passed in.
+ */
+static inline runlist_element *ntfs_rl_realloc_nofail(runlist_element *rl,
+ int old_size, int new_size)
+{
+ runlist_element *new_rl;
+
+ old_size = PAGE_ALIGN(old_size * sizeof(*rl));
+ new_size = PAGE_ALIGN(new_size * sizeof(*rl));
+ if (old_size == new_size)
+ return rl;
+
+ new_rl = ntfs_malloc_nofs_nofail(new_size);
+ BUG_ON(!new_rl);
+
+ if (likely(rl != NULL)) {
+ if (unlikely(old_size > new_size))
+ old_size = new_size;
+ memcpy(new_rl, rl, old_size);
+ ntfs_free(rl);
+ }
+ return new_rl;
+}
+
+/**
+ * ntfs_are_rl_mergeable - test if two runlists can be joined together
+ * @dst: original runlist
+ * @src: new runlist to test for mergeability with @dst
+ *
+ * Test if two runlists can be joined together. For this, their VCNs and LCNs
+ * must be adjacent.
+ *
+ * It is up to the caller to serialize access to the runlists @dst and @src.
+ *
+ * Return: true Success, the runlists can be merged.
+ * false Failure, the runlists cannot be merged.
+ */
+static inline bool ntfs_are_rl_mergeable(runlist_element *dst,
+ runlist_element *src)
+{
+ BUG_ON(!dst);
+ BUG_ON(!src);
+
+ /* We can merge unmapped regions even if they are misaligned. */
+ if ((dst->lcn == LCN_RL_NOT_MAPPED) && (src->lcn == LCN_RL_NOT_MAPPED))
+ return true;
+ /* If the runs are misaligned, we cannot merge them. */
+ if ((dst->vcn + dst->length) != src->vcn)
+ return false;
+ /* If both runs are non-sparse and contiguous, we can merge them. */
+ if ((dst->lcn >= 0) && (src->lcn >= 0) &&
+ ((dst->lcn + dst->length) == src->lcn))
+ return true;
+ /* If we are merging two holes, we can merge them. */
+ if ((dst->lcn == LCN_HOLE) && (src->lcn == LCN_HOLE))
+ return true;
+ /* Cannot merge. */
+ return false;
+}
+
+/**
+ * __ntfs_rl_merge - merge two runlists without testing if they can be merged
+ * @dst: original, destination runlist
+ * @src: new runlist to merge with @dst
+ *
+ * Merge the two runlists, writing into the destination runlist @dst. The
+ * caller must make sure the runlists can be merged or this will corrupt the
+ * destination runlist.
+ *
+ * It is up to the caller to serialize access to the runlists @dst and @src.
+ */
+static inline void __ntfs_rl_merge(runlist_element *dst, runlist_element *src)
+{
+ dst->length += src->length;
+}
+
+/**
+ * ntfs_rl_append - append a runlist after a given element
+ * @dst: original runlist to be worked on
+ * @dsize: number of elements in @dst (including end marker)
+ * @src: runlist to be inserted into @dst
+ * @ssize: number of elements in @src (excluding end marker)
+ * @loc: append the new runlist @src after this element in @dst
+ *
+ * Append the runlist @src after element @loc in @dst. Merge the right end of
+ * the new runlist, if necessary. Adjust the size of the hole before the
+ * appended runlist.
+ *
+ * It is up to the caller to serialize access to the runlists @dst and @src.
+ *
+ * On success, return a pointer to the new, combined, runlist. Note, both
+ * runlists @dst and @src are deallocated before returning so you cannot use
+ * the pointers for anything any more. (Strictly speaking the returned runlist
+ * may be the same as @dst but this is irrelevant.)
+ *
+ * On error, return -errno. Both runlists are left unmodified. The following
+ * error codes are defined:
+ * -ENOMEM - Not enough memory to allocate runlist array.
+ * -EINVAL - Invalid parameters were passed in.
+ */
+static inline runlist_element *ntfs_rl_append(runlist_element *dst,
+ int dsize, runlist_element *src, int ssize, int loc)
+{
+ bool right = false; /* Right end of @src needs merging. */
+ int marker; /* End of the inserted runs. */
+
+ BUG_ON(!dst);
+ BUG_ON(!src);
+
+ /* First, check if the right hand end needs merging. */
+ if ((loc + 1) < dsize)
+ right = ntfs_are_rl_mergeable(src + ssize - 1, dst + loc + 1);
+
+ /* Space required: @dst size + @src size, less one if we merged. */
+ dst = ntfs_rl_realloc(dst, dsize, dsize + ssize - right);
+ if (IS_ERR(dst))
+ return dst;
+ /*
+ * We are guaranteed to succeed from here so can start modifying the
+ * original runlists.
+ */
+
+ /* First, merge the right hand end, if necessary. */
+ if (right)
+ __ntfs_rl_merge(src + ssize - 1, dst + loc + 1);
+
+ /* First run after the @src runs that have been inserted. */
+ marker = loc + ssize + 1;
+
+ /* Move the tail of @dst out of the way, then copy in @src. */
+ ntfs_rl_mm(dst, marker, loc + 1 + right, dsize - (loc + 1 + right));
+ ntfs_rl_mc(dst, loc + 1, src, 0, ssize);
+
+ /* Adjust the size of the preceding hole. */
+ dst[loc].length = dst[loc + 1].vcn - dst[loc].vcn;
+
+ /* We may have changed the length of the file, so fix the end marker */
+ if (dst[marker].lcn == LCN_ENOENT)
+ dst[marker].vcn = dst[marker - 1].vcn + dst[marker - 1].length;
+
+ return dst;
+}
+
+/**
+ * ntfs_rl_insert - insert a runlist into another
+ * @dst: original runlist to be worked on
+ * @dsize: number of elements in @dst (including end marker)
+ * @src: new runlist to be inserted
+ * @ssize: number of elements in @src (excluding end marker)
+ * @loc: insert the new runlist @src before this element in @dst
+ *
+ * Insert the runlist @src before element @loc in the runlist @dst. Merge the
+ * left end of the new runlist, if necessary. Adjust the size of the hole
+ * after the inserted runlist.
+ *
+ * It is up to the caller to serialize access to the runlists @dst and @src.
+ *
+ * On success, return a pointer to the new, combined, runlist. Note, both
+ * runlists @dst and @src are deallocated before returning so you cannot use
+ * the pointers for anything any more. (Strictly speaking the returned runlist
+ * may be the same as @dst but this is irrelevant.)
+ *
+ * On error, return -errno. Both runlists are left unmodified. The following
+ * error codes are defined:
+ * -ENOMEM - Not enough memory to allocate runlist array.
+ * -EINVAL - Invalid parameters were passed in.
+ */
+static inline runlist_element *ntfs_rl_insert(runlist_element *dst,
+ int dsize, runlist_element *src, int ssize, int loc)
+{
+ bool left = false; /* Left end of @src needs merging. */
+ bool disc = false; /* Discontinuity between @dst and @src. */
+ int marker; /* End of the inserted runs. */
+
+ BUG_ON(!dst);
+ BUG_ON(!src);
+
+ /*
+ * disc => Discontinuity between the end of @dst and the start of @src.
+ * This means we might need to insert a "not mapped" run.
+ */
+ if (loc == 0)
+ disc = (src[0].vcn > 0);
+ else {
+ s64 merged_length;
+
+ left = ntfs_are_rl_mergeable(dst + loc - 1, src);
+
+ merged_length = dst[loc - 1].length;
+ if (left)
+ merged_length += src->length;
+
+ disc = (src[0].vcn > dst[loc - 1].vcn + merged_length);
+ }
+ /*
+ * Space required: @dst size + @src size, less one if we merged, plus
+ * one if there was a discontinuity.
+ */
+ dst = ntfs_rl_realloc(dst, dsize, dsize + ssize - left + disc);
+ if (IS_ERR(dst))
+ return dst;
+ /*
+ * We are guaranteed to succeed from here so can start modifying the
+ * original runlist.
+ */
+ if (left)
+ __ntfs_rl_merge(dst + loc - 1, src);
+ /*
+ * First run after the @src runs that have been inserted.
+ * Nominally, @marker equals @loc + @ssize, i.e. location + number of
+ * runs in @src. However, if @left, then the first run in @src has
+ * been merged with one in @dst. And if @disc, then @dst and @src do
+ * not meet and we need an extra run to fill the gap.
+ */
+ marker = loc + ssize - left + disc;
+
+ /* Move the tail of @dst out of the way, then copy in @src. */
+ ntfs_rl_mm(dst, marker, loc, dsize - loc);
+ ntfs_rl_mc(dst, loc + disc, src, left, ssize - left);
+
+ /* Adjust the VCN of the first run after the insertion... */
+ dst[marker].vcn = dst[marker - 1].vcn + dst[marker - 1].length;
+ /* ... and the length. */
+ if (dst[marker].lcn == LCN_HOLE || dst[marker].lcn == LCN_RL_NOT_MAPPED)
+ dst[marker].length = dst[marker + 1].vcn - dst[marker].vcn;
+
+ /* Writing beyond the end of the file and there is a discontinuity. */
+ if (disc) {
+ if (loc > 0) {
+ dst[loc].vcn = dst[loc - 1].vcn + dst[loc - 1].length;
+ dst[loc].length = dst[loc + 1].vcn - dst[loc].vcn;
+ } else {
+ dst[loc].vcn = 0;
+ dst[loc].length = dst[loc + 1].vcn;
+ }
+ dst[loc].lcn = LCN_RL_NOT_MAPPED;
+ }
+ return dst;
+}
+
+/**
+ * ntfs_rl_replace - overwrite a runlist element with another runlist
+ * @dst: original runlist to be worked on
+ * @dsize: number of elements in @dst (including end marker)
+ * @src: new runlist to be inserted
+ * @ssize: number of elements in @src (excluding end marker)
+ * @loc: index in runlist @dst to overwrite with @src
+ *
+ * Replace the runlist element @dst at @loc with @src. Merge the left and
+ * right ends of the inserted runlist, if necessary.
+ *
+ * It is up to the caller to serialize access to the runlists @dst and @src.
+ *
+ * On success, return a pointer to the new, combined, runlist. Note, both
+ * runlists @dst and @src are deallocated before returning so you cannot use
+ * the pointers for anything any more. (Strictly speaking the returned runlist
+ * may be the same as @dst but this is irrelevant.)
+ *
+ * On error, return -errno. Both runlists are left unmodified. The following
+ * error codes are defined:
+ * -ENOMEM - Not enough memory to allocate runlist array.
+ * -EINVAL - Invalid parameters were passed in.
+ */
+static inline runlist_element *ntfs_rl_replace(runlist_element *dst,
+ int dsize, runlist_element *src, int ssize, int loc)
+{
+ signed delta;
+ bool left = false; /* Left end of @src needs merging. */
+ bool right = false; /* Right end of @src needs merging. */
+ int tail; /* Start of tail of @dst. */
+ int marker; /* End of the inserted runs. */
+
+ BUG_ON(!dst);
+ BUG_ON(!src);
+
+ /* First, see if the left and right ends need merging. */
+ if ((loc + 1) < dsize)
+ right = ntfs_are_rl_mergeable(src + ssize - 1, dst + loc + 1);
+ if (loc > 0)
+ left = ntfs_are_rl_mergeable(dst + loc - 1, src);
+ /*
+ * Allocate some space. We will need less if the left, right, or both
+ * ends get merged. The -1 accounts for the run being replaced.
+ */
+ delta = ssize - 1 - left - right;
+ if (delta > 0) {
+ dst = ntfs_rl_realloc(dst, dsize, dsize + delta);
+ if (IS_ERR(dst))
+ return dst;
+ }
+ /*
+ * We are guaranteed to succeed from here so can start modifying the
+ * original runlists.
+ */
+
+ /* First, merge the left and right ends, if necessary. */
+ if (right)
+ __ntfs_rl_merge(src + ssize - 1, dst + loc + 1);
+ if (left)
+ __ntfs_rl_merge(dst + loc - 1, src);
+ /*
+ * Offset of the tail of @dst. This needs to be moved out of the way
+ * to make space for the runs to be copied from @src, i.e. the first
+ * run of the tail of @dst.
+ * Nominally, @tail equals @loc + 1, i.e. location, skipping the
+ * replaced run. However, if @right, then one of @dst's runs is
+ * already merged into @src.
+ */
+ tail = loc + right + 1;
+ /*
+ * First run after the @src runs that have been inserted, i.e. where
+ * the tail of @dst needs to be moved to.
+ * Nominally, @marker equals @loc + @ssize, i.e. location + number of
+ * runs in @src. However, if @left, then the first run in @src has
+ * been merged with one in @dst.
+ */
+ marker = loc + ssize - left;
+
+ /* Move the tail of @dst out of the way, then copy in @src. */
+ ntfs_rl_mm(dst, marker, tail, dsize - tail);
+ ntfs_rl_mc(dst, loc, src, left, ssize - left);
+
+ /* We may have changed the length of the file, so fix the end marker. */
+ if (dsize - tail > 0 && dst[marker].lcn == LCN_ENOENT)
+ dst[marker].vcn = dst[marker - 1].vcn + dst[marker - 1].length;
+ return dst;
+}
+
+/**
+ * ntfs_rl_split - insert a runlist into the centre of a hole
+ * @dst: original runlist to be worked on
+ * @dsize: number of elements in @dst (including end marker)
+ * @src: new runlist to be inserted
+ * @ssize: number of elements in @src (excluding end marker)
+ * @loc: index in runlist @dst at which to split and insert @src
+ *
+ * Split the runlist @dst at @loc into two and insert @new in between the two
+ * fragments. No merging of runlists is necessary. Adjust the size of the
+ * holes either side.
+ *
+ * It is up to the caller to serialize access to the runlists @dst and @src.
+ *
+ * On success, return a pointer to the new, combined, runlist. Note, both
+ * runlists @dst and @src are deallocated before returning so you cannot use
+ * the pointers for anything any more. (Strictly speaking the returned runlist
+ * may be the same as @dst but this is irrelevant.)
+ *
+ * On error, return -errno. Both runlists are left unmodified. The following
+ * error codes are defined:
+ * -ENOMEM - Not enough memory to allocate runlist array.
+ * -EINVAL - Invalid parameters were passed in.
+ */
+static inline runlist_element *ntfs_rl_split(runlist_element *dst, int dsize,
+ runlist_element *src, int ssize, int loc)
+{
+ BUG_ON(!dst);
+ BUG_ON(!src);
+
+ /* Space required: @dst size + @src size + one new hole. */
+ dst = ntfs_rl_realloc(dst, dsize, dsize + ssize + 1);
+ if (IS_ERR(dst))
+ return dst;
+ /*
+ * We are guaranteed to succeed from here so can start modifying the
+ * original runlists.
+ */
+
+ /* Move the tail of @dst out of the way, then copy in @src. */
+ ntfs_rl_mm(dst, loc + 1 + ssize, loc, dsize - loc);
+ ntfs_rl_mc(dst, loc + 1, src, 0, ssize);
+
+ /* Adjust the size of the holes either size of @src. */
+ dst[loc].length = dst[loc+1].vcn - dst[loc].vcn;
+ dst[loc+ssize+1].vcn = dst[loc+ssize].vcn + dst[loc+ssize].length;
+ dst[loc+ssize+1].length = dst[loc+ssize+2].vcn - dst[loc+ssize+1].vcn;
+
+ return dst;
+}
+
+/**
+ * ntfs_runlists_merge - merge two runlists into one
+ * @drl: original runlist to be worked on
+ * @srl: new runlist to be merged into @drl
+ *
+ * First we sanity check the two runlists @srl and @drl to make sure that they
+ * are sensible and can be merged. The runlist @srl must be either after the
+ * runlist @drl or completely within a hole (or unmapped region) in @drl.
+ *
+ * It is up to the caller to serialize access to the runlists @drl and @srl.
+ *
+ * Merging of runlists is necessary in two cases:
+ * 1. When attribute lists are used and a further extent is being mapped.
+ * 2. When new clusters are allocated to fill a hole or extend a file.
+ *
+ * There are four possible ways @srl can be merged. It can:
+ * - be inserted at the beginning of a hole,
+ * - split the hole in two and be inserted between the two fragments,
+ * - be appended at the end of a hole, or it can
+ * - replace the whole hole.
+ * It can also be appended to the end of the runlist, which is just a variant
+ * of the insert case.
+ *
+ * On success, return a pointer to the new, combined, runlist. Note, both
+ * runlists @drl and @srl are deallocated before returning so you cannot use
+ * the pointers for anything any more. (Strictly speaking the returned runlist
+ * may be the same as @dst but this is irrelevant.)
+ *
+ * On error, return -errno. Both runlists are left unmodified. The following
+ * error codes are defined:
+ * -ENOMEM - Not enough memory to allocate runlist array.
+ * -EINVAL - Invalid parameters were passed in.
+ * -ERANGE - The runlists overlap and cannot be merged.
+ */
+runlist_element *ntfs_runlists_merge(runlist_element *drl,
+ runlist_element *srl)
+{
+ int di, si; /* Current index into @[ds]rl. */
+ int sstart; /* First index with lcn > LCN_RL_NOT_MAPPED. */
+ int dins; /* Index into @drl at which to insert @srl. */
+ int dend, send; /* Last index into @[ds]rl. */
+ int dfinal, sfinal; /* The last index into @[ds]rl with
+ lcn >= LCN_HOLE. */
+ int marker = 0;
+ VCN marker_vcn = 0;
+
+#ifdef DEBUG
+ ntfs_debug("dst:");
+ ntfs_debug_dump_runlist(drl);
+ ntfs_debug("src:");
+ ntfs_debug_dump_runlist(srl);
+#endif
+
+ /* Check for silly calling... */
+ if (unlikely(!srl))
+ return drl;
+ if (IS_ERR(srl) || IS_ERR(drl))
+ return ERR_PTR(-EINVAL);
+
+ /* Check for the case where the first mapping is being done now. */
+ if (unlikely(!drl)) {
+ drl = srl;
+ /* Complete the source runlist if necessary. */
+ if (unlikely(drl[0].vcn)) {
+ /* Scan to the end of the source runlist. */
+ for (dend = 0; likely(drl[dend].length); dend++)
+ ;
+ dend++;
+ drl = ntfs_rl_realloc(drl, dend, dend + 1);
+ if (IS_ERR(drl))
+ return drl;
+ /* Insert start element at the front of the runlist. */
+ ntfs_rl_mm(drl, 1, 0, dend);
+ drl[0].vcn = 0;
+ drl[0].lcn = LCN_RL_NOT_MAPPED;
+ drl[0].length = drl[1].vcn;
+ }
+ goto finished;
+ }
+
+ si = di = 0;
+
+ /* Skip any unmapped start element(s) in the source runlist. */
+ while (srl[si].length && srl[si].lcn < LCN_HOLE)
+ si++;
+
+ /* Can't have an entirely unmapped source runlist. */
+ BUG_ON(!srl[si].length);
+
+ /* Record the starting points. */
+ sstart = si;
+
+ /*
+ * Skip forward in @drl until we reach the position where @srl needs to
+ * be inserted. If we reach the end of @drl, @srl just needs to be
+ * appended to @drl.
+ */
+ for (; drl[di].length; di++) {
+ if (drl[di].vcn + drl[di].length > srl[sstart].vcn)
+ break;
+ }
+ dins = di;
+
+ /* Sanity check for illegal overlaps. */
+ if ((drl[di].vcn == srl[si].vcn) && (drl[di].lcn >= 0) &&
+ (srl[si].lcn >= 0)) {
+ ntfs_error(NULL, "Run lists overlap. Cannot merge!");
+ return ERR_PTR(-ERANGE);
+ }
+
+ /* Scan to the end of both runlists in order to know their sizes. */
+ for (send = si; srl[send].length; send++)
+ ;
+ for (dend = di; drl[dend].length; dend++)
+ ;
+
+ if (srl[send].lcn == LCN_ENOENT)
+ marker_vcn = srl[marker = send].vcn;
+
+ /* Scan to the last element with lcn >= LCN_HOLE. */
+ for (sfinal = send; sfinal >= 0 && srl[sfinal].lcn < LCN_HOLE; sfinal--)
+ ;
+ for (dfinal = dend; dfinal >= 0 && drl[dfinal].lcn < LCN_HOLE; dfinal--)
+ ;
+
+ {
+ bool start;
+ bool finish;
+ int ds = dend + 1; /* Number of elements in drl & srl */
+ int ss = sfinal - sstart + 1;
+
+ start = ((drl[dins].lcn < LCN_RL_NOT_MAPPED) || /* End of file */
+ (drl[dins].vcn == srl[sstart].vcn)); /* Start of hole */
+ finish = ((drl[dins].lcn >= LCN_RL_NOT_MAPPED) && /* End of file */
+ ((drl[dins].vcn + drl[dins].length) <= /* End of hole */
+ (srl[send - 1].vcn + srl[send - 1].length)));
+
+ /* Or we will lose an end marker. */
+ if (finish && !drl[dins].length)
+ ss++;
+ if (marker && (drl[dins].vcn + drl[dins].length > srl[send - 1].vcn))
+ finish = false;
+#if 0
+ ntfs_debug("dfinal = %i, dend = %i", dfinal, dend);
+ ntfs_debug("sstart = %i, sfinal = %i, send = %i", sstart, sfinal, send);
+ ntfs_debug("start = %i, finish = %i", start, finish);
+ ntfs_debug("ds = %i, ss = %i, dins = %i", ds, ss, dins);
+#endif
+ if (start) {
+ if (finish)
+ drl = ntfs_rl_replace(drl, ds, srl + sstart, ss, dins);
+ else
+ drl = ntfs_rl_insert(drl, ds, srl + sstart, ss, dins);
+ } else {
+ if (finish)
+ drl = ntfs_rl_append(drl, ds, srl + sstart, ss, dins);
+ else
+ drl = ntfs_rl_split(drl, ds, srl + sstart, ss, dins);
+ }
+ if (IS_ERR(drl)) {
+ ntfs_error(NULL, "Merge failed.");
+ return drl;
+ }
+ ntfs_free(srl);
+ if (marker) {
+ ntfs_debug("Triggering marker code.");
+ for (ds = dend; drl[ds].length; ds++)
+ ;
+ /* We only need to care if @srl ended after @drl. */
+ if (drl[ds].vcn <= marker_vcn) {
+ int slots = 0;
+
+ if (drl[ds].vcn == marker_vcn) {
+ ntfs_debug("Old marker = 0x%llx, replacing "
+ "with LCN_ENOENT.",
+ (unsigned long long)
+ drl[ds].lcn);
+ drl[ds].lcn = LCN_ENOENT;
+ goto finished;
+ }
+ /*
+ * We need to create an unmapped runlist element in
+ * @drl or extend an existing one before adding the
+ * ENOENT terminator.
+ */
+ if (drl[ds].lcn == LCN_ENOENT) {
+ ds--;
+ slots = 1;
+ }
+ if (drl[ds].lcn != LCN_RL_NOT_MAPPED) {
+ /* Add an unmapped runlist element. */
+ if (!slots) {
+ drl = ntfs_rl_realloc_nofail(drl, ds,
+ ds + 2);
+ slots = 2;
+ }
+ ds++;
+ /* Need to set vcn if it isn't set already. */
+ if (slots != 1)
+ drl[ds].vcn = drl[ds - 1].vcn +
+ drl[ds - 1].length;
+ drl[ds].lcn = LCN_RL_NOT_MAPPED;
+ /* We now used up a slot. */
+ slots--;
+ }
+ drl[ds].length = marker_vcn - drl[ds].vcn;
+ /* Finally add the ENOENT terminator. */
+ ds++;
+ if (!slots)
+ drl = ntfs_rl_realloc_nofail(drl, ds, ds + 1);
+ drl[ds].vcn = marker_vcn;
+ drl[ds].lcn = LCN_ENOENT;
+ drl[ds].length = (s64)0;
+ }
+ }
+ }
+
+finished:
+ /* The merge was completed successfully. */
+ ntfs_debug("Merged runlist:");
+ ntfs_debug_dump_runlist(drl);
+ return drl;
+}
+
+/**
+ * ntfs_mapping_pairs_decompress - convert mapping pairs array to runlist
+ * @vol: ntfs volume on which the attribute resides
+ * @attr: attribute record whose mapping pairs array to decompress
+ * @old_rl: optional runlist in which to insert @attr's runlist
+ *
+ * It is up to the caller to serialize access to the runlist @old_rl.
+ *
+ * Decompress the attribute @attr's mapping pairs array into a runlist. On
+ * success, return the decompressed runlist.
+ *
+ * If @old_rl is not NULL, decompressed runlist is inserted into the
+ * appropriate place in @old_rl and the resultant, combined runlist is
+ * returned. The original @old_rl is deallocated.
+ *
+ * On error, return -errno. @old_rl is left unmodified in that case.
+ *
+ * The following error codes are defined:
+ * -ENOMEM - Not enough memory to allocate runlist array.
+ * -EIO - Corrupt runlist.
+ * -EINVAL - Invalid parameters were passed in.
+ * -ERANGE - The two runlists overlap.
+ *
+ * FIXME: For now we take the conceptionally simplest approach of creating the
+ * new runlist disregarding the already existing one and then splicing the
+ * two into one, if that is possible (we check for overlap and discard the new
+ * runlist if overlap present before returning ERR_PTR(-ERANGE)).
+ */
+runlist_element *ntfs_mapping_pairs_decompress(const ntfs_volume *vol,
+ const ATTR_RECORD *attr, runlist_element *old_rl)
+{
+ VCN vcn; /* Current vcn. */
+ LCN lcn; /* Current lcn. */
+ s64 deltaxcn; /* Change in [vl]cn. */
+ runlist_element *rl; /* The output runlist. */
+ u8 *buf; /* Current position in mapping pairs array. */
+ u8 *attr_end; /* End of attribute. */
+ int rlsize; /* Size of runlist buffer. */
+ u16 rlpos; /* Current runlist position in units of
+ runlist_elements. */
+ u8 b; /* Current byte offset in buf. */
+
+#ifdef DEBUG
+ /* Make sure attr exists and is non-resident. */
+ if (!attr || !attr->non_resident || sle64_to_cpu(
+ attr->data.non_resident.lowest_vcn) < (VCN)0) {
+ ntfs_error(vol->sb, "Invalid arguments.");
+ return ERR_PTR(-EINVAL);
+ }
+#endif
+ /* Start at vcn = lowest_vcn and lcn 0. */
+ vcn = sle64_to_cpu(attr->data.non_resident.lowest_vcn);
+ lcn = 0;
+ /* Get start of the mapping pairs array. */
+ buf = (u8*)attr + le16_to_cpu(
+ attr->data.non_resident.mapping_pairs_offset);
+ attr_end = (u8*)attr + le32_to_cpu(attr->length);
+ if (unlikely(buf < (u8*)attr || buf > attr_end)) {
+ ntfs_error(vol->sb, "Corrupt attribute.");
+ return ERR_PTR(-EIO);
+ }
+ /* If the mapping pairs array is valid but empty, nothing to do. */
+ if (!vcn && !*buf)
+ return old_rl;
+ /* Current position in runlist array. */
+ rlpos = 0;
+ /* Allocate first page and set current runlist size to one page. */
+ rl = ntfs_malloc_nofs(rlsize = PAGE_SIZE);
+ if (unlikely(!rl))
+ return ERR_PTR(-ENOMEM);
+ /* Insert unmapped starting element if necessary. */
+ if (vcn) {
+ rl->vcn = 0;
+ rl->lcn = LCN_RL_NOT_MAPPED;
+ rl->length = vcn;
+ rlpos++;
+ }
+ while (buf < attr_end && *buf) {
+ /*
+ * Allocate more memory if needed, including space for the
+ * not-mapped and terminator elements. ntfs_malloc_nofs()
+ * operates on whole pages only.
+ */
+ if (((rlpos + 3) * sizeof(*old_rl)) > rlsize) {
+ runlist_element *rl2;
+
+ rl2 = ntfs_malloc_nofs(rlsize + (int)PAGE_SIZE);
+ if (unlikely(!rl2)) {
+ ntfs_free(rl);
+ return ERR_PTR(-ENOMEM);
+ }
+ memcpy(rl2, rl, rlsize);
+ ntfs_free(rl);
+ rl = rl2;
+ rlsize += PAGE_SIZE;
+ }
+ /* Enter the current vcn into the current runlist element. */
+ rl[rlpos].vcn = vcn;
+ /*
+ * Get the change in vcn, i.e. the run length in clusters.
+ * Doing it this way ensures that we signextend negative values.
+ * A negative run length doesn't make any sense, but hey, I
+ * didn't make up the NTFS specs and Windows NT4 treats the run
+ * length as a signed value so that's how it is...
+ */
+ b = *buf & 0xf;
+ if (b) {
+ if (unlikely(buf + b > attr_end))
+ goto io_error;
+ for (deltaxcn = (s8)buf[b--]; b; b--)
+ deltaxcn = (deltaxcn << 8) + buf[b];
+ } else { /* The length entry is compulsory. */
+ ntfs_error(vol->sb, "Missing length entry in mapping "
+ "pairs array.");
+ deltaxcn = (s64)-1;
+ }
+ /*
+ * Assume a negative length to indicate data corruption and
+ * hence clean-up and return NULL.
+ */
+ if (unlikely(deltaxcn < 0)) {
+ ntfs_error(vol->sb, "Invalid length in mapping pairs "
+ "array.");
+ goto err_out;
+ }
+ /*
+ * Enter the current run length into the current runlist
+ * element.
+ */
+ rl[rlpos].length = deltaxcn;
+ /* Increment the current vcn by the current run length. */
+ vcn += deltaxcn;
+ /*
+ * There might be no lcn change at all, as is the case for
+ * sparse clusters on NTFS 3.0+, in which case we set the lcn
+ * to LCN_HOLE.
+ */
+ if (!(*buf & 0xf0))
+ rl[rlpos].lcn = LCN_HOLE;
+ else {
+ /* Get the lcn change which really can be negative. */
+ u8 b2 = *buf & 0xf;
+ b = b2 + ((*buf >> 4) & 0xf);
+ if (buf + b > attr_end)
+ goto io_error;
+ for (deltaxcn = (s8)buf[b--]; b > b2; b--)
+ deltaxcn = (deltaxcn << 8) + buf[b];
+ /* Change the current lcn to its new value. */
+ lcn += deltaxcn;
+#ifdef DEBUG
+ /*
+ * On NTFS 1.2-, apparently can have lcn == -1 to
+ * indicate a hole. But we haven't verified ourselves
+ * whether it is really the lcn or the deltaxcn that is
+ * -1. So if either is found give us a message so we
+ * can investigate it further!
+ */
+ if (vol->major_ver < 3) {
+ if (unlikely(deltaxcn == (LCN)-1))
+ ntfs_error(vol->sb, "lcn delta == -1");
+ if (unlikely(lcn == (LCN)-1))
+ ntfs_error(vol->sb, "lcn == -1");
+ }
+#endif
+ /* Check lcn is not below -1. */
+ if (unlikely(lcn < (LCN)-1)) {
+ ntfs_error(vol->sb, "Invalid LCN < -1 in "
+ "mapping pairs array.");
+ goto err_out;
+ }
+ /* Enter the current lcn into the runlist element. */
+ rl[rlpos].lcn = lcn;
+ }
+ /* Get to the next runlist element. */
+ rlpos++;
+ /* Increment the buffer position to the next mapping pair. */
+ buf += (*buf & 0xf) + ((*buf >> 4) & 0xf) + 1;
+ }
+ if (unlikely(buf >= attr_end))
+ goto io_error;
+ /*
+ * If there is a highest_vcn specified, it must be equal to the final
+ * vcn in the runlist - 1, or something has gone badly wrong.
+ */
+ deltaxcn = sle64_to_cpu(attr->data.non_resident.highest_vcn);
+ if (unlikely(deltaxcn && vcn - 1 != deltaxcn)) {
+mpa_err:
+ ntfs_error(vol->sb, "Corrupt mapping pairs array in "
+ "non-resident attribute.");
+ goto err_out;
+ }
+ /* Setup not mapped runlist element if this is the base extent. */
+ if (!attr->data.non_resident.lowest_vcn) {
+ VCN max_cluster;
+
+ max_cluster = ((sle64_to_cpu(
+ attr->data.non_resident.allocated_size) +
+ vol->cluster_size - 1) >>
+ vol->cluster_size_bits) - 1;
+ /*
+ * A highest_vcn of zero means this is a single extent
+ * attribute so simply terminate the runlist with LCN_ENOENT).
+ */
+ if (deltaxcn) {
+ /*
+ * If there is a difference between the highest_vcn and
+ * the highest cluster, the runlist is either corrupt
+ * or, more likely, there are more extents following
+ * this one.
+ */
+ if (deltaxcn < max_cluster) {
+ ntfs_debug("More extents to follow; deltaxcn "
+ "= 0x%llx, max_cluster = "
+ "0x%llx",
+ (unsigned long long)deltaxcn,
+ (unsigned long long)
+ max_cluster);
+ rl[rlpos].vcn = vcn;
+ vcn += rl[rlpos].length = max_cluster -
+ deltaxcn;
+ rl[rlpos].lcn = LCN_RL_NOT_MAPPED;
+ rlpos++;
+ } else if (unlikely(deltaxcn > max_cluster)) {
+ ntfs_error(vol->sb, "Corrupt attribute. "
+ "deltaxcn = 0x%llx, "
+ "max_cluster = 0x%llx",
+ (unsigned long long)deltaxcn,
+ (unsigned long long)
+ max_cluster);
+ goto mpa_err;
+ }
+ }
+ rl[rlpos].lcn = LCN_ENOENT;
+ } else /* Not the base extent. There may be more extents to follow. */
+ rl[rlpos].lcn = LCN_RL_NOT_MAPPED;
+
+ /* Setup terminating runlist element. */
+ rl[rlpos].vcn = vcn;
+ rl[rlpos].length = (s64)0;
+ /* If no existing runlist was specified, we are done. */
+ if (!old_rl) {
+ ntfs_debug("Mapping pairs array successfully decompressed:");
+ ntfs_debug_dump_runlist(rl);
+ return rl;
+ }
+ /* Now combine the new and old runlists checking for overlaps. */
+ old_rl = ntfs_runlists_merge(old_rl, rl);
+ if (!IS_ERR(old_rl))
+ return old_rl;
+ ntfs_free(rl);
+ ntfs_error(vol->sb, "Failed to merge runlists.");
+ return old_rl;
+io_error:
+ ntfs_error(vol->sb, "Corrupt attribute.");
+err_out:
+ ntfs_free(rl);
+ return ERR_PTR(-EIO);
+}
+
+/**
+ * ntfs_rl_vcn_to_lcn - convert a vcn into a lcn given a runlist
+ * @rl: runlist to use for conversion
+ * @vcn: vcn to convert
+ *
+ * Convert the virtual cluster number @vcn of an attribute into a logical
+ * cluster number (lcn) of a device using the runlist @rl to map vcns to their
+ * corresponding lcns.
+ *
+ * It is up to the caller to serialize access to the runlist @rl.
+ *
+ * Since lcns must be >= 0, we use negative return codes with special meaning:
+ *
+ * Return code Meaning / Description
+ * ==================================================
+ * LCN_HOLE Hole / not allocated on disk.
+ * LCN_RL_NOT_MAPPED This is part of the runlist which has not been
+ * inserted into the runlist yet.
+ * LCN_ENOENT There is no such vcn in the attribute.
+ *
+ * Locking: - The caller must have locked the runlist (for reading or writing).
+ * - This function does not touch the lock, nor does it modify the
+ * runlist.
+ */
+LCN ntfs_rl_vcn_to_lcn(const runlist_element *rl, const VCN vcn)
+{
+ int i;
+
+ BUG_ON(vcn < 0);
+ /*
+ * If rl is NULL, assume that we have found an unmapped runlist. The
+ * caller can then attempt to map it and fail appropriately if
+ * necessary.
+ */
+ if (unlikely(!rl))
+ return LCN_RL_NOT_MAPPED;
+
+ /* Catch out of lower bounds vcn. */
+ if (unlikely(vcn < rl[0].vcn))
+ return LCN_ENOENT;
+
+ for (i = 0; likely(rl[i].length); i++) {
+ if (unlikely(vcn < rl[i+1].vcn)) {
+ if (likely(rl[i].lcn >= (LCN)0))
+ return rl[i].lcn + (vcn - rl[i].vcn);
+ return rl[i].lcn;
+ }
+ }
+ /*
+ * The terminator element is setup to the correct value, i.e. one of
+ * LCN_HOLE, LCN_RL_NOT_MAPPED, or LCN_ENOENT.
+ */
+ if (likely(rl[i].lcn < (LCN)0))
+ return rl[i].lcn;
+ /* Just in case... We could replace this with BUG() some day. */
+ return LCN_ENOENT;
+}
+
+#ifdef NTFS_RW
+
+/**
+ * ntfs_rl_find_vcn_nolock - find a vcn in a runlist
+ * @rl: runlist to search
+ * @vcn: vcn to find
+ *
+ * Find the virtual cluster number @vcn in the runlist @rl and return the
+ * address of the runlist element containing the @vcn on success.
+ *
+ * Return NULL if @rl is NULL or @vcn is in an unmapped part/out of bounds of
+ * the runlist.
+ *
+ * Locking: The runlist must be locked on entry.
+ */
+runlist_element *ntfs_rl_find_vcn_nolock(runlist_element *rl, const VCN vcn)
+{
+ BUG_ON(vcn < 0);
+ if (unlikely(!rl || vcn < rl[0].vcn))
+ return NULL;
+ while (likely(rl->length)) {
+ if (unlikely(vcn < rl[1].vcn)) {
+ if (likely(rl->lcn >= LCN_HOLE))
+ return rl;
+ return NULL;
+ }
+ rl++;
+ }
+ if (likely(rl->lcn == LCN_ENOENT))
+ return rl;
+ return NULL;
+}
+
+/**
+ * ntfs_get_nr_significant_bytes - get number of bytes needed to store a number
+ * @n: number for which to get the number of bytes for
+ *
+ * Return the number of bytes required to store @n unambiguously as
+ * a signed number.
+ *
+ * This is used in the context of the mapping pairs array to determine how
+ * many bytes will be needed in the array to store a given logical cluster
+ * number (lcn) or a specific run length.
+ *
+ * Return the number of bytes written. This function cannot fail.
+ */
+static inline int ntfs_get_nr_significant_bytes(const s64 n)
+{
+ s64 l = n;
+ int i;
+ s8 j;
+
+ i = 0;
+ do {
+ l >>= 8;
+ i++;
+ } while (l != 0 && l != -1);
+ j = (n >> 8 * (i - 1)) & 0xff;
+ /* If the sign bit is wrong, we need an extra byte. */
+ if ((n < 0 && j >= 0) || (n > 0 && j < 0))
+ i++;
+ return i;
+}
+
+/**
+ * ntfs_get_size_for_mapping_pairs - get bytes needed for mapping pairs array
+ * @vol: ntfs volume (needed for the ntfs version)
+ * @rl: locked runlist to determine the size of the mapping pairs of
+ * @first_vcn: first vcn which to include in the mapping pairs array
+ * @last_vcn: last vcn which to include in the mapping pairs array
+ *
+ * Walk the locked runlist @rl and calculate the size in bytes of the mapping
+ * pairs array corresponding to the runlist @rl, starting at vcn @first_vcn and
+ * finishing with vcn @last_vcn.
+ *
+ * A @last_vcn of -1 means end of runlist and in that case the size of the
+ * mapping pairs array corresponding to the runlist starting at vcn @first_vcn
+ * and finishing at the end of the runlist is determined.
+ *
+ * This for example allows us to allocate a buffer of the right size when
+ * building the mapping pairs array.
+ *
+ * If @rl is NULL, just return 1 (for the single terminator byte).
+ *
+ * Return the calculated size in bytes on success. On error, return -errno.
+ * The following error codes are defined:
+ * -EINVAL - Run list contains unmapped elements. Make sure to only pass
+ * fully mapped runlists to this function.
+ * -EIO - The runlist is corrupt.
+ *
+ * Locking: @rl must be locked on entry (either for reading or writing), it
+ * remains locked throughout, and is left locked upon return.
+ */
+int ntfs_get_size_for_mapping_pairs(const ntfs_volume *vol,
+ const runlist_element *rl, const VCN first_vcn,
+ const VCN last_vcn)
+{
+ LCN prev_lcn;
+ int rls;
+ bool the_end = false;
+
+ BUG_ON(first_vcn < 0);
+ BUG_ON(last_vcn < -1);
+ BUG_ON(last_vcn >= 0 && first_vcn > last_vcn);
+ if (!rl) {
+ BUG_ON(first_vcn);
+ BUG_ON(last_vcn > 0);
+ return 1;
+ }
+ /* Skip to runlist element containing @first_vcn. */
+ while (rl->length && first_vcn >= rl[1].vcn)
+ rl++;
+ if (unlikely((!rl->length && first_vcn > rl->vcn) ||
+ first_vcn < rl->vcn))
+ return -EINVAL;
+ prev_lcn = 0;
+ /* Always need the termining zero byte. */
+ rls = 1;
+ /* Do the first partial run if present. */
+ if (first_vcn > rl->vcn) {
+ s64 delta, length = rl->length;
+
+ /* We know rl->length != 0 already. */
+ if (unlikely(length < 0 || rl->lcn < LCN_HOLE))
+ goto err_out;
+ /*
+ * If @stop_vcn is given and finishes inside this run, cap the
+ * run length.
+ */
+ if (unlikely(last_vcn >= 0 && rl[1].vcn > last_vcn)) {
+ s64 s1 = last_vcn + 1;
+ if (unlikely(rl[1].vcn > s1))
+ length = s1 - rl->vcn;
+ the_end = true;
+ }
+ delta = first_vcn - rl->vcn;
+ /* Header byte + length. */
+ rls += 1 + ntfs_get_nr_significant_bytes(length - delta);
+ /*
+ * If the logical cluster number (lcn) denotes a hole and we
+ * are on NTFS 3.0+, we don't store it at all, i.e. we need
+ * zero space. On earlier NTFS versions we just store the lcn.
+ * Note: this assumes that on NTFS 1.2-, holes are stored with
+ * an lcn of -1 and not a delta_lcn of -1 (unless both are -1).
+ */
+ if (likely(rl->lcn >= 0 || vol->major_ver < 3)) {
+ prev_lcn = rl->lcn;
+ if (likely(rl->lcn >= 0))
+ prev_lcn += delta;
+ /* Change in lcn. */
+ rls += ntfs_get_nr_significant_bytes(prev_lcn);
+ }
+ /* Go to next runlist element. */
+ rl++;
+ }
+ /* Do the full runs. */
+ for (; rl->length && !the_end; rl++) {
+ s64 length = rl->length;
+
+ if (unlikely(length < 0 || rl->lcn < LCN_HOLE))
+ goto err_out;
+ /*
+ * If @stop_vcn is given and finishes inside this run, cap the
+ * run length.
+ */
+ if (unlikely(last_vcn >= 0 && rl[1].vcn > last_vcn)) {
+ s64 s1 = last_vcn + 1;
+ if (unlikely(rl[1].vcn > s1))
+ length = s1 - rl->vcn;
+ the_end = true;
+ }
+ /* Header byte + length. */
+ rls += 1 + ntfs_get_nr_significant_bytes(length);
+ /*
+ * If the logical cluster number (lcn) denotes a hole and we
+ * are on NTFS 3.0+, we don't store it at all, i.e. we need
+ * zero space. On earlier NTFS versions we just store the lcn.
+ * Note: this assumes that on NTFS 1.2-, holes are stored with
+ * an lcn of -1 and not a delta_lcn of -1 (unless both are -1).
+ */
+ if (likely(rl->lcn >= 0 || vol->major_ver < 3)) {
+ /* Change in lcn. */
+ rls += ntfs_get_nr_significant_bytes(rl->lcn -
+ prev_lcn);
+ prev_lcn = rl->lcn;
+ }
+ }
+ return rls;
+err_out:
+ if (rl->lcn == LCN_RL_NOT_MAPPED)
+ rls = -EINVAL;
+ else
+ rls = -EIO;
+ return rls;
+}
+
+/**
+ * ntfs_write_significant_bytes - write the significant bytes of a number
+ * @dst: destination buffer to write to
+ * @dst_max: pointer to last byte of destination buffer for bounds checking
+ * @n: number whose significant bytes to write
+ *
+ * Store in @dst, the minimum bytes of the number @n which are required to
+ * identify @n unambiguously as a signed number, taking care not to exceed
+ * @dest_max, the maximum position within @dst to which we are allowed to
+ * write.
+ *
+ * This is used when building the mapping pairs array of a runlist to compress
+ * a given logical cluster number (lcn) or a specific run length to the minimum
+ * size possible.
+ *
+ * Return the number of bytes written on success. On error, i.e. the
+ * destination buffer @dst is too small, return -ENOSPC.
+ */
+static inline int ntfs_write_significant_bytes(s8 *dst, const s8 *dst_max,
+ const s64 n)
+{
+ s64 l = n;
+ int i;
+ s8 j;
+
+ i = 0;
+ do {
+ if (unlikely(dst > dst_max))
+ goto err_out;
+ *dst++ = l & 0xffll;
+ l >>= 8;
+ i++;
+ } while (l != 0 && l != -1);
+ j = (n >> 8 * (i - 1)) & 0xff;
+ /* If the sign bit is wrong, we need an extra byte. */
+ if (n < 0 && j >= 0) {
+ if (unlikely(dst > dst_max))
+ goto err_out;
+ i++;
+ *dst = (s8)-1;
+ } else if (n > 0 && j < 0) {
+ if (unlikely(dst > dst_max))
+ goto err_out;
+ i++;
+ *dst = (s8)0;
+ }
+ return i;
+err_out:
+ return -ENOSPC;
+}
+
+/**
+ * ntfs_mapping_pairs_build - build the mapping pairs array from a runlist
+ * @vol: ntfs volume (needed for the ntfs version)
+ * @dst: destination buffer to which to write the mapping pairs array
+ * @dst_len: size of destination buffer @dst in bytes
+ * @rl: locked runlist for which to build the mapping pairs array
+ * @first_vcn: first vcn which to include in the mapping pairs array
+ * @last_vcn: last vcn which to include in the mapping pairs array
+ * @stop_vcn: first vcn outside destination buffer on success or -ENOSPC
+ *
+ * Create the mapping pairs array from the locked runlist @rl, starting at vcn
+ * @first_vcn and finishing with vcn @last_vcn and save the array in @dst.
+ * @dst_len is the size of @dst in bytes and it should be at least equal to the
+ * value obtained by calling ntfs_get_size_for_mapping_pairs().
+ *
+ * A @last_vcn of -1 means end of runlist and in that case the mapping pairs
+ * array corresponding to the runlist starting at vcn @first_vcn and finishing
+ * at the end of the runlist is created.
+ *
+ * If @rl is NULL, just write a single terminator byte to @dst.
+ *
+ * On success or -ENOSPC error, if @stop_vcn is not NULL, *@stop_vcn is set to
+ * the first vcn outside the destination buffer. Note that on error, @dst has
+ * been filled with all the mapping pairs that will fit, thus it can be treated
+ * as partial success, in that a new attribute extent needs to be created or
+ * the next extent has to be used and the mapping pairs build has to be
+ * continued with @first_vcn set to *@stop_vcn.
+ *
+ * Return 0 on success and -errno on error. The following error codes are
+ * defined:
+ * -EINVAL - Run list contains unmapped elements. Make sure to only pass
+ * fully mapped runlists to this function.
+ * -EIO - The runlist is corrupt.
+ * -ENOSPC - The destination buffer is too small.
+ *
+ * Locking: @rl must be locked on entry (either for reading or writing), it
+ * remains locked throughout, and is left locked upon return.
+ */
+int ntfs_mapping_pairs_build(const ntfs_volume *vol, s8 *dst,
+ const int dst_len, const runlist_element *rl,
+ const VCN first_vcn, const VCN last_vcn, VCN *const stop_vcn)
+{
+ LCN prev_lcn;
+ s8 *dst_max, *dst_next;
+ int err = -ENOSPC;
+ bool the_end = false;
+ s8 len_len, lcn_len;
+
+ BUG_ON(first_vcn < 0);
+ BUG_ON(last_vcn < -1);
+ BUG_ON(last_vcn >= 0 && first_vcn > last_vcn);
+ BUG_ON(dst_len < 1);
+ if (!rl) {
+ BUG_ON(first_vcn);
+ BUG_ON(last_vcn > 0);
+ if (stop_vcn)
+ *stop_vcn = 0;
+ /* Terminator byte. */
+ *dst = 0;
+ return 0;
+ }
+ /* Skip to runlist element containing @first_vcn. */
+ while (rl->length && first_vcn >= rl[1].vcn)
+ rl++;
+ if (unlikely((!rl->length && first_vcn > rl->vcn) ||
+ first_vcn < rl->vcn))
+ return -EINVAL;
+ /*
+ * @dst_max is used for bounds checking in
+ * ntfs_write_significant_bytes().
+ */
+ dst_max = dst + dst_len - 1;
+ prev_lcn = 0;
+ /* Do the first partial run if present. */
+ if (first_vcn > rl->vcn) {
+ s64 delta, length = rl->length;
+
+ /* We know rl->length != 0 already. */
+ if (unlikely(length < 0 || rl->lcn < LCN_HOLE))
+ goto err_out;
+ /*
+ * If @stop_vcn is given and finishes inside this run, cap the
+ * run length.
+ */
+ if (unlikely(last_vcn >= 0 && rl[1].vcn > last_vcn)) {
+ s64 s1 = last_vcn + 1;
+ if (unlikely(rl[1].vcn > s1))
+ length = s1 - rl->vcn;
+ the_end = true;
+ }
+ delta = first_vcn - rl->vcn;
+ /* Write length. */
+ len_len = ntfs_write_significant_bytes(dst + 1, dst_max,
+ length - delta);
+ if (unlikely(len_len < 0))
+ goto size_err;
+ /*
+ * If the logical cluster number (lcn) denotes a hole and we
+ * are on NTFS 3.0+, we don't store it at all, i.e. we need
+ * zero space. On earlier NTFS versions we just write the lcn
+ * change. FIXME: Do we need to write the lcn change or just
+ * the lcn in that case? Not sure as I have never seen this
+ * case on NT4. - We assume that we just need to write the lcn
+ * change until someone tells us otherwise... (AIA)
+ */
+ if (likely(rl->lcn >= 0 || vol->major_ver < 3)) {
+ prev_lcn = rl->lcn;
+ if (likely(rl->lcn >= 0))
+ prev_lcn += delta;
+ /* Write change in lcn. */
+ lcn_len = ntfs_write_significant_bytes(dst + 1 +
+ len_len, dst_max, prev_lcn);
+ if (unlikely(lcn_len < 0))
+ goto size_err;
+ } else
+ lcn_len = 0;
+ dst_next = dst + len_len + lcn_len + 1;
+ if (unlikely(dst_next > dst_max))
+ goto size_err;
+ /* Update header byte. */
+ *dst = lcn_len << 4 | len_len;
+ /* Position at next mapping pairs array element. */
+ dst = dst_next;
+ /* Go to next runlist element. */
+ rl++;
+ }
+ /* Do the full runs. */
+ for (; rl->length && !the_end; rl++) {
+ s64 length = rl->length;
+
+ if (unlikely(length < 0 || rl->lcn < LCN_HOLE))
+ goto err_out;
+ /*
+ * If @stop_vcn is given and finishes inside this run, cap the
+ * run length.
+ */
+ if (unlikely(last_vcn >= 0 && rl[1].vcn > last_vcn)) {
+ s64 s1 = last_vcn + 1;
+ if (unlikely(rl[1].vcn > s1))
+ length = s1 - rl->vcn;
+ the_end = true;
+ }
+ /* Write length. */
+ len_len = ntfs_write_significant_bytes(dst + 1, dst_max,
+ length);
+ if (unlikely(len_len < 0))
+ goto size_err;
+ /*
+ * If the logical cluster number (lcn) denotes a hole and we
+ * are on NTFS 3.0+, we don't store it at all, i.e. we need
+ * zero space. On earlier NTFS versions we just write the lcn
+ * change. FIXME: Do we need to write the lcn change or just
+ * the lcn in that case? Not sure as I have never seen this
+ * case on NT4. - We assume that we just need to write the lcn
+ * change until someone tells us otherwise... (AIA)
+ */
+ if (likely(rl->lcn >= 0 || vol->major_ver < 3)) {
+ /* Write change in lcn. */
+ lcn_len = ntfs_write_significant_bytes(dst + 1 +
+ len_len, dst_max, rl->lcn - prev_lcn);
+ if (unlikely(lcn_len < 0))
+ goto size_err;
+ prev_lcn = rl->lcn;
+ } else
+ lcn_len = 0;
+ dst_next = dst + len_len + lcn_len + 1;
+ if (unlikely(dst_next > dst_max))
+ goto size_err;
+ /* Update header byte. */
+ *dst = lcn_len << 4 | len_len;
+ /* Position at next mapping pairs array element. */
+ dst = dst_next;
+ }
+ /* Success. */
+ err = 0;
+size_err:
+ /* Set stop vcn. */
+ if (stop_vcn)
+ *stop_vcn = rl->vcn;
+ /* Add terminator byte. */
+ *dst = 0;
+ return err;
+err_out:
+ if (rl->lcn == LCN_RL_NOT_MAPPED)
+ err = -EINVAL;
+ else
+ err = -EIO;
+ return err;
+}
+
+/**
+ * ntfs_rl_truncate_nolock - truncate a runlist starting at a specified vcn
+ * @vol: ntfs volume (needed for error output)
+ * @runlist: runlist to truncate
+ * @new_length: the new length of the runlist in VCNs
+ *
+ * Truncate the runlist described by @runlist as well as the memory buffer
+ * holding the runlist elements to a length of @new_length VCNs.
+ *
+ * If @new_length lies within the runlist, the runlist elements with VCNs of
+ * @new_length and above are discarded. As a special case if @new_length is
+ * zero, the runlist is discarded and set to NULL.
+ *
+ * If @new_length lies beyond the runlist, a sparse runlist element is added to
+ * the end of the runlist @runlist or if the last runlist element is a sparse
+ * one already, this is extended.
+ *
+ * Note, no checking is done for unmapped runlist elements. It is assumed that
+ * the caller has mapped any elements that need to be mapped already.
+ *
+ * Return 0 on success and -errno on error.
+ *
+ * Locking: The caller must hold @runlist->lock for writing.
+ */
+int ntfs_rl_truncate_nolock(const ntfs_volume *vol, runlist *const runlist,
+ const s64 new_length)
+{
+ runlist_element *rl;
+ int old_size;
+
+ ntfs_debug("Entering for new_length 0x%llx.", (long long)new_length);
+ BUG_ON(!runlist);
+ BUG_ON(new_length < 0);
+ rl = runlist->rl;
+ if (!new_length) {
+ ntfs_debug("Freeing runlist.");
+ runlist->rl = NULL;
+ if (rl)
+ ntfs_free(rl);
+ return 0;
+ }
+ if (unlikely(!rl)) {
+ /*
+ * Create a runlist consisting of a sparse runlist element of
+ * length @new_length followed by a terminator runlist element.
+ */
+ rl = ntfs_malloc_nofs(PAGE_SIZE);
+ if (unlikely(!rl)) {
+ ntfs_error(vol->sb, "Not enough memory to allocate "
+ "runlist element buffer.");
+ return -ENOMEM;
+ }
+ runlist->rl = rl;
+ rl[1].length = rl->vcn = 0;
+ rl->lcn = LCN_HOLE;
+ rl[1].vcn = rl->length = new_length;
+ rl[1].lcn = LCN_ENOENT;
+ return 0;
+ }
+ BUG_ON(new_length < rl->vcn);
+ /* Find @new_length in the runlist. */
+ while (likely(rl->length && new_length >= rl[1].vcn))
+ rl++;
+ /*
+ * If not at the end of the runlist we need to shrink it.
+ * If at the end of the runlist we need to expand it.
+ */
+ if (rl->length) {
+ runlist_element *trl;
+ bool is_end;
+
+ ntfs_debug("Shrinking runlist.");
+ /* Determine the runlist size. */
+ trl = rl + 1;
+ while (likely(trl->length))
+ trl++;
+ old_size = trl - runlist->rl + 1;
+ /* Truncate the run. */
+ rl->length = new_length - rl->vcn;
+ /*
+ * If a run was partially truncated, make the following runlist
+ * element a terminator.
+ */
+ is_end = false;
+ if (rl->length) {
+ rl++;
+ if (!rl->length)
+ is_end = true;
+ rl->vcn = new_length;
+ rl->length = 0;
+ }
+ rl->lcn = LCN_ENOENT;
+ /* Reallocate memory if necessary. */
+ if (!is_end) {
+ int new_size = rl - runlist->rl + 1;
+ rl = ntfs_rl_realloc(runlist->rl, old_size, new_size);
+ if (IS_ERR(rl))
+ ntfs_warning(vol->sb, "Failed to shrink "
+ "runlist buffer. This just "
+ "wastes a bit of memory "
+ "temporarily so we ignore it "
+ "and return success.");
+ else
+ runlist->rl = rl;
+ }
+ } else if (likely(/* !rl->length && */ new_length > rl->vcn)) {
+ ntfs_debug("Expanding runlist.");
+ /*
+ * If there is a previous runlist element and it is a sparse
+ * one, extend it. Otherwise need to add a new, sparse runlist
+ * element.
+ */
+ if ((rl > runlist->rl) && ((rl - 1)->lcn == LCN_HOLE))
+ (rl - 1)->length = new_length - (rl - 1)->vcn;
+ else {
+ /* Determine the runlist size. */
+ old_size = rl - runlist->rl + 1;
+ /* Reallocate memory if necessary. */
+ rl = ntfs_rl_realloc(runlist->rl, old_size,
+ old_size + 1);
+ if (IS_ERR(rl)) {
+ ntfs_error(vol->sb, "Failed to expand runlist "
+ "buffer, aborting.");
+ return PTR_ERR(rl);
+ }
+ runlist->rl = rl;
+ /*
+ * Set @rl to the same runlist element in the new
+ * runlist as before in the old runlist.
+ */
+ rl += old_size - 1;
+ /* Add a new, sparse runlist element. */
+ rl->lcn = LCN_HOLE;
+ rl->length = new_length - rl->vcn;
+ /* Add a new terminator runlist element. */
+ rl++;
+ rl->length = 0;
+ }
+ rl->vcn = new_length;
+ rl->lcn = LCN_ENOENT;
+ } else /* if (unlikely(!rl->length && new_length == rl->vcn)) */ {
+ /* Runlist already has same size as requested. */
+ rl->lcn = LCN_ENOENT;
+ }
+ ntfs_debug("Done.");
+ return 0;
+}
+
+/**
+ * ntfs_rl_punch_nolock - punch a hole into a runlist
+ * @vol: ntfs volume (needed for error output)
+ * @runlist: runlist to punch a hole into
+ * @start: starting VCN of the hole to be created
+ * @length: size of the hole to be created in units of clusters
+ *
+ * Punch a hole into the runlist @runlist starting at VCN @start and of size
+ * @length clusters.
+ *
+ * Return 0 on success and -errno on error, in which case @runlist has not been
+ * modified.
+ *
+ * If @start and/or @start + @length are outside the runlist return error code
+ * -ENOENT.
+ *
+ * If the runlist contains unmapped or error elements between @start and @start
+ * + @length return error code -EINVAL.
+ *
+ * Locking: The caller must hold @runlist->lock for writing.
+ */
+int ntfs_rl_punch_nolock(const ntfs_volume *vol, runlist *const runlist,
+ const VCN start, const s64 length)
+{
+ const VCN end = start + length;
+ s64 delta;
+ runlist_element *rl, *rl_end, *rl_real_end, *trl;
+ int old_size;
+ bool lcn_fixup = false;
+
+ ntfs_debug("Entering for start 0x%llx, length 0x%llx.",
+ (long long)start, (long long)length);
+ BUG_ON(!runlist);
+ BUG_ON(start < 0);
+ BUG_ON(length < 0);
+ BUG_ON(end < 0);
+ rl = runlist->rl;
+ if (unlikely(!rl)) {
+ if (likely(!start && !length))
+ return 0;
+ return -ENOENT;
+ }
+ /* Find @start in the runlist. */
+ while (likely(rl->length && start >= rl[1].vcn))
+ rl++;
+ rl_end = rl;
+ /* Find @end in the runlist. */
+ while (likely(rl_end->length && end >= rl_end[1].vcn)) {
+ /* Verify there are no unmapped or error elements. */
+ if (unlikely(rl_end->lcn < LCN_HOLE))
+ return -EINVAL;
+ rl_end++;
+ }
+ /* Check the last element. */
+ if (unlikely(rl_end->length && rl_end->lcn < LCN_HOLE))
+ return -EINVAL;
+ /* This covers @start being out of bounds, too. */
+ if (!rl_end->length && end > rl_end->vcn)
+ return -ENOENT;
+ if (!length)
+ return 0;
+ if (!rl->length)
+ return -ENOENT;
+ rl_real_end = rl_end;
+ /* Determine the runlist size. */
+ while (likely(rl_real_end->length))
+ rl_real_end++;
+ old_size = rl_real_end - runlist->rl + 1;
+ /* If @start is in a hole simply extend the hole. */
+ if (rl->lcn == LCN_HOLE) {
+ /*
+ * If both @start and @end are in the same sparse run, we are
+ * done.
+ */
+ if (end <= rl[1].vcn) {
+ ntfs_debug("Done (requested hole is already sparse).");
+ return 0;
+ }
+extend_hole:
+ /* Extend the hole. */
+ rl->length = end - rl->vcn;
+ /* If @end is in a hole, merge it with the current one. */
+ if (rl_end->lcn == LCN_HOLE) {
+ rl_end++;
+ rl->length = rl_end->vcn - rl->vcn;
+ }
+ /* We have done the hole. Now deal with the remaining tail. */
+ rl++;
+ /* Cut out all runlist elements up to @end. */
+ if (rl < rl_end)
+ memmove(rl, rl_end, (rl_real_end - rl_end + 1) *
+ sizeof(*rl));
+ /* Adjust the beginning of the tail if necessary. */
+ if (end > rl->vcn) {
+ delta = end - rl->vcn;
+ rl->vcn = end;
+ rl->length -= delta;
+ /* Only adjust the lcn if it is real. */
+ if (rl->lcn >= 0)
+ rl->lcn += delta;
+ }
+shrink_allocation:
+ /* Reallocate memory if the allocation changed. */
+ if (rl < rl_end) {
+ rl = ntfs_rl_realloc(runlist->rl, old_size,
+ old_size - (rl_end - rl));
+ if (IS_ERR(rl))
+ ntfs_warning(vol->sb, "Failed to shrink "
+ "runlist buffer. This just "
+ "wastes a bit of memory "
+ "temporarily so we ignore it "
+ "and return success.");
+ else
+ runlist->rl = rl;
+ }
+ ntfs_debug("Done (extend hole).");
+ return 0;
+ }
+ /*
+ * If @start is at the beginning of a run things are easier as there is
+ * no need to split the first run.
+ */
+ if (start == rl->vcn) {
+ /*
+ * @start is at the beginning of a run.
+ *
+ * If the previous run is sparse, extend its hole.
+ *
+ * If @end is not in the same run, switch the run to be sparse
+ * and extend the newly created hole.
+ *
+ * Thus both of these cases reduce the problem to the above
+ * case of "@start is in a hole".
+ */
+ if (rl > runlist->rl && (rl - 1)->lcn == LCN_HOLE) {
+ rl--;
+ goto extend_hole;
+ }
+ if (end >= rl[1].vcn) {
+ rl->lcn = LCN_HOLE;
+ goto extend_hole;
+ }
+ /*
+ * The final case is when @end is in the same run as @start.
+ * For this need to split the run into two. One run for the
+ * sparse region between the beginning of the old run, i.e.
+ * @start, and @end and one for the remaining non-sparse
+ * region, i.e. between @end and the end of the old run.
+ */
+ trl = ntfs_rl_realloc(runlist->rl, old_size, old_size + 1);
+ if (IS_ERR(trl))
+ goto enomem_out;
+ old_size++;
+ if (runlist->rl != trl) {
+ rl = trl + (rl - runlist->rl);
+ rl_end = trl + (rl_end - runlist->rl);
+ rl_real_end = trl + (rl_real_end - runlist->rl);
+ runlist->rl = trl;
+ }
+split_end:
+ /* Shift all the runs up by one. */
+ memmove(rl + 1, rl, (rl_real_end - rl + 1) * sizeof(*rl));
+ /* Finally, setup the two split runs. */
+ rl->lcn = LCN_HOLE;
+ rl->length = length;
+ rl++;
+ rl->vcn += length;
+ /* Only adjust the lcn if it is real. */
+ if (rl->lcn >= 0 || lcn_fixup)
+ rl->lcn += length;
+ rl->length -= length;
+ ntfs_debug("Done (split one).");
+ return 0;
+ }
+ /*
+ * @start is neither in a hole nor at the beginning of a run.
+ *
+ * If @end is in a hole, things are easier as simply truncating the run
+ * @start is in to end at @start - 1, deleting all runs after that up
+ * to @end, and finally extending the beginning of the run @end is in
+ * to be @start is all that is needed.
+ */
+ if (rl_end->lcn == LCN_HOLE) {
+ /* Truncate the run containing @start. */
+ rl->length = start - rl->vcn;
+ rl++;
+ /* Cut out all runlist elements up to @end. */
+ if (rl < rl_end)
+ memmove(rl, rl_end, (rl_real_end - rl_end + 1) *
+ sizeof(*rl));
+ /* Extend the beginning of the run @end is in to be @start. */
+ rl->vcn = start;
+ rl->length = rl[1].vcn - start;
+ goto shrink_allocation;
+ }
+ /*
+ * If @end is not in a hole there are still two cases to distinguish.
+ * Either @end is or is not in the same run as @start.
+ *
+ * The second case is easier as it can be reduced to an already solved
+ * problem by truncating the run @start is in to end at @start - 1.
+ * Then, if @end is in the next run need to split the run into a sparse
+ * run followed by a non-sparse run (already covered above) and if @end
+ * is not in the next run switching it to be sparse, again reduces the
+ * problem to the already covered case of "@start is in a hole".
+ */
+ if (end >= rl[1].vcn) {
+ /*
+ * If @end is not in the next run, reduce the problem to the
+ * case of "@start is in a hole".
+ */
+ if (rl[1].length && end >= rl[2].vcn) {
+ /* Truncate the run containing @start. */
+ rl->length = start - rl->vcn;
+ rl++;
+ rl->vcn = start;
+ rl->lcn = LCN_HOLE;
+ goto extend_hole;
+ }
+ trl = ntfs_rl_realloc(runlist->rl, old_size, old_size + 1);
+ if (IS_ERR(trl))
+ goto enomem_out;
+ old_size++;
+ if (runlist->rl != trl) {
+ rl = trl + (rl - runlist->rl);
+ rl_end = trl + (rl_end - runlist->rl);
+ rl_real_end = trl + (rl_real_end - runlist->rl);
+ runlist->rl = trl;
+ }
+ /* Truncate the run containing @start. */
+ rl->length = start - rl->vcn;
+ rl++;
+ /*
+ * @end is in the next run, reduce the problem to the case
+ * where "@start is at the beginning of a run and @end is in
+ * the same run as @start".
+ */
+ delta = rl->vcn - start;
+ rl->vcn = start;
+ if (rl->lcn >= 0) {
+ rl->lcn -= delta;
+ /* Need this in case the lcn just became negative. */
+ lcn_fixup = true;
+ }
+ rl->length += delta;
+ goto split_end;
+ }
+ /*
+ * The first case from above, i.e. @end is in the same run as @start.
+ * We need to split the run into three. One run for the non-sparse
+ * region between the beginning of the old run and @start, one for the
+ * sparse region between @start and @end, and one for the remaining
+ * non-sparse region, i.e. between @end and the end of the old run.
+ */
+ trl = ntfs_rl_realloc(runlist->rl, old_size, old_size + 2);
+ if (IS_ERR(trl))
+ goto enomem_out;
+ old_size += 2;
+ if (runlist->rl != trl) {
+ rl = trl + (rl - runlist->rl);
+ rl_end = trl + (rl_end - runlist->rl);
+ rl_real_end = trl + (rl_real_end - runlist->rl);
+ runlist->rl = trl;
+ }
+ /* Shift all the runs up by two. */
+ memmove(rl + 2, rl, (rl_real_end - rl + 1) * sizeof(*rl));
+ /* Finally, setup the three split runs. */
+ rl->length = start - rl->vcn;
+ rl++;
+ rl->vcn = start;
+ rl->lcn = LCN_HOLE;
+ rl->length = length;
+ rl++;
+ delta = end - rl->vcn;
+ rl->vcn = end;
+ rl->lcn += delta;
+ rl->length -= delta;
+ ntfs_debug("Done (split both).");
+ return 0;
+enomem_out:
+ ntfs_error(vol->sb, "Not enough memory to extend runlist buffer.");
+ return -ENOMEM;
+}
+
+#endif /* NTFS_RW */
diff --git a/fs/ntfs/runlist.h b/fs/ntfs/runlist.h
new file mode 100644
index 000000000..38de0a375
--- /dev/null
+++ b/fs/ntfs/runlist.h
@@ -0,0 +1,88 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * runlist.h - Defines for runlist handling in NTFS Linux kernel driver.
+ * Part of the Linux-NTFS project.
+ *
+ * Copyright (c) 2001-2005 Anton Altaparmakov
+ * Copyright (c) 2002 Richard Russon
+ */
+
+#ifndef _LINUX_NTFS_RUNLIST_H
+#define _LINUX_NTFS_RUNLIST_H
+
+#include "types.h"
+#include "layout.h"
+#include "volume.h"
+
+/**
+ * runlist_element - in memory vcn to lcn mapping array element
+ * @vcn: starting vcn of the current array element
+ * @lcn: starting lcn of the current array element
+ * @length: length in clusters of the current array element
+ *
+ * The last vcn (in fact the last vcn + 1) is reached when length == 0.
+ *
+ * When lcn == -1 this means that the count vcns starting at vcn are not
+ * physically allocated (i.e. this is a hole / data is sparse).
+ */
+typedef struct { /* In memory vcn to lcn mapping structure element. */
+ VCN vcn; /* vcn = Starting virtual cluster number. */
+ LCN lcn; /* lcn = Starting logical cluster number. */
+ s64 length; /* Run length in clusters. */
+} runlist_element;
+
+/**
+ * runlist - in memory vcn to lcn mapping array including a read/write lock
+ * @rl: pointer to an array of runlist elements
+ * @lock: read/write spinlock for serializing access to @rl
+ *
+ */
+typedef struct {
+ runlist_element *rl;
+ struct rw_semaphore lock;
+} runlist;
+
+static inline void ntfs_init_runlist(runlist *rl)
+{
+ rl->rl = NULL;
+ init_rwsem(&rl->lock);
+}
+
+typedef enum {
+ LCN_HOLE = -1, /* Keep this as highest value or die! */
+ LCN_RL_NOT_MAPPED = -2,
+ LCN_ENOENT = -3,
+ LCN_ENOMEM = -4,
+ LCN_EIO = -5,
+} LCN_SPECIAL_VALUES;
+
+extern runlist_element *ntfs_runlists_merge(runlist_element *drl,
+ runlist_element *srl);
+
+extern runlist_element *ntfs_mapping_pairs_decompress(const ntfs_volume *vol,
+ const ATTR_RECORD *attr, runlist_element *old_rl);
+
+extern LCN ntfs_rl_vcn_to_lcn(const runlist_element *rl, const VCN vcn);
+
+#ifdef NTFS_RW
+
+extern runlist_element *ntfs_rl_find_vcn_nolock(runlist_element *rl,
+ const VCN vcn);
+
+extern int ntfs_get_size_for_mapping_pairs(const ntfs_volume *vol,
+ const runlist_element *rl, const VCN first_vcn,
+ const VCN last_vcn);
+
+extern int ntfs_mapping_pairs_build(const ntfs_volume *vol, s8 *dst,
+ const int dst_len, const runlist_element *rl,
+ const VCN first_vcn, const VCN last_vcn, VCN *const stop_vcn);
+
+extern int ntfs_rl_truncate_nolock(const ntfs_volume *vol,
+ runlist *const runlist, const s64 new_length);
+
+int ntfs_rl_punch_nolock(const ntfs_volume *vol, runlist *const runlist,
+ const VCN start, const s64 length);
+
+#endif /* NTFS_RW */
+
+#endif /* _LINUX_NTFS_RUNLIST_H */
diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c
new file mode 100644
index 000000000..001f4e053
--- /dev/null
+++ b/fs/ntfs/super.c
@@ -0,0 +1,3194 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * super.c - NTFS kernel super block handling. Part of the Linux-NTFS project.
+ *
+ * Copyright (c) 2001-2012 Anton Altaparmakov and Tuxera Inc.
+ * Copyright (c) 2001,2002 Richard Russon
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/stddef.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/spinlock.h>
+#include <linux/blkdev.h> /* For bdev_logical_block_size(). */
+#include <linux/backing-dev.h>
+#include <linux/buffer_head.h>
+#include <linux/vfs.h>
+#include <linux/moduleparam.h>
+#include <linux/bitmap.h>
+
+#include "sysctl.h"
+#include "logfile.h"
+#include "quota.h"
+#include "usnjrnl.h"
+#include "dir.h"
+#include "debug.h"
+#include "index.h"
+#include "inode.h"
+#include "aops.h"
+#include "layout.h"
+#include "malloc.h"
+#include "ntfs.h"
+
+/* Number of mounted filesystems which have compression enabled. */
+static unsigned long ntfs_nr_compression_users;
+
+/* A global default upcase table and a corresponding reference count. */
+static ntfschar *default_upcase;
+static unsigned long ntfs_nr_upcase_users;
+
+/* Error constants/strings used in inode.c::ntfs_show_options(). */
+typedef enum {
+ /* One of these must be present, default is ON_ERRORS_CONTINUE. */
+ ON_ERRORS_PANIC = 0x01,
+ ON_ERRORS_REMOUNT_RO = 0x02,
+ ON_ERRORS_CONTINUE = 0x04,
+ /* Optional, can be combined with any of the above. */
+ ON_ERRORS_RECOVER = 0x10,
+} ON_ERRORS_ACTIONS;
+
+const option_t on_errors_arr[] = {
+ { ON_ERRORS_PANIC, "panic" },
+ { ON_ERRORS_REMOUNT_RO, "remount-ro", },
+ { ON_ERRORS_CONTINUE, "continue", },
+ { ON_ERRORS_RECOVER, "recover" },
+ { 0, NULL }
+};
+
+/**
+ * simple_getbool -
+ *
+ * Copied from old ntfs driver (which copied from vfat driver).
+ */
+static int simple_getbool(char *s, bool *setval)
+{
+ if (s) {
+ if (!strcmp(s, "1") || !strcmp(s, "yes") || !strcmp(s, "true"))
+ *setval = true;
+ else if (!strcmp(s, "0") || !strcmp(s, "no") ||
+ !strcmp(s, "false"))
+ *setval = false;
+ else
+ return 0;
+ } else
+ *setval = true;
+ return 1;
+}
+
+/**
+ * parse_options - parse the (re)mount options
+ * @vol: ntfs volume
+ * @opt: string containing the (re)mount options
+ *
+ * Parse the recognized options in @opt for the ntfs volume described by @vol.
+ */
+static bool parse_options(ntfs_volume *vol, char *opt)
+{
+ char *p, *v, *ov;
+ static char *utf8 = "utf8";
+ int errors = 0, sloppy = 0;
+ kuid_t uid = INVALID_UID;
+ kgid_t gid = INVALID_GID;
+ umode_t fmask = (umode_t)-1, dmask = (umode_t)-1;
+ int mft_zone_multiplier = -1, on_errors = -1;
+ int show_sys_files = -1, case_sensitive = -1, disable_sparse = -1;
+ struct nls_table *nls_map = NULL, *old_nls;
+
+ /* I am lazy... (-8 */
+#define NTFS_GETOPT_WITH_DEFAULT(option, variable, default_value) \
+ if (!strcmp(p, option)) { \
+ if (!v || !*v) \
+ variable = default_value; \
+ else { \
+ variable = simple_strtoul(ov = v, &v, 0); \
+ if (*v) \
+ goto needs_val; \
+ } \
+ }
+#define NTFS_GETOPT(option, variable) \
+ if (!strcmp(p, option)) { \
+ if (!v || !*v) \
+ goto needs_arg; \
+ variable = simple_strtoul(ov = v, &v, 0); \
+ if (*v) \
+ goto needs_val; \
+ }
+#define NTFS_GETOPT_UID(option, variable) \
+ if (!strcmp(p, option)) { \
+ uid_t uid_value; \
+ if (!v || !*v) \
+ goto needs_arg; \
+ uid_value = simple_strtoul(ov = v, &v, 0); \
+ if (*v) \
+ goto needs_val; \
+ variable = make_kuid(current_user_ns(), uid_value); \
+ if (!uid_valid(variable)) \
+ goto needs_val; \
+ }
+#define NTFS_GETOPT_GID(option, variable) \
+ if (!strcmp(p, option)) { \
+ gid_t gid_value; \
+ if (!v || !*v) \
+ goto needs_arg; \
+ gid_value = simple_strtoul(ov = v, &v, 0); \
+ if (*v) \
+ goto needs_val; \
+ variable = make_kgid(current_user_ns(), gid_value); \
+ if (!gid_valid(variable)) \
+ goto needs_val; \
+ }
+#define NTFS_GETOPT_OCTAL(option, variable) \
+ if (!strcmp(p, option)) { \
+ if (!v || !*v) \
+ goto needs_arg; \
+ variable = simple_strtoul(ov = v, &v, 8); \
+ if (*v) \
+ goto needs_val; \
+ }
+#define NTFS_GETOPT_BOOL(option, variable) \
+ if (!strcmp(p, option)) { \
+ bool val; \
+ if (!simple_getbool(v, &val)) \
+ goto needs_bool; \
+ variable = val; \
+ }
+#define NTFS_GETOPT_OPTIONS_ARRAY(option, variable, opt_array) \
+ if (!strcmp(p, option)) { \
+ int _i; \
+ if (!v || !*v) \
+ goto needs_arg; \
+ ov = v; \
+ if (variable == -1) \
+ variable = 0; \
+ for (_i = 0; opt_array[_i].str && *opt_array[_i].str; _i++) \
+ if (!strcmp(opt_array[_i].str, v)) { \
+ variable |= opt_array[_i].val; \
+ break; \
+ } \
+ if (!opt_array[_i].str || !*opt_array[_i].str) \
+ goto needs_val; \
+ }
+ if (!opt || !*opt)
+ goto no_mount_options;
+ ntfs_debug("Entering with mount options string: %s", opt);
+ while ((p = strsep(&opt, ","))) {
+ if ((v = strchr(p, '=')))
+ *v++ = 0;
+ NTFS_GETOPT_UID("uid", uid)
+ else NTFS_GETOPT_GID("gid", gid)
+ else NTFS_GETOPT_OCTAL("umask", fmask = dmask)
+ else NTFS_GETOPT_OCTAL("fmask", fmask)
+ else NTFS_GETOPT_OCTAL("dmask", dmask)
+ else NTFS_GETOPT("mft_zone_multiplier", mft_zone_multiplier)
+ else NTFS_GETOPT_WITH_DEFAULT("sloppy", sloppy, true)
+ else NTFS_GETOPT_BOOL("show_sys_files", show_sys_files)
+ else NTFS_GETOPT_BOOL("case_sensitive", case_sensitive)
+ else NTFS_GETOPT_BOOL("disable_sparse", disable_sparse)
+ else NTFS_GETOPT_OPTIONS_ARRAY("errors", on_errors,
+ on_errors_arr)
+ else if (!strcmp(p, "posix") || !strcmp(p, "show_inodes"))
+ ntfs_warning(vol->sb, "Ignoring obsolete option %s.",
+ p);
+ else if (!strcmp(p, "nls") || !strcmp(p, "iocharset")) {
+ if (!strcmp(p, "iocharset"))
+ ntfs_warning(vol->sb, "Option iocharset is "
+ "deprecated. Please use "
+ "option nls=<charsetname> in "
+ "the future.");
+ if (!v || !*v)
+ goto needs_arg;
+use_utf8:
+ old_nls = nls_map;
+ nls_map = load_nls(v);
+ if (!nls_map) {
+ if (!old_nls) {
+ ntfs_error(vol->sb, "NLS character set "
+ "%s not found.", v);
+ return false;
+ }
+ ntfs_error(vol->sb, "NLS character set %s not "
+ "found. Using previous one %s.",
+ v, old_nls->charset);
+ nls_map = old_nls;
+ } else /* nls_map */ {
+ unload_nls(old_nls);
+ }
+ } else if (!strcmp(p, "utf8")) {
+ bool val = false;
+ ntfs_warning(vol->sb, "Option utf8 is no longer "
+ "supported, using option nls=utf8. Please "
+ "use option nls=utf8 in the future and "
+ "make sure utf8 is compiled either as a "
+ "module or into the kernel.");
+ if (!v || !*v)
+ val = true;
+ else if (!simple_getbool(v, &val))
+ goto needs_bool;
+ if (val) {
+ v = utf8;
+ goto use_utf8;
+ }
+ } else {
+ ntfs_error(vol->sb, "Unrecognized mount option %s.", p);
+ if (errors < INT_MAX)
+ errors++;
+ }
+#undef NTFS_GETOPT_OPTIONS_ARRAY
+#undef NTFS_GETOPT_BOOL
+#undef NTFS_GETOPT
+#undef NTFS_GETOPT_WITH_DEFAULT
+ }
+no_mount_options:
+ if (errors && !sloppy)
+ return false;
+ if (sloppy)
+ ntfs_warning(vol->sb, "Sloppy option given. Ignoring "
+ "unrecognized mount option(s) and continuing.");
+ /* Keep this first! */
+ if (on_errors != -1) {
+ if (!on_errors) {
+ ntfs_error(vol->sb, "Invalid errors option argument "
+ "or bug in options parser.");
+ return false;
+ }
+ }
+ if (nls_map) {
+ if (vol->nls_map && vol->nls_map != nls_map) {
+ ntfs_error(vol->sb, "Cannot change NLS character set "
+ "on remount.");
+ return false;
+ } /* else (!vol->nls_map) */
+ ntfs_debug("Using NLS character set %s.", nls_map->charset);
+ vol->nls_map = nls_map;
+ } else /* (!nls_map) */ {
+ if (!vol->nls_map) {
+ vol->nls_map = load_nls_default();
+ if (!vol->nls_map) {
+ ntfs_error(vol->sb, "Failed to load default "
+ "NLS character set.");
+ return false;
+ }
+ ntfs_debug("Using default NLS character set (%s).",
+ vol->nls_map->charset);
+ }
+ }
+ if (mft_zone_multiplier != -1) {
+ if (vol->mft_zone_multiplier && vol->mft_zone_multiplier !=
+ mft_zone_multiplier) {
+ ntfs_error(vol->sb, "Cannot change mft_zone_multiplier "
+ "on remount.");
+ return false;
+ }
+ if (mft_zone_multiplier < 1 || mft_zone_multiplier > 4) {
+ ntfs_error(vol->sb, "Invalid mft_zone_multiplier. "
+ "Using default value, i.e. 1.");
+ mft_zone_multiplier = 1;
+ }
+ vol->mft_zone_multiplier = mft_zone_multiplier;
+ }
+ if (!vol->mft_zone_multiplier)
+ vol->mft_zone_multiplier = 1;
+ if (on_errors != -1)
+ vol->on_errors = on_errors;
+ if (!vol->on_errors || vol->on_errors == ON_ERRORS_RECOVER)
+ vol->on_errors |= ON_ERRORS_CONTINUE;
+ if (uid_valid(uid))
+ vol->uid = uid;
+ if (gid_valid(gid))
+ vol->gid = gid;
+ if (fmask != (umode_t)-1)
+ vol->fmask = fmask;
+ if (dmask != (umode_t)-1)
+ vol->dmask = dmask;
+ if (show_sys_files != -1) {
+ if (show_sys_files)
+ NVolSetShowSystemFiles(vol);
+ else
+ NVolClearShowSystemFiles(vol);
+ }
+ if (case_sensitive != -1) {
+ if (case_sensitive)
+ NVolSetCaseSensitive(vol);
+ else
+ NVolClearCaseSensitive(vol);
+ }
+ if (disable_sparse != -1) {
+ if (disable_sparse)
+ NVolClearSparseEnabled(vol);
+ else {
+ if (!NVolSparseEnabled(vol) &&
+ vol->major_ver && vol->major_ver < 3)
+ ntfs_warning(vol->sb, "Not enabling sparse "
+ "support due to NTFS volume "
+ "version %i.%i (need at least "
+ "version 3.0).", vol->major_ver,
+ vol->minor_ver);
+ else
+ NVolSetSparseEnabled(vol);
+ }
+ }
+ return true;
+needs_arg:
+ ntfs_error(vol->sb, "The %s option requires an argument.", p);
+ return false;
+needs_bool:
+ ntfs_error(vol->sb, "The %s option requires a boolean argument.", p);
+ return false;
+needs_val:
+ ntfs_error(vol->sb, "Invalid %s option argument: %s", p, ov);
+ return false;
+}
+
+#ifdef NTFS_RW
+
+/**
+ * ntfs_write_volume_flags - write new flags to the volume information flags
+ * @vol: ntfs volume on which to modify the flags
+ * @flags: new flags value for the volume information flags
+ *
+ * Internal function. You probably want to use ntfs_{set,clear}_volume_flags()
+ * instead (see below).
+ *
+ * Replace the volume information flags on the volume @vol with the value
+ * supplied in @flags. Note, this overwrites the volume information flags, so
+ * make sure to combine the flags you want to modify with the old flags and use
+ * the result when calling ntfs_write_volume_flags().
+ *
+ * Return 0 on success and -errno on error.
+ */
+static int ntfs_write_volume_flags(ntfs_volume *vol, const VOLUME_FLAGS flags)
+{
+ ntfs_inode *ni = NTFS_I(vol->vol_ino);
+ MFT_RECORD *m;
+ VOLUME_INFORMATION *vi;
+ ntfs_attr_search_ctx *ctx;
+ int err;
+
+ ntfs_debug("Entering, old flags = 0x%x, new flags = 0x%x.",
+ le16_to_cpu(vol->vol_flags), le16_to_cpu(flags));
+ if (vol->vol_flags == flags)
+ goto done;
+ BUG_ON(!ni);
+ m = map_mft_record(ni);
+ if (IS_ERR(m)) {
+ err = PTR_ERR(m);
+ goto err_out;
+ }
+ ctx = ntfs_attr_get_search_ctx(ni, m);
+ if (!ctx) {
+ err = -ENOMEM;
+ goto put_unm_err_out;
+ }
+ err = ntfs_attr_lookup(AT_VOLUME_INFORMATION, NULL, 0, 0, 0, NULL, 0,
+ ctx);
+ if (err)
+ goto put_unm_err_out;
+ vi = (VOLUME_INFORMATION*)((u8*)ctx->attr +
+ le16_to_cpu(ctx->attr->data.resident.value_offset));
+ vol->vol_flags = vi->flags = flags;
+ flush_dcache_mft_record_page(ctx->ntfs_ino);
+ mark_mft_record_dirty(ctx->ntfs_ino);
+ ntfs_attr_put_search_ctx(ctx);
+ unmap_mft_record(ni);
+done:
+ ntfs_debug("Done.");
+ return 0;
+put_unm_err_out:
+ if (ctx)
+ ntfs_attr_put_search_ctx(ctx);
+ unmap_mft_record(ni);
+err_out:
+ ntfs_error(vol->sb, "Failed with error code %i.", -err);
+ return err;
+}
+
+/**
+ * ntfs_set_volume_flags - set bits in the volume information flags
+ * @vol: ntfs volume on which to modify the flags
+ * @flags: flags to set on the volume
+ *
+ * Set the bits in @flags in the volume information flags on the volume @vol.
+ *
+ * Return 0 on success and -errno on error.
+ */
+static inline int ntfs_set_volume_flags(ntfs_volume *vol, VOLUME_FLAGS flags)
+{
+ flags &= VOLUME_FLAGS_MASK;
+ return ntfs_write_volume_flags(vol, vol->vol_flags | flags);
+}
+
+/**
+ * ntfs_clear_volume_flags - clear bits in the volume information flags
+ * @vol: ntfs volume on which to modify the flags
+ * @flags: flags to clear on the volume
+ *
+ * Clear the bits in @flags in the volume information flags on the volume @vol.
+ *
+ * Return 0 on success and -errno on error.
+ */
+static inline int ntfs_clear_volume_flags(ntfs_volume *vol, VOLUME_FLAGS flags)
+{
+ flags &= VOLUME_FLAGS_MASK;
+ flags = vol->vol_flags & cpu_to_le16(~le16_to_cpu(flags));
+ return ntfs_write_volume_flags(vol, flags);
+}
+
+#endif /* NTFS_RW */
+
+/**
+ * ntfs_remount - change the mount options of a mounted ntfs filesystem
+ * @sb: superblock of mounted ntfs filesystem
+ * @flags: remount flags
+ * @opt: remount options string
+ *
+ * Change the mount options of an already mounted ntfs filesystem.
+ *
+ * NOTE: The VFS sets the @sb->s_flags remount flags to @flags after
+ * ntfs_remount() returns successfully (i.e. returns 0). Otherwise,
+ * @sb->s_flags are not changed.
+ */
+static int ntfs_remount(struct super_block *sb, int *flags, char *opt)
+{
+ ntfs_volume *vol = NTFS_SB(sb);
+
+ ntfs_debug("Entering with remount options string: %s", opt);
+
+ sync_filesystem(sb);
+
+#ifndef NTFS_RW
+ /* For read-only compiled driver, enforce read-only flag. */
+ *flags |= SB_RDONLY;
+#else /* NTFS_RW */
+ /*
+ * For the read-write compiled driver, if we are remounting read-write,
+ * make sure there are no volume errors and that no unsupported volume
+ * flags are set. Also, empty the logfile journal as it would become
+ * stale as soon as something is written to the volume and mark the
+ * volume dirty so that chkdsk is run if the volume is not umounted
+ * cleanly. Finally, mark the quotas out of date so Windows rescans
+ * the volume on boot and updates them.
+ *
+ * When remounting read-only, mark the volume clean if no volume errors
+ * have occurred.
+ */
+ if (sb_rdonly(sb) && !(*flags & SB_RDONLY)) {
+ static const char *es = ". Cannot remount read-write.";
+
+ /* Remounting read-write. */
+ if (NVolErrors(vol)) {
+ ntfs_error(sb, "Volume has errors and is read-only%s",
+ es);
+ return -EROFS;
+ }
+ if (vol->vol_flags & VOLUME_IS_DIRTY) {
+ ntfs_error(sb, "Volume is dirty and read-only%s", es);
+ return -EROFS;
+ }
+ if (vol->vol_flags & VOLUME_MODIFIED_BY_CHKDSK) {
+ ntfs_error(sb, "Volume has been modified by chkdsk "
+ "and is read-only%s", es);
+ return -EROFS;
+ }
+ if (vol->vol_flags & VOLUME_MUST_MOUNT_RO_MASK) {
+ ntfs_error(sb, "Volume has unsupported flags set "
+ "(0x%x) and is read-only%s",
+ (unsigned)le16_to_cpu(vol->vol_flags),
+ es);
+ return -EROFS;
+ }
+ if (ntfs_set_volume_flags(vol, VOLUME_IS_DIRTY)) {
+ ntfs_error(sb, "Failed to set dirty bit in volume "
+ "information flags%s", es);
+ return -EROFS;
+ }
+#if 0
+ // TODO: Enable this code once we start modifying anything that
+ // is different between NTFS 1.2 and 3.x...
+ /* Set NT4 compatibility flag on newer NTFS version volumes. */
+ if ((vol->major_ver > 1)) {
+ if (ntfs_set_volume_flags(vol, VOLUME_MOUNTED_ON_NT4)) {
+ ntfs_error(sb, "Failed to set NT4 "
+ "compatibility flag%s", es);
+ NVolSetErrors(vol);
+ return -EROFS;
+ }
+ }
+#endif
+ if (!ntfs_empty_logfile(vol->logfile_ino)) {
+ ntfs_error(sb, "Failed to empty journal $LogFile%s",
+ es);
+ NVolSetErrors(vol);
+ return -EROFS;
+ }
+ if (!ntfs_mark_quotas_out_of_date(vol)) {
+ ntfs_error(sb, "Failed to mark quotas out of date%s",
+ es);
+ NVolSetErrors(vol);
+ return -EROFS;
+ }
+ if (!ntfs_stamp_usnjrnl(vol)) {
+ ntfs_error(sb, "Failed to stamp transaction log "
+ "($UsnJrnl)%s", es);
+ NVolSetErrors(vol);
+ return -EROFS;
+ }
+ } else if (!sb_rdonly(sb) && (*flags & SB_RDONLY)) {
+ /* Remounting read-only. */
+ if (!NVolErrors(vol)) {
+ if (ntfs_clear_volume_flags(vol, VOLUME_IS_DIRTY))
+ ntfs_warning(sb, "Failed to clear dirty bit "
+ "in volume information "
+ "flags. Run chkdsk.");
+ }
+ }
+#endif /* NTFS_RW */
+
+ // TODO: Deal with *flags.
+
+ if (!parse_options(vol, opt))
+ return -EINVAL;
+
+ ntfs_debug("Done.");
+ return 0;
+}
+
+/**
+ * is_boot_sector_ntfs - check whether a boot sector is a valid NTFS boot sector
+ * @sb: Super block of the device to which @b belongs.
+ * @b: Boot sector of device @sb to check.
+ * @silent: If 'true', all output will be silenced.
+ *
+ * is_boot_sector_ntfs() checks whether the boot sector @b is a valid NTFS boot
+ * sector. Returns 'true' if it is valid and 'false' if not.
+ *
+ * @sb is only needed for warning/error output, i.e. it can be NULL when silent
+ * is 'true'.
+ */
+static bool is_boot_sector_ntfs(const struct super_block *sb,
+ const NTFS_BOOT_SECTOR *b, const bool silent)
+{
+ /*
+ * Check that checksum == sum of u32 values from b to the checksum
+ * field. If checksum is zero, no checking is done. We will work when
+ * the checksum test fails, since some utilities update the boot sector
+ * ignoring the checksum which leaves the checksum out-of-date. We
+ * report a warning if this is the case.
+ */
+ if ((void*)b < (void*)&b->checksum && b->checksum && !silent) {
+ le32 *u;
+ u32 i;
+
+ for (i = 0, u = (le32*)b; u < (le32*)(&b->checksum); ++u)
+ i += le32_to_cpup(u);
+ if (le32_to_cpu(b->checksum) != i)
+ ntfs_warning(sb, "Invalid boot sector checksum.");
+ }
+ /* Check OEMidentifier is "NTFS " */
+ if (b->oem_id != magicNTFS)
+ goto not_ntfs;
+ /* Check bytes per sector value is between 256 and 4096. */
+ if (le16_to_cpu(b->bpb.bytes_per_sector) < 0x100 ||
+ le16_to_cpu(b->bpb.bytes_per_sector) > 0x1000)
+ goto not_ntfs;
+ /* Check sectors per cluster value is valid. */
+ switch (b->bpb.sectors_per_cluster) {
+ case 1: case 2: case 4: case 8: case 16: case 32: case 64: case 128:
+ break;
+ default:
+ goto not_ntfs;
+ }
+ /* Check the cluster size is not above the maximum (64kiB). */
+ if ((u32)le16_to_cpu(b->bpb.bytes_per_sector) *
+ b->bpb.sectors_per_cluster > NTFS_MAX_CLUSTER_SIZE)
+ goto not_ntfs;
+ /* Check reserved/unused fields are really zero. */
+ if (le16_to_cpu(b->bpb.reserved_sectors) ||
+ le16_to_cpu(b->bpb.root_entries) ||
+ le16_to_cpu(b->bpb.sectors) ||
+ le16_to_cpu(b->bpb.sectors_per_fat) ||
+ le32_to_cpu(b->bpb.large_sectors) || b->bpb.fats)
+ goto not_ntfs;
+ /* Check clusters per file mft record value is valid. */
+ if ((u8)b->clusters_per_mft_record < 0xe1 ||
+ (u8)b->clusters_per_mft_record > 0xf7)
+ switch (b->clusters_per_mft_record) {
+ case 1: case 2: case 4: case 8: case 16: case 32: case 64:
+ break;
+ default:
+ goto not_ntfs;
+ }
+ /* Check clusters per index block value is valid. */
+ if ((u8)b->clusters_per_index_record < 0xe1 ||
+ (u8)b->clusters_per_index_record > 0xf7)
+ switch (b->clusters_per_index_record) {
+ case 1: case 2: case 4: case 8: case 16: case 32: case 64:
+ break;
+ default:
+ goto not_ntfs;
+ }
+ /*
+ * Check for valid end of sector marker. We will work without it, but
+ * many BIOSes will refuse to boot from a bootsector if the magic is
+ * incorrect, so we emit a warning.
+ */
+ if (!silent && b->end_of_sector_marker != cpu_to_le16(0xaa55))
+ ntfs_warning(sb, "Invalid end of sector marker.");
+ return true;
+not_ntfs:
+ return false;
+}
+
+/**
+ * read_ntfs_boot_sector - read the NTFS boot sector of a device
+ * @sb: super block of device to read the boot sector from
+ * @silent: if true, suppress all output
+ *
+ * Reads the boot sector from the device and validates it. If that fails, tries
+ * to read the backup boot sector, first from the end of the device a-la NT4 and
+ * later and then from the middle of the device a-la NT3.51 and before.
+ *
+ * If a valid boot sector is found but it is not the primary boot sector, we
+ * repair the primary boot sector silently (unless the device is read-only or
+ * the primary boot sector is not accessible).
+ *
+ * NOTE: To call this function, @sb must have the fields s_dev, the ntfs super
+ * block (u.ntfs_sb), nr_blocks and the device flags (s_flags) initialized
+ * to their respective values.
+ *
+ * Return the unlocked buffer head containing the boot sector or NULL on error.
+ */
+static struct buffer_head *read_ntfs_boot_sector(struct super_block *sb,
+ const int silent)
+{
+ const char *read_err_str = "Unable to read %s boot sector.";
+ struct buffer_head *bh_primary, *bh_backup;
+ sector_t nr_blocks = NTFS_SB(sb)->nr_blocks;
+
+ /* Try to read primary boot sector. */
+ if ((bh_primary = sb_bread(sb, 0))) {
+ if (is_boot_sector_ntfs(sb, (NTFS_BOOT_SECTOR*)
+ bh_primary->b_data, silent))
+ return bh_primary;
+ if (!silent)
+ ntfs_error(sb, "Primary boot sector is invalid.");
+ } else if (!silent)
+ ntfs_error(sb, read_err_str, "primary");
+ if (!(NTFS_SB(sb)->on_errors & ON_ERRORS_RECOVER)) {
+ if (bh_primary)
+ brelse(bh_primary);
+ if (!silent)
+ ntfs_error(sb, "Mount option errors=recover not used. "
+ "Aborting without trying to recover.");
+ return NULL;
+ }
+ /* Try to read NT4+ backup boot sector. */
+ if ((bh_backup = sb_bread(sb, nr_blocks - 1))) {
+ if (is_boot_sector_ntfs(sb, (NTFS_BOOT_SECTOR*)
+ bh_backup->b_data, silent))
+ goto hotfix_primary_boot_sector;
+ brelse(bh_backup);
+ } else if (!silent)
+ ntfs_error(sb, read_err_str, "backup");
+ /* Try to read NT3.51- backup boot sector. */
+ if ((bh_backup = sb_bread(sb, nr_blocks >> 1))) {
+ if (is_boot_sector_ntfs(sb, (NTFS_BOOT_SECTOR*)
+ bh_backup->b_data, silent))
+ goto hotfix_primary_boot_sector;
+ if (!silent)
+ ntfs_error(sb, "Could not find a valid backup boot "
+ "sector.");
+ brelse(bh_backup);
+ } else if (!silent)
+ ntfs_error(sb, read_err_str, "backup");
+ /* We failed. Cleanup and return. */
+ if (bh_primary)
+ brelse(bh_primary);
+ return NULL;
+hotfix_primary_boot_sector:
+ if (bh_primary) {
+ /*
+ * If we managed to read sector zero and the volume is not
+ * read-only, copy the found, valid backup boot sector to the
+ * primary boot sector. Note we only copy the actual boot
+ * sector structure, not the actual whole device sector as that
+ * may be bigger and would potentially damage the $Boot system
+ * file (FIXME: Would be nice to know if the backup boot sector
+ * on a large sector device contains the whole boot loader or
+ * just the first 512 bytes).
+ */
+ if (!sb_rdonly(sb)) {
+ ntfs_warning(sb, "Hot-fix: Recovering invalid primary "
+ "boot sector from backup copy.");
+ memcpy(bh_primary->b_data, bh_backup->b_data,
+ NTFS_BLOCK_SIZE);
+ mark_buffer_dirty(bh_primary);
+ sync_dirty_buffer(bh_primary);
+ if (buffer_uptodate(bh_primary)) {
+ brelse(bh_backup);
+ return bh_primary;
+ }
+ ntfs_error(sb, "Hot-fix: Device write error while "
+ "recovering primary boot sector.");
+ } else {
+ ntfs_warning(sb, "Hot-fix: Recovery of primary boot "
+ "sector failed: Read-only mount.");
+ }
+ brelse(bh_primary);
+ }
+ ntfs_warning(sb, "Using backup boot sector.");
+ return bh_backup;
+}
+
+/**
+ * parse_ntfs_boot_sector - parse the boot sector and store the data in @vol
+ * @vol: volume structure to initialise with data from boot sector
+ * @b: boot sector to parse
+ *
+ * Parse the ntfs boot sector @b and store all imporant information therein in
+ * the ntfs super block @vol. Return 'true' on success and 'false' on error.
+ */
+static bool parse_ntfs_boot_sector(ntfs_volume *vol, const NTFS_BOOT_SECTOR *b)
+{
+ unsigned int sectors_per_cluster_bits, nr_hidden_sects;
+ int clusters_per_mft_record, clusters_per_index_record;
+ s64 ll;
+
+ vol->sector_size = le16_to_cpu(b->bpb.bytes_per_sector);
+ vol->sector_size_bits = ffs(vol->sector_size) - 1;
+ ntfs_debug("vol->sector_size = %i (0x%x)", vol->sector_size,
+ vol->sector_size);
+ ntfs_debug("vol->sector_size_bits = %i (0x%x)", vol->sector_size_bits,
+ vol->sector_size_bits);
+ if (vol->sector_size < vol->sb->s_blocksize) {
+ ntfs_error(vol->sb, "Sector size (%i) is smaller than the "
+ "device block size (%lu). This is not "
+ "supported. Sorry.", vol->sector_size,
+ vol->sb->s_blocksize);
+ return false;
+ }
+ ntfs_debug("sectors_per_cluster = 0x%x", b->bpb.sectors_per_cluster);
+ sectors_per_cluster_bits = ffs(b->bpb.sectors_per_cluster) - 1;
+ ntfs_debug("sectors_per_cluster_bits = 0x%x",
+ sectors_per_cluster_bits);
+ nr_hidden_sects = le32_to_cpu(b->bpb.hidden_sectors);
+ ntfs_debug("number of hidden sectors = 0x%x", nr_hidden_sects);
+ vol->cluster_size = vol->sector_size << sectors_per_cluster_bits;
+ vol->cluster_size_mask = vol->cluster_size - 1;
+ vol->cluster_size_bits = ffs(vol->cluster_size) - 1;
+ ntfs_debug("vol->cluster_size = %i (0x%x)", vol->cluster_size,
+ vol->cluster_size);
+ ntfs_debug("vol->cluster_size_mask = 0x%x", vol->cluster_size_mask);
+ ntfs_debug("vol->cluster_size_bits = %i", vol->cluster_size_bits);
+ if (vol->cluster_size < vol->sector_size) {
+ ntfs_error(vol->sb, "Cluster size (%i) is smaller than the "
+ "sector size (%i). This is not supported. "
+ "Sorry.", vol->cluster_size, vol->sector_size);
+ return false;
+ }
+ clusters_per_mft_record = b->clusters_per_mft_record;
+ ntfs_debug("clusters_per_mft_record = %i (0x%x)",
+ clusters_per_mft_record, clusters_per_mft_record);
+ if (clusters_per_mft_record > 0)
+ vol->mft_record_size = vol->cluster_size <<
+ (ffs(clusters_per_mft_record) - 1);
+ else
+ /*
+ * When mft_record_size < cluster_size, clusters_per_mft_record
+ * = -log2(mft_record_size) bytes. mft_record_size normaly is
+ * 1024 bytes, which is encoded as 0xF6 (-10 in decimal).
+ */
+ vol->mft_record_size = 1 << -clusters_per_mft_record;
+ vol->mft_record_size_mask = vol->mft_record_size - 1;
+ vol->mft_record_size_bits = ffs(vol->mft_record_size) - 1;
+ ntfs_debug("vol->mft_record_size = %i (0x%x)", vol->mft_record_size,
+ vol->mft_record_size);
+ ntfs_debug("vol->mft_record_size_mask = 0x%x",
+ vol->mft_record_size_mask);
+ ntfs_debug("vol->mft_record_size_bits = %i (0x%x)",
+ vol->mft_record_size_bits, vol->mft_record_size_bits);
+ /*
+ * We cannot support mft record sizes above the PAGE_SIZE since
+ * we store $MFT/$DATA, the table of mft records in the page cache.
+ */
+ if (vol->mft_record_size > PAGE_SIZE) {
+ ntfs_error(vol->sb, "Mft record size (%i) exceeds the "
+ "PAGE_SIZE on your system (%lu). "
+ "This is not supported. Sorry.",
+ vol->mft_record_size, PAGE_SIZE);
+ return false;
+ }
+ /* We cannot support mft record sizes below the sector size. */
+ if (vol->mft_record_size < vol->sector_size) {
+ ntfs_error(vol->sb, "Mft record size (%i) is smaller than the "
+ "sector size (%i). This is not supported. "
+ "Sorry.", vol->mft_record_size,
+ vol->sector_size);
+ return false;
+ }
+ clusters_per_index_record = b->clusters_per_index_record;
+ ntfs_debug("clusters_per_index_record = %i (0x%x)",
+ clusters_per_index_record, clusters_per_index_record);
+ if (clusters_per_index_record > 0)
+ vol->index_record_size = vol->cluster_size <<
+ (ffs(clusters_per_index_record) - 1);
+ else
+ /*
+ * When index_record_size < cluster_size,
+ * clusters_per_index_record = -log2(index_record_size) bytes.
+ * index_record_size normaly equals 4096 bytes, which is
+ * encoded as 0xF4 (-12 in decimal).
+ */
+ vol->index_record_size = 1 << -clusters_per_index_record;
+ vol->index_record_size_mask = vol->index_record_size - 1;
+ vol->index_record_size_bits = ffs(vol->index_record_size) - 1;
+ ntfs_debug("vol->index_record_size = %i (0x%x)",
+ vol->index_record_size, vol->index_record_size);
+ ntfs_debug("vol->index_record_size_mask = 0x%x",
+ vol->index_record_size_mask);
+ ntfs_debug("vol->index_record_size_bits = %i (0x%x)",
+ vol->index_record_size_bits,
+ vol->index_record_size_bits);
+ /* We cannot support index record sizes below the sector size. */
+ if (vol->index_record_size < vol->sector_size) {
+ ntfs_error(vol->sb, "Index record size (%i) is smaller than "
+ "the sector size (%i). This is not "
+ "supported. Sorry.", vol->index_record_size,
+ vol->sector_size);
+ return false;
+ }
+ /*
+ * Get the size of the volume in clusters and check for 64-bit-ness.
+ * Windows currently only uses 32 bits to save the clusters so we do
+ * the same as it is much faster on 32-bit CPUs.
+ */
+ ll = sle64_to_cpu(b->number_of_sectors) >> sectors_per_cluster_bits;
+ if ((u64)ll >= 1ULL << 32) {
+ ntfs_error(vol->sb, "Cannot handle 64-bit clusters. Sorry.");
+ return false;
+ }
+ vol->nr_clusters = ll;
+ ntfs_debug("vol->nr_clusters = 0x%llx", (long long)vol->nr_clusters);
+ /*
+ * On an architecture where unsigned long is 32-bits, we restrict the
+ * volume size to 2TiB (2^41). On a 64-bit architecture, the compiler
+ * will hopefully optimize the whole check away.
+ */
+ if (sizeof(unsigned long) < 8) {
+ if ((ll << vol->cluster_size_bits) >= (1ULL << 41)) {
+ ntfs_error(vol->sb, "Volume size (%lluTiB) is too "
+ "large for this architecture. "
+ "Maximum supported is 2TiB. Sorry.",
+ (unsigned long long)ll >> (40 -
+ vol->cluster_size_bits));
+ return false;
+ }
+ }
+ ll = sle64_to_cpu(b->mft_lcn);
+ if (ll >= vol->nr_clusters) {
+ ntfs_error(vol->sb, "MFT LCN (%lli, 0x%llx) is beyond end of "
+ "volume. Weird.", (unsigned long long)ll,
+ (unsigned long long)ll);
+ return false;
+ }
+ vol->mft_lcn = ll;
+ ntfs_debug("vol->mft_lcn = 0x%llx", (long long)vol->mft_lcn);
+ ll = sle64_to_cpu(b->mftmirr_lcn);
+ if (ll >= vol->nr_clusters) {
+ ntfs_error(vol->sb, "MFTMirr LCN (%lli, 0x%llx) is beyond end "
+ "of volume. Weird.", (unsigned long long)ll,
+ (unsigned long long)ll);
+ return false;
+ }
+ vol->mftmirr_lcn = ll;
+ ntfs_debug("vol->mftmirr_lcn = 0x%llx", (long long)vol->mftmirr_lcn);
+#ifdef NTFS_RW
+ /*
+ * Work out the size of the mft mirror in number of mft records. If the
+ * cluster size is less than or equal to the size taken by four mft
+ * records, the mft mirror stores the first four mft records. If the
+ * cluster size is bigger than the size taken by four mft records, the
+ * mft mirror contains as many mft records as will fit into one
+ * cluster.
+ */
+ if (vol->cluster_size <= (4 << vol->mft_record_size_bits))
+ vol->mftmirr_size = 4;
+ else
+ vol->mftmirr_size = vol->cluster_size >>
+ vol->mft_record_size_bits;
+ ntfs_debug("vol->mftmirr_size = %i", vol->mftmirr_size);
+#endif /* NTFS_RW */
+ vol->serial_no = le64_to_cpu(b->volume_serial_number);
+ ntfs_debug("vol->serial_no = 0x%llx",
+ (unsigned long long)vol->serial_no);
+ return true;
+}
+
+/**
+ * ntfs_setup_allocators - initialize the cluster and mft allocators
+ * @vol: volume structure for which to setup the allocators
+ *
+ * Setup the cluster (lcn) and mft allocators to the starting values.
+ */
+static void ntfs_setup_allocators(ntfs_volume *vol)
+{
+#ifdef NTFS_RW
+ LCN mft_zone_size, mft_lcn;
+#endif /* NTFS_RW */
+
+ ntfs_debug("vol->mft_zone_multiplier = 0x%x",
+ vol->mft_zone_multiplier);
+#ifdef NTFS_RW
+ /* Determine the size of the MFT zone. */
+ mft_zone_size = vol->nr_clusters;
+ switch (vol->mft_zone_multiplier) { /* % of volume size in clusters */
+ case 4:
+ mft_zone_size >>= 1; /* 50% */
+ break;
+ case 3:
+ mft_zone_size = (mft_zone_size +
+ (mft_zone_size >> 1)) >> 2; /* 37.5% */
+ break;
+ case 2:
+ mft_zone_size >>= 2; /* 25% */
+ break;
+ /* case 1: */
+ default:
+ mft_zone_size >>= 3; /* 12.5% */
+ break;
+ }
+ /* Setup the mft zone. */
+ vol->mft_zone_start = vol->mft_zone_pos = vol->mft_lcn;
+ ntfs_debug("vol->mft_zone_pos = 0x%llx",
+ (unsigned long long)vol->mft_zone_pos);
+ /*
+ * Calculate the mft_lcn for an unmodified NTFS volume (see mkntfs
+ * source) and if the actual mft_lcn is in the expected place or even
+ * further to the front of the volume, extend the mft_zone to cover the
+ * beginning of the volume as well. This is in order to protect the
+ * area reserved for the mft bitmap as well within the mft_zone itself.
+ * On non-standard volumes we do not protect it as the overhead would
+ * be higher than the speed increase we would get by doing it.
+ */
+ mft_lcn = (8192 + 2 * vol->cluster_size - 1) / vol->cluster_size;
+ if (mft_lcn * vol->cluster_size < 16 * 1024)
+ mft_lcn = (16 * 1024 + vol->cluster_size - 1) /
+ vol->cluster_size;
+ if (vol->mft_zone_start <= mft_lcn)
+ vol->mft_zone_start = 0;
+ ntfs_debug("vol->mft_zone_start = 0x%llx",
+ (unsigned long long)vol->mft_zone_start);
+ /*
+ * Need to cap the mft zone on non-standard volumes so that it does
+ * not point outside the boundaries of the volume. We do this by
+ * halving the zone size until we are inside the volume.
+ */
+ vol->mft_zone_end = vol->mft_lcn + mft_zone_size;
+ while (vol->mft_zone_end >= vol->nr_clusters) {
+ mft_zone_size >>= 1;
+ vol->mft_zone_end = vol->mft_lcn + mft_zone_size;
+ }
+ ntfs_debug("vol->mft_zone_end = 0x%llx",
+ (unsigned long long)vol->mft_zone_end);
+ /*
+ * Set the current position within each data zone to the start of the
+ * respective zone.
+ */
+ vol->data1_zone_pos = vol->mft_zone_end;
+ ntfs_debug("vol->data1_zone_pos = 0x%llx",
+ (unsigned long long)vol->data1_zone_pos);
+ vol->data2_zone_pos = 0;
+ ntfs_debug("vol->data2_zone_pos = 0x%llx",
+ (unsigned long long)vol->data2_zone_pos);
+
+ /* Set the mft data allocation position to mft record 24. */
+ vol->mft_data_pos = 24;
+ ntfs_debug("vol->mft_data_pos = 0x%llx",
+ (unsigned long long)vol->mft_data_pos);
+#endif /* NTFS_RW */
+}
+
+#ifdef NTFS_RW
+
+/**
+ * load_and_init_mft_mirror - load and setup the mft mirror inode for a volume
+ * @vol: ntfs super block describing device whose mft mirror to load
+ *
+ * Return 'true' on success or 'false' on error.
+ */
+static bool load_and_init_mft_mirror(ntfs_volume *vol)
+{
+ struct inode *tmp_ino;
+ ntfs_inode *tmp_ni;
+
+ ntfs_debug("Entering.");
+ /* Get mft mirror inode. */
+ tmp_ino = ntfs_iget(vol->sb, FILE_MFTMirr);
+ if (IS_ERR(tmp_ino) || is_bad_inode(tmp_ino)) {
+ if (!IS_ERR(tmp_ino))
+ iput(tmp_ino);
+ /* Caller will display error message. */
+ return false;
+ }
+ /*
+ * Re-initialize some specifics about $MFTMirr's inode as
+ * ntfs_read_inode() will have set up the default ones.
+ */
+ /* Set uid and gid to root. */
+ tmp_ino->i_uid = GLOBAL_ROOT_UID;
+ tmp_ino->i_gid = GLOBAL_ROOT_GID;
+ /* Regular file. No access for anyone. */
+ tmp_ino->i_mode = S_IFREG;
+ /* No VFS initiated operations allowed for $MFTMirr. */
+ tmp_ino->i_op = &ntfs_empty_inode_ops;
+ tmp_ino->i_fop = &ntfs_empty_file_ops;
+ /* Put in our special address space operations. */
+ tmp_ino->i_mapping->a_ops = &ntfs_mst_aops;
+ tmp_ni = NTFS_I(tmp_ino);
+ /* The $MFTMirr, like the $MFT is multi sector transfer protected. */
+ NInoSetMstProtected(tmp_ni);
+ NInoSetSparseDisabled(tmp_ni);
+ /*
+ * Set up our little cheat allowing us to reuse the async read io
+ * completion handler for directories.
+ */
+ tmp_ni->itype.index.block_size = vol->mft_record_size;
+ tmp_ni->itype.index.block_size_bits = vol->mft_record_size_bits;
+ vol->mftmirr_ino = tmp_ino;
+ ntfs_debug("Done.");
+ return true;
+}
+
+/**
+ * check_mft_mirror - compare contents of the mft mirror with the mft
+ * @vol: ntfs super block describing device whose mft mirror to check
+ *
+ * Return 'true' on success or 'false' on error.
+ *
+ * Note, this function also results in the mft mirror runlist being completely
+ * mapped into memory. The mft mirror write code requires this and will BUG()
+ * should it find an unmapped runlist element.
+ */
+static bool check_mft_mirror(ntfs_volume *vol)
+{
+ struct super_block *sb = vol->sb;
+ ntfs_inode *mirr_ni;
+ struct page *mft_page, *mirr_page;
+ u8 *kmft, *kmirr;
+ runlist_element *rl, rl2[2];
+ pgoff_t index;
+ int mrecs_per_page, i;
+
+ ntfs_debug("Entering.");
+ /* Compare contents of $MFT and $MFTMirr. */
+ mrecs_per_page = PAGE_SIZE / vol->mft_record_size;
+ BUG_ON(!mrecs_per_page);
+ BUG_ON(!vol->mftmirr_size);
+ mft_page = mirr_page = NULL;
+ kmft = kmirr = NULL;
+ index = i = 0;
+ do {
+ u32 bytes;
+
+ /* Switch pages if necessary. */
+ if (!(i % mrecs_per_page)) {
+ if (index) {
+ ntfs_unmap_page(mft_page);
+ ntfs_unmap_page(mirr_page);
+ }
+ /* Get the $MFT page. */
+ mft_page = ntfs_map_page(vol->mft_ino->i_mapping,
+ index);
+ if (IS_ERR(mft_page)) {
+ ntfs_error(sb, "Failed to read $MFT.");
+ return false;
+ }
+ kmft = page_address(mft_page);
+ /* Get the $MFTMirr page. */
+ mirr_page = ntfs_map_page(vol->mftmirr_ino->i_mapping,
+ index);
+ if (IS_ERR(mirr_page)) {
+ ntfs_error(sb, "Failed to read $MFTMirr.");
+ goto mft_unmap_out;
+ }
+ kmirr = page_address(mirr_page);
+ ++index;
+ }
+ /* Do not check the record if it is not in use. */
+ if (((MFT_RECORD*)kmft)->flags & MFT_RECORD_IN_USE) {
+ /* Make sure the record is ok. */
+ if (ntfs_is_baad_recordp((le32*)kmft)) {
+ ntfs_error(sb, "Incomplete multi sector "
+ "transfer detected in mft "
+ "record %i.", i);
+mm_unmap_out:
+ ntfs_unmap_page(mirr_page);
+mft_unmap_out:
+ ntfs_unmap_page(mft_page);
+ return false;
+ }
+ }
+ /* Do not check the mirror record if it is not in use. */
+ if (((MFT_RECORD*)kmirr)->flags & MFT_RECORD_IN_USE) {
+ if (ntfs_is_baad_recordp((le32*)kmirr)) {
+ ntfs_error(sb, "Incomplete multi sector "
+ "transfer detected in mft "
+ "mirror record %i.", i);
+ goto mm_unmap_out;
+ }
+ }
+ /* Get the amount of data in the current record. */
+ bytes = le32_to_cpu(((MFT_RECORD*)kmft)->bytes_in_use);
+ if (bytes < sizeof(MFT_RECORD_OLD) ||
+ bytes > vol->mft_record_size ||
+ ntfs_is_baad_recordp((le32*)kmft)) {
+ bytes = le32_to_cpu(((MFT_RECORD*)kmirr)->bytes_in_use);
+ if (bytes < sizeof(MFT_RECORD_OLD) ||
+ bytes > vol->mft_record_size ||
+ ntfs_is_baad_recordp((le32*)kmirr))
+ bytes = vol->mft_record_size;
+ }
+ /* Compare the two records. */
+ if (memcmp(kmft, kmirr, bytes)) {
+ ntfs_error(sb, "$MFT and $MFTMirr (record %i) do not "
+ "match. Run ntfsfix or chkdsk.", i);
+ goto mm_unmap_out;
+ }
+ kmft += vol->mft_record_size;
+ kmirr += vol->mft_record_size;
+ } while (++i < vol->mftmirr_size);
+ /* Release the last pages. */
+ ntfs_unmap_page(mft_page);
+ ntfs_unmap_page(mirr_page);
+
+ /* Construct the mft mirror runlist by hand. */
+ rl2[0].vcn = 0;
+ rl2[0].lcn = vol->mftmirr_lcn;
+ rl2[0].length = (vol->mftmirr_size * vol->mft_record_size +
+ vol->cluster_size - 1) / vol->cluster_size;
+ rl2[1].vcn = rl2[0].length;
+ rl2[1].lcn = LCN_ENOENT;
+ rl2[1].length = 0;
+ /*
+ * Because we have just read all of the mft mirror, we know we have
+ * mapped the full runlist for it.
+ */
+ mirr_ni = NTFS_I(vol->mftmirr_ino);
+ down_read(&mirr_ni->runlist.lock);
+ rl = mirr_ni->runlist.rl;
+ /* Compare the two runlists. They must be identical. */
+ i = 0;
+ do {
+ if (rl2[i].vcn != rl[i].vcn || rl2[i].lcn != rl[i].lcn ||
+ rl2[i].length != rl[i].length) {
+ ntfs_error(sb, "$MFTMirr location mismatch. "
+ "Run chkdsk.");
+ up_read(&mirr_ni->runlist.lock);
+ return false;
+ }
+ } while (rl2[i++].length);
+ up_read(&mirr_ni->runlist.lock);
+ ntfs_debug("Done.");
+ return true;
+}
+
+/**
+ * load_and_check_logfile - load and check the logfile inode for a volume
+ * @vol: ntfs super block describing device whose logfile to load
+ *
+ * Return 'true' on success or 'false' on error.
+ */
+static bool load_and_check_logfile(ntfs_volume *vol,
+ RESTART_PAGE_HEADER **rp)
+{
+ struct inode *tmp_ino;
+
+ ntfs_debug("Entering.");
+ tmp_ino = ntfs_iget(vol->sb, FILE_LogFile);
+ if (IS_ERR(tmp_ino) || is_bad_inode(tmp_ino)) {
+ if (!IS_ERR(tmp_ino))
+ iput(tmp_ino);
+ /* Caller will display error message. */
+ return false;
+ }
+ if (!ntfs_check_logfile(tmp_ino, rp)) {
+ iput(tmp_ino);
+ /* ntfs_check_logfile() will have displayed error output. */
+ return false;
+ }
+ NInoSetSparseDisabled(NTFS_I(tmp_ino));
+ vol->logfile_ino = tmp_ino;
+ ntfs_debug("Done.");
+ return true;
+}
+
+#define NTFS_HIBERFIL_HEADER_SIZE 4096
+
+/**
+ * check_windows_hibernation_status - check if Windows is suspended on a volume
+ * @vol: ntfs super block of device to check
+ *
+ * Check if Windows is hibernated on the ntfs volume @vol. This is done by
+ * looking for the file hiberfil.sys in the root directory of the volume. If
+ * the file is not present Windows is definitely not suspended.
+ *
+ * If hiberfil.sys exists and is less than 4kiB in size it means Windows is
+ * definitely suspended (this volume is not the system volume). Caveat: on a
+ * system with many volumes it is possible that the < 4kiB check is bogus but
+ * for now this should do fine.
+ *
+ * If hiberfil.sys exists and is larger than 4kiB in size, we need to read the
+ * hiberfil header (which is the first 4kiB). If this begins with "hibr",
+ * Windows is definitely suspended. If it is completely full of zeroes,
+ * Windows is definitely not hibernated. Any other case is treated as if
+ * Windows is suspended. This caters for the above mentioned caveat of a
+ * system with many volumes where no "hibr" magic would be present and there is
+ * no zero header.
+ *
+ * Return 0 if Windows is not hibernated on the volume, >0 if Windows is
+ * hibernated on the volume, and -errno on error.
+ */
+static int check_windows_hibernation_status(ntfs_volume *vol)
+{
+ MFT_REF mref;
+ struct inode *vi;
+ struct page *page;
+ u32 *kaddr, *kend;
+ ntfs_name *name = NULL;
+ int ret = 1;
+ static const ntfschar hiberfil[13] = { cpu_to_le16('h'),
+ cpu_to_le16('i'), cpu_to_le16('b'),
+ cpu_to_le16('e'), cpu_to_le16('r'),
+ cpu_to_le16('f'), cpu_to_le16('i'),
+ cpu_to_le16('l'), cpu_to_le16('.'),
+ cpu_to_le16('s'), cpu_to_le16('y'),
+ cpu_to_le16('s'), 0 };
+
+ ntfs_debug("Entering.");
+ /*
+ * Find the inode number for the hibernation file by looking up the
+ * filename hiberfil.sys in the root directory.
+ */
+ inode_lock(vol->root_ino);
+ mref = ntfs_lookup_inode_by_name(NTFS_I(vol->root_ino), hiberfil, 12,
+ &name);
+ inode_unlock(vol->root_ino);
+ if (IS_ERR_MREF(mref)) {
+ ret = MREF_ERR(mref);
+ /* If the file does not exist, Windows is not hibernated. */
+ if (ret == -ENOENT) {
+ ntfs_debug("hiberfil.sys not present. Windows is not "
+ "hibernated on the volume.");
+ return 0;
+ }
+ /* A real error occurred. */
+ ntfs_error(vol->sb, "Failed to find inode number for "
+ "hiberfil.sys.");
+ return ret;
+ }
+ /* We do not care for the type of match that was found. */
+ kfree(name);
+ /* Get the inode. */
+ vi = ntfs_iget(vol->sb, MREF(mref));
+ if (IS_ERR(vi) || is_bad_inode(vi)) {
+ if (!IS_ERR(vi))
+ iput(vi);
+ ntfs_error(vol->sb, "Failed to load hiberfil.sys.");
+ return IS_ERR(vi) ? PTR_ERR(vi) : -EIO;
+ }
+ if (unlikely(i_size_read(vi) < NTFS_HIBERFIL_HEADER_SIZE)) {
+ ntfs_debug("hiberfil.sys is smaller than 4kiB (0x%llx). "
+ "Windows is hibernated on the volume. This "
+ "is not the system volume.", i_size_read(vi));
+ goto iput_out;
+ }
+ page = ntfs_map_page(vi->i_mapping, 0);
+ if (IS_ERR(page)) {
+ ntfs_error(vol->sb, "Failed to read from hiberfil.sys.");
+ ret = PTR_ERR(page);
+ goto iput_out;
+ }
+ kaddr = (u32*)page_address(page);
+ if (*(le32*)kaddr == cpu_to_le32(0x72626968)/*'hibr'*/) {
+ ntfs_debug("Magic \"hibr\" found in hiberfil.sys. Windows is "
+ "hibernated on the volume. This is the "
+ "system volume.");
+ goto unm_iput_out;
+ }
+ kend = kaddr + NTFS_HIBERFIL_HEADER_SIZE/sizeof(*kaddr);
+ do {
+ if (unlikely(*kaddr)) {
+ ntfs_debug("hiberfil.sys is larger than 4kiB "
+ "(0x%llx), does not contain the "
+ "\"hibr\" magic, and does not have a "
+ "zero header. Windows is hibernated "
+ "on the volume. This is not the "
+ "system volume.", i_size_read(vi));
+ goto unm_iput_out;
+ }
+ } while (++kaddr < kend);
+ ntfs_debug("hiberfil.sys contains a zero header. Windows is not "
+ "hibernated on the volume. This is the system "
+ "volume.");
+ ret = 0;
+unm_iput_out:
+ ntfs_unmap_page(page);
+iput_out:
+ iput(vi);
+ return ret;
+}
+
+/**
+ * load_and_init_quota - load and setup the quota file for a volume if present
+ * @vol: ntfs super block describing device whose quota file to load
+ *
+ * Return 'true' on success or 'false' on error. If $Quota is not present, we
+ * leave vol->quota_ino as NULL and return success.
+ */
+static bool load_and_init_quota(ntfs_volume *vol)
+{
+ MFT_REF mref;
+ struct inode *tmp_ino;
+ ntfs_name *name = NULL;
+ static const ntfschar Quota[7] = { cpu_to_le16('$'),
+ cpu_to_le16('Q'), cpu_to_le16('u'),
+ cpu_to_le16('o'), cpu_to_le16('t'),
+ cpu_to_le16('a'), 0 };
+ static ntfschar Q[3] = { cpu_to_le16('$'),
+ cpu_to_le16('Q'), 0 };
+
+ ntfs_debug("Entering.");
+ /*
+ * Find the inode number for the quota file by looking up the filename
+ * $Quota in the extended system files directory $Extend.
+ */
+ inode_lock(vol->extend_ino);
+ mref = ntfs_lookup_inode_by_name(NTFS_I(vol->extend_ino), Quota, 6,
+ &name);
+ inode_unlock(vol->extend_ino);
+ if (IS_ERR_MREF(mref)) {
+ /*
+ * If the file does not exist, quotas are disabled and have
+ * never been enabled on this volume, just return success.
+ */
+ if (MREF_ERR(mref) == -ENOENT) {
+ ntfs_debug("$Quota not present. Volume does not have "
+ "quotas enabled.");
+ /*
+ * No need to try to set quotas out of date if they are
+ * not enabled.
+ */
+ NVolSetQuotaOutOfDate(vol);
+ return true;
+ }
+ /* A real error occurred. */
+ ntfs_error(vol->sb, "Failed to find inode number for $Quota.");
+ return false;
+ }
+ /* We do not care for the type of match that was found. */
+ kfree(name);
+ /* Get the inode. */
+ tmp_ino = ntfs_iget(vol->sb, MREF(mref));
+ if (IS_ERR(tmp_ino) || is_bad_inode(tmp_ino)) {
+ if (!IS_ERR(tmp_ino))
+ iput(tmp_ino);
+ ntfs_error(vol->sb, "Failed to load $Quota.");
+ return false;
+ }
+ vol->quota_ino = tmp_ino;
+ /* Get the $Q index allocation attribute. */
+ tmp_ino = ntfs_index_iget(vol->quota_ino, Q, 2);
+ if (IS_ERR(tmp_ino)) {
+ ntfs_error(vol->sb, "Failed to load $Quota/$Q index.");
+ return false;
+ }
+ vol->quota_q_ino = tmp_ino;
+ ntfs_debug("Done.");
+ return true;
+}
+
+/**
+ * load_and_init_usnjrnl - load and setup the transaction log if present
+ * @vol: ntfs super block describing device whose usnjrnl file to load
+ *
+ * Return 'true' on success or 'false' on error.
+ *
+ * If $UsnJrnl is not present or in the process of being disabled, we set
+ * NVolUsnJrnlStamped() and return success.
+ *
+ * If the $UsnJrnl $DATA/$J attribute has a size equal to the lowest valid usn,
+ * i.e. transaction logging has only just been enabled or the journal has been
+ * stamped and nothing has been logged since, we also set NVolUsnJrnlStamped()
+ * and return success.
+ */
+static bool load_and_init_usnjrnl(ntfs_volume *vol)
+{
+ MFT_REF mref;
+ struct inode *tmp_ino;
+ ntfs_inode *tmp_ni;
+ struct page *page;
+ ntfs_name *name = NULL;
+ USN_HEADER *uh;
+ static const ntfschar UsnJrnl[9] = { cpu_to_le16('$'),
+ cpu_to_le16('U'), cpu_to_le16('s'),
+ cpu_to_le16('n'), cpu_to_le16('J'),
+ cpu_to_le16('r'), cpu_to_le16('n'),
+ cpu_to_le16('l'), 0 };
+ static ntfschar Max[5] = { cpu_to_le16('$'),
+ cpu_to_le16('M'), cpu_to_le16('a'),
+ cpu_to_le16('x'), 0 };
+ static ntfschar J[3] = { cpu_to_le16('$'),
+ cpu_to_le16('J'), 0 };
+
+ ntfs_debug("Entering.");
+ /*
+ * Find the inode number for the transaction log file by looking up the
+ * filename $UsnJrnl in the extended system files directory $Extend.
+ */
+ inode_lock(vol->extend_ino);
+ mref = ntfs_lookup_inode_by_name(NTFS_I(vol->extend_ino), UsnJrnl, 8,
+ &name);
+ inode_unlock(vol->extend_ino);
+ if (IS_ERR_MREF(mref)) {
+ /*
+ * If the file does not exist, transaction logging is disabled,
+ * just return success.
+ */
+ if (MREF_ERR(mref) == -ENOENT) {
+ ntfs_debug("$UsnJrnl not present. Volume does not "
+ "have transaction logging enabled.");
+not_enabled:
+ /*
+ * No need to try to stamp the transaction log if
+ * transaction logging is not enabled.
+ */
+ NVolSetUsnJrnlStamped(vol);
+ return true;
+ }
+ /* A real error occurred. */
+ ntfs_error(vol->sb, "Failed to find inode number for "
+ "$UsnJrnl.");
+ return false;
+ }
+ /* We do not care for the type of match that was found. */
+ kfree(name);
+ /* Get the inode. */
+ tmp_ino = ntfs_iget(vol->sb, MREF(mref));
+ if (IS_ERR(tmp_ino) || unlikely(is_bad_inode(tmp_ino))) {
+ if (!IS_ERR(tmp_ino))
+ iput(tmp_ino);
+ ntfs_error(vol->sb, "Failed to load $UsnJrnl.");
+ return false;
+ }
+ vol->usnjrnl_ino = tmp_ino;
+ /*
+ * If the transaction log is in the process of being deleted, we can
+ * ignore it.
+ */
+ if (unlikely(vol->vol_flags & VOLUME_DELETE_USN_UNDERWAY)) {
+ ntfs_debug("$UsnJrnl in the process of being disabled. "
+ "Volume does not have transaction logging "
+ "enabled.");
+ goto not_enabled;
+ }
+ /* Get the $DATA/$Max attribute. */
+ tmp_ino = ntfs_attr_iget(vol->usnjrnl_ino, AT_DATA, Max, 4);
+ if (IS_ERR(tmp_ino)) {
+ ntfs_error(vol->sb, "Failed to load $UsnJrnl/$DATA/$Max "
+ "attribute.");
+ return false;
+ }
+ vol->usnjrnl_max_ino = tmp_ino;
+ if (unlikely(i_size_read(tmp_ino) < sizeof(USN_HEADER))) {
+ ntfs_error(vol->sb, "Found corrupt $UsnJrnl/$DATA/$Max "
+ "attribute (size is 0x%llx but should be at "
+ "least 0x%zx bytes).", i_size_read(tmp_ino),
+ sizeof(USN_HEADER));
+ return false;
+ }
+ /* Get the $DATA/$J attribute. */
+ tmp_ino = ntfs_attr_iget(vol->usnjrnl_ino, AT_DATA, J, 2);
+ if (IS_ERR(tmp_ino)) {
+ ntfs_error(vol->sb, "Failed to load $UsnJrnl/$DATA/$J "
+ "attribute.");
+ return false;
+ }
+ vol->usnjrnl_j_ino = tmp_ino;
+ /* Verify $J is non-resident and sparse. */
+ tmp_ni = NTFS_I(vol->usnjrnl_j_ino);
+ if (unlikely(!NInoNonResident(tmp_ni) || !NInoSparse(tmp_ni))) {
+ ntfs_error(vol->sb, "$UsnJrnl/$DATA/$J attribute is resident "
+ "and/or not sparse.");
+ return false;
+ }
+ /* Read the USN_HEADER from $DATA/$Max. */
+ page = ntfs_map_page(vol->usnjrnl_max_ino->i_mapping, 0);
+ if (IS_ERR(page)) {
+ ntfs_error(vol->sb, "Failed to read from $UsnJrnl/$DATA/$Max "
+ "attribute.");
+ return false;
+ }
+ uh = (USN_HEADER*)page_address(page);
+ /* Sanity check the $Max. */
+ if (unlikely(sle64_to_cpu(uh->allocation_delta) >
+ sle64_to_cpu(uh->maximum_size))) {
+ ntfs_error(vol->sb, "Allocation delta (0x%llx) exceeds "
+ "maximum size (0x%llx). $UsnJrnl is corrupt.",
+ (long long)sle64_to_cpu(uh->allocation_delta),
+ (long long)sle64_to_cpu(uh->maximum_size));
+ ntfs_unmap_page(page);
+ return false;
+ }
+ /*
+ * If the transaction log has been stamped and nothing has been written
+ * to it since, we do not need to stamp it.
+ */
+ if (unlikely(sle64_to_cpu(uh->lowest_valid_usn) >=
+ i_size_read(vol->usnjrnl_j_ino))) {
+ if (likely(sle64_to_cpu(uh->lowest_valid_usn) ==
+ i_size_read(vol->usnjrnl_j_ino))) {
+ ntfs_unmap_page(page);
+ ntfs_debug("$UsnJrnl is enabled but nothing has been "
+ "logged since it was last stamped. "
+ "Treating this as if the volume does "
+ "not have transaction logging "
+ "enabled.");
+ goto not_enabled;
+ }
+ ntfs_error(vol->sb, "$UsnJrnl has lowest valid usn (0x%llx) "
+ "which is out of bounds (0x%llx). $UsnJrnl "
+ "is corrupt.",
+ (long long)sle64_to_cpu(uh->lowest_valid_usn),
+ i_size_read(vol->usnjrnl_j_ino));
+ ntfs_unmap_page(page);
+ return false;
+ }
+ ntfs_unmap_page(page);
+ ntfs_debug("Done.");
+ return true;
+}
+
+/**
+ * load_and_init_attrdef - load the attribute definitions table for a volume
+ * @vol: ntfs super block describing device whose attrdef to load
+ *
+ * Return 'true' on success or 'false' on error.
+ */
+static bool load_and_init_attrdef(ntfs_volume *vol)
+{
+ loff_t i_size;
+ struct super_block *sb = vol->sb;
+ struct inode *ino;
+ struct page *page;
+ pgoff_t index, max_index;
+ unsigned int size;
+
+ ntfs_debug("Entering.");
+ /* Read attrdef table and setup vol->attrdef and vol->attrdef_size. */
+ ino = ntfs_iget(sb, FILE_AttrDef);
+ if (IS_ERR(ino) || is_bad_inode(ino)) {
+ if (!IS_ERR(ino))
+ iput(ino);
+ goto failed;
+ }
+ NInoSetSparseDisabled(NTFS_I(ino));
+ /* The size of FILE_AttrDef must be above 0 and fit inside 31 bits. */
+ i_size = i_size_read(ino);
+ if (i_size <= 0 || i_size > 0x7fffffff)
+ goto iput_failed;
+ vol->attrdef = (ATTR_DEF*)ntfs_malloc_nofs(i_size);
+ if (!vol->attrdef)
+ goto iput_failed;
+ index = 0;
+ max_index = i_size >> PAGE_SHIFT;
+ size = PAGE_SIZE;
+ while (index < max_index) {
+ /* Read the attrdef table and copy it into the linear buffer. */
+read_partial_attrdef_page:
+ page = ntfs_map_page(ino->i_mapping, index);
+ if (IS_ERR(page))
+ goto free_iput_failed;
+ memcpy((u8*)vol->attrdef + (index++ << PAGE_SHIFT),
+ page_address(page), size);
+ ntfs_unmap_page(page);
+ };
+ if (size == PAGE_SIZE) {
+ size = i_size & ~PAGE_MASK;
+ if (size)
+ goto read_partial_attrdef_page;
+ }
+ vol->attrdef_size = i_size;
+ ntfs_debug("Read %llu bytes from $AttrDef.", i_size);
+ iput(ino);
+ return true;
+free_iput_failed:
+ ntfs_free(vol->attrdef);
+ vol->attrdef = NULL;
+iput_failed:
+ iput(ino);
+failed:
+ ntfs_error(sb, "Failed to initialize attribute definition table.");
+ return false;
+}
+
+#endif /* NTFS_RW */
+
+/**
+ * load_and_init_upcase - load the upcase table for an ntfs volume
+ * @vol: ntfs super block describing device whose upcase to load
+ *
+ * Return 'true' on success or 'false' on error.
+ */
+static bool load_and_init_upcase(ntfs_volume *vol)
+{
+ loff_t i_size;
+ struct super_block *sb = vol->sb;
+ struct inode *ino;
+ struct page *page;
+ pgoff_t index, max_index;
+ unsigned int size;
+ int i, max;
+
+ ntfs_debug("Entering.");
+ /* Read upcase table and setup vol->upcase and vol->upcase_len. */
+ ino = ntfs_iget(sb, FILE_UpCase);
+ if (IS_ERR(ino) || is_bad_inode(ino)) {
+ if (!IS_ERR(ino))
+ iput(ino);
+ goto upcase_failed;
+ }
+ /*
+ * The upcase size must not be above 64k Unicode characters, must not
+ * be zero and must be a multiple of sizeof(ntfschar).
+ */
+ i_size = i_size_read(ino);
+ if (!i_size || i_size & (sizeof(ntfschar) - 1) ||
+ i_size > 64ULL * 1024 * sizeof(ntfschar))
+ goto iput_upcase_failed;
+ vol->upcase = (ntfschar*)ntfs_malloc_nofs(i_size);
+ if (!vol->upcase)
+ goto iput_upcase_failed;
+ index = 0;
+ max_index = i_size >> PAGE_SHIFT;
+ size = PAGE_SIZE;
+ while (index < max_index) {
+ /* Read the upcase table and copy it into the linear buffer. */
+read_partial_upcase_page:
+ page = ntfs_map_page(ino->i_mapping, index);
+ if (IS_ERR(page))
+ goto iput_upcase_failed;
+ memcpy((char*)vol->upcase + (index++ << PAGE_SHIFT),
+ page_address(page), size);
+ ntfs_unmap_page(page);
+ };
+ if (size == PAGE_SIZE) {
+ size = i_size & ~PAGE_MASK;
+ if (size)
+ goto read_partial_upcase_page;
+ }
+ vol->upcase_len = i_size >> UCHAR_T_SIZE_BITS;
+ ntfs_debug("Read %llu bytes from $UpCase (expected %zu bytes).",
+ i_size, 64 * 1024 * sizeof(ntfschar));
+ iput(ino);
+ mutex_lock(&ntfs_lock);
+ if (!default_upcase) {
+ ntfs_debug("Using volume specified $UpCase since default is "
+ "not present.");
+ mutex_unlock(&ntfs_lock);
+ return true;
+ }
+ max = default_upcase_len;
+ if (max > vol->upcase_len)
+ max = vol->upcase_len;
+ for (i = 0; i < max; i++)
+ if (vol->upcase[i] != default_upcase[i])
+ break;
+ if (i == max) {
+ ntfs_free(vol->upcase);
+ vol->upcase = default_upcase;
+ vol->upcase_len = max;
+ ntfs_nr_upcase_users++;
+ mutex_unlock(&ntfs_lock);
+ ntfs_debug("Volume specified $UpCase matches default. Using "
+ "default.");
+ return true;
+ }
+ mutex_unlock(&ntfs_lock);
+ ntfs_debug("Using volume specified $UpCase since it does not match "
+ "the default.");
+ return true;
+iput_upcase_failed:
+ iput(ino);
+ ntfs_free(vol->upcase);
+ vol->upcase = NULL;
+upcase_failed:
+ mutex_lock(&ntfs_lock);
+ if (default_upcase) {
+ vol->upcase = default_upcase;
+ vol->upcase_len = default_upcase_len;
+ ntfs_nr_upcase_users++;
+ mutex_unlock(&ntfs_lock);
+ ntfs_error(sb, "Failed to load $UpCase from the volume. Using "
+ "default.");
+ return true;
+ }
+ mutex_unlock(&ntfs_lock);
+ ntfs_error(sb, "Failed to initialize upcase table.");
+ return false;
+}
+
+/*
+ * The lcn and mft bitmap inodes are NTFS-internal inodes with
+ * their own special locking rules:
+ */
+static struct lock_class_key
+ lcnbmp_runlist_lock_key, lcnbmp_mrec_lock_key,
+ mftbmp_runlist_lock_key, mftbmp_mrec_lock_key;
+
+/**
+ * load_system_files - open the system files using normal functions
+ * @vol: ntfs super block describing device whose system files to load
+ *
+ * Open the system files with normal access functions and complete setting up
+ * the ntfs super block @vol.
+ *
+ * Return 'true' on success or 'false' on error.
+ */
+static bool load_system_files(ntfs_volume *vol)
+{
+ struct super_block *sb = vol->sb;
+ MFT_RECORD *m;
+ VOLUME_INFORMATION *vi;
+ ntfs_attr_search_ctx *ctx;
+#ifdef NTFS_RW
+ RESTART_PAGE_HEADER *rp;
+ int err;
+#endif /* NTFS_RW */
+
+ ntfs_debug("Entering.");
+#ifdef NTFS_RW
+ /* Get mft mirror inode compare the contents of $MFT and $MFTMirr. */
+ if (!load_and_init_mft_mirror(vol) || !check_mft_mirror(vol)) {
+ static const char *es1 = "Failed to load $MFTMirr";
+ static const char *es2 = "$MFTMirr does not match $MFT";
+ static const char *es3 = ". Run ntfsfix and/or chkdsk.";
+
+ /* If a read-write mount, convert it to a read-only mount. */
+ if (!sb_rdonly(sb)) {
+ if (!(vol->on_errors & (ON_ERRORS_REMOUNT_RO |
+ ON_ERRORS_CONTINUE))) {
+ ntfs_error(sb, "%s and neither on_errors="
+ "continue nor on_errors="
+ "remount-ro was specified%s",
+ !vol->mftmirr_ino ? es1 : es2,
+ es3);
+ goto iput_mirr_err_out;
+ }
+ sb->s_flags |= SB_RDONLY;
+ ntfs_error(sb, "%s. Mounting read-only%s",
+ !vol->mftmirr_ino ? es1 : es2, es3);
+ } else
+ ntfs_warning(sb, "%s. Will not be able to remount "
+ "read-write%s",
+ !vol->mftmirr_ino ? es1 : es2, es3);
+ /* This will prevent a read-write remount. */
+ NVolSetErrors(vol);
+ }
+#endif /* NTFS_RW */
+ /* Get mft bitmap attribute inode. */
+ vol->mftbmp_ino = ntfs_attr_iget(vol->mft_ino, AT_BITMAP, NULL, 0);
+ if (IS_ERR(vol->mftbmp_ino)) {
+ ntfs_error(sb, "Failed to load $MFT/$BITMAP attribute.");
+ goto iput_mirr_err_out;
+ }
+ lockdep_set_class(&NTFS_I(vol->mftbmp_ino)->runlist.lock,
+ &mftbmp_runlist_lock_key);
+ lockdep_set_class(&NTFS_I(vol->mftbmp_ino)->mrec_lock,
+ &mftbmp_mrec_lock_key);
+ /* Read upcase table and setup @vol->upcase and @vol->upcase_len. */
+ if (!load_and_init_upcase(vol))
+ goto iput_mftbmp_err_out;
+#ifdef NTFS_RW
+ /*
+ * Read attribute definitions table and setup @vol->attrdef and
+ * @vol->attrdef_size.
+ */
+ if (!load_and_init_attrdef(vol))
+ goto iput_upcase_err_out;
+#endif /* NTFS_RW */
+ /*
+ * Get the cluster allocation bitmap inode and verify the size, no
+ * need for any locking at this stage as we are already running
+ * exclusively as we are mount in progress task.
+ */
+ vol->lcnbmp_ino = ntfs_iget(sb, FILE_Bitmap);
+ if (IS_ERR(vol->lcnbmp_ino) || is_bad_inode(vol->lcnbmp_ino)) {
+ if (!IS_ERR(vol->lcnbmp_ino))
+ iput(vol->lcnbmp_ino);
+ goto bitmap_failed;
+ }
+ lockdep_set_class(&NTFS_I(vol->lcnbmp_ino)->runlist.lock,
+ &lcnbmp_runlist_lock_key);
+ lockdep_set_class(&NTFS_I(vol->lcnbmp_ino)->mrec_lock,
+ &lcnbmp_mrec_lock_key);
+
+ NInoSetSparseDisabled(NTFS_I(vol->lcnbmp_ino));
+ if ((vol->nr_clusters + 7) >> 3 > i_size_read(vol->lcnbmp_ino)) {
+ iput(vol->lcnbmp_ino);
+bitmap_failed:
+ ntfs_error(sb, "Failed to load $Bitmap.");
+ goto iput_attrdef_err_out;
+ }
+ /*
+ * Get the volume inode and setup our cache of the volume flags and
+ * version.
+ */
+ vol->vol_ino = ntfs_iget(sb, FILE_Volume);
+ if (IS_ERR(vol->vol_ino) || is_bad_inode(vol->vol_ino)) {
+ if (!IS_ERR(vol->vol_ino))
+ iput(vol->vol_ino);
+volume_failed:
+ ntfs_error(sb, "Failed to load $Volume.");
+ goto iput_lcnbmp_err_out;
+ }
+ m = map_mft_record(NTFS_I(vol->vol_ino));
+ if (IS_ERR(m)) {
+iput_volume_failed:
+ iput(vol->vol_ino);
+ goto volume_failed;
+ }
+ if (!(ctx = ntfs_attr_get_search_ctx(NTFS_I(vol->vol_ino), m))) {
+ ntfs_error(sb, "Failed to get attribute search context.");
+ goto get_ctx_vol_failed;
+ }
+ if (ntfs_attr_lookup(AT_VOLUME_INFORMATION, NULL, 0, 0, 0, NULL, 0,
+ ctx) || ctx->attr->non_resident || ctx->attr->flags) {
+err_put_vol:
+ ntfs_attr_put_search_ctx(ctx);
+get_ctx_vol_failed:
+ unmap_mft_record(NTFS_I(vol->vol_ino));
+ goto iput_volume_failed;
+ }
+ vi = (VOLUME_INFORMATION*)((char*)ctx->attr +
+ le16_to_cpu(ctx->attr->data.resident.value_offset));
+ /* Some bounds checks. */
+ if ((u8*)vi < (u8*)ctx->attr || (u8*)vi +
+ le32_to_cpu(ctx->attr->data.resident.value_length) >
+ (u8*)ctx->attr + le32_to_cpu(ctx->attr->length))
+ goto err_put_vol;
+ /* Copy the volume flags and version to the ntfs_volume structure. */
+ vol->vol_flags = vi->flags;
+ vol->major_ver = vi->major_ver;
+ vol->minor_ver = vi->minor_ver;
+ ntfs_attr_put_search_ctx(ctx);
+ unmap_mft_record(NTFS_I(vol->vol_ino));
+ pr_info("volume version %i.%i.\n", vol->major_ver,
+ vol->minor_ver);
+ if (vol->major_ver < 3 && NVolSparseEnabled(vol)) {
+ ntfs_warning(vol->sb, "Disabling sparse support due to NTFS "
+ "volume version %i.%i (need at least version "
+ "3.0).", vol->major_ver, vol->minor_ver);
+ NVolClearSparseEnabled(vol);
+ }
+#ifdef NTFS_RW
+ /* Make sure that no unsupported volume flags are set. */
+ if (vol->vol_flags & VOLUME_MUST_MOUNT_RO_MASK) {
+ static const char *es1a = "Volume is dirty";
+ static const char *es1b = "Volume has been modified by chkdsk";
+ static const char *es1c = "Volume has unsupported flags set";
+ static const char *es2a = ". Run chkdsk and mount in Windows.";
+ static const char *es2b = ". Mount in Windows.";
+ const char *es1, *es2;
+
+ es2 = es2a;
+ if (vol->vol_flags & VOLUME_IS_DIRTY)
+ es1 = es1a;
+ else if (vol->vol_flags & VOLUME_MODIFIED_BY_CHKDSK) {
+ es1 = es1b;
+ es2 = es2b;
+ } else {
+ es1 = es1c;
+ ntfs_warning(sb, "Unsupported volume flags 0x%x "
+ "encountered.",
+ (unsigned)le16_to_cpu(vol->vol_flags));
+ }
+ /* If a read-write mount, convert it to a read-only mount. */
+ if (!sb_rdonly(sb)) {
+ if (!(vol->on_errors & (ON_ERRORS_REMOUNT_RO |
+ ON_ERRORS_CONTINUE))) {
+ ntfs_error(sb, "%s and neither on_errors="
+ "continue nor on_errors="
+ "remount-ro was specified%s",
+ es1, es2);
+ goto iput_vol_err_out;
+ }
+ sb->s_flags |= SB_RDONLY;
+ ntfs_error(sb, "%s. Mounting read-only%s", es1, es2);
+ } else
+ ntfs_warning(sb, "%s. Will not be able to remount "
+ "read-write%s", es1, es2);
+ /*
+ * Do not set NVolErrors() because ntfs_remount() re-checks the
+ * flags which we need to do in case any flags have changed.
+ */
+ }
+ /*
+ * Get the inode for the logfile, check it and determine if the volume
+ * was shutdown cleanly.
+ */
+ rp = NULL;
+ if (!load_and_check_logfile(vol, &rp) ||
+ !ntfs_is_logfile_clean(vol->logfile_ino, rp)) {
+ static const char *es1a = "Failed to load $LogFile";
+ static const char *es1b = "$LogFile is not clean";
+ static const char *es2 = ". Mount in Windows.";
+ const char *es1;
+
+ es1 = !vol->logfile_ino ? es1a : es1b;
+ /* If a read-write mount, convert it to a read-only mount. */
+ if (!sb_rdonly(sb)) {
+ if (!(vol->on_errors & (ON_ERRORS_REMOUNT_RO |
+ ON_ERRORS_CONTINUE))) {
+ ntfs_error(sb, "%s and neither on_errors="
+ "continue nor on_errors="
+ "remount-ro was specified%s",
+ es1, es2);
+ if (vol->logfile_ino) {
+ BUG_ON(!rp);
+ ntfs_free(rp);
+ }
+ goto iput_logfile_err_out;
+ }
+ sb->s_flags |= SB_RDONLY;
+ ntfs_error(sb, "%s. Mounting read-only%s", es1, es2);
+ } else
+ ntfs_warning(sb, "%s. Will not be able to remount "
+ "read-write%s", es1, es2);
+ /* This will prevent a read-write remount. */
+ NVolSetErrors(vol);
+ }
+ ntfs_free(rp);
+#endif /* NTFS_RW */
+ /* Get the root directory inode so we can do path lookups. */
+ vol->root_ino = ntfs_iget(sb, FILE_root);
+ if (IS_ERR(vol->root_ino) || is_bad_inode(vol->root_ino)) {
+ if (!IS_ERR(vol->root_ino))
+ iput(vol->root_ino);
+ ntfs_error(sb, "Failed to load root directory.");
+ goto iput_logfile_err_out;
+ }
+#ifdef NTFS_RW
+ /*
+ * Check if Windows is suspended to disk on the target volume. If it
+ * is hibernated, we must not write *anything* to the disk so set
+ * NVolErrors() without setting the dirty volume flag and mount
+ * read-only. This will prevent read-write remounting and it will also
+ * prevent all writes.
+ */
+ err = check_windows_hibernation_status(vol);
+ if (unlikely(err)) {
+ static const char *es1a = "Failed to determine if Windows is "
+ "hibernated";
+ static const char *es1b = "Windows is hibernated";
+ static const char *es2 = ". Run chkdsk.";
+ const char *es1;
+
+ es1 = err < 0 ? es1a : es1b;
+ /* If a read-write mount, convert it to a read-only mount. */
+ if (!sb_rdonly(sb)) {
+ if (!(vol->on_errors & (ON_ERRORS_REMOUNT_RO |
+ ON_ERRORS_CONTINUE))) {
+ ntfs_error(sb, "%s and neither on_errors="
+ "continue nor on_errors="
+ "remount-ro was specified%s",
+ es1, es2);
+ goto iput_root_err_out;
+ }
+ sb->s_flags |= SB_RDONLY;
+ ntfs_error(sb, "%s. Mounting read-only%s", es1, es2);
+ } else
+ ntfs_warning(sb, "%s. Will not be able to remount "
+ "read-write%s", es1, es2);
+ /* This will prevent a read-write remount. */
+ NVolSetErrors(vol);
+ }
+ /* If (still) a read-write mount, mark the volume dirty. */
+ if (!sb_rdonly(sb) && ntfs_set_volume_flags(vol, VOLUME_IS_DIRTY)) {
+ static const char *es1 = "Failed to set dirty bit in volume "
+ "information flags";
+ static const char *es2 = ". Run chkdsk.";
+
+ /* Convert to a read-only mount. */
+ if (!(vol->on_errors & (ON_ERRORS_REMOUNT_RO |
+ ON_ERRORS_CONTINUE))) {
+ ntfs_error(sb, "%s and neither on_errors=continue nor "
+ "on_errors=remount-ro was specified%s",
+ es1, es2);
+ goto iput_root_err_out;
+ }
+ ntfs_error(sb, "%s. Mounting read-only%s", es1, es2);
+ sb->s_flags |= SB_RDONLY;
+ /*
+ * Do not set NVolErrors() because ntfs_remount() might manage
+ * to set the dirty flag in which case all would be well.
+ */
+ }
+#if 0
+ // TODO: Enable this code once we start modifying anything that is
+ // different between NTFS 1.2 and 3.x...
+ /*
+ * If (still) a read-write mount, set the NT4 compatibility flag on
+ * newer NTFS version volumes.
+ */
+ if (!(sb->s_flags & SB_RDONLY) && (vol->major_ver > 1) &&
+ ntfs_set_volume_flags(vol, VOLUME_MOUNTED_ON_NT4)) {
+ static const char *es1 = "Failed to set NT4 compatibility flag";
+ static const char *es2 = ". Run chkdsk.";
+
+ /* Convert to a read-only mount. */
+ if (!(vol->on_errors & (ON_ERRORS_REMOUNT_RO |
+ ON_ERRORS_CONTINUE))) {
+ ntfs_error(sb, "%s and neither on_errors=continue nor "
+ "on_errors=remount-ro was specified%s",
+ es1, es2);
+ goto iput_root_err_out;
+ }
+ ntfs_error(sb, "%s. Mounting read-only%s", es1, es2);
+ sb->s_flags |= SB_RDONLY;
+ NVolSetErrors(vol);
+ }
+#endif
+ /* If (still) a read-write mount, empty the logfile. */
+ if (!sb_rdonly(sb) && !ntfs_empty_logfile(vol->logfile_ino)) {
+ static const char *es1 = "Failed to empty $LogFile";
+ static const char *es2 = ". Mount in Windows.";
+
+ /* Convert to a read-only mount. */
+ if (!(vol->on_errors & (ON_ERRORS_REMOUNT_RO |
+ ON_ERRORS_CONTINUE))) {
+ ntfs_error(sb, "%s and neither on_errors=continue nor "
+ "on_errors=remount-ro was specified%s",
+ es1, es2);
+ goto iput_root_err_out;
+ }
+ ntfs_error(sb, "%s. Mounting read-only%s", es1, es2);
+ sb->s_flags |= SB_RDONLY;
+ NVolSetErrors(vol);
+ }
+#endif /* NTFS_RW */
+ /* If on NTFS versions before 3.0, we are done. */
+ if (unlikely(vol->major_ver < 3))
+ return true;
+ /* NTFS 3.0+ specific initialization. */
+ /* Get the security descriptors inode. */
+ vol->secure_ino = ntfs_iget(sb, FILE_Secure);
+ if (IS_ERR(vol->secure_ino) || is_bad_inode(vol->secure_ino)) {
+ if (!IS_ERR(vol->secure_ino))
+ iput(vol->secure_ino);
+ ntfs_error(sb, "Failed to load $Secure.");
+ goto iput_root_err_out;
+ }
+ // TODO: Initialize security.
+ /* Get the extended system files' directory inode. */
+ vol->extend_ino = ntfs_iget(sb, FILE_Extend);
+ if (IS_ERR(vol->extend_ino) || is_bad_inode(vol->extend_ino) ||
+ !S_ISDIR(vol->extend_ino->i_mode)) {
+ if (!IS_ERR(vol->extend_ino))
+ iput(vol->extend_ino);
+ ntfs_error(sb, "Failed to load $Extend.");
+ goto iput_sec_err_out;
+ }
+#ifdef NTFS_RW
+ /* Find the quota file, load it if present, and set it up. */
+ if (!load_and_init_quota(vol)) {
+ static const char *es1 = "Failed to load $Quota";
+ static const char *es2 = ". Run chkdsk.";
+
+ /* If a read-write mount, convert it to a read-only mount. */
+ if (!sb_rdonly(sb)) {
+ if (!(vol->on_errors & (ON_ERRORS_REMOUNT_RO |
+ ON_ERRORS_CONTINUE))) {
+ ntfs_error(sb, "%s and neither on_errors="
+ "continue nor on_errors="
+ "remount-ro was specified%s",
+ es1, es2);
+ goto iput_quota_err_out;
+ }
+ sb->s_flags |= SB_RDONLY;
+ ntfs_error(sb, "%s. Mounting read-only%s", es1, es2);
+ } else
+ ntfs_warning(sb, "%s. Will not be able to remount "
+ "read-write%s", es1, es2);
+ /* This will prevent a read-write remount. */
+ NVolSetErrors(vol);
+ }
+ /* If (still) a read-write mount, mark the quotas out of date. */
+ if (!sb_rdonly(sb) && !ntfs_mark_quotas_out_of_date(vol)) {
+ static const char *es1 = "Failed to mark quotas out of date";
+ static const char *es2 = ". Run chkdsk.";
+
+ /* Convert to a read-only mount. */
+ if (!(vol->on_errors & (ON_ERRORS_REMOUNT_RO |
+ ON_ERRORS_CONTINUE))) {
+ ntfs_error(sb, "%s and neither on_errors=continue nor "
+ "on_errors=remount-ro was specified%s",
+ es1, es2);
+ goto iput_quota_err_out;
+ }
+ ntfs_error(sb, "%s. Mounting read-only%s", es1, es2);
+ sb->s_flags |= SB_RDONLY;
+ NVolSetErrors(vol);
+ }
+ /*
+ * Find the transaction log file ($UsnJrnl), load it if present, check
+ * it, and set it up.
+ */
+ if (!load_and_init_usnjrnl(vol)) {
+ static const char *es1 = "Failed to load $UsnJrnl";
+ static const char *es2 = ". Run chkdsk.";
+
+ /* If a read-write mount, convert it to a read-only mount. */
+ if (!sb_rdonly(sb)) {
+ if (!(vol->on_errors & (ON_ERRORS_REMOUNT_RO |
+ ON_ERRORS_CONTINUE))) {
+ ntfs_error(sb, "%s and neither on_errors="
+ "continue nor on_errors="
+ "remount-ro was specified%s",
+ es1, es2);
+ goto iput_usnjrnl_err_out;
+ }
+ sb->s_flags |= SB_RDONLY;
+ ntfs_error(sb, "%s. Mounting read-only%s", es1, es2);
+ } else
+ ntfs_warning(sb, "%s. Will not be able to remount "
+ "read-write%s", es1, es2);
+ /* This will prevent a read-write remount. */
+ NVolSetErrors(vol);
+ }
+ /* If (still) a read-write mount, stamp the transaction log. */
+ if (!sb_rdonly(sb) && !ntfs_stamp_usnjrnl(vol)) {
+ static const char *es1 = "Failed to stamp transaction log "
+ "($UsnJrnl)";
+ static const char *es2 = ". Run chkdsk.";
+
+ /* Convert to a read-only mount. */
+ if (!(vol->on_errors & (ON_ERRORS_REMOUNT_RO |
+ ON_ERRORS_CONTINUE))) {
+ ntfs_error(sb, "%s and neither on_errors=continue nor "
+ "on_errors=remount-ro was specified%s",
+ es1, es2);
+ goto iput_usnjrnl_err_out;
+ }
+ ntfs_error(sb, "%s. Mounting read-only%s", es1, es2);
+ sb->s_flags |= SB_RDONLY;
+ NVolSetErrors(vol);
+ }
+#endif /* NTFS_RW */
+ return true;
+#ifdef NTFS_RW
+iput_usnjrnl_err_out:
+ iput(vol->usnjrnl_j_ino);
+ iput(vol->usnjrnl_max_ino);
+ iput(vol->usnjrnl_ino);
+iput_quota_err_out:
+ iput(vol->quota_q_ino);
+ iput(vol->quota_ino);
+ iput(vol->extend_ino);
+#endif /* NTFS_RW */
+iput_sec_err_out:
+ iput(vol->secure_ino);
+iput_root_err_out:
+ iput(vol->root_ino);
+iput_logfile_err_out:
+#ifdef NTFS_RW
+ iput(vol->logfile_ino);
+iput_vol_err_out:
+#endif /* NTFS_RW */
+ iput(vol->vol_ino);
+iput_lcnbmp_err_out:
+ iput(vol->lcnbmp_ino);
+iput_attrdef_err_out:
+ vol->attrdef_size = 0;
+ if (vol->attrdef) {
+ ntfs_free(vol->attrdef);
+ vol->attrdef = NULL;
+ }
+#ifdef NTFS_RW
+iput_upcase_err_out:
+#endif /* NTFS_RW */
+ vol->upcase_len = 0;
+ mutex_lock(&ntfs_lock);
+ if (vol->upcase == default_upcase) {
+ ntfs_nr_upcase_users--;
+ vol->upcase = NULL;
+ }
+ mutex_unlock(&ntfs_lock);
+ if (vol->upcase) {
+ ntfs_free(vol->upcase);
+ vol->upcase = NULL;
+ }
+iput_mftbmp_err_out:
+ iput(vol->mftbmp_ino);
+iput_mirr_err_out:
+#ifdef NTFS_RW
+ iput(vol->mftmirr_ino);
+#endif /* NTFS_RW */
+ return false;
+}
+
+/**
+ * ntfs_put_super - called by the vfs to unmount a volume
+ * @sb: vfs superblock of volume to unmount
+ *
+ * ntfs_put_super() is called by the VFS (from fs/super.c::do_umount()) when
+ * the volume is being unmounted (umount system call has been invoked) and it
+ * releases all inodes and memory belonging to the NTFS specific part of the
+ * super block.
+ */
+static void ntfs_put_super(struct super_block *sb)
+{
+ ntfs_volume *vol = NTFS_SB(sb);
+
+ ntfs_debug("Entering.");
+
+#ifdef NTFS_RW
+ /*
+ * Commit all inodes while they are still open in case some of them
+ * cause others to be dirtied.
+ */
+ ntfs_commit_inode(vol->vol_ino);
+
+ /* NTFS 3.0+ specific. */
+ if (vol->major_ver >= 3) {
+ if (vol->usnjrnl_j_ino)
+ ntfs_commit_inode(vol->usnjrnl_j_ino);
+ if (vol->usnjrnl_max_ino)
+ ntfs_commit_inode(vol->usnjrnl_max_ino);
+ if (vol->usnjrnl_ino)
+ ntfs_commit_inode(vol->usnjrnl_ino);
+ if (vol->quota_q_ino)
+ ntfs_commit_inode(vol->quota_q_ino);
+ if (vol->quota_ino)
+ ntfs_commit_inode(vol->quota_ino);
+ if (vol->extend_ino)
+ ntfs_commit_inode(vol->extend_ino);
+ if (vol->secure_ino)
+ ntfs_commit_inode(vol->secure_ino);
+ }
+
+ ntfs_commit_inode(vol->root_ino);
+
+ down_write(&vol->lcnbmp_lock);
+ ntfs_commit_inode(vol->lcnbmp_ino);
+ up_write(&vol->lcnbmp_lock);
+
+ down_write(&vol->mftbmp_lock);
+ ntfs_commit_inode(vol->mftbmp_ino);
+ up_write(&vol->mftbmp_lock);
+
+ if (vol->logfile_ino)
+ ntfs_commit_inode(vol->logfile_ino);
+
+ if (vol->mftmirr_ino)
+ ntfs_commit_inode(vol->mftmirr_ino);
+ ntfs_commit_inode(vol->mft_ino);
+
+ /*
+ * If a read-write mount and no volume errors have occurred, mark the
+ * volume clean. Also, re-commit all affected inodes.
+ */
+ if (!sb_rdonly(sb)) {
+ if (!NVolErrors(vol)) {
+ if (ntfs_clear_volume_flags(vol, VOLUME_IS_DIRTY))
+ ntfs_warning(sb, "Failed to clear dirty bit "
+ "in volume information "
+ "flags. Run chkdsk.");
+ ntfs_commit_inode(vol->vol_ino);
+ ntfs_commit_inode(vol->root_ino);
+ if (vol->mftmirr_ino)
+ ntfs_commit_inode(vol->mftmirr_ino);
+ ntfs_commit_inode(vol->mft_ino);
+ } else {
+ ntfs_warning(sb, "Volume has errors. Leaving volume "
+ "marked dirty. Run chkdsk.");
+ }
+ }
+#endif /* NTFS_RW */
+
+ iput(vol->vol_ino);
+ vol->vol_ino = NULL;
+
+ /* NTFS 3.0+ specific clean up. */
+ if (vol->major_ver >= 3) {
+#ifdef NTFS_RW
+ if (vol->usnjrnl_j_ino) {
+ iput(vol->usnjrnl_j_ino);
+ vol->usnjrnl_j_ino = NULL;
+ }
+ if (vol->usnjrnl_max_ino) {
+ iput(vol->usnjrnl_max_ino);
+ vol->usnjrnl_max_ino = NULL;
+ }
+ if (vol->usnjrnl_ino) {
+ iput(vol->usnjrnl_ino);
+ vol->usnjrnl_ino = NULL;
+ }
+ if (vol->quota_q_ino) {
+ iput(vol->quota_q_ino);
+ vol->quota_q_ino = NULL;
+ }
+ if (vol->quota_ino) {
+ iput(vol->quota_ino);
+ vol->quota_ino = NULL;
+ }
+#endif /* NTFS_RW */
+ if (vol->extend_ino) {
+ iput(vol->extend_ino);
+ vol->extend_ino = NULL;
+ }
+ if (vol->secure_ino) {
+ iput(vol->secure_ino);
+ vol->secure_ino = NULL;
+ }
+ }
+
+ iput(vol->root_ino);
+ vol->root_ino = NULL;
+
+ down_write(&vol->lcnbmp_lock);
+ iput(vol->lcnbmp_ino);
+ vol->lcnbmp_ino = NULL;
+ up_write(&vol->lcnbmp_lock);
+
+ down_write(&vol->mftbmp_lock);
+ iput(vol->mftbmp_ino);
+ vol->mftbmp_ino = NULL;
+ up_write(&vol->mftbmp_lock);
+
+#ifdef NTFS_RW
+ if (vol->logfile_ino) {
+ iput(vol->logfile_ino);
+ vol->logfile_ino = NULL;
+ }
+ if (vol->mftmirr_ino) {
+ /* Re-commit the mft mirror and mft just in case. */
+ ntfs_commit_inode(vol->mftmirr_ino);
+ ntfs_commit_inode(vol->mft_ino);
+ iput(vol->mftmirr_ino);
+ vol->mftmirr_ino = NULL;
+ }
+ /*
+ * We should have no dirty inodes left, due to
+ * mft.c::ntfs_mft_writepage() cleaning all the dirty pages as
+ * the underlying mft records are written out and cleaned.
+ */
+ ntfs_commit_inode(vol->mft_ino);
+ write_inode_now(vol->mft_ino, 1);
+#endif /* NTFS_RW */
+
+ iput(vol->mft_ino);
+ vol->mft_ino = NULL;
+
+ /* Throw away the table of attribute definitions. */
+ vol->attrdef_size = 0;
+ if (vol->attrdef) {
+ ntfs_free(vol->attrdef);
+ vol->attrdef = NULL;
+ }
+ vol->upcase_len = 0;
+ /*
+ * Destroy the global default upcase table if necessary. Also decrease
+ * the number of upcase users if we are a user.
+ */
+ mutex_lock(&ntfs_lock);
+ if (vol->upcase == default_upcase) {
+ ntfs_nr_upcase_users--;
+ vol->upcase = NULL;
+ }
+ if (!ntfs_nr_upcase_users && default_upcase) {
+ ntfs_free(default_upcase);
+ default_upcase = NULL;
+ }
+ if (vol->cluster_size <= 4096 && !--ntfs_nr_compression_users)
+ free_compression_buffers();
+ mutex_unlock(&ntfs_lock);
+ if (vol->upcase) {
+ ntfs_free(vol->upcase);
+ vol->upcase = NULL;
+ }
+
+ unload_nls(vol->nls_map);
+
+ sb->s_fs_info = NULL;
+ kfree(vol);
+}
+
+/**
+ * get_nr_free_clusters - return the number of free clusters on a volume
+ * @vol: ntfs volume for which to obtain free cluster count
+ *
+ * Calculate the number of free clusters on the mounted NTFS volume @vol. We
+ * actually calculate the number of clusters in use instead because this
+ * allows us to not care about partial pages as these will be just zero filled
+ * and hence not be counted as allocated clusters.
+ *
+ * The only particularity is that clusters beyond the end of the logical ntfs
+ * volume will be marked as allocated to prevent errors which means we have to
+ * discount those at the end. This is important as the cluster bitmap always
+ * has a size in multiples of 8 bytes, i.e. up to 63 clusters could be outside
+ * the logical volume and marked in use when they are not as they do not exist.
+ *
+ * If any pages cannot be read we assume all clusters in the erroring pages are
+ * in use. This means we return an underestimate on errors which is better than
+ * an overestimate.
+ */
+static s64 get_nr_free_clusters(ntfs_volume *vol)
+{
+ s64 nr_free = vol->nr_clusters;
+ struct address_space *mapping = vol->lcnbmp_ino->i_mapping;
+ struct page *page;
+ pgoff_t index, max_index;
+
+ ntfs_debug("Entering.");
+ /* Serialize accesses to the cluster bitmap. */
+ down_read(&vol->lcnbmp_lock);
+ /*
+ * Convert the number of bits into bytes rounded up, then convert into
+ * multiples of PAGE_SIZE, rounding up so that if we have one
+ * full and one partial page max_index = 2.
+ */
+ max_index = (((vol->nr_clusters + 7) >> 3) + PAGE_SIZE - 1) >>
+ PAGE_SHIFT;
+ /* Use multiples of 4 bytes, thus max_size is PAGE_SIZE / 4. */
+ ntfs_debug("Reading $Bitmap, max_index = 0x%lx, max_size = 0x%lx.",
+ max_index, PAGE_SIZE / 4);
+ for (index = 0; index < max_index; index++) {
+ unsigned long *kaddr;
+
+ /*
+ * Read the page from page cache, getting it from backing store
+ * if necessary, and increment the use count.
+ */
+ page = read_mapping_page(mapping, index, NULL);
+ /* Ignore pages which errored synchronously. */
+ if (IS_ERR(page)) {
+ ntfs_debug("read_mapping_page() error. Skipping "
+ "page (index 0x%lx).", index);
+ nr_free -= PAGE_SIZE * 8;
+ continue;
+ }
+ kaddr = kmap_atomic(page);
+ /*
+ * Subtract the number of set bits. If this
+ * is the last page and it is partial we don't really care as
+ * it just means we do a little extra work but it won't affect
+ * the result as all out of range bytes are set to zero by
+ * ntfs_readpage().
+ */
+ nr_free -= bitmap_weight(kaddr,
+ PAGE_SIZE * BITS_PER_BYTE);
+ kunmap_atomic(kaddr);
+ put_page(page);
+ }
+ ntfs_debug("Finished reading $Bitmap, last index = 0x%lx.", index - 1);
+ /*
+ * Fixup for eventual bits outside logical ntfs volume (see function
+ * description above).
+ */
+ if (vol->nr_clusters & 63)
+ nr_free += 64 - (vol->nr_clusters & 63);
+ up_read(&vol->lcnbmp_lock);
+ /* If errors occurred we may well have gone below zero, fix this. */
+ if (nr_free < 0)
+ nr_free = 0;
+ ntfs_debug("Exiting.");
+ return nr_free;
+}
+
+/**
+ * __get_nr_free_mft_records - return the number of free inodes on a volume
+ * @vol: ntfs volume for which to obtain free inode count
+ * @nr_free: number of mft records in filesystem
+ * @max_index: maximum number of pages containing set bits
+ *
+ * Calculate the number of free mft records (inodes) on the mounted NTFS
+ * volume @vol. We actually calculate the number of mft records in use instead
+ * because this allows us to not care about partial pages as these will be just
+ * zero filled and hence not be counted as allocated mft record.
+ *
+ * If any pages cannot be read we assume all mft records in the erroring pages
+ * are in use. This means we return an underestimate on errors which is better
+ * than an overestimate.
+ *
+ * NOTE: Caller must hold mftbmp_lock rw_semaphore for reading or writing.
+ */
+static unsigned long __get_nr_free_mft_records(ntfs_volume *vol,
+ s64 nr_free, const pgoff_t max_index)
+{
+ struct address_space *mapping = vol->mftbmp_ino->i_mapping;
+ struct page *page;
+ pgoff_t index;
+
+ ntfs_debug("Entering.");
+ /* Use multiples of 4 bytes, thus max_size is PAGE_SIZE / 4. */
+ ntfs_debug("Reading $MFT/$BITMAP, max_index = 0x%lx, max_size = "
+ "0x%lx.", max_index, PAGE_SIZE / 4);
+ for (index = 0; index < max_index; index++) {
+ unsigned long *kaddr;
+
+ /*
+ * Read the page from page cache, getting it from backing store
+ * if necessary, and increment the use count.
+ */
+ page = read_mapping_page(mapping, index, NULL);
+ /* Ignore pages which errored synchronously. */
+ if (IS_ERR(page)) {
+ ntfs_debug("read_mapping_page() error. Skipping "
+ "page (index 0x%lx).", index);
+ nr_free -= PAGE_SIZE * 8;
+ continue;
+ }
+ kaddr = kmap_atomic(page);
+ /*
+ * Subtract the number of set bits. If this
+ * is the last page and it is partial we don't really care as
+ * it just means we do a little extra work but it won't affect
+ * the result as all out of range bytes are set to zero by
+ * ntfs_readpage().
+ */
+ nr_free -= bitmap_weight(kaddr,
+ PAGE_SIZE * BITS_PER_BYTE);
+ kunmap_atomic(kaddr);
+ put_page(page);
+ }
+ ntfs_debug("Finished reading $MFT/$BITMAP, last index = 0x%lx.",
+ index - 1);
+ /* If errors occurred we may well have gone below zero, fix this. */
+ if (nr_free < 0)
+ nr_free = 0;
+ ntfs_debug("Exiting.");
+ return nr_free;
+}
+
+/**
+ * ntfs_statfs - return information about mounted NTFS volume
+ * @dentry: dentry from mounted volume
+ * @sfs: statfs structure in which to return the information
+ *
+ * Return information about the mounted NTFS volume @dentry in the statfs structure
+ * pointed to by @sfs (this is initialized with zeros before ntfs_statfs is
+ * called). We interpret the values to be correct of the moment in time at
+ * which we are called. Most values are variable otherwise and this isn't just
+ * the free values but the totals as well. For example we can increase the
+ * total number of file nodes if we run out and we can keep doing this until
+ * there is no more space on the volume left at all.
+ *
+ * Called from vfs_statfs which is used to handle the statfs, fstatfs, and
+ * ustat system calls.
+ *
+ * Return 0 on success or -errno on error.
+ */
+static int ntfs_statfs(struct dentry *dentry, struct kstatfs *sfs)
+{
+ struct super_block *sb = dentry->d_sb;
+ s64 size;
+ ntfs_volume *vol = NTFS_SB(sb);
+ ntfs_inode *mft_ni = NTFS_I(vol->mft_ino);
+ pgoff_t max_index;
+ unsigned long flags;
+
+ ntfs_debug("Entering.");
+ /* Type of filesystem. */
+ sfs->f_type = NTFS_SB_MAGIC;
+ /* Optimal transfer block size. */
+ sfs->f_bsize = PAGE_SIZE;
+ /*
+ * Total data blocks in filesystem in units of f_bsize and since
+ * inodes are also stored in data blocs ($MFT is a file) this is just
+ * the total clusters.
+ */
+ sfs->f_blocks = vol->nr_clusters << vol->cluster_size_bits >>
+ PAGE_SHIFT;
+ /* Free data blocks in filesystem in units of f_bsize. */
+ size = get_nr_free_clusters(vol) << vol->cluster_size_bits >>
+ PAGE_SHIFT;
+ if (size < 0LL)
+ size = 0LL;
+ /* Free blocks avail to non-superuser, same as above on NTFS. */
+ sfs->f_bavail = sfs->f_bfree = size;
+ /* Serialize accesses to the inode bitmap. */
+ down_read(&vol->mftbmp_lock);
+ read_lock_irqsave(&mft_ni->size_lock, flags);
+ size = i_size_read(vol->mft_ino) >> vol->mft_record_size_bits;
+ /*
+ * Convert the maximum number of set bits into bytes rounded up, then
+ * convert into multiples of PAGE_SIZE, rounding up so that if we
+ * have one full and one partial page max_index = 2.
+ */
+ max_index = ((((mft_ni->initialized_size >> vol->mft_record_size_bits)
+ + 7) >> 3) + PAGE_SIZE - 1) >> PAGE_SHIFT;
+ read_unlock_irqrestore(&mft_ni->size_lock, flags);
+ /* Number of inodes in filesystem (at this point in time). */
+ sfs->f_files = size;
+ /* Free inodes in fs (based on current total count). */
+ sfs->f_ffree = __get_nr_free_mft_records(vol, size, max_index);
+ up_read(&vol->mftbmp_lock);
+ /*
+ * File system id. This is extremely *nix flavour dependent and even
+ * within Linux itself all fs do their own thing. I interpret this to
+ * mean a unique id associated with the mounted fs and not the id
+ * associated with the filesystem driver, the latter is already given
+ * by the filesystem type in sfs->f_type. Thus we use the 64-bit
+ * volume serial number splitting it into two 32-bit parts. We enter
+ * the least significant 32-bits in f_fsid[0] and the most significant
+ * 32-bits in f_fsid[1].
+ */
+ sfs->f_fsid = u64_to_fsid(vol->serial_no);
+ /* Maximum length of filenames. */
+ sfs->f_namelen = NTFS_MAX_NAME_LEN;
+ return 0;
+}
+
+#ifdef NTFS_RW
+static int ntfs_write_inode(struct inode *vi, struct writeback_control *wbc)
+{
+ return __ntfs_write_inode(vi, wbc->sync_mode == WB_SYNC_ALL);
+}
+#endif
+
+/**
+ * The complete super operations.
+ */
+static const struct super_operations ntfs_sops = {
+ .alloc_inode = ntfs_alloc_big_inode, /* VFS: Allocate new inode. */
+ .free_inode = ntfs_free_big_inode, /* VFS: Deallocate inode. */
+#ifdef NTFS_RW
+ .write_inode = ntfs_write_inode, /* VFS: Write dirty inode to
+ disk. */
+#endif /* NTFS_RW */
+ .put_super = ntfs_put_super, /* Syscall: umount. */
+ .statfs = ntfs_statfs, /* Syscall: statfs */
+ .remount_fs = ntfs_remount, /* Syscall: mount -o remount. */
+ .evict_inode = ntfs_evict_big_inode, /* VFS: Called when an inode is
+ removed from memory. */
+ .show_options = ntfs_show_options, /* Show mount options in
+ proc. */
+};
+
+/**
+ * ntfs_fill_super - mount an ntfs filesystem
+ * @sb: super block of ntfs filesystem to mount
+ * @opt: string containing the mount options
+ * @silent: silence error output
+ *
+ * ntfs_fill_super() is called by the VFS to mount the device described by @sb
+ * with the mount otions in @data with the NTFS filesystem.
+ *
+ * If @silent is true, remain silent even if errors are detected. This is used
+ * during bootup, when the kernel tries to mount the root filesystem with all
+ * registered filesystems one after the other until one succeeds. This implies
+ * that all filesystems except the correct one will quite correctly and
+ * expectedly return an error, but nobody wants to see error messages when in
+ * fact this is what is supposed to happen.
+ *
+ * NOTE: @sb->s_flags contains the mount options flags.
+ */
+static int ntfs_fill_super(struct super_block *sb, void *opt, const int silent)
+{
+ ntfs_volume *vol;
+ struct buffer_head *bh;
+ struct inode *tmp_ino;
+ int blocksize, result;
+
+ /*
+ * We do a pretty difficult piece of bootstrap by reading the
+ * MFT (and other metadata) from disk into memory. We'll only
+ * release this metadata during umount, so the locking patterns
+ * observed during bootstrap do not count. So turn off the
+ * observation of locking patterns (strictly for this context
+ * only) while mounting NTFS. [The validator is still active
+ * otherwise, even for this context: it will for example record
+ * lock class registrations.]
+ */
+ lockdep_off();
+ ntfs_debug("Entering.");
+#ifndef NTFS_RW
+ sb->s_flags |= SB_RDONLY;
+#endif /* ! NTFS_RW */
+ /* Allocate a new ntfs_volume and place it in sb->s_fs_info. */
+ sb->s_fs_info = kmalloc(sizeof(ntfs_volume), GFP_NOFS);
+ vol = NTFS_SB(sb);
+ if (!vol) {
+ if (!silent)
+ ntfs_error(sb, "Allocation of NTFS volume structure "
+ "failed. Aborting mount...");
+ lockdep_on();
+ return -ENOMEM;
+ }
+ /* Initialize ntfs_volume structure. */
+ *vol = (ntfs_volume) {
+ .sb = sb,
+ /*
+ * Default is group and other don't have any access to files or
+ * directories while owner has full access. Further, files by
+ * default are not executable but directories are of course
+ * browseable.
+ */
+ .fmask = 0177,
+ .dmask = 0077,
+ };
+ init_rwsem(&vol->mftbmp_lock);
+ init_rwsem(&vol->lcnbmp_lock);
+
+ /* By default, enable sparse support. */
+ NVolSetSparseEnabled(vol);
+
+ /* Important to get the mount options dealt with now. */
+ if (!parse_options(vol, (char*)opt))
+ goto err_out_now;
+
+ /* We support sector sizes up to the PAGE_SIZE. */
+ if (bdev_logical_block_size(sb->s_bdev) > PAGE_SIZE) {
+ if (!silent)
+ ntfs_error(sb, "Device has unsupported sector size "
+ "(%i). The maximum supported sector "
+ "size on this architecture is %lu "
+ "bytes.",
+ bdev_logical_block_size(sb->s_bdev),
+ PAGE_SIZE);
+ goto err_out_now;
+ }
+ /*
+ * Setup the device access block size to NTFS_BLOCK_SIZE or the hard
+ * sector size, whichever is bigger.
+ */
+ blocksize = sb_min_blocksize(sb, NTFS_BLOCK_SIZE);
+ if (blocksize < NTFS_BLOCK_SIZE) {
+ if (!silent)
+ ntfs_error(sb, "Unable to set device block size.");
+ goto err_out_now;
+ }
+ BUG_ON(blocksize != sb->s_blocksize);
+ ntfs_debug("Set device block size to %i bytes (block size bits %i).",
+ blocksize, sb->s_blocksize_bits);
+ /* Determine the size of the device in units of block_size bytes. */
+ vol->nr_blocks = sb_bdev_nr_blocks(sb);
+ if (!vol->nr_blocks) {
+ if (!silent)
+ ntfs_error(sb, "Unable to determine device size.");
+ goto err_out_now;
+ }
+ /* Read the boot sector and return unlocked buffer head to it. */
+ if (!(bh = read_ntfs_boot_sector(sb, silent))) {
+ if (!silent)
+ ntfs_error(sb, "Not an NTFS volume.");
+ goto err_out_now;
+ }
+ /*
+ * Extract the data from the boot sector and setup the ntfs volume
+ * using it.
+ */
+ result = parse_ntfs_boot_sector(vol, (NTFS_BOOT_SECTOR*)bh->b_data);
+ brelse(bh);
+ if (!result) {
+ if (!silent)
+ ntfs_error(sb, "Unsupported NTFS filesystem.");
+ goto err_out_now;
+ }
+ /*
+ * If the boot sector indicates a sector size bigger than the current
+ * device block size, switch the device block size to the sector size.
+ * TODO: It may be possible to support this case even when the set
+ * below fails, we would just be breaking up the i/o for each sector
+ * into multiple blocks for i/o purposes but otherwise it should just
+ * work. However it is safer to leave disabled until someone hits this
+ * error message and then we can get them to try it without the setting
+ * so we know for sure that it works.
+ */
+ if (vol->sector_size > blocksize) {
+ blocksize = sb_set_blocksize(sb, vol->sector_size);
+ if (blocksize != vol->sector_size) {
+ if (!silent)
+ ntfs_error(sb, "Unable to set device block "
+ "size to sector size (%i).",
+ vol->sector_size);
+ goto err_out_now;
+ }
+ BUG_ON(blocksize != sb->s_blocksize);
+ vol->nr_blocks = sb_bdev_nr_blocks(sb);
+ ntfs_debug("Changed device block size to %i bytes (block size "
+ "bits %i) to match volume sector size.",
+ blocksize, sb->s_blocksize_bits);
+ }
+ /* Initialize the cluster and mft allocators. */
+ ntfs_setup_allocators(vol);
+ /* Setup remaining fields in the super block. */
+ sb->s_magic = NTFS_SB_MAGIC;
+ /*
+ * Ntfs allows 63 bits for the file size, i.e. correct would be:
+ * sb->s_maxbytes = ~0ULL >> 1;
+ * But the kernel uses a long as the page cache page index which on
+ * 32-bit architectures is only 32-bits. MAX_LFS_FILESIZE is kernel
+ * defined to the maximum the page cache page index can cope with
+ * without overflowing the index or to 2^63 - 1, whichever is smaller.
+ */
+ sb->s_maxbytes = MAX_LFS_FILESIZE;
+ /* Ntfs measures time in 100ns intervals. */
+ sb->s_time_gran = 100;
+ /*
+ * Now load the metadata required for the page cache and our address
+ * space operations to function. We do this by setting up a specialised
+ * read_inode method and then just calling the normal iget() to obtain
+ * the inode for $MFT which is sufficient to allow our normal inode
+ * operations and associated address space operations to function.
+ */
+ sb->s_op = &ntfs_sops;
+ tmp_ino = new_inode(sb);
+ if (!tmp_ino) {
+ if (!silent)
+ ntfs_error(sb, "Failed to load essential metadata.");
+ goto err_out_now;
+ }
+ tmp_ino->i_ino = FILE_MFT;
+ insert_inode_hash(tmp_ino);
+ if (ntfs_read_inode_mount(tmp_ino) < 0) {
+ if (!silent)
+ ntfs_error(sb, "Failed to load essential metadata.");
+ goto iput_tmp_ino_err_out_now;
+ }
+ mutex_lock(&ntfs_lock);
+ /*
+ * The current mount is a compression user if the cluster size is
+ * less than or equal 4kiB.
+ */
+ if (vol->cluster_size <= 4096 && !ntfs_nr_compression_users++) {
+ result = allocate_compression_buffers();
+ if (result) {
+ ntfs_error(NULL, "Failed to allocate buffers "
+ "for compression engine.");
+ ntfs_nr_compression_users--;
+ mutex_unlock(&ntfs_lock);
+ goto iput_tmp_ino_err_out_now;
+ }
+ }
+ /*
+ * Generate the global default upcase table if necessary. Also
+ * temporarily increment the number of upcase users to avoid race
+ * conditions with concurrent (u)mounts.
+ */
+ if (!default_upcase)
+ default_upcase = generate_default_upcase();
+ ntfs_nr_upcase_users++;
+ mutex_unlock(&ntfs_lock);
+ /*
+ * From now on, ignore @silent parameter. If we fail below this line,
+ * it will be due to a corrupt fs or a system error, so we report it.
+ */
+ /*
+ * Open the system files with normal access functions and complete
+ * setting up the ntfs super block.
+ */
+ if (!load_system_files(vol)) {
+ ntfs_error(sb, "Failed to load system files.");
+ goto unl_upcase_iput_tmp_ino_err_out_now;
+ }
+
+ /* We grab a reference, simulating an ntfs_iget(). */
+ ihold(vol->root_ino);
+ if ((sb->s_root = d_make_root(vol->root_ino))) {
+ ntfs_debug("Exiting, status successful.");
+ /* Release the default upcase if it has no users. */
+ mutex_lock(&ntfs_lock);
+ if (!--ntfs_nr_upcase_users && default_upcase) {
+ ntfs_free(default_upcase);
+ default_upcase = NULL;
+ }
+ mutex_unlock(&ntfs_lock);
+ sb->s_export_op = &ntfs_export_ops;
+ lockdep_on();
+ return 0;
+ }
+ ntfs_error(sb, "Failed to allocate root directory.");
+ /* Clean up after the successful load_system_files() call from above. */
+ // TODO: Use ntfs_put_super() instead of repeating all this code...
+ // FIXME: Should mark the volume clean as the error is most likely
+ // -ENOMEM.
+ iput(vol->vol_ino);
+ vol->vol_ino = NULL;
+ /* NTFS 3.0+ specific clean up. */
+ if (vol->major_ver >= 3) {
+#ifdef NTFS_RW
+ if (vol->usnjrnl_j_ino) {
+ iput(vol->usnjrnl_j_ino);
+ vol->usnjrnl_j_ino = NULL;
+ }
+ if (vol->usnjrnl_max_ino) {
+ iput(vol->usnjrnl_max_ino);
+ vol->usnjrnl_max_ino = NULL;
+ }
+ if (vol->usnjrnl_ino) {
+ iput(vol->usnjrnl_ino);
+ vol->usnjrnl_ino = NULL;
+ }
+ if (vol->quota_q_ino) {
+ iput(vol->quota_q_ino);
+ vol->quota_q_ino = NULL;
+ }
+ if (vol->quota_ino) {
+ iput(vol->quota_ino);
+ vol->quota_ino = NULL;
+ }
+#endif /* NTFS_RW */
+ if (vol->extend_ino) {
+ iput(vol->extend_ino);
+ vol->extend_ino = NULL;
+ }
+ if (vol->secure_ino) {
+ iput(vol->secure_ino);
+ vol->secure_ino = NULL;
+ }
+ }
+ iput(vol->root_ino);
+ vol->root_ino = NULL;
+ iput(vol->lcnbmp_ino);
+ vol->lcnbmp_ino = NULL;
+ iput(vol->mftbmp_ino);
+ vol->mftbmp_ino = NULL;
+#ifdef NTFS_RW
+ if (vol->logfile_ino) {
+ iput(vol->logfile_ino);
+ vol->logfile_ino = NULL;
+ }
+ if (vol->mftmirr_ino) {
+ iput(vol->mftmirr_ino);
+ vol->mftmirr_ino = NULL;
+ }
+#endif /* NTFS_RW */
+ /* Throw away the table of attribute definitions. */
+ vol->attrdef_size = 0;
+ if (vol->attrdef) {
+ ntfs_free(vol->attrdef);
+ vol->attrdef = NULL;
+ }
+ vol->upcase_len = 0;
+ mutex_lock(&ntfs_lock);
+ if (vol->upcase == default_upcase) {
+ ntfs_nr_upcase_users--;
+ vol->upcase = NULL;
+ }
+ mutex_unlock(&ntfs_lock);
+ if (vol->upcase) {
+ ntfs_free(vol->upcase);
+ vol->upcase = NULL;
+ }
+ if (vol->nls_map) {
+ unload_nls(vol->nls_map);
+ vol->nls_map = NULL;
+ }
+ /* Error exit code path. */
+unl_upcase_iput_tmp_ino_err_out_now:
+ /*
+ * Decrease the number of upcase users and destroy the global default
+ * upcase table if necessary.
+ */
+ mutex_lock(&ntfs_lock);
+ if (!--ntfs_nr_upcase_users && default_upcase) {
+ ntfs_free(default_upcase);
+ default_upcase = NULL;
+ }
+ if (vol->cluster_size <= 4096 && !--ntfs_nr_compression_users)
+ free_compression_buffers();
+ mutex_unlock(&ntfs_lock);
+iput_tmp_ino_err_out_now:
+ iput(tmp_ino);
+ if (vol->mft_ino && vol->mft_ino != tmp_ino)
+ iput(vol->mft_ino);
+ vol->mft_ino = NULL;
+ /* Errors at this stage are irrelevant. */
+err_out_now:
+ sb->s_fs_info = NULL;
+ kfree(vol);
+ ntfs_debug("Failed, returning -EINVAL.");
+ lockdep_on();
+ return -EINVAL;
+}
+
+/*
+ * This is a slab cache to optimize allocations and deallocations of Unicode
+ * strings of the maximum length allowed by NTFS, which is NTFS_MAX_NAME_LEN
+ * (255) Unicode characters + a terminating NULL Unicode character.
+ */
+struct kmem_cache *ntfs_name_cache;
+
+/* Slab caches for efficient allocation/deallocation of inodes. */
+struct kmem_cache *ntfs_inode_cache;
+struct kmem_cache *ntfs_big_inode_cache;
+
+/* Init once constructor for the inode slab cache. */
+static void ntfs_big_inode_init_once(void *foo)
+{
+ ntfs_inode *ni = (ntfs_inode *)foo;
+
+ inode_init_once(VFS_I(ni));
+}
+
+/*
+ * Slab caches to optimize allocations and deallocations of attribute search
+ * contexts and index contexts, respectively.
+ */
+struct kmem_cache *ntfs_attr_ctx_cache;
+struct kmem_cache *ntfs_index_ctx_cache;
+
+/* Driver wide mutex. */
+DEFINE_MUTEX(ntfs_lock);
+
+static struct dentry *ntfs_mount(struct file_system_type *fs_type,
+ int flags, const char *dev_name, void *data)
+{
+ return mount_bdev(fs_type, flags, dev_name, data, ntfs_fill_super);
+}
+
+static struct file_system_type ntfs_fs_type = {
+ .owner = THIS_MODULE,
+ .name = "ntfs",
+ .mount = ntfs_mount,
+ .kill_sb = kill_block_super,
+ .fs_flags = FS_REQUIRES_DEV,
+};
+MODULE_ALIAS_FS("ntfs");
+
+/* Stable names for the slab caches. */
+static const char ntfs_index_ctx_cache_name[] = "ntfs_index_ctx_cache";
+static const char ntfs_attr_ctx_cache_name[] = "ntfs_attr_ctx_cache";
+static const char ntfs_name_cache_name[] = "ntfs_name_cache";
+static const char ntfs_inode_cache_name[] = "ntfs_inode_cache";
+static const char ntfs_big_inode_cache_name[] = "ntfs_big_inode_cache";
+
+static int __init init_ntfs_fs(void)
+{
+ int err = 0;
+
+ /* This may be ugly but it results in pretty output so who cares. (-8 */
+ pr_info("driver " NTFS_VERSION " [Flags: R/"
+#ifdef NTFS_RW
+ "W"
+#else
+ "O"
+#endif
+#ifdef DEBUG
+ " DEBUG"
+#endif
+#ifdef MODULE
+ " MODULE"
+#endif
+ "].\n");
+
+ ntfs_debug("Debug messages are enabled.");
+
+ ntfs_index_ctx_cache = kmem_cache_create(ntfs_index_ctx_cache_name,
+ sizeof(ntfs_index_context), 0 /* offset */,
+ SLAB_HWCACHE_ALIGN, NULL /* ctor */);
+ if (!ntfs_index_ctx_cache) {
+ pr_crit("Failed to create %s!\n", ntfs_index_ctx_cache_name);
+ goto ictx_err_out;
+ }
+ ntfs_attr_ctx_cache = kmem_cache_create(ntfs_attr_ctx_cache_name,
+ sizeof(ntfs_attr_search_ctx), 0 /* offset */,
+ SLAB_HWCACHE_ALIGN, NULL /* ctor */);
+ if (!ntfs_attr_ctx_cache) {
+ pr_crit("NTFS: Failed to create %s!\n",
+ ntfs_attr_ctx_cache_name);
+ goto actx_err_out;
+ }
+
+ ntfs_name_cache = kmem_cache_create(ntfs_name_cache_name,
+ (NTFS_MAX_NAME_LEN+1) * sizeof(ntfschar), 0,
+ SLAB_HWCACHE_ALIGN, NULL);
+ if (!ntfs_name_cache) {
+ pr_crit("Failed to create %s!\n", ntfs_name_cache_name);
+ goto name_err_out;
+ }
+
+ ntfs_inode_cache = kmem_cache_create(ntfs_inode_cache_name,
+ sizeof(ntfs_inode), 0,
+ SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD, NULL);
+ if (!ntfs_inode_cache) {
+ pr_crit("Failed to create %s!\n", ntfs_inode_cache_name);
+ goto inode_err_out;
+ }
+
+ ntfs_big_inode_cache = kmem_cache_create(ntfs_big_inode_cache_name,
+ sizeof(big_ntfs_inode), 0,
+ SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD|
+ SLAB_ACCOUNT, ntfs_big_inode_init_once);
+ if (!ntfs_big_inode_cache) {
+ pr_crit("Failed to create %s!\n", ntfs_big_inode_cache_name);
+ goto big_inode_err_out;
+ }
+
+ /* Register the ntfs sysctls. */
+ err = ntfs_sysctl(1);
+ if (err) {
+ pr_crit("Failed to register NTFS sysctls!\n");
+ goto sysctl_err_out;
+ }
+
+ err = register_filesystem(&ntfs_fs_type);
+ if (!err) {
+ ntfs_debug("NTFS driver registered successfully.");
+ return 0; /* Success! */
+ }
+ pr_crit("Failed to register NTFS filesystem driver!\n");
+
+ /* Unregister the ntfs sysctls. */
+ ntfs_sysctl(0);
+sysctl_err_out:
+ kmem_cache_destroy(ntfs_big_inode_cache);
+big_inode_err_out:
+ kmem_cache_destroy(ntfs_inode_cache);
+inode_err_out:
+ kmem_cache_destroy(ntfs_name_cache);
+name_err_out:
+ kmem_cache_destroy(ntfs_attr_ctx_cache);
+actx_err_out:
+ kmem_cache_destroy(ntfs_index_ctx_cache);
+ictx_err_out:
+ if (!err) {
+ pr_crit("Aborting NTFS filesystem driver registration...\n");
+ err = -ENOMEM;
+ }
+ return err;
+}
+
+static void __exit exit_ntfs_fs(void)
+{
+ ntfs_debug("Unregistering NTFS driver.");
+
+ unregister_filesystem(&ntfs_fs_type);
+
+ /*
+ * Make sure all delayed rcu free inodes are flushed before we
+ * destroy cache.
+ */
+ rcu_barrier();
+ kmem_cache_destroy(ntfs_big_inode_cache);
+ kmem_cache_destroy(ntfs_inode_cache);
+ kmem_cache_destroy(ntfs_name_cache);
+ kmem_cache_destroy(ntfs_attr_ctx_cache);
+ kmem_cache_destroy(ntfs_index_ctx_cache);
+ /* Unregister the ntfs sysctls. */
+ ntfs_sysctl(0);
+}
+
+MODULE_AUTHOR("Anton Altaparmakov <anton@tuxera.com>");
+MODULE_DESCRIPTION("NTFS 1.2/3.x driver - Copyright (c) 2001-2014 Anton Altaparmakov and Tuxera Inc.");
+MODULE_VERSION(NTFS_VERSION);
+MODULE_LICENSE("GPL");
+#ifdef DEBUG
+module_param(debug_msgs, bint, 0);
+MODULE_PARM_DESC(debug_msgs, "Enable debug messages.");
+#endif
+
+module_init(init_ntfs_fs)
+module_exit(exit_ntfs_fs)
diff --git a/fs/ntfs/sysctl.c b/fs/ntfs/sysctl.c
new file mode 100644
index 000000000..a030d00af
--- /dev/null
+++ b/fs/ntfs/sysctl.c
@@ -0,0 +1,69 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * sysctl.c - Code for sysctl handling in NTFS Linux kernel driver. Part of
+ * the Linux-NTFS project. Adapted from the old NTFS driver,
+ * Copyright (C) 1997 Martin von Löwis, Régis Duchesne
+ *
+ * Copyright (c) 2002-2005 Anton Altaparmakov
+ */
+
+#ifdef DEBUG
+
+#include <linux/module.h>
+
+#ifdef CONFIG_SYSCTL
+
+#include <linux/proc_fs.h>
+#include <linux/sysctl.h>
+
+#include "sysctl.h"
+#include "debug.h"
+
+/* Definition of the ntfs sysctl. */
+static struct ctl_table ntfs_sysctls[] = {
+ {
+ .procname = "ntfs-debug",
+ .data = &debug_msgs, /* Data pointer and size. */
+ .maxlen = sizeof(debug_msgs),
+ .mode = 0644, /* Mode, proc handler. */
+ .proc_handler = proc_dointvec
+ },
+ {}
+};
+
+/* Define the parent directory /proc/sys/fs. */
+static struct ctl_table sysctls_root[] = {
+ {
+ .procname = "fs",
+ .mode = 0555,
+ .child = ntfs_sysctls
+ },
+ {}
+};
+
+/* Storage for the sysctls header. */
+static struct ctl_table_header *sysctls_root_table;
+
+/**
+ * ntfs_sysctl - add or remove the debug sysctl
+ * @add: add (1) or remove (0) the sysctl
+ *
+ * Add or remove the debug sysctl. Return 0 on success or -errno on error.
+ */
+int ntfs_sysctl(int add)
+{
+ if (add) {
+ BUG_ON(sysctls_root_table);
+ sysctls_root_table = register_sysctl_table(sysctls_root);
+ if (!sysctls_root_table)
+ return -ENOMEM;
+ } else {
+ BUG_ON(!sysctls_root_table);
+ unregister_sysctl_table(sysctls_root_table);
+ sysctls_root_table = NULL;
+ }
+ return 0;
+}
+
+#endif /* CONFIG_SYSCTL */
+#endif /* DEBUG */
diff --git a/fs/ntfs/sysctl.h b/fs/ntfs/sysctl.h
new file mode 100644
index 000000000..96bb2299d
--- /dev/null
+++ b/fs/ntfs/sysctl.h
@@ -0,0 +1,27 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * sysctl.h - Defines for sysctl handling in NTFS Linux kernel driver. Part of
+ * the Linux-NTFS project. Adapted from the old NTFS driver,
+ * Copyright (C) 1997 Martin von Löwis, Régis Duchesne
+ *
+ * Copyright (c) 2002-2004 Anton Altaparmakov
+ */
+
+#ifndef _LINUX_NTFS_SYSCTL_H
+#define _LINUX_NTFS_SYSCTL_H
+
+
+#if defined(DEBUG) && defined(CONFIG_SYSCTL)
+
+extern int ntfs_sysctl(int add);
+
+#else
+
+/* Just return success. */
+static inline int ntfs_sysctl(int add)
+{
+ return 0;
+}
+
+#endif /* DEBUG && CONFIG_SYSCTL */
+#endif /* _LINUX_NTFS_SYSCTL_H */
diff --git a/fs/ntfs/time.h b/fs/ntfs/time.h
new file mode 100644
index 000000000..6b6326130
--- /dev/null
+++ b/fs/ntfs/time.h
@@ -0,0 +1,89 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * time.h - NTFS time conversion functions. Part of the Linux-NTFS project.
+ *
+ * Copyright (c) 2001-2005 Anton Altaparmakov
+ */
+
+#ifndef _LINUX_NTFS_TIME_H
+#define _LINUX_NTFS_TIME_H
+
+#include <linux/time.h> /* For current_kernel_time(). */
+#include <asm/div64.h> /* For do_div(). */
+
+#include "endian.h"
+
+#define NTFS_TIME_OFFSET ((s64)(369 * 365 + 89) * 24 * 3600 * 10000000)
+
+/**
+ * utc2ntfs - convert Linux UTC time to NTFS time
+ * @ts: Linux UTC time to convert to NTFS time
+ *
+ * Convert the Linux UTC time @ts to its corresponding NTFS time and return
+ * that in little endian format.
+ *
+ * Linux stores time in a struct timespec64 consisting of a time64_t tv_sec
+ * and a long tv_nsec where tv_sec is the number of 1-second intervals since
+ * 1st January 1970, 00:00:00 UTC and tv_nsec is the number of 1-nano-second
+ * intervals since the value of tv_sec.
+ *
+ * NTFS uses Microsoft's standard time format which is stored in a s64 and is
+ * measured as the number of 100-nano-second intervals since 1st January 1601,
+ * 00:00:00 UTC.
+ */
+static inline sle64 utc2ntfs(const struct timespec64 ts)
+{
+ /*
+ * Convert the seconds to 100ns intervals, add the nano-seconds
+ * converted to 100ns intervals, and then add the NTFS time offset.
+ */
+ return cpu_to_sle64((s64)ts.tv_sec * 10000000 + ts.tv_nsec / 100 +
+ NTFS_TIME_OFFSET);
+}
+
+/**
+ * get_current_ntfs_time - get the current time in little endian NTFS format
+ *
+ * Get the current time from the Linux kernel, convert it to its corresponding
+ * NTFS time and return that in little endian format.
+ */
+static inline sle64 get_current_ntfs_time(void)
+{
+ struct timespec64 ts;
+
+ ktime_get_coarse_real_ts64(&ts);
+ return utc2ntfs(ts);
+}
+
+/**
+ * ntfs2utc - convert NTFS time to Linux time
+ * @time: NTFS time (little endian) to convert to Linux UTC
+ *
+ * Convert the little endian NTFS time @time to its corresponding Linux UTC
+ * time and return that in cpu format.
+ *
+ * Linux stores time in a struct timespec64 consisting of a time64_t tv_sec
+ * and a long tv_nsec where tv_sec is the number of 1-second intervals since
+ * 1st January 1970, 00:00:00 UTC and tv_nsec is the number of 1-nano-second
+ * intervals since the value of tv_sec.
+ *
+ * NTFS uses Microsoft's standard time format which is stored in a s64 and is
+ * measured as the number of 100 nano-second intervals since 1st January 1601,
+ * 00:00:00 UTC.
+ */
+static inline struct timespec64 ntfs2utc(const sle64 time)
+{
+ struct timespec64 ts;
+
+ /* Subtract the NTFS time offset. */
+ u64 t = (u64)(sle64_to_cpu(time) - NTFS_TIME_OFFSET);
+ /*
+ * Convert the time to 1-second intervals and the remainder to
+ * 1-nano-second intervals.
+ */
+ ts.tv_nsec = do_div(t, 10000000) * 100;
+ ts.tv_sec = t;
+ return ts;
+}
+
+#endif /* _LINUX_NTFS_TIME_H */
diff --git a/fs/ntfs/types.h b/fs/ntfs/types.h
new file mode 100644
index 000000000..9a47859e7
--- /dev/null
+++ b/fs/ntfs/types.h
@@ -0,0 +1,55 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * types.h - Defines for NTFS Linux kernel driver specific types.
+ * Part of the Linux-NTFS project.
+ *
+ * Copyright (c) 2001-2005 Anton Altaparmakov
+ */
+
+#ifndef _LINUX_NTFS_TYPES_H
+#define _LINUX_NTFS_TYPES_H
+
+#include <linux/types.h>
+
+typedef __le16 le16;
+typedef __le32 le32;
+typedef __le64 le64;
+typedef __u16 __bitwise sle16;
+typedef __u32 __bitwise sle32;
+typedef __u64 __bitwise sle64;
+
+/* 2-byte Unicode character type. */
+typedef le16 ntfschar;
+#define UCHAR_T_SIZE_BITS 1
+
+/*
+ * Clusters are signed 64-bit values on NTFS volumes. We define two types, LCN
+ * and VCN, to allow for type checking and better code readability.
+ */
+typedef s64 VCN;
+typedef sle64 leVCN;
+typedef s64 LCN;
+typedef sle64 leLCN;
+
+/*
+ * The NTFS journal $LogFile uses log sequence numbers which are signed 64-bit
+ * values. We define our own type LSN, to allow for type checking and better
+ * code readability.
+ */
+typedef s64 LSN;
+typedef sle64 leLSN;
+
+/*
+ * The NTFS transaction log $UsnJrnl uses usn which are signed 64-bit values.
+ * We define our own type USN, to allow for type checking and better code
+ * readability.
+ */
+typedef s64 USN;
+typedef sle64 leUSN;
+
+typedef enum {
+ CASE_SENSITIVE = 0,
+ IGNORE_CASE = 1,
+} IGNORE_CASE_BOOL;
+
+#endif /* _LINUX_NTFS_TYPES_H */
diff --git a/fs/ntfs/unistr.c b/fs/ntfs/unistr.c
new file mode 100644
index 000000000..a6b6c64f1
--- /dev/null
+++ b/fs/ntfs/unistr.c
@@ -0,0 +1,384 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * unistr.c - NTFS Unicode string handling. Part of the Linux-NTFS project.
+ *
+ * Copyright (c) 2001-2006 Anton Altaparmakov
+ */
+
+#include <linux/slab.h>
+
+#include "types.h"
+#include "debug.h"
+#include "ntfs.h"
+
+/*
+ * IMPORTANT
+ * =========
+ *
+ * All these routines assume that the Unicode characters are in little endian
+ * encoding inside the strings!!!
+ */
+
+/*
+ * This is used by the name collation functions to quickly determine what
+ * characters are (in)valid.
+ */
+static const u8 legal_ansi_char_array[0x40] = {
+ 0x00, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
+ 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
+
+ 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
+ 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
+
+ 0x17, 0x07, 0x18, 0x17, 0x17, 0x17, 0x17, 0x17,
+ 0x17, 0x17, 0x18, 0x16, 0x16, 0x17, 0x07, 0x00,
+
+ 0x17, 0x17, 0x17, 0x17, 0x17, 0x17, 0x17, 0x17,
+ 0x17, 0x17, 0x04, 0x16, 0x18, 0x16, 0x18, 0x18,
+};
+
+/**
+ * ntfs_are_names_equal - compare two Unicode names for equality
+ * @s1: name to compare to @s2
+ * @s1_len: length in Unicode characters of @s1
+ * @s2: name to compare to @s1
+ * @s2_len: length in Unicode characters of @s2
+ * @ic: ignore case bool
+ * @upcase: upcase table (only if @ic == IGNORE_CASE)
+ * @upcase_size: length in Unicode characters of @upcase (if present)
+ *
+ * Compare the names @s1 and @s2 and return 'true' (1) if the names are
+ * identical, or 'false' (0) if they are not identical. If @ic is IGNORE_CASE,
+ * the @upcase table is used to performa a case insensitive comparison.
+ */
+bool ntfs_are_names_equal(const ntfschar *s1, size_t s1_len,
+ const ntfschar *s2, size_t s2_len, const IGNORE_CASE_BOOL ic,
+ const ntfschar *upcase, const u32 upcase_size)
+{
+ if (s1_len != s2_len)
+ return false;
+ if (ic == CASE_SENSITIVE)
+ return !ntfs_ucsncmp(s1, s2, s1_len);
+ return !ntfs_ucsncasecmp(s1, s2, s1_len, upcase, upcase_size);
+}
+
+/**
+ * ntfs_collate_names - collate two Unicode names
+ * @name1: first Unicode name to compare
+ * @name2: second Unicode name to compare
+ * @err_val: if @name1 contains an invalid character return this value
+ * @ic: either CASE_SENSITIVE or IGNORE_CASE
+ * @upcase: upcase table (ignored if @ic is CASE_SENSITIVE)
+ * @upcase_len: upcase table size (ignored if @ic is CASE_SENSITIVE)
+ *
+ * ntfs_collate_names collates two Unicode names and returns:
+ *
+ * -1 if the first name collates before the second one,
+ * 0 if the names match,
+ * 1 if the second name collates before the first one, or
+ * @err_val if an invalid character is found in @name1 during the comparison.
+ *
+ * The following characters are considered invalid: '"', '*', '<', '>' and '?'.
+ */
+int ntfs_collate_names(const ntfschar *name1, const u32 name1_len,
+ const ntfschar *name2, const u32 name2_len,
+ const int err_val, const IGNORE_CASE_BOOL ic,
+ const ntfschar *upcase, const u32 upcase_len)
+{
+ u32 cnt, min_len;
+ u16 c1, c2;
+
+ min_len = name1_len;
+ if (name1_len > name2_len)
+ min_len = name2_len;
+ for (cnt = 0; cnt < min_len; ++cnt) {
+ c1 = le16_to_cpu(*name1++);
+ c2 = le16_to_cpu(*name2++);
+ if (ic) {
+ if (c1 < upcase_len)
+ c1 = le16_to_cpu(upcase[c1]);
+ if (c2 < upcase_len)
+ c2 = le16_to_cpu(upcase[c2]);
+ }
+ if (c1 < 64 && legal_ansi_char_array[c1] & 8)
+ return err_val;
+ if (c1 < c2)
+ return -1;
+ if (c1 > c2)
+ return 1;
+ }
+ if (name1_len < name2_len)
+ return -1;
+ if (name1_len == name2_len)
+ return 0;
+ /* name1_len > name2_len */
+ c1 = le16_to_cpu(*name1);
+ if (c1 < 64 && legal_ansi_char_array[c1] & 8)
+ return err_val;
+ return 1;
+}
+
+/**
+ * ntfs_ucsncmp - compare two little endian Unicode strings
+ * @s1: first string
+ * @s2: second string
+ * @n: maximum unicode characters to compare
+ *
+ * Compare the first @n characters of the Unicode strings @s1 and @s2,
+ * The strings in little endian format and appropriate le16_to_cpu()
+ * conversion is performed on non-little endian machines.
+ *
+ * The function returns an integer less than, equal to, or greater than zero
+ * if @s1 (or the first @n Unicode characters thereof) is found, respectively,
+ * to be less than, to match, or be greater than @s2.
+ */
+int ntfs_ucsncmp(const ntfschar *s1, const ntfschar *s2, size_t n)
+{
+ u16 c1, c2;
+ size_t i;
+
+ for (i = 0; i < n; ++i) {
+ c1 = le16_to_cpu(s1[i]);
+ c2 = le16_to_cpu(s2[i]);
+ if (c1 < c2)
+ return -1;
+ if (c1 > c2)
+ return 1;
+ if (!c1)
+ break;
+ }
+ return 0;
+}
+
+/**
+ * ntfs_ucsncasecmp - compare two little endian Unicode strings, ignoring case
+ * @s1: first string
+ * @s2: second string
+ * @n: maximum unicode characters to compare
+ * @upcase: upcase table
+ * @upcase_size: upcase table size in Unicode characters
+ *
+ * Compare the first @n characters of the Unicode strings @s1 and @s2,
+ * ignoring case. The strings in little endian format and appropriate
+ * le16_to_cpu() conversion is performed on non-little endian machines.
+ *
+ * Each character is uppercased using the @upcase table before the comparison.
+ *
+ * The function returns an integer less than, equal to, or greater than zero
+ * if @s1 (or the first @n Unicode characters thereof) is found, respectively,
+ * to be less than, to match, or be greater than @s2.
+ */
+int ntfs_ucsncasecmp(const ntfschar *s1, const ntfschar *s2, size_t n,
+ const ntfschar *upcase, const u32 upcase_size)
+{
+ size_t i;
+ u16 c1, c2;
+
+ for (i = 0; i < n; ++i) {
+ if ((c1 = le16_to_cpu(s1[i])) < upcase_size)
+ c1 = le16_to_cpu(upcase[c1]);
+ if ((c2 = le16_to_cpu(s2[i])) < upcase_size)
+ c2 = le16_to_cpu(upcase[c2]);
+ if (c1 < c2)
+ return -1;
+ if (c1 > c2)
+ return 1;
+ if (!c1)
+ break;
+ }
+ return 0;
+}
+
+void ntfs_upcase_name(ntfschar *name, u32 name_len, const ntfschar *upcase,
+ const u32 upcase_len)
+{
+ u32 i;
+ u16 u;
+
+ for (i = 0; i < name_len; i++)
+ if ((u = le16_to_cpu(name[i])) < upcase_len)
+ name[i] = upcase[u];
+}
+
+void ntfs_file_upcase_value(FILE_NAME_ATTR *file_name_attr,
+ const ntfschar *upcase, const u32 upcase_len)
+{
+ ntfs_upcase_name((ntfschar*)&file_name_attr->file_name,
+ file_name_attr->file_name_length, upcase, upcase_len);
+}
+
+int ntfs_file_compare_values(FILE_NAME_ATTR *file_name_attr1,
+ FILE_NAME_ATTR *file_name_attr2,
+ const int err_val, const IGNORE_CASE_BOOL ic,
+ const ntfschar *upcase, const u32 upcase_len)
+{
+ return ntfs_collate_names((ntfschar*)&file_name_attr1->file_name,
+ file_name_attr1->file_name_length,
+ (ntfschar*)&file_name_attr2->file_name,
+ file_name_attr2->file_name_length,
+ err_val, ic, upcase, upcase_len);
+}
+
+/**
+ * ntfs_nlstoucs - convert NLS string to little endian Unicode string
+ * @vol: ntfs volume which we are working with
+ * @ins: input NLS string buffer
+ * @ins_len: length of input string in bytes
+ * @outs: on return contains the allocated output Unicode string buffer
+ *
+ * Convert the input string @ins, which is in whatever format the loaded NLS
+ * map dictates, into a little endian, 2-byte Unicode string.
+ *
+ * This function allocates the string and the caller is responsible for
+ * calling kmem_cache_free(ntfs_name_cache, *@outs); when finished with it.
+ *
+ * On success the function returns the number of Unicode characters written to
+ * the output string *@outs (>= 0), not counting the terminating Unicode NULL
+ * character. *@outs is set to the allocated output string buffer.
+ *
+ * On error, a negative number corresponding to the error code is returned. In
+ * that case the output string is not allocated. Both *@outs and *@outs_len
+ * are then undefined.
+ *
+ * This might look a bit odd due to fast path optimization...
+ */
+int ntfs_nlstoucs(const ntfs_volume *vol, const char *ins,
+ const int ins_len, ntfschar **outs)
+{
+ struct nls_table *nls = vol->nls_map;
+ ntfschar *ucs;
+ wchar_t wc;
+ int i, o, wc_len;
+
+ /* We do not trust outside sources. */
+ if (likely(ins)) {
+ ucs = kmem_cache_alloc(ntfs_name_cache, GFP_NOFS);
+ if (likely(ucs)) {
+ for (i = o = 0; i < ins_len; i += wc_len) {
+ wc_len = nls->char2uni(ins + i, ins_len - i,
+ &wc);
+ if (likely(wc_len >= 0 &&
+ o < NTFS_MAX_NAME_LEN)) {
+ if (likely(wc)) {
+ ucs[o++] = cpu_to_le16(wc);
+ continue;
+ } /* else if (!wc) */
+ break;
+ } /* else if (wc_len < 0 ||
+ o >= NTFS_MAX_NAME_LEN) */
+ goto name_err;
+ }
+ ucs[o] = 0;
+ *outs = ucs;
+ return o;
+ } /* else if (!ucs) */
+ ntfs_error(vol->sb, "Failed to allocate buffer for converted "
+ "name from ntfs_name_cache.");
+ return -ENOMEM;
+ } /* else if (!ins) */
+ ntfs_error(vol->sb, "Received NULL pointer.");
+ return -EINVAL;
+name_err:
+ kmem_cache_free(ntfs_name_cache, ucs);
+ if (wc_len < 0) {
+ ntfs_error(vol->sb, "Name using character set %s contains "
+ "characters that cannot be converted to "
+ "Unicode.", nls->charset);
+ i = -EILSEQ;
+ } else /* if (o >= NTFS_MAX_NAME_LEN) */ {
+ ntfs_error(vol->sb, "Name is too long (maximum length for a "
+ "name on NTFS is %d Unicode characters.",
+ NTFS_MAX_NAME_LEN);
+ i = -ENAMETOOLONG;
+ }
+ return i;
+}
+
+/**
+ * ntfs_ucstonls - convert little endian Unicode string to NLS string
+ * @vol: ntfs volume which we are working with
+ * @ins: input Unicode string buffer
+ * @ins_len: length of input string in Unicode characters
+ * @outs: on return contains the (allocated) output NLS string buffer
+ * @outs_len: length of output string buffer in bytes
+ *
+ * Convert the input little endian, 2-byte Unicode string @ins, of length
+ * @ins_len into the string format dictated by the loaded NLS.
+ *
+ * If *@outs is NULL, this function allocates the string and the caller is
+ * responsible for calling kfree(*@outs); when finished with it. In this case
+ * @outs_len is ignored and can be 0.
+ *
+ * On success the function returns the number of bytes written to the output
+ * string *@outs (>= 0), not counting the terminating NULL byte. If the output
+ * string buffer was allocated, *@outs is set to it.
+ *
+ * On error, a negative number corresponding to the error code is returned. In
+ * that case the output string is not allocated. The contents of *@outs are
+ * then undefined.
+ *
+ * This might look a bit odd due to fast path optimization...
+ */
+int ntfs_ucstonls(const ntfs_volume *vol, const ntfschar *ins,
+ const int ins_len, unsigned char **outs, int outs_len)
+{
+ struct nls_table *nls = vol->nls_map;
+ unsigned char *ns;
+ int i, o, ns_len, wc;
+
+ /* We don't trust outside sources. */
+ if (ins) {
+ ns = *outs;
+ ns_len = outs_len;
+ if (ns && !ns_len) {
+ wc = -ENAMETOOLONG;
+ goto conversion_err;
+ }
+ if (!ns) {
+ ns_len = ins_len * NLS_MAX_CHARSET_SIZE;
+ ns = kmalloc(ns_len + 1, GFP_NOFS);
+ if (!ns)
+ goto mem_err_out;
+ }
+ for (i = o = 0; i < ins_len; i++) {
+retry: wc = nls->uni2char(le16_to_cpu(ins[i]), ns + o,
+ ns_len - o);
+ if (wc > 0) {
+ o += wc;
+ continue;
+ } else if (!wc)
+ break;
+ else if (wc == -ENAMETOOLONG && ns != *outs) {
+ unsigned char *tc;
+ /* Grow in multiples of 64 bytes. */
+ tc = kmalloc((ns_len + 64) &
+ ~63, GFP_NOFS);
+ if (tc) {
+ memcpy(tc, ns, ns_len);
+ ns_len = ((ns_len + 64) & ~63) - 1;
+ kfree(ns);
+ ns = tc;
+ goto retry;
+ } /* No memory so goto conversion_error; */
+ } /* wc < 0, real error. */
+ goto conversion_err;
+ }
+ ns[o] = 0;
+ *outs = ns;
+ return o;
+ } /* else (!ins) */
+ ntfs_error(vol->sb, "Received NULL pointer.");
+ return -EINVAL;
+conversion_err:
+ ntfs_error(vol->sb, "Unicode name contains characters that cannot be "
+ "converted to character set %s. You might want to "
+ "try to use the mount option nls=utf8.", nls->charset);
+ if (ns != *outs)
+ kfree(ns);
+ if (wc != -ENAMETOOLONG)
+ wc = -EILSEQ;
+ return wc;
+mem_err_out:
+ ntfs_error(vol->sb, "Failed to allocate name!");
+ return -ENOMEM;
+}
diff --git a/fs/ntfs/upcase.c b/fs/ntfs/upcase.c
new file mode 100644
index 000000000..4ebe84a78
--- /dev/null
+++ b/fs/ntfs/upcase.c
@@ -0,0 +1,73 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * upcase.c - Generate the full NTFS Unicode upcase table in little endian.
+ * Part of the Linux-NTFS project.
+ *
+ * Copyright (c) 2001 Richard Russon <ntfs@flatcap.org>
+ * Copyright (c) 2001-2006 Anton Altaparmakov
+ */
+
+#include "malloc.h"
+#include "ntfs.h"
+
+ntfschar *generate_default_upcase(void)
+{
+ static const int uc_run_table[][3] = { /* Start, End, Add */
+ {0x0061, 0x007B, -32}, {0x0451, 0x045D, -80}, {0x1F70, 0x1F72, 74},
+ {0x00E0, 0x00F7, -32}, {0x045E, 0x0460, -80}, {0x1F72, 0x1F76, 86},
+ {0x00F8, 0x00FF, -32}, {0x0561, 0x0587, -48}, {0x1F76, 0x1F78, 100},
+ {0x0256, 0x0258, -205}, {0x1F00, 0x1F08, 8}, {0x1F78, 0x1F7A, 128},
+ {0x028A, 0x028C, -217}, {0x1F10, 0x1F16, 8}, {0x1F7A, 0x1F7C, 112},
+ {0x03AC, 0x03AD, -38}, {0x1F20, 0x1F28, 8}, {0x1F7C, 0x1F7E, 126},
+ {0x03AD, 0x03B0, -37}, {0x1F30, 0x1F38, 8}, {0x1FB0, 0x1FB2, 8},
+ {0x03B1, 0x03C2, -32}, {0x1F40, 0x1F46, 8}, {0x1FD0, 0x1FD2, 8},
+ {0x03C2, 0x03C3, -31}, {0x1F51, 0x1F52, 8}, {0x1FE0, 0x1FE2, 8},
+ {0x03C3, 0x03CC, -32}, {0x1F53, 0x1F54, 8}, {0x1FE5, 0x1FE6, 7},
+ {0x03CC, 0x03CD, -64}, {0x1F55, 0x1F56, 8}, {0x2170, 0x2180, -16},
+ {0x03CD, 0x03CF, -63}, {0x1F57, 0x1F58, 8}, {0x24D0, 0x24EA, -26},
+ {0x0430, 0x0450, -32}, {0x1F60, 0x1F68, 8}, {0xFF41, 0xFF5B, -32},
+ {0}
+ };
+
+ static const int uc_dup_table[][2] = { /* Start, End */
+ {0x0100, 0x012F}, {0x01A0, 0x01A6}, {0x03E2, 0x03EF}, {0x04CB, 0x04CC},
+ {0x0132, 0x0137}, {0x01B3, 0x01B7}, {0x0460, 0x0481}, {0x04D0, 0x04EB},
+ {0x0139, 0x0149}, {0x01CD, 0x01DD}, {0x0490, 0x04BF}, {0x04EE, 0x04F5},
+ {0x014A, 0x0178}, {0x01DE, 0x01EF}, {0x04BF, 0x04BF}, {0x04F8, 0x04F9},
+ {0x0179, 0x017E}, {0x01F4, 0x01F5}, {0x04C1, 0x04C4}, {0x1E00, 0x1E95},
+ {0x018B, 0x018B}, {0x01FA, 0x0218}, {0x04C7, 0x04C8}, {0x1EA0, 0x1EF9},
+ {0}
+ };
+
+ static const int uc_word_table[][2] = { /* Offset, Value */
+ {0x00FF, 0x0178}, {0x01AD, 0x01AC}, {0x01F3, 0x01F1}, {0x0269, 0x0196},
+ {0x0183, 0x0182}, {0x01B0, 0x01AF}, {0x0253, 0x0181}, {0x026F, 0x019C},
+ {0x0185, 0x0184}, {0x01B9, 0x01B8}, {0x0254, 0x0186}, {0x0272, 0x019D},
+ {0x0188, 0x0187}, {0x01BD, 0x01BC}, {0x0259, 0x018F}, {0x0275, 0x019F},
+ {0x018C, 0x018B}, {0x01C6, 0x01C4}, {0x025B, 0x0190}, {0x0283, 0x01A9},
+ {0x0192, 0x0191}, {0x01C9, 0x01C7}, {0x0260, 0x0193}, {0x0288, 0x01AE},
+ {0x0199, 0x0198}, {0x01CC, 0x01CA}, {0x0263, 0x0194}, {0x0292, 0x01B7},
+ {0x01A8, 0x01A7}, {0x01DD, 0x018E}, {0x0268, 0x0197},
+ {0}
+ };
+
+ int i, r;
+ ntfschar *uc;
+
+ uc = ntfs_malloc_nofs(default_upcase_len * sizeof(ntfschar));
+ if (!uc)
+ return uc;
+ memset(uc, 0, default_upcase_len * sizeof(ntfschar));
+ /* Generate the little endian Unicode upcase table used by ntfs. */
+ for (i = 0; i < default_upcase_len; i++)
+ uc[i] = cpu_to_le16(i);
+ for (r = 0; uc_run_table[r][0]; r++)
+ for (i = uc_run_table[r][0]; i < uc_run_table[r][1]; i++)
+ le16_add_cpu(&uc[i], uc_run_table[r][2]);
+ for (r = 0; uc_dup_table[r][0]; r++)
+ for (i = uc_dup_table[r][0]; i < uc_dup_table[r][1]; i += 2)
+ le16_add_cpu(&uc[i + 1], -1);
+ for (r = 0; uc_word_table[r][0]; r++)
+ uc[uc_word_table[r][0]] = cpu_to_le16(uc_word_table[r][1]);
+ return uc;
+}
diff --git a/fs/ntfs/usnjrnl.c b/fs/ntfs/usnjrnl.c
new file mode 100644
index 000000000..9097a0b4e
--- /dev/null
+++ b/fs/ntfs/usnjrnl.c
@@ -0,0 +1,70 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * usnjrnl.h - NTFS kernel transaction log ($UsnJrnl) handling. Part of the
+ * Linux-NTFS project.
+ *
+ * Copyright (c) 2005 Anton Altaparmakov
+ */
+
+#ifdef NTFS_RW
+
+#include <linux/fs.h>
+#include <linux/highmem.h>
+#include <linux/mm.h>
+
+#include "aops.h"
+#include "debug.h"
+#include "endian.h"
+#include "time.h"
+#include "types.h"
+#include "usnjrnl.h"
+#include "volume.h"
+
+/**
+ * ntfs_stamp_usnjrnl - stamp the transaction log ($UsnJrnl) on an ntfs volume
+ * @vol: ntfs volume on which to stamp the transaction log
+ *
+ * Stamp the transaction log ($UsnJrnl) on the ntfs volume @vol and return
+ * 'true' on success and 'false' on error.
+ *
+ * This function assumes that the transaction log has already been loaded and
+ * consistency checked by a call to fs/ntfs/super.c::load_and_init_usnjrnl().
+ */
+bool ntfs_stamp_usnjrnl(ntfs_volume *vol)
+{
+ ntfs_debug("Entering.");
+ if (likely(!NVolUsnJrnlStamped(vol))) {
+ sle64 stamp;
+ struct page *page;
+ USN_HEADER *uh;
+
+ page = ntfs_map_page(vol->usnjrnl_max_ino->i_mapping, 0);
+ if (IS_ERR(page)) {
+ ntfs_error(vol->sb, "Failed to read from "
+ "$UsnJrnl/$DATA/$Max attribute.");
+ return false;
+ }
+ uh = (USN_HEADER*)page_address(page);
+ stamp = get_current_ntfs_time();
+ ntfs_debug("Stamping transaction log ($UsnJrnl): old "
+ "journal_id 0x%llx, old lowest_valid_usn "
+ "0x%llx, new journal_id 0x%llx, new "
+ "lowest_valid_usn 0x%llx.",
+ (long long)sle64_to_cpu(uh->journal_id),
+ (long long)sle64_to_cpu(uh->lowest_valid_usn),
+ (long long)sle64_to_cpu(stamp),
+ i_size_read(vol->usnjrnl_j_ino));
+ uh->lowest_valid_usn =
+ cpu_to_sle64(i_size_read(vol->usnjrnl_j_ino));
+ uh->journal_id = stamp;
+ flush_dcache_page(page);
+ set_page_dirty(page);
+ ntfs_unmap_page(page);
+ /* Set the flag so we do not have to do it again on remount. */
+ NVolSetUsnJrnlStamped(vol);
+ }
+ ntfs_debug("Done.");
+ return true;
+}
+
+#endif /* NTFS_RW */
diff --git a/fs/ntfs/usnjrnl.h b/fs/ntfs/usnjrnl.h
new file mode 100644
index 000000000..85f531b59
--- /dev/null
+++ b/fs/ntfs/usnjrnl.h
@@ -0,0 +1,191 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * usnjrnl.h - Defines for NTFS kernel transaction log ($UsnJrnl) handling.
+ * Part of the Linux-NTFS project.
+ *
+ * Copyright (c) 2005 Anton Altaparmakov
+ */
+
+#ifndef _LINUX_NTFS_USNJRNL_H
+#define _LINUX_NTFS_USNJRNL_H
+
+#ifdef NTFS_RW
+
+#include "types.h"
+#include "endian.h"
+#include "layout.h"
+#include "volume.h"
+
+/*
+ * Transaction log ($UsnJrnl) organization:
+ *
+ * The transaction log records whenever a file is modified in any way. So for
+ * example it will record that file "blah" was written to at a particular time
+ * but not what was written. If will record that a file was deleted or
+ * created, that a file was truncated, etc. See below for all the reason
+ * codes used.
+ *
+ * The transaction log is in the $Extend directory which is in the root
+ * directory of each volume. If it is not present it means transaction
+ * logging is disabled. If it is present it means transaction logging is
+ * either enabled or in the process of being disabled in which case we can
+ * ignore it as it will go away as soon as Windows gets its hands on it.
+ *
+ * To determine whether the transaction logging is enabled or in the process
+ * of being disabled, need to check the volume flags in the
+ * $VOLUME_INFORMATION attribute in the $Volume system file (which is present
+ * in the root directory and has a fixed mft record number, see layout.h).
+ * If the flag VOLUME_DELETE_USN_UNDERWAY is set it means the transaction log
+ * is in the process of being disabled and if this flag is clear it means the
+ * transaction log is enabled.
+ *
+ * The transaction log consists of two parts; the $DATA/$Max attribute as well
+ * as the $DATA/$J attribute. $Max is a header describing the transaction
+ * log whilst $J is the transaction log data itself as a sequence of variable
+ * sized USN_RECORDs (see below for all the structures).
+ *
+ * We do not care about transaction logging at this point in time but we still
+ * need to let windows know that the transaction log is out of date. To do
+ * this we need to stamp the transaction log. This involves setting the
+ * lowest_valid_usn field in the $DATA/$Max attribute to the usn to be used
+ * for the next added USN_RECORD to the $DATA/$J attribute as well as
+ * generating a new journal_id in $DATA/$Max.
+ *
+ * The journal_id is as of the current version (2.0) of the transaction log
+ * simply the 64-bit timestamp of when the journal was either created or last
+ * stamped.
+ *
+ * To determine the next usn there are two ways. The first is to parse
+ * $DATA/$J and to find the last USN_RECORD in it and to add its record_length
+ * to its usn (which is the byte offset in the $DATA/$J attribute). The
+ * second is simply to take the data size of the attribute. Since the usns
+ * are simply byte offsets into $DATA/$J, this is exactly the next usn. For
+ * obvious reasons we use the second method as it is much simpler and faster.
+ *
+ * As an aside, note that to actually disable the transaction log, one would
+ * need to set the VOLUME_DELETE_USN_UNDERWAY flag (see above), then go
+ * through all the mft records on the volume and set the usn field in their
+ * $STANDARD_INFORMATION attribute to zero. Once that is done, one would need
+ * to delete the transaction log file, i.e. \$Extent\$UsnJrnl, and finally,
+ * one would need to clear the VOLUME_DELETE_USN_UNDERWAY flag.
+ *
+ * Note that if a volume is unmounted whilst the transaction log is being
+ * disabled, the process will continue the next time the volume is mounted.
+ * This is why we can safely mount read-write when we see a transaction log
+ * in the process of being deleted.
+ */
+
+/* Some $UsnJrnl related constants. */
+#define UsnJrnlMajorVer 2
+#define UsnJrnlMinorVer 0
+
+/*
+ * $DATA/$Max attribute. This is (always?) resident and has a fixed size of
+ * 32 bytes. It contains the header describing the transaction log.
+ */
+typedef struct {
+/*Ofs*/
+/* 0*/sle64 maximum_size; /* The maximum on-disk size of the $DATA/$J
+ attribute. */
+/* 8*/sle64 allocation_delta; /* Number of bytes by which to increase the
+ size of the $DATA/$J attribute. */
+/*0x10*/sle64 journal_id; /* Current id of the transaction log. */
+/*0x18*/leUSN lowest_valid_usn; /* Lowest valid usn in $DATA/$J for the
+ current journal_id. */
+/* sizeof() = 32 (0x20) bytes */
+} __attribute__ ((__packed__)) USN_HEADER;
+
+/*
+ * Reason flags (32-bit). Cumulative flags describing the change(s) to the
+ * file since it was last opened. I think the names speak for themselves but
+ * if you disagree check out the descriptions in the Linux NTFS project NTFS
+ * documentation: http://www.linux-ntfs.org/
+ */
+enum {
+ USN_REASON_DATA_OVERWRITE = cpu_to_le32(0x00000001),
+ USN_REASON_DATA_EXTEND = cpu_to_le32(0x00000002),
+ USN_REASON_DATA_TRUNCATION = cpu_to_le32(0x00000004),
+ USN_REASON_NAMED_DATA_OVERWRITE = cpu_to_le32(0x00000010),
+ USN_REASON_NAMED_DATA_EXTEND = cpu_to_le32(0x00000020),
+ USN_REASON_NAMED_DATA_TRUNCATION= cpu_to_le32(0x00000040),
+ USN_REASON_FILE_CREATE = cpu_to_le32(0x00000100),
+ USN_REASON_FILE_DELETE = cpu_to_le32(0x00000200),
+ USN_REASON_EA_CHANGE = cpu_to_le32(0x00000400),
+ USN_REASON_SECURITY_CHANGE = cpu_to_le32(0x00000800),
+ USN_REASON_RENAME_OLD_NAME = cpu_to_le32(0x00001000),
+ USN_REASON_RENAME_NEW_NAME = cpu_to_le32(0x00002000),
+ USN_REASON_INDEXABLE_CHANGE = cpu_to_le32(0x00004000),
+ USN_REASON_BASIC_INFO_CHANGE = cpu_to_le32(0x00008000),
+ USN_REASON_HARD_LINK_CHANGE = cpu_to_le32(0x00010000),
+ USN_REASON_COMPRESSION_CHANGE = cpu_to_le32(0x00020000),
+ USN_REASON_ENCRYPTION_CHANGE = cpu_to_le32(0x00040000),
+ USN_REASON_OBJECT_ID_CHANGE = cpu_to_le32(0x00080000),
+ USN_REASON_REPARSE_POINT_CHANGE = cpu_to_le32(0x00100000),
+ USN_REASON_STREAM_CHANGE = cpu_to_le32(0x00200000),
+ USN_REASON_CLOSE = cpu_to_le32(0x80000000),
+};
+
+typedef le32 USN_REASON_FLAGS;
+
+/*
+ * Source info flags (32-bit). Information about the source of the change(s)
+ * to the file. For detailed descriptions of what these mean, see the Linux
+ * NTFS project NTFS documentation:
+ * http://www.linux-ntfs.org/
+ */
+enum {
+ USN_SOURCE_DATA_MANAGEMENT = cpu_to_le32(0x00000001),
+ USN_SOURCE_AUXILIARY_DATA = cpu_to_le32(0x00000002),
+ USN_SOURCE_REPLICATION_MANAGEMENT = cpu_to_le32(0x00000004),
+};
+
+typedef le32 USN_SOURCE_INFO_FLAGS;
+
+/*
+ * $DATA/$J attribute. This is always non-resident, is marked as sparse, and
+ * is of variabled size. It consists of a sequence of variable size
+ * USN_RECORDS. The minimum allocated_size is allocation_delta as
+ * specified in $DATA/$Max. When the maximum_size specified in $DATA/$Max is
+ * exceeded by more than allocation_delta bytes, allocation_delta bytes are
+ * allocated and appended to the $DATA/$J attribute and an equal number of
+ * bytes at the beginning of the attribute are freed and made sparse. Note the
+ * making sparse only happens at volume checkpoints and hence the actual
+ * $DATA/$J size can exceed maximum_size + allocation_delta temporarily.
+ */
+typedef struct {
+/*Ofs*/
+/* 0*/le32 length; /* Byte size of this record (8-byte
+ aligned). */
+/* 4*/le16 major_ver; /* Major version of the transaction log used
+ for this record. */
+/* 6*/le16 minor_ver; /* Minor version of the transaction log used
+ for this record. */
+/* 8*/leMFT_REF mft_reference;/* The mft reference of the file (or
+ directory) described by this record. */
+/*0x10*/leMFT_REF parent_directory;/* The mft reference of the parent
+ directory of the file described by this
+ record. */
+/*0x18*/leUSN usn; /* The usn of this record. Equals the offset
+ within the $DATA/$J attribute. */
+/*0x20*/sle64 time; /* Time when this record was created. */
+/*0x28*/USN_REASON_FLAGS reason;/* Reason flags (see above). */
+/*0x2c*/USN_SOURCE_INFO_FLAGS source_info;/* Source info flags (see above). */
+/*0x30*/le32 security_id; /* File security_id copied from
+ $STANDARD_INFORMATION. */
+/*0x34*/FILE_ATTR_FLAGS file_attributes; /* File attributes copied from
+ $STANDARD_INFORMATION or $FILE_NAME (not
+ sure which). */
+/*0x38*/le16 file_name_size; /* Size of the file name in bytes. */
+/*0x3a*/le16 file_name_offset; /* Offset to the file name in bytes from the
+ start of this record. */
+/*0x3c*/ntfschar file_name[0]; /* Use when creating only. When reading use
+ file_name_offset to determine the location
+ of the name. */
+/* sizeof() = 60 (0x3c) bytes */
+} __attribute__ ((__packed__)) USN_RECORD;
+
+extern bool ntfs_stamp_usnjrnl(ntfs_volume *vol);
+
+#endif /* NTFS_RW */
+
+#endif /* _LINUX_NTFS_USNJRNL_H */
diff --git a/fs/ntfs/volume.h b/fs/ntfs/volume.h
new file mode 100644
index 000000000..930a9ae8a
--- /dev/null
+++ b/fs/ntfs/volume.h
@@ -0,0 +1,164 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * volume.h - Defines for volume structures in NTFS Linux kernel driver. Part
+ * of the Linux-NTFS project.
+ *
+ * Copyright (c) 2001-2006 Anton Altaparmakov
+ * Copyright (c) 2002 Richard Russon
+ */
+
+#ifndef _LINUX_NTFS_VOLUME_H
+#define _LINUX_NTFS_VOLUME_H
+
+#include <linux/rwsem.h>
+#include <linux/uidgid.h>
+
+#include "types.h"
+#include "layout.h"
+
+/*
+ * The NTFS in memory super block structure.
+ */
+typedef struct {
+ /*
+ * FIXME: Reorder to have commonly used together element within the
+ * same cache line, aiming at a cache line size of 32 bytes. Aim for
+ * 64 bytes for less commonly used together elements. Put most commonly
+ * used elements to front of structure. Obviously do this only when the
+ * structure has stabilized... (AIA)
+ */
+ /* Device specifics. */
+ struct super_block *sb; /* Pointer back to the super_block. */
+ LCN nr_blocks; /* Number of sb->s_blocksize bytes
+ sized blocks on the device. */
+ /* Configuration provided by user at mount time. */
+ unsigned long flags; /* Miscellaneous flags, see below. */
+ kuid_t uid; /* uid that files will be mounted as. */
+ kgid_t gid; /* gid that files will be mounted as. */
+ umode_t fmask; /* The mask for file permissions. */
+ umode_t dmask; /* The mask for directory
+ permissions. */
+ u8 mft_zone_multiplier; /* Initial mft zone multiplier. */
+ u8 on_errors; /* What to do on filesystem errors. */
+ /* NTFS bootsector provided information. */
+ u16 sector_size; /* in bytes */
+ u8 sector_size_bits; /* log2(sector_size) */
+ u32 cluster_size; /* in bytes */
+ u32 cluster_size_mask; /* cluster_size - 1 */
+ u8 cluster_size_bits; /* log2(cluster_size) */
+ u32 mft_record_size; /* in bytes */
+ u32 mft_record_size_mask; /* mft_record_size - 1 */
+ u8 mft_record_size_bits; /* log2(mft_record_size) */
+ u32 index_record_size; /* in bytes */
+ u32 index_record_size_mask; /* index_record_size - 1 */
+ u8 index_record_size_bits; /* log2(index_record_size) */
+ LCN nr_clusters; /* Volume size in clusters == number of
+ bits in lcn bitmap. */
+ LCN mft_lcn; /* Cluster location of mft data. */
+ LCN mftmirr_lcn; /* Cluster location of copy of mft. */
+ u64 serial_no; /* The volume serial number. */
+ /* Mount specific NTFS information. */
+ u32 upcase_len; /* Number of entries in upcase[]. */
+ ntfschar *upcase; /* The upcase table. */
+
+ s32 attrdef_size; /* Size of the attribute definition
+ table in bytes. */
+ ATTR_DEF *attrdef; /* Table of attribute definitions.
+ Obtained from FILE_AttrDef. */
+
+#ifdef NTFS_RW
+ /* Variables used by the cluster and mft allocators. */
+ s64 mft_data_pos; /* Mft record number at which to
+ allocate the next mft record. */
+ LCN mft_zone_start; /* First cluster of the mft zone. */
+ LCN mft_zone_end; /* First cluster beyond the mft zone. */
+ LCN mft_zone_pos; /* Current position in the mft zone. */
+ LCN data1_zone_pos; /* Current position in the first data
+ zone. */
+ LCN data2_zone_pos; /* Current position in the second data
+ zone. */
+#endif /* NTFS_RW */
+
+ struct inode *mft_ino; /* The VFS inode of $MFT. */
+
+ struct inode *mftbmp_ino; /* Attribute inode for $MFT/$BITMAP. */
+ struct rw_semaphore mftbmp_lock; /* Lock for serializing accesses to the
+ mft record bitmap ($MFT/$BITMAP). */
+#ifdef NTFS_RW
+ struct inode *mftmirr_ino; /* The VFS inode of $MFTMirr. */
+ int mftmirr_size; /* Size of mft mirror in mft records. */
+
+ struct inode *logfile_ino; /* The VFS inode of $LogFile. */
+#endif /* NTFS_RW */
+
+ struct inode *lcnbmp_ino; /* The VFS inode of $Bitmap. */
+ struct rw_semaphore lcnbmp_lock; /* Lock for serializing accesses to the
+ cluster bitmap ($Bitmap/$DATA). */
+
+ struct inode *vol_ino; /* The VFS inode of $Volume. */
+ VOLUME_FLAGS vol_flags; /* Volume flags. */
+ u8 major_ver; /* Ntfs major version of volume. */
+ u8 minor_ver; /* Ntfs minor version of volume. */
+
+ struct inode *root_ino; /* The VFS inode of the root
+ directory. */
+ struct inode *secure_ino; /* The VFS inode of $Secure (NTFS3.0+
+ only, otherwise NULL). */
+ struct inode *extend_ino; /* The VFS inode of $Extend (NTFS3.0+
+ only, otherwise NULL). */
+#ifdef NTFS_RW
+ /* $Quota stuff is NTFS3.0+ specific. Unused/NULL otherwise. */
+ struct inode *quota_ino; /* The VFS inode of $Quota. */
+ struct inode *quota_q_ino; /* Attribute inode for $Quota/$Q. */
+ /* $UsnJrnl stuff is NTFS3.0+ specific. Unused/NULL otherwise. */
+ struct inode *usnjrnl_ino; /* The VFS inode of $UsnJrnl. */
+ struct inode *usnjrnl_max_ino; /* Attribute inode for $UsnJrnl/$Max. */
+ struct inode *usnjrnl_j_ino; /* Attribute inode for $UsnJrnl/$J. */
+#endif /* NTFS_RW */
+ struct nls_table *nls_map;
+} ntfs_volume;
+
+/*
+ * Defined bits for the flags field in the ntfs_volume structure.
+ */
+typedef enum {
+ NV_Errors, /* 1: Volume has errors, prevent remount rw. */
+ NV_ShowSystemFiles, /* 1: Return system files in ntfs_readdir(). */
+ NV_CaseSensitive, /* 1: Treat file names as case sensitive and
+ create filenames in the POSIX namespace.
+ Otherwise be case insensitive but still
+ create file names in POSIX namespace. */
+ NV_LogFileEmpty, /* 1: $LogFile journal is empty. */
+ NV_QuotaOutOfDate, /* 1: $Quota is out of date. */
+ NV_UsnJrnlStamped, /* 1: $UsnJrnl has been stamped. */
+ NV_SparseEnabled, /* 1: May create sparse files. */
+} ntfs_volume_flags;
+
+/*
+ * Macro tricks to expand the NVolFoo(), NVolSetFoo(), and NVolClearFoo()
+ * functions.
+ */
+#define DEFINE_NVOL_BIT_OPS(flag) \
+static inline int NVol##flag(ntfs_volume *vol) \
+{ \
+ return test_bit(NV_##flag, &(vol)->flags); \
+} \
+static inline void NVolSet##flag(ntfs_volume *vol) \
+{ \
+ set_bit(NV_##flag, &(vol)->flags); \
+} \
+static inline void NVolClear##flag(ntfs_volume *vol) \
+{ \
+ clear_bit(NV_##flag, &(vol)->flags); \
+}
+
+/* Emit the ntfs volume bitops functions. */
+DEFINE_NVOL_BIT_OPS(Errors)
+DEFINE_NVOL_BIT_OPS(ShowSystemFiles)
+DEFINE_NVOL_BIT_OPS(CaseSensitive)
+DEFINE_NVOL_BIT_OPS(LogFileEmpty)
+DEFINE_NVOL_BIT_OPS(QuotaOutOfDate)
+DEFINE_NVOL_BIT_OPS(UsnJrnlStamped)
+DEFINE_NVOL_BIT_OPS(SparseEnabled)
+
+#endif /* _LINUX_NTFS_VOLUME_H */
diff --git a/fs/ntfs3/Kconfig b/fs/ntfs3/Kconfig
new file mode 100644
index 000000000..6e4cbc48a
--- /dev/null
+++ b/fs/ntfs3/Kconfig
@@ -0,0 +1,46 @@
+# SPDX-License-Identifier: GPL-2.0-only
+config NTFS3_FS
+ tristate "NTFS Read-Write file system support"
+ select NLS
+ help
+ Windows OS native file system (NTFS) support up to NTFS version 3.1.
+
+ Y or M enables the NTFS3 driver with full features enabled (read,
+ write, journal replaying, sparse/compressed files support).
+ File system type to use on mount is "ntfs3". Module name (M option)
+ is also "ntfs3".
+
+ Documentation: <file:Documentation/filesystems/ntfs3.rst>
+
+config NTFS3_64BIT_CLUSTER
+ bool "64 bits per NTFS clusters"
+ depends on NTFS3_FS && 64BIT
+ help
+ Windows implementation of ntfs.sys uses 32 bits per clusters.
+ If activated 64 bits per clusters you will be able to use 4k cluster
+ for 16T+ volumes. Windows will not be able to mount such volumes.
+
+ It is recommended to say N here.
+
+config NTFS3_LZX_XPRESS
+ bool "activate support of external compressions lzx/xpress"
+ depends on NTFS3_FS
+ help
+ In Windows 10 one can use command "compact" to compress any files.
+ 4 possible variants of compression are: xpress4k, xpress8k, xpress16k and lzx.
+ If activated you will be able to read such files correctly.
+
+ It is recommended to say Y here.
+
+config NTFS3_FS_POSIX_ACL
+ bool "NTFS POSIX Access Control Lists"
+ depends on NTFS3_FS
+ select FS_POSIX_ACL
+ help
+ POSIX Access Control Lists (ACLs) support additional access rights
+ for users and groups beyond the standard owner/group/world scheme,
+ and this option selects support for ACLs specifically for ntfs
+ filesystems.
+ NOTE: this is linux only feature. Windows will ignore these ACLs.
+
+ If you don't know what Access Control Lists are, say N.
diff --git a/fs/ntfs3/Makefile b/fs/ntfs3/Makefile
new file mode 100644
index 000000000..279701b62
--- /dev/null
+++ b/fs/ntfs3/Makefile
@@ -0,0 +1,36 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# Makefile for the ntfs3 filesystem support.
+#
+
+# to check robot warnings
+ccflags-y += -Wint-to-pointer-cast \
+ $(call cc-option,-Wunused-but-set-variable,-Wunused-const-variable) \
+ $(call cc-option,-Wold-style-declaration,-Wout-of-line-declaration)
+
+obj-$(CONFIG_NTFS3_FS) += ntfs3.o
+
+ntfs3-y := attrib.o \
+ attrlist.o \
+ bitfunc.o \
+ bitmap.o \
+ dir.o \
+ fsntfs.o \
+ frecord.o \
+ file.o \
+ fslog.o \
+ inode.o \
+ index.o \
+ lznt.o \
+ namei.o \
+ record.o \
+ run.o \
+ super.o \
+ upcase.o \
+ xattr.o
+
+ntfs3-$(CONFIG_NTFS3_LZX_XPRESS) += $(addprefix lib/,\
+ decompress_common.o \
+ lzx_decompress.o \
+ xpress_decompress.o \
+ ) \ No newline at end of file
diff --git a/fs/ntfs3/attrib.c b/fs/ntfs3/attrib.c
new file mode 100644
index 000000000..2215179c9
--- /dev/null
+++ b/fs/ntfs3/attrib.c
@@ -0,0 +1,2470 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ *
+ * Copyright (C) 2019-2021 Paragon Software GmbH, All rights reserved.
+ *
+ * TODO: Merge attr_set_size/attr_data_get_block/attr_allocate_frame?
+ */
+
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include <linux/kernel.h>
+
+#include "debug.h"
+#include "ntfs.h"
+#include "ntfs_fs.h"
+
+/*
+ * You can set external NTFS_MIN_LOG2_OF_CLUMP/NTFS_MAX_LOG2_OF_CLUMP to manage
+ * preallocate algorithm.
+ */
+#ifndef NTFS_MIN_LOG2_OF_CLUMP
+#define NTFS_MIN_LOG2_OF_CLUMP 16
+#endif
+
+#ifndef NTFS_MAX_LOG2_OF_CLUMP
+#define NTFS_MAX_LOG2_OF_CLUMP 26
+#endif
+
+// 16M
+#define NTFS_CLUMP_MIN (1 << (NTFS_MIN_LOG2_OF_CLUMP + 8))
+// 16G
+#define NTFS_CLUMP_MAX (1ull << (NTFS_MAX_LOG2_OF_CLUMP + 8))
+
+static inline u64 get_pre_allocated(u64 size)
+{
+ u32 clump;
+ u8 align_shift;
+ u64 ret;
+
+ if (size <= NTFS_CLUMP_MIN) {
+ clump = 1 << NTFS_MIN_LOG2_OF_CLUMP;
+ align_shift = NTFS_MIN_LOG2_OF_CLUMP;
+ } else if (size >= NTFS_CLUMP_MAX) {
+ clump = 1 << NTFS_MAX_LOG2_OF_CLUMP;
+ align_shift = NTFS_MAX_LOG2_OF_CLUMP;
+ } else {
+ align_shift = NTFS_MIN_LOG2_OF_CLUMP - 1 +
+ __ffs(size >> (8 + NTFS_MIN_LOG2_OF_CLUMP));
+ clump = 1u << align_shift;
+ }
+
+ ret = (((size + clump - 1) >> align_shift)) << align_shift;
+
+ return ret;
+}
+
+/*
+ * attr_must_be_resident
+ *
+ * Return: True if attribute must be resident.
+ */
+static inline bool attr_must_be_resident(struct ntfs_sb_info *sbi,
+ enum ATTR_TYPE type)
+{
+ const struct ATTR_DEF_ENTRY *de;
+
+ switch (type) {
+ case ATTR_STD:
+ case ATTR_NAME:
+ case ATTR_ID:
+ case ATTR_LABEL:
+ case ATTR_VOL_INFO:
+ case ATTR_ROOT:
+ case ATTR_EA_INFO:
+ return true;
+ default:
+ de = ntfs_query_def(sbi, type);
+ if (de && (de->flags & NTFS_ATTR_MUST_BE_RESIDENT))
+ return true;
+ return false;
+ }
+}
+
+/*
+ * attr_load_runs - Load all runs stored in @attr.
+ */
+static int attr_load_runs(struct ATTRIB *attr, struct ntfs_inode *ni,
+ struct runs_tree *run, const CLST *vcn)
+{
+ int err;
+ CLST svcn = le64_to_cpu(attr->nres.svcn);
+ CLST evcn = le64_to_cpu(attr->nres.evcn);
+ u32 asize;
+ u16 run_off;
+
+ if (svcn >= evcn + 1 || run_is_mapped_full(run, svcn, evcn))
+ return 0;
+
+ if (vcn && (evcn < *vcn || *vcn < svcn))
+ return -EINVAL;
+
+ asize = le32_to_cpu(attr->size);
+ run_off = le16_to_cpu(attr->nres.run_off);
+
+ if (run_off > asize)
+ return -EINVAL;
+
+ err = run_unpack_ex(run, ni->mi.sbi, ni->mi.rno, svcn, evcn,
+ vcn ? *vcn : svcn, Add2Ptr(attr, run_off),
+ asize - run_off);
+ if (err < 0)
+ return err;
+
+ return 0;
+}
+
+/*
+ * run_deallocate_ex - Deallocate clusters.
+ */
+static int run_deallocate_ex(struct ntfs_sb_info *sbi, struct runs_tree *run,
+ CLST vcn, CLST len, CLST *done, bool trim)
+{
+ int err = 0;
+ CLST vcn_next, vcn0 = vcn, lcn, clen, dn = 0;
+ size_t idx;
+
+ if (!len)
+ goto out;
+
+ if (!run_lookup_entry(run, vcn, &lcn, &clen, &idx)) {
+failed:
+ run_truncate(run, vcn0);
+ err = -EINVAL;
+ goto out;
+ }
+
+ for (;;) {
+ if (clen > len)
+ clen = len;
+
+ if (!clen) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ if (lcn != SPARSE_LCN) {
+ if (sbi) {
+ /* mark bitmap range [lcn + clen) as free and trim clusters. */
+ mark_as_free_ex(sbi, lcn, clen, trim);
+ }
+ dn += clen;
+ }
+
+ len -= clen;
+ if (!len)
+ break;
+
+ vcn_next = vcn + clen;
+ if (!run_get_entry(run, ++idx, &vcn, &lcn, &clen) ||
+ vcn != vcn_next) {
+ /* Save memory - don't load entire run. */
+ goto failed;
+ }
+ }
+
+out:
+ if (done)
+ *done += dn;
+
+ return err;
+}
+
+/*
+ * attr_allocate_clusters - Find free space, mark it as used and store in @run.
+ */
+int attr_allocate_clusters(struct ntfs_sb_info *sbi, struct runs_tree *run,
+ CLST vcn, CLST lcn, CLST len, CLST *pre_alloc,
+ enum ALLOCATE_OPT opt, CLST *alen, const size_t fr,
+ CLST *new_lcn)
+{
+ int err;
+ CLST flen, vcn0 = vcn, pre = pre_alloc ? *pre_alloc : 0;
+ size_t cnt = run->count;
+
+ for (;;) {
+ err = ntfs_look_for_free_space(sbi, lcn, len + pre, &lcn, &flen,
+ opt);
+
+ if (err == -ENOSPC && pre) {
+ pre = 0;
+ if (*pre_alloc)
+ *pre_alloc = 0;
+ continue;
+ }
+
+ if (err)
+ goto out;
+
+ if (new_lcn && vcn == vcn0)
+ *new_lcn = lcn;
+
+ /* Add new fragment into run storage. */
+ if (!run_add_entry(run, vcn, lcn, flen, opt == ALLOCATE_MFT)) {
+ /* Undo last 'ntfs_look_for_free_space' */
+ mark_as_free_ex(sbi, lcn, len, false);
+ err = -ENOMEM;
+ goto out;
+ }
+
+ vcn += flen;
+
+ if (flen >= len || opt == ALLOCATE_MFT ||
+ (fr && run->count - cnt >= fr)) {
+ *alen = vcn - vcn0;
+ return 0;
+ }
+
+ len -= flen;
+ }
+
+out:
+ /* Undo 'ntfs_look_for_free_space' */
+ if (vcn - vcn0) {
+ run_deallocate_ex(sbi, run, vcn0, vcn - vcn0, NULL, false);
+ run_truncate(run, vcn0);
+ }
+
+ return err;
+}
+
+/*
+ * attr_make_nonresident
+ *
+ * If page is not NULL - it is already contains resident data
+ * and locked (called from ni_write_frame()).
+ */
+int attr_make_nonresident(struct ntfs_inode *ni, struct ATTRIB *attr,
+ struct ATTR_LIST_ENTRY *le, struct mft_inode *mi,
+ u64 new_size, struct runs_tree *run,
+ struct ATTRIB **ins_attr, struct page *page)
+{
+ struct ntfs_sb_info *sbi;
+ struct ATTRIB *attr_s;
+ struct MFT_REC *rec;
+ u32 used, asize, rsize, aoff, align;
+ bool is_data;
+ CLST len, alen;
+ char *next;
+ int err;
+
+ if (attr->non_res) {
+ *ins_attr = attr;
+ return 0;
+ }
+
+ sbi = mi->sbi;
+ rec = mi->mrec;
+ attr_s = NULL;
+ used = le32_to_cpu(rec->used);
+ asize = le32_to_cpu(attr->size);
+ next = Add2Ptr(attr, asize);
+ aoff = PtrOffset(rec, attr);
+ rsize = le32_to_cpu(attr->res.data_size);
+ is_data = attr->type == ATTR_DATA && !attr->name_len;
+
+ align = sbi->cluster_size;
+ if (is_attr_compressed(attr))
+ align <<= COMPRESSION_UNIT;
+ len = (rsize + align - 1) >> sbi->cluster_bits;
+
+ run_init(run);
+
+ /* Make a copy of original attribute. */
+ attr_s = kmemdup(attr, asize, GFP_NOFS);
+ if (!attr_s) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ if (!len) {
+ /* Empty resident -> Empty nonresident. */
+ alen = 0;
+ } else {
+ const char *data = resident_data(attr);
+
+ err = attr_allocate_clusters(sbi, run, 0, 0, len, NULL,
+ ALLOCATE_DEF, &alen, 0, NULL);
+ if (err)
+ goto out1;
+
+ if (!rsize) {
+ /* Empty resident -> Non empty nonresident. */
+ } else if (!is_data) {
+ err = ntfs_sb_write_run(sbi, run, 0, data, rsize, 0);
+ if (err)
+ goto out2;
+ } else if (!page) {
+ char *kaddr;
+
+ page = grab_cache_page(ni->vfs_inode.i_mapping, 0);
+ if (!page) {
+ err = -ENOMEM;
+ goto out2;
+ }
+ kaddr = kmap_atomic(page);
+ memcpy(kaddr, data, rsize);
+ memset(kaddr + rsize, 0, PAGE_SIZE - rsize);
+ kunmap_atomic(kaddr);
+ flush_dcache_page(page);
+ SetPageUptodate(page);
+ set_page_dirty(page);
+ unlock_page(page);
+ put_page(page);
+ }
+ }
+
+ /* Remove original attribute. */
+ used -= asize;
+ memmove(attr, Add2Ptr(attr, asize), used - aoff);
+ rec->used = cpu_to_le32(used);
+ mi->dirty = true;
+ if (le)
+ al_remove_le(ni, le);
+
+ err = ni_insert_nonresident(ni, attr_s->type, attr_name(attr_s),
+ attr_s->name_len, run, 0, alen,
+ attr_s->flags, &attr, NULL, NULL);
+ if (err)
+ goto out3;
+
+ kfree(attr_s);
+ attr->nres.data_size = cpu_to_le64(rsize);
+ attr->nres.valid_size = attr->nres.data_size;
+
+ *ins_attr = attr;
+
+ if (is_data)
+ ni->ni_flags &= ~NI_FLAG_RESIDENT;
+
+ /* Resident attribute becomes non resident. */
+ return 0;
+
+out3:
+ attr = Add2Ptr(rec, aoff);
+ memmove(next, attr, used - aoff);
+ memcpy(attr, attr_s, asize);
+ rec->used = cpu_to_le32(used + asize);
+ mi->dirty = true;
+out2:
+ /* Undo: do not trim new allocated clusters. */
+ run_deallocate(sbi, run, false);
+ run_close(run);
+out1:
+ kfree(attr_s);
+out:
+ return err;
+}
+
+/*
+ * attr_set_size_res - Helper for attr_set_size().
+ */
+static int attr_set_size_res(struct ntfs_inode *ni, struct ATTRIB *attr,
+ struct ATTR_LIST_ENTRY *le, struct mft_inode *mi,
+ u64 new_size, struct runs_tree *run,
+ struct ATTRIB **ins_attr)
+{
+ struct ntfs_sb_info *sbi = mi->sbi;
+ struct MFT_REC *rec = mi->mrec;
+ u32 used = le32_to_cpu(rec->used);
+ u32 asize = le32_to_cpu(attr->size);
+ u32 aoff = PtrOffset(rec, attr);
+ u32 rsize = le32_to_cpu(attr->res.data_size);
+ u32 tail = used - aoff - asize;
+ char *next = Add2Ptr(attr, asize);
+ s64 dsize = ALIGN(new_size, 8) - ALIGN(rsize, 8);
+
+ if (dsize < 0) {
+ memmove(next + dsize, next, tail);
+ } else if (dsize > 0) {
+ if (used + dsize > sbi->max_bytes_per_attr)
+ return attr_make_nonresident(ni, attr, le, mi, new_size,
+ run, ins_attr, NULL);
+
+ memmove(next + dsize, next, tail);
+ memset(next, 0, dsize);
+ }
+
+ if (new_size > rsize)
+ memset(Add2Ptr(resident_data(attr), rsize), 0,
+ new_size - rsize);
+
+ rec->used = cpu_to_le32(used + dsize);
+ attr->size = cpu_to_le32(asize + dsize);
+ attr->res.data_size = cpu_to_le32(new_size);
+ mi->dirty = true;
+ *ins_attr = attr;
+
+ return 0;
+}
+
+/*
+ * attr_set_size - Change the size of attribute.
+ *
+ * Extend:
+ * - Sparse/compressed: No allocated clusters.
+ * - Normal: Append allocated and preallocated new clusters.
+ * Shrink:
+ * - No deallocate if @keep_prealloc is set.
+ */
+int attr_set_size(struct ntfs_inode *ni, enum ATTR_TYPE type,
+ const __le16 *name, u8 name_len, struct runs_tree *run,
+ u64 new_size, const u64 *new_valid, bool keep_prealloc,
+ struct ATTRIB **ret)
+{
+ int err = 0;
+ struct ntfs_sb_info *sbi = ni->mi.sbi;
+ u8 cluster_bits = sbi->cluster_bits;
+ bool is_mft =
+ ni->mi.rno == MFT_REC_MFT && type == ATTR_DATA && !name_len;
+ u64 old_valid, old_size, old_alloc, new_alloc, new_alloc_tmp;
+ struct ATTRIB *attr = NULL, *attr_b;
+ struct ATTR_LIST_ENTRY *le, *le_b;
+ struct mft_inode *mi, *mi_b;
+ CLST alen, vcn, lcn, new_alen, old_alen, svcn, evcn;
+ CLST next_svcn, pre_alloc = -1, done = 0;
+ bool is_ext, is_bad = false;
+ u32 align;
+ struct MFT_REC *rec;
+
+again:
+ alen = 0;
+ le_b = NULL;
+ attr_b = ni_find_attr(ni, NULL, &le_b, type, name, name_len, NULL,
+ &mi_b);
+ if (!attr_b) {
+ err = -ENOENT;
+ goto bad_inode;
+ }
+
+ if (!attr_b->non_res) {
+ err = attr_set_size_res(ni, attr_b, le_b, mi_b, new_size, run,
+ &attr_b);
+ if (err)
+ return err;
+
+ /* Return if file is still resident. */
+ if (!attr_b->non_res)
+ goto ok1;
+
+ /* Layout of records may be changed, so do a full search. */
+ goto again;
+ }
+
+ is_ext = is_attr_ext(attr_b);
+ align = sbi->cluster_size;
+ if (is_ext)
+ align <<= attr_b->nres.c_unit;
+
+ old_valid = le64_to_cpu(attr_b->nres.valid_size);
+ old_size = le64_to_cpu(attr_b->nres.data_size);
+ old_alloc = le64_to_cpu(attr_b->nres.alloc_size);
+
+again_1:
+ old_alen = old_alloc >> cluster_bits;
+
+ new_alloc = (new_size + align - 1) & ~(u64)(align - 1);
+ new_alen = new_alloc >> cluster_bits;
+
+ if (keep_prealloc && new_size < old_size) {
+ attr_b->nres.data_size = cpu_to_le64(new_size);
+ mi_b->dirty = true;
+ goto ok;
+ }
+
+ vcn = old_alen - 1;
+
+ svcn = le64_to_cpu(attr_b->nres.svcn);
+ evcn = le64_to_cpu(attr_b->nres.evcn);
+
+ if (svcn <= vcn && vcn <= evcn) {
+ attr = attr_b;
+ le = le_b;
+ mi = mi_b;
+ } else if (!le_b) {
+ err = -EINVAL;
+ goto bad_inode;
+ } else {
+ le = le_b;
+ attr = ni_find_attr(ni, attr_b, &le, type, name, name_len, &vcn,
+ &mi);
+ if (!attr) {
+ err = -EINVAL;
+ goto bad_inode;
+ }
+
+next_le_1:
+ svcn = le64_to_cpu(attr->nres.svcn);
+ evcn = le64_to_cpu(attr->nres.evcn);
+ }
+ /*
+ * Here we have:
+ * attr,mi,le - last attribute segment (containing 'vcn').
+ * attr_b,mi_b,le_b - base (primary) attribute segment.
+ */
+next_le:
+ rec = mi->mrec;
+ err = attr_load_runs(attr, ni, run, NULL);
+ if (err)
+ goto out;
+
+ if (new_size > old_size) {
+ CLST to_allocate;
+ size_t free;
+
+ if (new_alloc <= old_alloc) {
+ attr_b->nres.data_size = cpu_to_le64(new_size);
+ mi_b->dirty = true;
+ goto ok;
+ }
+
+ /*
+ * Add clusters. In simple case we have to:
+ * - allocate space (vcn, lcn, len)
+ * - update packed run in 'mi'
+ * - update attr->nres.evcn
+ * - update attr_b->nres.data_size/attr_b->nres.alloc_size
+ */
+ to_allocate = new_alen - old_alen;
+add_alloc_in_same_attr_seg:
+ lcn = 0;
+ if (is_mft) {
+ /* MFT allocates clusters from MFT zone. */
+ pre_alloc = 0;
+ } else if (is_ext) {
+ /* No preallocate for sparse/compress. */
+ pre_alloc = 0;
+ } else if (pre_alloc == -1) {
+ pre_alloc = 0;
+ if (type == ATTR_DATA && !name_len &&
+ sbi->options->prealloc) {
+ pre_alloc =
+ bytes_to_cluster(
+ sbi,
+ get_pre_allocated(new_size)) -
+ new_alen;
+ }
+
+ /* Get the last LCN to allocate from. */
+ if (old_alen &&
+ !run_lookup_entry(run, vcn, &lcn, NULL, NULL)) {
+ lcn = SPARSE_LCN;
+ }
+
+ if (lcn == SPARSE_LCN)
+ lcn = 0;
+ else if (lcn)
+ lcn += 1;
+
+ free = wnd_zeroes(&sbi->used.bitmap);
+ if (to_allocate > free) {
+ err = -ENOSPC;
+ goto out;
+ }
+
+ if (pre_alloc && to_allocate + pre_alloc > free)
+ pre_alloc = 0;
+ }
+
+ vcn = old_alen;
+
+ if (is_ext) {
+ if (!run_add_entry(run, vcn, SPARSE_LCN, to_allocate,
+ false)) {
+ err = -ENOMEM;
+ goto out;
+ }
+ alen = to_allocate;
+ } else {
+ /* ~3 bytes per fragment. */
+ err = attr_allocate_clusters(
+ sbi, run, vcn, lcn, to_allocate, &pre_alloc,
+ is_mft ? ALLOCATE_MFT : 0, &alen,
+ is_mft ? 0
+ : (sbi->record_size -
+ le32_to_cpu(rec->used) + 8) /
+ 3 +
+ 1,
+ NULL);
+ if (err)
+ goto out;
+ }
+
+ done += alen;
+ vcn += alen;
+ if (to_allocate > alen)
+ to_allocate -= alen;
+ else
+ to_allocate = 0;
+
+pack_runs:
+ err = mi_pack_runs(mi, attr, run, vcn - svcn);
+ if (err)
+ goto undo_1;
+
+ next_svcn = le64_to_cpu(attr->nres.evcn) + 1;
+ new_alloc_tmp = (u64)next_svcn << cluster_bits;
+ attr_b->nres.alloc_size = cpu_to_le64(new_alloc_tmp);
+ mi_b->dirty = true;
+
+ if (next_svcn >= vcn && !to_allocate) {
+ /* Normal way. Update attribute and exit. */
+ attr_b->nres.data_size = cpu_to_le64(new_size);
+ goto ok;
+ }
+
+ /* At least two MFT to avoid recursive loop. */
+ if (is_mft && next_svcn == vcn &&
+ ((u64)done << sbi->cluster_bits) >= 2 * sbi->record_size) {
+ new_size = new_alloc_tmp;
+ attr_b->nres.data_size = attr_b->nres.alloc_size;
+ goto ok;
+ }
+
+ if (le32_to_cpu(rec->used) < sbi->record_size) {
+ old_alen = next_svcn;
+ evcn = old_alen - 1;
+ goto add_alloc_in_same_attr_seg;
+ }
+
+ attr_b->nres.data_size = attr_b->nres.alloc_size;
+ if (new_alloc_tmp < old_valid)
+ attr_b->nres.valid_size = attr_b->nres.data_size;
+
+ if (type == ATTR_LIST) {
+ err = ni_expand_list(ni);
+ if (err)
+ goto undo_2;
+ if (next_svcn < vcn)
+ goto pack_runs;
+
+ /* Layout of records is changed. */
+ goto again;
+ }
+
+ if (!ni->attr_list.size) {
+ err = ni_create_attr_list(ni);
+ /* In case of error layout of records is not changed. */
+ if (err)
+ goto undo_2;
+ /* Layout of records is changed. */
+ }
+
+ if (next_svcn >= vcn) {
+ /* This is MFT data, repeat. */
+ goto again;
+ }
+
+ /* Insert new attribute segment. */
+ err = ni_insert_nonresident(ni, type, name, name_len, run,
+ next_svcn, vcn - next_svcn,
+ attr_b->flags, &attr, &mi, NULL);
+
+ /*
+ * Layout of records maybe changed.
+ * Find base attribute to update.
+ */
+ le_b = NULL;
+ attr_b = ni_find_attr(ni, NULL, &le_b, type, name, name_len,
+ NULL, &mi_b);
+ if (!attr_b) {
+ err = -EINVAL;
+ goto bad_inode;
+ }
+
+ if (err) {
+ /* ni_insert_nonresident failed. */
+ attr = NULL;
+ goto undo_2;
+ }
+
+ if (!is_mft)
+ run_truncate_head(run, evcn + 1);
+
+ svcn = le64_to_cpu(attr->nres.svcn);
+ evcn = le64_to_cpu(attr->nres.evcn);
+
+ /*
+ * Attribute is in consistency state.
+ * Save this point to restore to if next steps fail.
+ */
+ old_valid = old_size = old_alloc = (u64)vcn << cluster_bits;
+ attr_b->nres.valid_size = attr_b->nres.data_size =
+ attr_b->nres.alloc_size = cpu_to_le64(old_size);
+ mi_b->dirty = true;
+ goto again_1;
+ }
+
+ if (new_size != old_size ||
+ (new_alloc != old_alloc && !keep_prealloc)) {
+ /*
+ * Truncate clusters. In simple case we have to:
+ * - update packed run in 'mi'
+ * - update attr->nres.evcn
+ * - update attr_b->nres.data_size/attr_b->nres.alloc_size
+ * - mark and trim clusters as free (vcn, lcn, len)
+ */
+ CLST dlen = 0;
+
+ vcn = max(svcn, new_alen);
+ new_alloc_tmp = (u64)vcn << cluster_bits;
+
+ if (vcn > svcn) {
+ err = mi_pack_runs(mi, attr, run, vcn - svcn);
+ if (err)
+ goto out;
+ } else if (le && le->vcn) {
+ u16 le_sz = le16_to_cpu(le->size);
+
+ /*
+ * NOTE: List entries for one attribute are always
+ * the same size. We deal with last entry (vcn==0)
+ * and it is not first in entries array
+ * (list entry for std attribute always first).
+ * So it is safe to step back.
+ */
+ mi_remove_attr(NULL, mi, attr);
+
+ if (!al_remove_le(ni, le)) {
+ err = -EINVAL;
+ goto bad_inode;
+ }
+
+ le = (struct ATTR_LIST_ENTRY *)((u8 *)le - le_sz);
+ } else {
+ attr->nres.evcn = cpu_to_le64((u64)vcn - 1);
+ mi->dirty = true;
+ }
+
+ attr_b->nres.alloc_size = cpu_to_le64(new_alloc_tmp);
+
+ if (vcn == new_alen) {
+ attr_b->nres.data_size = cpu_to_le64(new_size);
+ if (new_size < old_valid)
+ attr_b->nres.valid_size =
+ attr_b->nres.data_size;
+ } else {
+ if (new_alloc_tmp <=
+ le64_to_cpu(attr_b->nres.data_size))
+ attr_b->nres.data_size =
+ attr_b->nres.alloc_size;
+ if (new_alloc_tmp <
+ le64_to_cpu(attr_b->nres.valid_size))
+ attr_b->nres.valid_size =
+ attr_b->nres.alloc_size;
+ }
+ mi_b->dirty = true;
+
+ err = run_deallocate_ex(sbi, run, vcn, evcn - vcn + 1, &dlen,
+ true);
+ if (err)
+ goto out;
+
+ if (is_ext) {
+ /* dlen - really deallocated clusters. */
+ le64_sub_cpu(&attr_b->nres.total_size,
+ ((u64)dlen << cluster_bits));
+ }
+
+ run_truncate(run, vcn);
+
+ if (new_alloc_tmp <= new_alloc)
+ goto ok;
+
+ old_size = new_alloc_tmp;
+ vcn = svcn - 1;
+
+ if (le == le_b) {
+ attr = attr_b;
+ mi = mi_b;
+ evcn = svcn - 1;
+ svcn = 0;
+ goto next_le;
+ }
+
+ if (le->type != type || le->name_len != name_len ||
+ memcmp(le_name(le), name, name_len * sizeof(short))) {
+ err = -EINVAL;
+ goto bad_inode;
+ }
+
+ err = ni_load_mi(ni, le, &mi);
+ if (err)
+ goto out;
+
+ attr = mi_find_attr(mi, NULL, type, name, name_len, &le->id);
+ if (!attr) {
+ err = -EINVAL;
+ goto bad_inode;
+ }
+ goto next_le_1;
+ }
+
+ok:
+ if (new_valid) {
+ __le64 valid = cpu_to_le64(min(*new_valid, new_size));
+
+ if (attr_b->nres.valid_size != valid) {
+ attr_b->nres.valid_size = valid;
+ mi_b->dirty = true;
+ }
+ }
+
+ok1:
+ if (ret)
+ *ret = attr_b;
+
+ /* Update inode_set_bytes. */
+ if (((type == ATTR_DATA && !name_len) ||
+ (type == ATTR_ALLOC && name == I30_NAME))) {
+ bool dirty = false;
+
+ if (ni->vfs_inode.i_size != new_size) {
+ ni->vfs_inode.i_size = new_size;
+ dirty = true;
+ }
+
+ if (attr_b->non_res) {
+ new_alloc = le64_to_cpu(attr_b->nres.alloc_size);
+ if (inode_get_bytes(&ni->vfs_inode) != new_alloc) {
+ inode_set_bytes(&ni->vfs_inode, new_alloc);
+ dirty = true;
+ }
+ }
+
+ if (dirty) {
+ ni->ni_flags |= NI_FLAG_UPDATE_PARENT;
+ mark_inode_dirty(&ni->vfs_inode);
+ }
+ }
+
+ return 0;
+
+undo_2:
+ vcn -= alen;
+ attr_b->nres.data_size = cpu_to_le64(old_size);
+ attr_b->nres.valid_size = cpu_to_le64(old_valid);
+ attr_b->nres.alloc_size = cpu_to_le64(old_alloc);
+
+ /* Restore 'attr' and 'mi'. */
+ if (attr)
+ goto restore_run;
+
+ if (le64_to_cpu(attr_b->nres.svcn) <= svcn &&
+ svcn <= le64_to_cpu(attr_b->nres.evcn)) {
+ attr = attr_b;
+ le = le_b;
+ mi = mi_b;
+ } else if (!le_b) {
+ err = -EINVAL;
+ goto bad_inode;
+ } else {
+ le = le_b;
+ attr = ni_find_attr(ni, attr_b, &le, type, name, name_len,
+ &svcn, &mi);
+ if (!attr)
+ goto bad_inode;
+ }
+
+restore_run:
+ if (mi_pack_runs(mi, attr, run, evcn - svcn + 1))
+ is_bad = true;
+
+undo_1:
+ run_deallocate_ex(sbi, run, vcn, alen, NULL, false);
+
+ run_truncate(run, vcn);
+out:
+ if (is_bad) {
+bad_inode:
+ _ntfs_bad_inode(&ni->vfs_inode);
+ }
+ return err;
+}
+
+int attr_data_get_block(struct ntfs_inode *ni, CLST vcn, CLST clen, CLST *lcn,
+ CLST *len, bool *new)
+{
+ int err = 0;
+ struct runs_tree *run = &ni->file.run;
+ struct ntfs_sb_info *sbi;
+ u8 cluster_bits;
+ struct ATTRIB *attr = NULL, *attr_b;
+ struct ATTR_LIST_ENTRY *le, *le_b;
+ struct mft_inode *mi, *mi_b;
+ CLST hint, svcn, to_alloc, evcn1, next_svcn, asize, end;
+ u64 total_size;
+ u32 clst_per_frame;
+ bool ok;
+
+ if (new)
+ *new = false;
+
+ down_read(&ni->file.run_lock);
+ ok = run_lookup_entry(run, vcn, lcn, len, NULL);
+ up_read(&ni->file.run_lock);
+
+ if (ok && (*lcn != SPARSE_LCN || !new)) {
+ /* Normal way. */
+ return 0;
+ }
+
+ if (!clen)
+ clen = 1;
+
+ if (ok && clen > *len)
+ clen = *len;
+
+ sbi = ni->mi.sbi;
+ cluster_bits = sbi->cluster_bits;
+
+ ni_lock(ni);
+ down_write(&ni->file.run_lock);
+
+ le_b = NULL;
+ attr_b = ni_find_attr(ni, NULL, &le_b, ATTR_DATA, NULL, 0, NULL, &mi_b);
+ if (!attr_b) {
+ err = -ENOENT;
+ goto out;
+ }
+
+ if (!attr_b->non_res) {
+ *lcn = RESIDENT_LCN;
+ *len = 1;
+ goto out;
+ }
+
+ asize = le64_to_cpu(attr_b->nres.alloc_size) >> cluster_bits;
+ if (vcn >= asize) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ clst_per_frame = 1u << attr_b->nres.c_unit;
+ to_alloc = (clen + clst_per_frame - 1) & ~(clst_per_frame - 1);
+
+ if (vcn + to_alloc > asize)
+ to_alloc = asize - vcn;
+
+ svcn = le64_to_cpu(attr_b->nres.svcn);
+ evcn1 = le64_to_cpu(attr_b->nres.evcn) + 1;
+
+ attr = attr_b;
+ le = le_b;
+ mi = mi_b;
+
+ if (le_b && (vcn < svcn || evcn1 <= vcn)) {
+ attr = ni_find_attr(ni, attr_b, &le, ATTR_DATA, NULL, 0, &vcn,
+ &mi);
+ if (!attr) {
+ err = -EINVAL;
+ goto out;
+ }
+ svcn = le64_to_cpu(attr->nres.svcn);
+ evcn1 = le64_to_cpu(attr->nres.evcn) + 1;
+ }
+
+ err = attr_load_runs(attr, ni, run, NULL);
+ if (err)
+ goto out;
+
+ if (!ok) {
+ ok = run_lookup_entry(run, vcn, lcn, len, NULL);
+ if (ok && (*lcn != SPARSE_LCN || !new)) {
+ /* Normal way. */
+ err = 0;
+ goto ok;
+ }
+
+ if (!ok && !new) {
+ *len = 0;
+ err = 0;
+ goto ok;
+ }
+
+ if (ok && clen > *len) {
+ clen = *len;
+ to_alloc = (clen + clst_per_frame - 1) &
+ ~(clst_per_frame - 1);
+ }
+ }
+
+ if (!is_attr_ext(attr_b)) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ /* Get the last LCN to allocate from. */
+ hint = 0;
+
+ if (vcn > evcn1) {
+ if (!run_add_entry(run, evcn1, SPARSE_LCN, vcn - evcn1,
+ false)) {
+ err = -ENOMEM;
+ goto out;
+ }
+ } else if (vcn && !run_lookup_entry(run, vcn - 1, &hint, NULL, NULL)) {
+ hint = -1;
+ }
+
+ err = attr_allocate_clusters(
+ sbi, run, vcn, hint + 1, to_alloc, NULL, 0, len,
+ (sbi->record_size - le32_to_cpu(mi->mrec->used) + 8) / 3 + 1,
+ lcn);
+ if (err)
+ goto out;
+ *new = true;
+
+ end = vcn + *len;
+
+ total_size = le64_to_cpu(attr_b->nres.total_size) +
+ ((u64)*len << cluster_bits);
+
+repack:
+ err = mi_pack_runs(mi, attr, run, max(end, evcn1) - svcn);
+ if (err)
+ goto out;
+
+ attr_b->nres.total_size = cpu_to_le64(total_size);
+ inode_set_bytes(&ni->vfs_inode, total_size);
+ ni->ni_flags |= NI_FLAG_UPDATE_PARENT;
+
+ mi_b->dirty = true;
+ mark_inode_dirty(&ni->vfs_inode);
+
+ /* Stored [vcn : next_svcn) from [vcn : end). */
+ next_svcn = le64_to_cpu(attr->nres.evcn) + 1;
+
+ if (end <= evcn1) {
+ if (next_svcn == evcn1) {
+ /* Normal way. Update attribute and exit. */
+ goto ok;
+ }
+ /* Add new segment [next_svcn : evcn1 - next_svcn). */
+ if (!ni->attr_list.size) {
+ err = ni_create_attr_list(ni);
+ if (err)
+ goto out;
+ /* Layout of records is changed. */
+ le_b = NULL;
+ attr_b = ni_find_attr(ni, NULL, &le_b, ATTR_DATA, NULL,
+ 0, NULL, &mi_b);
+ if (!attr_b) {
+ err = -ENOENT;
+ goto out;
+ }
+
+ attr = attr_b;
+ le = le_b;
+ mi = mi_b;
+ goto repack;
+ }
+ }
+
+ svcn = evcn1;
+
+ /* Estimate next attribute. */
+ attr = ni_find_attr(ni, attr, &le, ATTR_DATA, NULL, 0, &svcn, &mi);
+
+ if (attr) {
+ CLST alloc = bytes_to_cluster(
+ sbi, le64_to_cpu(attr_b->nres.alloc_size));
+ CLST evcn = le64_to_cpu(attr->nres.evcn);
+
+ if (end < next_svcn)
+ end = next_svcn;
+ while (end > evcn) {
+ /* Remove segment [svcn : evcn). */
+ mi_remove_attr(NULL, mi, attr);
+
+ if (!al_remove_le(ni, le)) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ if (evcn + 1 >= alloc) {
+ /* Last attribute segment. */
+ evcn1 = evcn + 1;
+ goto ins_ext;
+ }
+
+ if (ni_load_mi(ni, le, &mi)) {
+ attr = NULL;
+ goto out;
+ }
+
+ attr = mi_find_attr(mi, NULL, ATTR_DATA, NULL, 0,
+ &le->id);
+ if (!attr) {
+ err = -EINVAL;
+ goto out;
+ }
+ svcn = le64_to_cpu(attr->nres.svcn);
+ evcn = le64_to_cpu(attr->nres.evcn);
+ }
+
+ if (end < svcn)
+ end = svcn;
+
+ err = attr_load_runs(attr, ni, run, &end);
+ if (err)
+ goto out;
+
+ evcn1 = evcn + 1;
+ attr->nres.svcn = cpu_to_le64(next_svcn);
+ err = mi_pack_runs(mi, attr, run, evcn1 - next_svcn);
+ if (err)
+ goto out;
+
+ le->vcn = cpu_to_le64(next_svcn);
+ ni->attr_list.dirty = true;
+ mi->dirty = true;
+
+ next_svcn = le64_to_cpu(attr->nres.evcn) + 1;
+ }
+ins_ext:
+ if (evcn1 > next_svcn) {
+ err = ni_insert_nonresident(ni, ATTR_DATA, NULL, 0, run,
+ next_svcn, evcn1 - next_svcn,
+ attr_b->flags, &attr, &mi, NULL);
+ if (err)
+ goto out;
+ }
+ok:
+ run_truncate_around(run, vcn);
+out:
+ up_write(&ni->file.run_lock);
+ ni_unlock(ni);
+
+ return err;
+}
+
+int attr_data_read_resident(struct ntfs_inode *ni, struct page *page)
+{
+ u64 vbo;
+ struct ATTRIB *attr;
+ u32 data_size;
+
+ attr = ni_find_attr(ni, NULL, NULL, ATTR_DATA, NULL, 0, NULL, NULL);
+ if (!attr)
+ return -EINVAL;
+
+ if (attr->non_res)
+ return E_NTFS_NONRESIDENT;
+
+ vbo = page->index << PAGE_SHIFT;
+ data_size = le32_to_cpu(attr->res.data_size);
+ if (vbo < data_size) {
+ const char *data = resident_data(attr);
+ char *kaddr = kmap_atomic(page);
+ u32 use = data_size - vbo;
+
+ if (use > PAGE_SIZE)
+ use = PAGE_SIZE;
+
+ memcpy(kaddr, data + vbo, use);
+ memset(kaddr + use, 0, PAGE_SIZE - use);
+ kunmap_atomic(kaddr);
+ flush_dcache_page(page);
+ SetPageUptodate(page);
+ } else if (!PageUptodate(page)) {
+ zero_user_segment(page, 0, PAGE_SIZE);
+ SetPageUptodate(page);
+ }
+
+ return 0;
+}
+
+int attr_data_write_resident(struct ntfs_inode *ni, struct page *page)
+{
+ u64 vbo;
+ struct mft_inode *mi;
+ struct ATTRIB *attr;
+ u32 data_size;
+
+ attr = ni_find_attr(ni, NULL, NULL, ATTR_DATA, NULL, 0, NULL, &mi);
+ if (!attr)
+ return -EINVAL;
+
+ if (attr->non_res) {
+ /* Return special error code to check this case. */
+ return E_NTFS_NONRESIDENT;
+ }
+
+ vbo = page->index << PAGE_SHIFT;
+ data_size = le32_to_cpu(attr->res.data_size);
+ if (vbo < data_size) {
+ char *data = resident_data(attr);
+ char *kaddr = kmap_atomic(page);
+ u32 use = data_size - vbo;
+
+ if (use > PAGE_SIZE)
+ use = PAGE_SIZE;
+ memcpy(data + vbo, kaddr, use);
+ kunmap_atomic(kaddr);
+ mi->dirty = true;
+ }
+ ni->i_valid = data_size;
+
+ return 0;
+}
+
+/*
+ * attr_load_runs_vcn - Load runs with VCN.
+ */
+int attr_load_runs_vcn(struct ntfs_inode *ni, enum ATTR_TYPE type,
+ const __le16 *name, u8 name_len, struct runs_tree *run,
+ CLST vcn)
+{
+ struct ATTRIB *attr;
+ int err;
+ CLST svcn, evcn;
+ u16 ro;
+
+ if (!ni) {
+ /* Is record corrupted? */
+ return -ENOENT;
+ }
+
+ attr = ni_find_attr(ni, NULL, NULL, type, name, name_len, &vcn, NULL);
+ if (!attr) {
+ /* Is record corrupted? */
+ return -ENOENT;
+ }
+
+ svcn = le64_to_cpu(attr->nres.svcn);
+ evcn = le64_to_cpu(attr->nres.evcn);
+
+ if (evcn < vcn || vcn < svcn) {
+ /* Is record corrupted? */
+ return -EINVAL;
+ }
+
+ ro = le16_to_cpu(attr->nres.run_off);
+
+ if (ro > le32_to_cpu(attr->size))
+ return -EINVAL;
+
+ err = run_unpack_ex(run, ni->mi.sbi, ni->mi.rno, svcn, evcn, svcn,
+ Add2Ptr(attr, ro), le32_to_cpu(attr->size) - ro);
+ if (err < 0)
+ return err;
+ return 0;
+}
+
+/*
+ * attr_load_runs_range - Load runs for given range [from to).
+ */
+int attr_load_runs_range(struct ntfs_inode *ni, enum ATTR_TYPE type,
+ const __le16 *name, u8 name_len, struct runs_tree *run,
+ u64 from, u64 to)
+{
+ struct ntfs_sb_info *sbi = ni->mi.sbi;
+ u8 cluster_bits = sbi->cluster_bits;
+ CLST vcn;
+ CLST vcn_last = (to - 1) >> cluster_bits;
+ CLST lcn, clen;
+ int err;
+
+ for (vcn = from >> cluster_bits; vcn <= vcn_last; vcn += clen) {
+ if (!run_lookup_entry(run, vcn, &lcn, &clen, NULL)) {
+ err = attr_load_runs_vcn(ni, type, name, name_len, run,
+ vcn);
+ if (err)
+ return err;
+ clen = 0; /* Next run_lookup_entry(vcn) must be success. */
+ }
+ }
+
+ return 0;
+}
+
+#ifdef CONFIG_NTFS3_LZX_XPRESS
+/*
+ * attr_wof_frame_info
+ *
+ * Read header of Xpress/LZX file to get info about frame.
+ */
+int attr_wof_frame_info(struct ntfs_inode *ni, struct ATTRIB *attr,
+ struct runs_tree *run, u64 frame, u64 frames,
+ u8 frame_bits, u32 *ondisk_size, u64 *vbo_data)
+{
+ struct ntfs_sb_info *sbi = ni->mi.sbi;
+ u64 vbo[2], off[2], wof_size;
+ u32 voff;
+ u8 bytes_per_off;
+ char *addr;
+ struct page *page;
+ int i, err;
+ __le32 *off32;
+ __le64 *off64;
+
+ if (ni->vfs_inode.i_size < 0x100000000ull) {
+ /* File starts with array of 32 bit offsets. */
+ bytes_per_off = sizeof(__le32);
+ vbo[1] = frame << 2;
+ *vbo_data = frames << 2;
+ } else {
+ /* File starts with array of 64 bit offsets. */
+ bytes_per_off = sizeof(__le64);
+ vbo[1] = frame << 3;
+ *vbo_data = frames << 3;
+ }
+
+ /*
+ * Read 4/8 bytes at [vbo - 4(8)] == offset where compressed frame starts.
+ * Read 4/8 bytes at [vbo] == offset where compressed frame ends.
+ */
+ if (!attr->non_res) {
+ if (vbo[1] + bytes_per_off > le32_to_cpu(attr->res.data_size)) {
+ ntfs_inode_err(&ni->vfs_inode, "is corrupted");
+ return -EINVAL;
+ }
+ addr = resident_data(attr);
+
+ if (bytes_per_off == sizeof(__le32)) {
+ off32 = Add2Ptr(addr, vbo[1]);
+ off[0] = vbo[1] ? le32_to_cpu(off32[-1]) : 0;
+ off[1] = le32_to_cpu(off32[0]);
+ } else {
+ off64 = Add2Ptr(addr, vbo[1]);
+ off[0] = vbo[1] ? le64_to_cpu(off64[-1]) : 0;
+ off[1] = le64_to_cpu(off64[0]);
+ }
+
+ *vbo_data += off[0];
+ *ondisk_size = off[1] - off[0];
+ return 0;
+ }
+
+ wof_size = le64_to_cpu(attr->nres.data_size);
+ down_write(&ni->file.run_lock);
+ page = ni->file.offs_page;
+ if (!page) {
+ page = alloc_page(GFP_KERNEL);
+ if (!page) {
+ err = -ENOMEM;
+ goto out;
+ }
+ page->index = -1;
+ ni->file.offs_page = page;
+ }
+ lock_page(page);
+ addr = page_address(page);
+
+ if (vbo[1]) {
+ voff = vbo[1] & (PAGE_SIZE - 1);
+ vbo[0] = vbo[1] - bytes_per_off;
+ i = 0;
+ } else {
+ voff = 0;
+ vbo[0] = 0;
+ off[0] = 0;
+ i = 1;
+ }
+
+ do {
+ pgoff_t index = vbo[i] >> PAGE_SHIFT;
+
+ if (index != page->index) {
+ u64 from = vbo[i] & ~(u64)(PAGE_SIZE - 1);
+ u64 to = min(from + PAGE_SIZE, wof_size);
+
+ err = attr_load_runs_range(ni, ATTR_DATA, WOF_NAME,
+ ARRAY_SIZE(WOF_NAME), run,
+ from, to);
+ if (err)
+ goto out1;
+
+ err = ntfs_bio_pages(sbi, run, &page, 1, from,
+ to - from, REQ_OP_READ);
+ if (err) {
+ page->index = -1;
+ goto out1;
+ }
+ page->index = index;
+ }
+
+ if (i) {
+ if (bytes_per_off == sizeof(__le32)) {
+ off32 = Add2Ptr(addr, voff);
+ off[1] = le32_to_cpu(*off32);
+ } else {
+ off64 = Add2Ptr(addr, voff);
+ off[1] = le64_to_cpu(*off64);
+ }
+ } else if (!voff) {
+ if (bytes_per_off == sizeof(__le32)) {
+ off32 = Add2Ptr(addr, PAGE_SIZE - sizeof(u32));
+ off[0] = le32_to_cpu(*off32);
+ } else {
+ off64 = Add2Ptr(addr, PAGE_SIZE - sizeof(u64));
+ off[0] = le64_to_cpu(*off64);
+ }
+ } else {
+ /* Two values in one page. */
+ if (bytes_per_off == sizeof(__le32)) {
+ off32 = Add2Ptr(addr, voff);
+ off[0] = le32_to_cpu(off32[-1]);
+ off[1] = le32_to_cpu(off32[0]);
+ } else {
+ off64 = Add2Ptr(addr, voff);
+ off[0] = le64_to_cpu(off64[-1]);
+ off[1] = le64_to_cpu(off64[0]);
+ }
+ break;
+ }
+ } while (++i < 2);
+
+ *vbo_data += off[0];
+ *ondisk_size = off[1] - off[0];
+
+out1:
+ unlock_page(page);
+out:
+ up_write(&ni->file.run_lock);
+ return err;
+}
+#endif
+
+/*
+ * attr_is_frame_compressed - Used to detect compressed frame.
+ */
+int attr_is_frame_compressed(struct ntfs_inode *ni, struct ATTRIB *attr,
+ CLST frame, CLST *clst_data)
+{
+ int err;
+ u32 clst_frame;
+ CLST clen, lcn, vcn, alen, slen, vcn_next;
+ size_t idx;
+ struct runs_tree *run;
+
+ *clst_data = 0;
+
+ if (!is_attr_compressed(attr))
+ return 0;
+
+ if (!attr->non_res)
+ return 0;
+
+ clst_frame = 1u << attr->nres.c_unit;
+ vcn = frame * clst_frame;
+ run = &ni->file.run;
+
+ if (!run_lookup_entry(run, vcn, &lcn, &clen, &idx)) {
+ err = attr_load_runs_vcn(ni, attr->type, attr_name(attr),
+ attr->name_len, run, vcn);
+ if (err)
+ return err;
+
+ if (!run_lookup_entry(run, vcn, &lcn, &clen, &idx))
+ return -EINVAL;
+ }
+
+ if (lcn == SPARSE_LCN) {
+ /* Sparsed frame. */
+ return 0;
+ }
+
+ if (clen >= clst_frame) {
+ /*
+ * The frame is not compressed 'cause
+ * it does not contain any sparse clusters.
+ */
+ *clst_data = clst_frame;
+ return 0;
+ }
+
+ alen = bytes_to_cluster(ni->mi.sbi, le64_to_cpu(attr->nres.alloc_size));
+ slen = 0;
+ *clst_data = clen;
+
+ /*
+ * The frame is compressed if *clst_data + slen >= clst_frame.
+ * Check next fragments.
+ */
+ while ((vcn += clen) < alen) {
+ vcn_next = vcn;
+
+ if (!run_get_entry(run, ++idx, &vcn, &lcn, &clen) ||
+ vcn_next != vcn) {
+ err = attr_load_runs_vcn(ni, attr->type,
+ attr_name(attr),
+ attr->name_len, run, vcn_next);
+ if (err)
+ return err;
+ vcn = vcn_next;
+
+ if (!run_lookup_entry(run, vcn, &lcn, &clen, &idx))
+ return -EINVAL;
+ }
+
+ if (lcn == SPARSE_LCN) {
+ slen += clen;
+ } else {
+ if (slen) {
+ /*
+ * Data_clusters + sparse_clusters =
+ * not enough for frame.
+ */
+ return -EINVAL;
+ }
+ *clst_data += clen;
+ }
+
+ if (*clst_data + slen >= clst_frame) {
+ if (!slen) {
+ /*
+ * There is no sparsed clusters in this frame
+ * so it is not compressed.
+ */
+ *clst_data = clst_frame;
+ } else {
+ /* Frame is compressed. */
+ }
+ break;
+ }
+ }
+
+ return 0;
+}
+
+/*
+ * attr_allocate_frame - Allocate/free clusters for @frame.
+ *
+ * Assumed: down_write(&ni->file.run_lock);
+ */
+int attr_allocate_frame(struct ntfs_inode *ni, CLST frame, size_t compr_size,
+ u64 new_valid)
+{
+ int err = 0;
+ struct runs_tree *run = &ni->file.run;
+ struct ntfs_sb_info *sbi = ni->mi.sbi;
+ struct ATTRIB *attr = NULL, *attr_b;
+ struct ATTR_LIST_ENTRY *le, *le_b;
+ struct mft_inode *mi, *mi_b;
+ CLST svcn, evcn1, next_svcn, lcn, len;
+ CLST vcn, end, clst_data;
+ u64 total_size, valid_size, data_size;
+
+ le_b = NULL;
+ attr_b = ni_find_attr(ni, NULL, &le_b, ATTR_DATA, NULL, 0, NULL, &mi_b);
+ if (!attr_b)
+ return -ENOENT;
+
+ if (!is_attr_ext(attr_b))
+ return -EINVAL;
+
+ vcn = frame << NTFS_LZNT_CUNIT;
+ total_size = le64_to_cpu(attr_b->nres.total_size);
+
+ svcn = le64_to_cpu(attr_b->nres.svcn);
+ evcn1 = le64_to_cpu(attr_b->nres.evcn) + 1;
+ data_size = le64_to_cpu(attr_b->nres.data_size);
+
+ if (svcn <= vcn && vcn < evcn1) {
+ attr = attr_b;
+ le = le_b;
+ mi = mi_b;
+ } else if (!le_b) {
+ err = -EINVAL;
+ goto out;
+ } else {
+ le = le_b;
+ attr = ni_find_attr(ni, attr_b, &le, ATTR_DATA, NULL, 0, &vcn,
+ &mi);
+ if (!attr) {
+ err = -EINVAL;
+ goto out;
+ }
+ svcn = le64_to_cpu(attr->nres.svcn);
+ evcn1 = le64_to_cpu(attr->nres.evcn) + 1;
+ }
+
+ err = attr_load_runs(attr, ni, run, NULL);
+ if (err)
+ goto out;
+
+ err = attr_is_frame_compressed(ni, attr_b, frame, &clst_data);
+ if (err)
+ goto out;
+
+ total_size -= (u64)clst_data << sbi->cluster_bits;
+
+ len = bytes_to_cluster(sbi, compr_size);
+
+ if (len == clst_data)
+ goto out;
+
+ if (len < clst_data) {
+ err = run_deallocate_ex(sbi, run, vcn + len, clst_data - len,
+ NULL, true);
+ if (err)
+ goto out;
+
+ if (!run_add_entry(run, vcn + len, SPARSE_LCN, clst_data - len,
+ false)) {
+ err = -ENOMEM;
+ goto out;
+ }
+ end = vcn + clst_data;
+ /* Run contains updated range [vcn + len : end). */
+ } else {
+ CLST alen, hint = 0;
+ /* Get the last LCN to allocate from. */
+ if (vcn + clst_data &&
+ !run_lookup_entry(run, vcn + clst_data - 1, &hint, NULL,
+ NULL)) {
+ hint = -1;
+ }
+
+ err = attr_allocate_clusters(sbi, run, vcn + clst_data,
+ hint + 1, len - clst_data, NULL, 0,
+ &alen, 0, &lcn);
+ if (err)
+ goto out;
+
+ end = vcn + len;
+ /* Run contains updated range [vcn + clst_data : end). */
+ }
+
+ total_size += (u64)len << sbi->cluster_bits;
+
+repack:
+ err = mi_pack_runs(mi, attr, run, max(end, evcn1) - svcn);
+ if (err)
+ goto out;
+
+ attr_b->nres.total_size = cpu_to_le64(total_size);
+ inode_set_bytes(&ni->vfs_inode, total_size);
+
+ mi_b->dirty = true;
+ mark_inode_dirty(&ni->vfs_inode);
+
+ /* Stored [vcn : next_svcn) from [vcn : end). */
+ next_svcn = le64_to_cpu(attr->nres.evcn) + 1;
+
+ if (end <= evcn1) {
+ if (next_svcn == evcn1) {
+ /* Normal way. Update attribute and exit. */
+ goto ok;
+ }
+ /* Add new segment [next_svcn : evcn1 - next_svcn). */
+ if (!ni->attr_list.size) {
+ err = ni_create_attr_list(ni);
+ if (err)
+ goto out;
+ /* Layout of records is changed. */
+ le_b = NULL;
+ attr_b = ni_find_attr(ni, NULL, &le_b, ATTR_DATA, NULL,
+ 0, NULL, &mi_b);
+ if (!attr_b)
+ return -ENOENT;
+
+ attr = attr_b;
+ le = le_b;
+ mi = mi_b;
+ goto repack;
+ }
+ }
+
+ svcn = evcn1;
+
+ /* Estimate next attribute. */
+ attr = ni_find_attr(ni, attr, &le, ATTR_DATA, NULL, 0, &svcn, &mi);
+
+ if (attr) {
+ CLST alloc = bytes_to_cluster(
+ sbi, le64_to_cpu(attr_b->nres.alloc_size));
+ CLST evcn = le64_to_cpu(attr->nres.evcn);
+
+ if (end < next_svcn)
+ end = next_svcn;
+ while (end > evcn) {
+ /* Remove segment [svcn : evcn). */
+ mi_remove_attr(NULL, mi, attr);
+
+ if (!al_remove_le(ni, le)) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ if (evcn + 1 >= alloc) {
+ /* Last attribute segment. */
+ evcn1 = evcn + 1;
+ goto ins_ext;
+ }
+
+ if (ni_load_mi(ni, le, &mi)) {
+ attr = NULL;
+ goto out;
+ }
+
+ attr = mi_find_attr(mi, NULL, ATTR_DATA, NULL, 0,
+ &le->id);
+ if (!attr) {
+ err = -EINVAL;
+ goto out;
+ }
+ svcn = le64_to_cpu(attr->nres.svcn);
+ evcn = le64_to_cpu(attr->nres.evcn);
+ }
+
+ if (end < svcn)
+ end = svcn;
+
+ err = attr_load_runs(attr, ni, run, &end);
+ if (err)
+ goto out;
+
+ evcn1 = evcn + 1;
+ attr->nres.svcn = cpu_to_le64(next_svcn);
+ err = mi_pack_runs(mi, attr, run, evcn1 - next_svcn);
+ if (err)
+ goto out;
+
+ le->vcn = cpu_to_le64(next_svcn);
+ ni->attr_list.dirty = true;
+ mi->dirty = true;
+
+ next_svcn = le64_to_cpu(attr->nres.evcn) + 1;
+ }
+ins_ext:
+ if (evcn1 > next_svcn) {
+ err = ni_insert_nonresident(ni, ATTR_DATA, NULL, 0, run,
+ next_svcn, evcn1 - next_svcn,
+ attr_b->flags, &attr, &mi, NULL);
+ if (err)
+ goto out;
+ }
+ok:
+ run_truncate_around(run, vcn);
+out:
+ if (new_valid > data_size)
+ new_valid = data_size;
+
+ valid_size = le64_to_cpu(attr_b->nres.valid_size);
+ if (new_valid != valid_size) {
+ attr_b->nres.valid_size = cpu_to_le64(valid_size);
+ mi_b->dirty = true;
+ }
+
+ return err;
+}
+
+/*
+ * attr_collapse_range - Collapse range in file.
+ */
+int attr_collapse_range(struct ntfs_inode *ni, u64 vbo, u64 bytes)
+{
+ int err = 0;
+ struct runs_tree *run = &ni->file.run;
+ struct ntfs_sb_info *sbi = ni->mi.sbi;
+ struct ATTRIB *attr = NULL, *attr_b;
+ struct ATTR_LIST_ENTRY *le, *le_b;
+ struct mft_inode *mi, *mi_b;
+ CLST svcn, evcn1, len, dealloc, alen;
+ CLST vcn, end;
+ u64 valid_size, data_size, alloc_size, total_size;
+ u32 mask;
+ __le16 a_flags;
+
+ if (!bytes)
+ return 0;
+
+ le_b = NULL;
+ attr_b = ni_find_attr(ni, NULL, &le_b, ATTR_DATA, NULL, 0, NULL, &mi_b);
+ if (!attr_b)
+ return -ENOENT;
+
+ if (!attr_b->non_res) {
+ /* Attribute is resident. Nothing to do? */
+ return 0;
+ }
+
+ data_size = le64_to_cpu(attr_b->nres.data_size);
+ alloc_size = le64_to_cpu(attr_b->nres.alloc_size);
+ a_flags = attr_b->flags;
+
+ if (is_attr_ext(attr_b)) {
+ total_size = le64_to_cpu(attr_b->nres.total_size);
+ mask = (sbi->cluster_size << attr_b->nres.c_unit) - 1;
+ } else {
+ total_size = alloc_size;
+ mask = sbi->cluster_mask;
+ }
+
+ if ((vbo & mask) || (bytes & mask)) {
+ /* Allow to collapse only cluster aligned ranges. */
+ return -EINVAL;
+ }
+
+ if (vbo > data_size)
+ return -EINVAL;
+
+ down_write(&ni->file.run_lock);
+
+ if (vbo + bytes >= data_size) {
+ u64 new_valid = min(ni->i_valid, vbo);
+
+ /* Simple truncate file at 'vbo'. */
+ truncate_setsize(&ni->vfs_inode, vbo);
+ err = attr_set_size(ni, ATTR_DATA, NULL, 0, &ni->file.run, vbo,
+ &new_valid, true, NULL);
+
+ if (!err && new_valid < ni->i_valid)
+ ni->i_valid = new_valid;
+
+ goto out;
+ }
+
+ /*
+ * Enumerate all attribute segments and collapse.
+ */
+ alen = alloc_size >> sbi->cluster_bits;
+ vcn = vbo >> sbi->cluster_bits;
+ len = bytes >> sbi->cluster_bits;
+ end = vcn + len;
+ dealloc = 0;
+
+ svcn = le64_to_cpu(attr_b->nres.svcn);
+ evcn1 = le64_to_cpu(attr_b->nres.evcn) + 1;
+
+ if (svcn <= vcn && vcn < evcn1) {
+ attr = attr_b;
+ le = le_b;
+ mi = mi_b;
+ } else if (!le_b) {
+ err = -EINVAL;
+ goto out;
+ } else {
+ le = le_b;
+ attr = ni_find_attr(ni, attr_b, &le, ATTR_DATA, NULL, 0, &vcn,
+ &mi);
+ if (!attr) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ svcn = le64_to_cpu(attr->nres.svcn);
+ evcn1 = le64_to_cpu(attr->nres.evcn) + 1;
+ }
+
+ for (;;) {
+ if (svcn >= end) {
+ /* Shift VCN- */
+ attr->nres.svcn = cpu_to_le64(svcn - len);
+ attr->nres.evcn = cpu_to_le64(evcn1 - 1 - len);
+ if (le) {
+ le->vcn = attr->nres.svcn;
+ ni->attr_list.dirty = true;
+ }
+ mi->dirty = true;
+ } else if (svcn < vcn || end < evcn1) {
+ CLST vcn1, eat, next_svcn;
+
+ /* Collapse a part of this attribute segment. */
+ err = attr_load_runs(attr, ni, run, &svcn);
+ if (err)
+ goto out;
+ vcn1 = max(vcn, svcn);
+ eat = min(end, evcn1) - vcn1;
+
+ err = run_deallocate_ex(sbi, run, vcn1, eat, &dealloc,
+ true);
+ if (err)
+ goto out;
+
+ if (!run_collapse_range(run, vcn1, eat)) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ if (svcn >= vcn) {
+ /* Shift VCN */
+ attr->nres.svcn = cpu_to_le64(vcn);
+ if (le) {
+ le->vcn = attr->nres.svcn;
+ ni->attr_list.dirty = true;
+ }
+ }
+
+ err = mi_pack_runs(mi, attr, run, evcn1 - svcn - eat);
+ if (err)
+ goto out;
+
+ next_svcn = le64_to_cpu(attr->nres.evcn) + 1;
+ if (next_svcn + eat < evcn1) {
+ err = ni_insert_nonresident(
+ ni, ATTR_DATA, NULL, 0, run, next_svcn,
+ evcn1 - eat - next_svcn, a_flags, &attr,
+ &mi, &le);
+ if (err)
+ goto out;
+
+ /* Layout of records maybe changed. */
+ attr_b = NULL;
+ }
+
+ /* Free all allocated memory. */
+ run_truncate(run, 0);
+ } else {
+ u16 le_sz;
+ u16 roff = le16_to_cpu(attr->nres.run_off);
+
+ if (roff > le32_to_cpu(attr->size)) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ run_unpack_ex(RUN_DEALLOCATE, sbi, ni->mi.rno, svcn,
+ evcn1 - 1, svcn, Add2Ptr(attr, roff),
+ le32_to_cpu(attr->size) - roff);
+
+ /* Delete this attribute segment. */
+ mi_remove_attr(NULL, mi, attr);
+ if (!le)
+ break;
+
+ le_sz = le16_to_cpu(le->size);
+ if (!al_remove_le(ni, le)) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ if (evcn1 >= alen)
+ break;
+
+ if (!svcn) {
+ /* Load next record that contains this attribute. */
+ if (ni_load_mi(ni, le, &mi)) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ /* Look for required attribute. */
+ attr = mi_find_attr(mi, NULL, ATTR_DATA, NULL,
+ 0, &le->id);
+ if (!attr) {
+ err = -EINVAL;
+ goto out;
+ }
+ goto next_attr;
+ }
+ le = (struct ATTR_LIST_ENTRY *)((u8 *)le - le_sz);
+ }
+
+ if (evcn1 >= alen)
+ break;
+
+ attr = ni_enum_attr_ex(ni, attr, &le, &mi);
+ if (!attr) {
+ err = -EINVAL;
+ goto out;
+ }
+
+next_attr:
+ svcn = le64_to_cpu(attr->nres.svcn);
+ evcn1 = le64_to_cpu(attr->nres.evcn) + 1;
+ }
+
+ if (!attr_b) {
+ le_b = NULL;
+ attr_b = ni_find_attr(ni, NULL, &le_b, ATTR_DATA, NULL, 0, NULL,
+ &mi_b);
+ if (!attr_b) {
+ err = -ENOENT;
+ goto out;
+ }
+ }
+
+ data_size -= bytes;
+ valid_size = ni->i_valid;
+ if (vbo + bytes <= valid_size)
+ valid_size -= bytes;
+ else if (vbo < valid_size)
+ valid_size = vbo;
+
+ attr_b->nres.alloc_size = cpu_to_le64(alloc_size - bytes);
+ attr_b->nres.data_size = cpu_to_le64(data_size);
+ attr_b->nres.valid_size = cpu_to_le64(min(valid_size, data_size));
+ total_size -= (u64)dealloc << sbi->cluster_bits;
+ if (is_attr_ext(attr_b))
+ attr_b->nres.total_size = cpu_to_le64(total_size);
+ mi_b->dirty = true;
+
+ /* Update inode size. */
+ ni->i_valid = valid_size;
+ ni->vfs_inode.i_size = data_size;
+ inode_set_bytes(&ni->vfs_inode, total_size);
+ ni->ni_flags |= NI_FLAG_UPDATE_PARENT;
+ mark_inode_dirty(&ni->vfs_inode);
+
+out:
+ up_write(&ni->file.run_lock);
+ if (err)
+ _ntfs_bad_inode(&ni->vfs_inode);
+
+ return err;
+}
+
+/*
+ * attr_punch_hole
+ *
+ * Not for normal files.
+ */
+int attr_punch_hole(struct ntfs_inode *ni, u64 vbo, u64 bytes, u32 *frame_size)
+{
+ int err = 0;
+ struct runs_tree *run = &ni->file.run;
+ struct ntfs_sb_info *sbi = ni->mi.sbi;
+ struct ATTRIB *attr = NULL, *attr_b;
+ struct ATTR_LIST_ENTRY *le, *le_b;
+ struct mft_inode *mi, *mi_b;
+ CLST svcn, evcn1, vcn, len, end, alen, hole, next_svcn;
+ u64 total_size, alloc_size;
+ u32 mask;
+ __le16 a_flags;
+ struct runs_tree run2;
+
+ if (!bytes)
+ return 0;
+
+ le_b = NULL;
+ attr_b = ni_find_attr(ni, NULL, &le_b, ATTR_DATA, NULL, 0, NULL, &mi_b);
+ if (!attr_b)
+ return -ENOENT;
+
+ if (!attr_b->non_res) {
+ u32 data_size = le32_to_cpu(attr_b->res.data_size);
+ u32 from, to;
+
+ if (vbo > data_size)
+ return 0;
+
+ from = vbo;
+ to = min_t(u64, vbo + bytes, data_size);
+ memset(Add2Ptr(resident_data(attr_b), from), 0, to - from);
+ return 0;
+ }
+
+ if (!is_attr_ext(attr_b))
+ return -EOPNOTSUPP;
+
+ alloc_size = le64_to_cpu(attr_b->nres.alloc_size);
+ total_size = le64_to_cpu(attr_b->nres.total_size);
+
+ if (vbo >= alloc_size) {
+ /* NOTE: It is allowed. */
+ return 0;
+ }
+
+ mask = (sbi->cluster_size << attr_b->nres.c_unit) - 1;
+
+ bytes += vbo;
+ if (bytes > alloc_size)
+ bytes = alloc_size;
+ bytes -= vbo;
+
+ if ((vbo & mask) || (bytes & mask)) {
+ /* We have to zero a range(s). */
+ if (frame_size == NULL) {
+ /* Caller insists range is aligned. */
+ return -EINVAL;
+ }
+ *frame_size = mask + 1;
+ return E_NTFS_NOTALIGNED;
+ }
+
+ down_write(&ni->file.run_lock);
+ run_init(&run2);
+ run_truncate(run, 0);
+
+ /*
+ * Enumerate all attribute segments and punch hole where necessary.
+ */
+ alen = alloc_size >> sbi->cluster_bits;
+ vcn = vbo >> sbi->cluster_bits;
+ len = bytes >> sbi->cluster_bits;
+ end = vcn + len;
+ hole = 0;
+
+ svcn = le64_to_cpu(attr_b->nres.svcn);
+ evcn1 = le64_to_cpu(attr_b->nres.evcn) + 1;
+ a_flags = attr_b->flags;
+
+ if (svcn <= vcn && vcn < evcn1) {
+ attr = attr_b;
+ le = le_b;
+ mi = mi_b;
+ } else if (!le_b) {
+ err = -EINVAL;
+ goto bad_inode;
+ } else {
+ le = le_b;
+ attr = ni_find_attr(ni, attr_b, &le, ATTR_DATA, NULL, 0, &vcn,
+ &mi);
+ if (!attr) {
+ err = -EINVAL;
+ goto bad_inode;
+ }
+
+ svcn = le64_to_cpu(attr->nres.svcn);
+ evcn1 = le64_to_cpu(attr->nres.evcn) + 1;
+ }
+
+ while (svcn < end) {
+ CLST vcn1, zero, hole2 = hole;
+
+ err = attr_load_runs(attr, ni, run, &svcn);
+ if (err)
+ goto done;
+ vcn1 = max(vcn, svcn);
+ zero = min(end, evcn1) - vcn1;
+
+ /*
+ * Check range [vcn1 + zero).
+ * Calculate how many clusters there are.
+ * Don't do any destructive actions.
+ */
+ err = run_deallocate_ex(NULL, run, vcn1, zero, &hole2, false);
+ if (err)
+ goto done;
+
+ /* Check if required range is already hole. */
+ if (hole2 == hole)
+ goto next_attr;
+
+ /* Make a clone of run to undo. */
+ err = run_clone(run, &run2);
+ if (err)
+ goto done;
+
+ /* Make a hole range (sparse) [vcn1 + zero). */
+ if (!run_add_entry(run, vcn1, SPARSE_LCN, zero, false)) {
+ err = -ENOMEM;
+ goto done;
+ }
+
+ /* Update run in attribute segment. */
+ err = mi_pack_runs(mi, attr, run, evcn1 - svcn);
+ if (err)
+ goto done;
+ next_svcn = le64_to_cpu(attr->nres.evcn) + 1;
+ if (next_svcn < evcn1) {
+ /* Insert new attribute segment. */
+ err = ni_insert_nonresident(ni, ATTR_DATA, NULL, 0, run,
+ next_svcn,
+ evcn1 - next_svcn, a_flags,
+ &attr, &mi, &le);
+ if (err)
+ goto undo_punch;
+
+ /* Layout of records maybe changed. */
+ attr_b = NULL;
+ }
+
+ /* Real deallocate. Should not fail. */
+ run_deallocate_ex(sbi, &run2, vcn1, zero, &hole, true);
+
+next_attr:
+ /* Free all allocated memory. */
+ run_truncate(run, 0);
+
+ if (evcn1 >= alen)
+ break;
+
+ /* Get next attribute segment. */
+ attr = ni_enum_attr_ex(ni, attr, &le, &mi);
+ if (!attr) {
+ err = -EINVAL;
+ goto bad_inode;
+ }
+
+ svcn = le64_to_cpu(attr->nres.svcn);
+ evcn1 = le64_to_cpu(attr->nres.evcn) + 1;
+ }
+
+done:
+ if (!hole)
+ goto out;
+
+ if (!attr_b) {
+ attr_b = ni_find_attr(ni, NULL, NULL, ATTR_DATA, NULL, 0, NULL,
+ &mi_b);
+ if (!attr_b) {
+ err = -EINVAL;
+ goto bad_inode;
+ }
+ }
+
+ total_size -= (u64)hole << sbi->cluster_bits;
+ attr_b->nres.total_size = cpu_to_le64(total_size);
+ mi_b->dirty = true;
+
+ /* Update inode size. */
+ inode_set_bytes(&ni->vfs_inode, total_size);
+ ni->ni_flags |= NI_FLAG_UPDATE_PARENT;
+ mark_inode_dirty(&ni->vfs_inode);
+
+out:
+ run_close(&run2);
+ up_write(&ni->file.run_lock);
+ return err;
+
+bad_inode:
+ _ntfs_bad_inode(&ni->vfs_inode);
+ goto out;
+
+undo_punch:
+ /*
+ * Restore packed runs.
+ * 'mi_pack_runs' should not fail, cause we restore original.
+ */
+ if (mi_pack_runs(mi, attr, &run2, evcn1 - svcn))
+ goto bad_inode;
+
+ goto done;
+}
+
+/*
+ * attr_insert_range - Insert range (hole) in file.
+ * Not for normal files.
+ */
+int attr_insert_range(struct ntfs_inode *ni, u64 vbo, u64 bytes)
+{
+ int err = 0;
+ struct runs_tree *run = &ni->file.run;
+ struct ntfs_sb_info *sbi = ni->mi.sbi;
+ struct ATTRIB *attr = NULL, *attr_b;
+ struct ATTR_LIST_ENTRY *le, *le_b;
+ struct mft_inode *mi, *mi_b;
+ CLST vcn, svcn, evcn1, len, next_svcn;
+ u64 data_size, alloc_size;
+ u32 mask;
+ __le16 a_flags;
+
+ if (!bytes)
+ return 0;
+
+ le_b = NULL;
+ attr_b = ni_find_attr(ni, NULL, &le_b, ATTR_DATA, NULL, 0, NULL, &mi_b);
+ if (!attr_b)
+ return -ENOENT;
+
+ if (!is_attr_ext(attr_b)) {
+ /* It was checked above. See fallocate. */
+ return -EOPNOTSUPP;
+ }
+
+ if (!attr_b->non_res) {
+ data_size = le32_to_cpu(attr_b->res.data_size);
+ alloc_size = data_size;
+ mask = sbi->cluster_mask; /* cluster_size - 1 */
+ } else {
+ data_size = le64_to_cpu(attr_b->nres.data_size);
+ alloc_size = le64_to_cpu(attr_b->nres.alloc_size);
+ mask = (sbi->cluster_size << attr_b->nres.c_unit) - 1;
+ }
+
+ if (vbo > data_size) {
+ /* Insert range after the file size is not allowed. */
+ return -EINVAL;
+ }
+
+ if ((vbo & mask) || (bytes & mask)) {
+ /* Allow to insert only frame aligned ranges. */
+ return -EINVAL;
+ }
+
+ /*
+ * valid_size <= data_size <= alloc_size
+ * Check alloc_size for maximum possible.
+ */
+ if (bytes > sbi->maxbytes_sparse - alloc_size)
+ return -EFBIG;
+
+ vcn = vbo >> sbi->cluster_bits;
+ len = bytes >> sbi->cluster_bits;
+
+ down_write(&ni->file.run_lock);
+
+ if (!attr_b->non_res) {
+ err = attr_set_size(ni, ATTR_DATA, NULL, 0, run,
+ data_size + bytes, NULL, false, NULL);
+
+ le_b = NULL;
+ attr_b = ni_find_attr(ni, NULL, &le_b, ATTR_DATA, NULL, 0, NULL,
+ &mi_b);
+ if (!attr_b) {
+ err = -EINVAL;
+ goto bad_inode;
+ }
+
+ if (err)
+ goto out;
+
+ if (!attr_b->non_res) {
+ /* Still resident. */
+ char *data = Add2Ptr(attr_b, attr_b->res.data_off);
+
+ memmove(data + bytes, data, bytes);
+ memset(data, 0, bytes);
+ goto done;
+ }
+
+ /* Resident files becomes nonresident. */
+ data_size = le64_to_cpu(attr_b->nres.data_size);
+ alloc_size = le64_to_cpu(attr_b->nres.alloc_size);
+ }
+
+ /*
+ * Enumerate all attribute segments and shift start vcn.
+ */
+ a_flags = attr_b->flags;
+ svcn = le64_to_cpu(attr_b->nres.svcn);
+ evcn1 = le64_to_cpu(attr_b->nres.evcn) + 1;
+
+ if (svcn <= vcn && vcn < evcn1) {
+ attr = attr_b;
+ le = le_b;
+ mi = mi_b;
+ } else if (!le_b) {
+ err = -EINVAL;
+ goto bad_inode;
+ } else {
+ le = le_b;
+ attr = ni_find_attr(ni, attr_b, &le, ATTR_DATA, NULL, 0, &vcn,
+ &mi);
+ if (!attr) {
+ err = -EINVAL;
+ goto bad_inode;
+ }
+
+ svcn = le64_to_cpu(attr->nres.svcn);
+ evcn1 = le64_to_cpu(attr->nres.evcn) + 1;
+ }
+
+ run_truncate(run, 0); /* clear cached values. */
+ err = attr_load_runs(attr, ni, run, NULL);
+ if (err)
+ goto out;
+
+ if (!run_insert_range(run, vcn, len)) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ /* Try to pack in current record as much as possible. */
+ err = mi_pack_runs(mi, attr, run, evcn1 + len - svcn);
+ if (err)
+ goto out;
+
+ next_svcn = le64_to_cpu(attr->nres.evcn) + 1;
+
+ while ((attr = ni_enum_attr_ex(ni, attr, &le, &mi)) &&
+ attr->type == ATTR_DATA && !attr->name_len) {
+ le64_add_cpu(&attr->nres.svcn, len);
+ le64_add_cpu(&attr->nres.evcn, len);
+ if (le) {
+ le->vcn = attr->nres.svcn;
+ ni->attr_list.dirty = true;
+ }
+ mi->dirty = true;
+ }
+
+ if (next_svcn < evcn1 + len) {
+ err = ni_insert_nonresident(ni, ATTR_DATA, NULL, 0, run,
+ next_svcn, evcn1 + len - next_svcn,
+ a_flags, NULL, NULL, NULL);
+
+ le_b = NULL;
+ attr_b = ni_find_attr(ni, NULL, &le_b, ATTR_DATA, NULL, 0, NULL,
+ &mi_b);
+ if (!attr_b) {
+ err = -EINVAL;
+ goto bad_inode;
+ }
+
+ if (err) {
+ /* ni_insert_nonresident failed. Try to undo. */
+ goto undo_insert_range;
+ }
+ }
+
+ /*
+ * Update primary attribute segment.
+ */
+ if (vbo <= ni->i_valid)
+ ni->i_valid += bytes;
+
+ attr_b->nres.data_size = le64_to_cpu(data_size + bytes);
+ attr_b->nres.alloc_size = le64_to_cpu(alloc_size + bytes);
+
+ /* ni->valid may be not equal valid_size (temporary). */
+ if (ni->i_valid > data_size + bytes)
+ attr_b->nres.valid_size = attr_b->nres.data_size;
+ else
+ attr_b->nres.valid_size = cpu_to_le64(ni->i_valid);
+ mi_b->dirty = true;
+
+done:
+ ni->vfs_inode.i_size += bytes;
+ ni->ni_flags |= NI_FLAG_UPDATE_PARENT;
+ mark_inode_dirty(&ni->vfs_inode);
+
+out:
+ run_truncate(run, 0); /* clear cached values. */
+
+ up_write(&ni->file.run_lock);
+
+ return err;
+
+bad_inode:
+ _ntfs_bad_inode(&ni->vfs_inode);
+ goto out;
+
+undo_insert_range:
+ svcn = le64_to_cpu(attr_b->nres.svcn);
+ evcn1 = le64_to_cpu(attr_b->nres.evcn) + 1;
+
+ if (svcn <= vcn && vcn < evcn1) {
+ attr = attr_b;
+ le = le_b;
+ mi = mi_b;
+ } else if (!le_b) {
+ goto bad_inode;
+ } else {
+ le = le_b;
+ attr = ni_find_attr(ni, attr_b, &le, ATTR_DATA, NULL, 0, &vcn,
+ &mi);
+ if (!attr) {
+ goto bad_inode;
+ }
+
+ svcn = le64_to_cpu(attr->nres.svcn);
+ evcn1 = le64_to_cpu(attr->nres.evcn) + 1;
+ }
+
+ if (attr_load_runs(attr, ni, run, NULL))
+ goto bad_inode;
+
+ if (!run_collapse_range(run, vcn, len))
+ goto bad_inode;
+
+ if (mi_pack_runs(mi, attr, run, evcn1 + len - svcn))
+ goto bad_inode;
+
+ while ((attr = ni_enum_attr_ex(ni, attr, &le, &mi)) &&
+ attr->type == ATTR_DATA && !attr->name_len) {
+ le64_sub_cpu(&attr->nres.svcn, len);
+ le64_sub_cpu(&attr->nres.evcn, len);
+ if (le) {
+ le->vcn = attr->nres.svcn;
+ ni->attr_list.dirty = true;
+ }
+ mi->dirty = true;
+ }
+
+ goto out;
+}
diff --git a/fs/ntfs3/attrlist.c b/fs/ntfs3/attrlist.c
new file mode 100644
index 000000000..0c6a68e71
--- /dev/null
+++ b/fs/ntfs3/attrlist.c
@@ -0,0 +1,473 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ *
+ * Copyright (C) 2019-2021 Paragon Software GmbH, All rights reserved.
+ *
+ */
+
+#include <linux/fs.h>
+
+#include "debug.h"
+#include "ntfs.h"
+#include "ntfs_fs.h"
+
+/*
+ * al_is_valid_le
+ *
+ * Return: True if @le is valid.
+ */
+static inline bool al_is_valid_le(const struct ntfs_inode *ni,
+ struct ATTR_LIST_ENTRY *le)
+{
+ if (!le || !ni->attr_list.le || !ni->attr_list.size)
+ return false;
+
+ return PtrOffset(ni->attr_list.le, le) + le16_to_cpu(le->size) <=
+ ni->attr_list.size;
+}
+
+void al_destroy(struct ntfs_inode *ni)
+{
+ run_close(&ni->attr_list.run);
+ kfree(ni->attr_list.le);
+ ni->attr_list.le = NULL;
+ ni->attr_list.size = 0;
+ ni->attr_list.dirty = false;
+}
+
+/*
+ * ntfs_load_attr_list
+ *
+ * This method makes sure that the ATTRIB list, if present,
+ * has been properly set up.
+ */
+int ntfs_load_attr_list(struct ntfs_inode *ni, struct ATTRIB *attr)
+{
+ int err;
+ size_t lsize;
+ void *le = NULL;
+
+ if (ni->attr_list.size)
+ return 0;
+
+ if (!attr->non_res) {
+ lsize = le32_to_cpu(attr->res.data_size);
+ /* attr is resident: lsize < record_size (1K or 4K) */
+ le = kvmalloc(al_aligned(lsize), GFP_KERNEL);
+ if (!le) {
+ err = -ENOMEM;
+ goto out;
+ }
+ memcpy(le, resident_data(attr), lsize);
+ } else if (attr->nres.svcn) {
+ err = -EINVAL;
+ goto out;
+ } else {
+ u16 run_off = le16_to_cpu(attr->nres.run_off);
+
+ lsize = le64_to_cpu(attr->nres.data_size);
+
+ run_init(&ni->attr_list.run);
+
+ if (run_off > le32_to_cpu(attr->size)) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ err = run_unpack_ex(&ni->attr_list.run, ni->mi.sbi, ni->mi.rno,
+ 0, le64_to_cpu(attr->nres.evcn), 0,
+ Add2Ptr(attr, run_off),
+ le32_to_cpu(attr->size) - run_off);
+ if (err < 0)
+ goto out;
+
+ /* attr is nonresident.
+ * The worst case:
+ * 1T (2^40) extremely fragmented file.
+ * cluster = 4K (2^12) => 2^28 fragments
+ * 2^9 fragments per one record => 2^19 records
+ * 2^5 bytes of ATTR_LIST_ENTRY per one record => 2^24 bytes.
+ *
+ * the result is 16M bytes per attribute list.
+ * Use kvmalloc to allocate in range [several Kbytes - dozen Mbytes]
+ */
+ le = kvmalloc(al_aligned(lsize), GFP_KERNEL);
+ if (!le) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ err = ntfs_read_run_nb(ni->mi.sbi, &ni->attr_list.run, 0, le,
+ lsize, NULL);
+ if (err)
+ goto out;
+ }
+
+ ni->attr_list.size = lsize;
+ ni->attr_list.le = le;
+
+ return 0;
+
+out:
+ ni->attr_list.le = le;
+ al_destroy(ni);
+
+ return err;
+}
+
+/*
+ * al_enumerate
+ *
+ * Return:
+ * * The next list le.
+ * * If @le is NULL then return the first le.
+ */
+struct ATTR_LIST_ENTRY *al_enumerate(struct ntfs_inode *ni,
+ struct ATTR_LIST_ENTRY *le)
+{
+ size_t off;
+ u16 sz;
+
+ if (!le) {
+ le = ni->attr_list.le;
+ } else {
+ sz = le16_to_cpu(le->size);
+ if (sz < sizeof(struct ATTR_LIST_ENTRY)) {
+ /* Impossible 'cause we should not return such le. */
+ return NULL;
+ }
+ le = Add2Ptr(le, sz);
+ }
+
+ /* Check boundary. */
+ off = PtrOffset(ni->attr_list.le, le);
+ if (off + sizeof(struct ATTR_LIST_ENTRY) > ni->attr_list.size) {
+ /* The regular end of list. */
+ return NULL;
+ }
+
+ sz = le16_to_cpu(le->size);
+
+ /* Check le for errors. */
+ if (sz < sizeof(struct ATTR_LIST_ENTRY) ||
+ off + sz > ni->attr_list.size ||
+ sz < le->name_off + le->name_len * sizeof(short)) {
+ return NULL;
+ }
+
+ return le;
+}
+
+/*
+ * al_find_le
+ *
+ * Find the first le in the list which matches type, name and VCN.
+ *
+ * Return: NULL if not found.
+ */
+struct ATTR_LIST_ENTRY *al_find_le(struct ntfs_inode *ni,
+ struct ATTR_LIST_ENTRY *le,
+ const struct ATTRIB *attr)
+{
+ CLST svcn = attr_svcn(attr);
+
+ return al_find_ex(ni, le, attr->type, attr_name(attr), attr->name_len,
+ &svcn);
+}
+
+/*
+ * al_find_ex
+ *
+ * Find the first le in the list which matches type, name and VCN.
+ *
+ * Return: NULL if not found.
+ */
+struct ATTR_LIST_ENTRY *al_find_ex(struct ntfs_inode *ni,
+ struct ATTR_LIST_ENTRY *le,
+ enum ATTR_TYPE type, const __le16 *name,
+ u8 name_len, const CLST *vcn)
+{
+ struct ATTR_LIST_ENTRY *ret = NULL;
+ u32 type_in = le32_to_cpu(type);
+
+ while ((le = al_enumerate(ni, le))) {
+ u64 le_vcn;
+ int diff = le32_to_cpu(le->type) - type_in;
+
+ /* List entries are sorted by type, name and VCN. */
+ if (diff < 0)
+ continue;
+
+ if (diff > 0)
+ return ret;
+
+ if (le->name_len != name_len)
+ continue;
+
+ le_vcn = le64_to_cpu(le->vcn);
+ if (!le_vcn) {
+ /*
+ * Compare entry names only for entry with vcn == 0.
+ */
+ diff = ntfs_cmp_names(le_name(le), name_len, name,
+ name_len, ni->mi.sbi->upcase,
+ true);
+ if (diff < 0)
+ continue;
+
+ if (diff > 0)
+ return ret;
+ }
+
+ if (!vcn)
+ return le;
+
+ if (*vcn == le_vcn)
+ return le;
+
+ if (*vcn < le_vcn)
+ return ret;
+
+ ret = le;
+ }
+
+ return ret;
+}
+
+/*
+ * al_find_le_to_insert
+ *
+ * Find the first list entry which matches type, name and VCN.
+ */
+static struct ATTR_LIST_ENTRY *al_find_le_to_insert(struct ntfs_inode *ni,
+ enum ATTR_TYPE type,
+ const __le16 *name,
+ u8 name_len, CLST vcn)
+{
+ struct ATTR_LIST_ENTRY *le = NULL, *prev;
+ u32 type_in = le32_to_cpu(type);
+
+ /* List entries are sorted by type, name and VCN. */
+ while ((le = al_enumerate(ni, prev = le))) {
+ int diff = le32_to_cpu(le->type) - type_in;
+
+ if (diff < 0)
+ continue;
+
+ if (diff > 0)
+ return le;
+
+ if (!le->vcn) {
+ /*
+ * Compare entry names only for entry with vcn == 0.
+ */
+ diff = ntfs_cmp_names(le_name(le), le->name_len, name,
+ name_len, ni->mi.sbi->upcase,
+ true);
+ if (diff < 0)
+ continue;
+
+ if (diff > 0)
+ return le;
+ }
+
+ if (le64_to_cpu(le->vcn) >= vcn)
+ return le;
+ }
+
+ return prev ? Add2Ptr(prev, le16_to_cpu(prev->size)) : ni->attr_list.le;
+}
+
+/*
+ * al_add_le
+ *
+ * Add an "attribute list entry" to the list.
+ */
+int al_add_le(struct ntfs_inode *ni, enum ATTR_TYPE type, const __le16 *name,
+ u8 name_len, CLST svcn, __le16 id, const struct MFT_REF *ref,
+ struct ATTR_LIST_ENTRY **new_le)
+{
+ int err;
+ struct ATTRIB *attr;
+ struct ATTR_LIST_ENTRY *le;
+ size_t off;
+ u16 sz;
+ size_t asize, new_asize, old_size;
+ u64 new_size;
+ typeof(ni->attr_list) *al = &ni->attr_list;
+
+ /*
+ * Compute the size of the new 'le'
+ */
+ sz = le_size(name_len);
+ old_size = al->size;
+ new_size = old_size + sz;
+ asize = al_aligned(old_size);
+ new_asize = al_aligned(new_size);
+
+ /* Scan forward to the point at which the new 'le' should be inserted. */
+ le = al_find_le_to_insert(ni, type, name, name_len, svcn);
+ off = PtrOffset(al->le, le);
+
+ if (new_size > asize) {
+ void *ptr = kmalloc(new_asize, GFP_NOFS);
+
+ if (!ptr)
+ return -ENOMEM;
+
+ memcpy(ptr, al->le, off);
+ memcpy(Add2Ptr(ptr, off + sz), le, old_size - off);
+ le = Add2Ptr(ptr, off);
+ kfree(al->le);
+ al->le = ptr;
+ } else {
+ memmove(Add2Ptr(le, sz), le, old_size - off);
+ }
+ *new_le = le;
+
+ al->size = new_size;
+
+ le->type = type;
+ le->size = cpu_to_le16(sz);
+ le->name_len = name_len;
+ le->name_off = offsetof(struct ATTR_LIST_ENTRY, name);
+ le->vcn = cpu_to_le64(svcn);
+ le->ref = *ref;
+ le->id = id;
+ memcpy(le->name, name, sizeof(short) * name_len);
+
+ err = attr_set_size(ni, ATTR_LIST, NULL, 0, &al->run, new_size,
+ &new_size, true, &attr);
+ if (err) {
+ /* Undo memmove above. */
+ memmove(le, Add2Ptr(le, sz), old_size - off);
+ al->size = old_size;
+ return err;
+ }
+
+ al->dirty = true;
+
+ if (attr && attr->non_res) {
+ err = ntfs_sb_write_run(ni->mi.sbi, &al->run, 0, al->le,
+ al->size, 0);
+ if (err)
+ return err;
+ al->dirty = false;
+ }
+
+ return 0;
+}
+
+/*
+ * al_remove_le - Remove @le from attribute list.
+ */
+bool al_remove_le(struct ntfs_inode *ni, struct ATTR_LIST_ENTRY *le)
+{
+ u16 size;
+ size_t off;
+ typeof(ni->attr_list) *al = &ni->attr_list;
+
+ if (!al_is_valid_le(ni, le))
+ return false;
+
+ /* Save on stack the size of 'le' */
+ size = le16_to_cpu(le->size);
+ off = PtrOffset(al->le, le);
+
+ memmove(le, Add2Ptr(le, size), al->size - (off + size));
+
+ al->size -= size;
+ al->dirty = true;
+
+ return true;
+}
+
+/*
+ * al_delete_le - Delete first le from the list which matches its parameters.
+ */
+bool al_delete_le(struct ntfs_inode *ni, enum ATTR_TYPE type, CLST vcn,
+ const __le16 *name, size_t name_len,
+ const struct MFT_REF *ref)
+{
+ u16 size;
+ struct ATTR_LIST_ENTRY *le;
+ size_t off;
+ typeof(ni->attr_list) *al = &ni->attr_list;
+
+ /* Scan forward to the first le that matches the input. */
+ le = al_find_ex(ni, NULL, type, name, name_len, &vcn);
+ if (!le)
+ return false;
+
+ off = PtrOffset(al->le, le);
+
+next:
+ if (off >= al->size)
+ return false;
+ if (le->type != type)
+ return false;
+ if (le->name_len != name_len)
+ return false;
+ if (name_len && ntfs_cmp_names(le_name(le), name_len, name, name_len,
+ ni->mi.sbi->upcase, true))
+ return false;
+ if (le64_to_cpu(le->vcn) != vcn)
+ return false;
+
+ /*
+ * The caller specified a segment reference, so we have to
+ * scan through the matching entries until we find that segment
+ * reference or we run of matching entries.
+ */
+ if (ref && memcmp(ref, &le->ref, sizeof(*ref))) {
+ off += le16_to_cpu(le->size);
+ le = Add2Ptr(al->le, off);
+ goto next;
+ }
+
+ /* Save on stack the size of 'le'. */
+ size = le16_to_cpu(le->size);
+ /* Delete the le. */
+ memmove(le, Add2Ptr(le, size), al->size - (off + size));
+
+ al->size -= size;
+ al->dirty = true;
+
+ return true;
+}
+
+int al_update(struct ntfs_inode *ni, int sync)
+{
+ int err;
+ struct ATTRIB *attr;
+ typeof(ni->attr_list) *al = &ni->attr_list;
+
+ if (!al->dirty || !al->size)
+ return 0;
+
+ /*
+ * Attribute list increased on demand in al_add_le.
+ * Attribute list decreased here.
+ */
+ err = attr_set_size(ni, ATTR_LIST, NULL, 0, &al->run, al->size, NULL,
+ false, &attr);
+ if (err)
+ goto out;
+
+ if (!attr->non_res) {
+ memcpy(resident_data(attr), al->le, al->size);
+ } else {
+ err = ntfs_sb_write_run(ni->mi.sbi, &al->run, 0, al->le,
+ al->size, sync);
+ if (err)
+ goto out;
+
+ attr->nres.valid_size = attr->nres.data_size;
+ }
+
+ ni->mi.dirty = true;
+ al->dirty = false;
+
+out:
+ return err;
+}
diff --git a/fs/ntfs3/bitfunc.c b/fs/ntfs3/bitfunc.c
new file mode 100644
index 000000000..50d838093
--- /dev/null
+++ b/fs/ntfs3/bitfunc.c
@@ -0,0 +1,128 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ *
+ * Copyright (C) 2019-2021 Paragon Software GmbH, All rights reserved.
+ *
+ */
+
+#include <linux/types.h>
+
+#include "ntfs_fs.h"
+
+#define BITS_IN_SIZE_T (sizeof(size_t) * 8)
+
+/*
+ * fill_mask[i] - first i bits are '1' , i = 0,1,2,3,4,5,6,7,8
+ * fill_mask[i] = 0xFF >> (8-i)
+ */
+static const u8 fill_mask[] = { 0x00, 0x01, 0x03, 0x07, 0x0F,
+ 0x1F, 0x3F, 0x7F, 0xFF };
+
+/*
+ * zero_mask[i] - first i bits are '0' , i = 0,1,2,3,4,5,6,7,8
+ * zero_mask[i] = 0xFF << i
+ */
+static const u8 zero_mask[] = { 0xFF, 0xFE, 0xFC, 0xF8, 0xF0,
+ 0xE0, 0xC0, 0x80, 0x00 };
+
+/*
+ * are_bits_clear
+ *
+ * Return: True if all bits [bit, bit+nbits) are zeros "0".
+ */
+bool are_bits_clear(const ulong *lmap, size_t bit, size_t nbits)
+{
+ size_t pos = bit & 7;
+ const u8 *map = (u8 *)lmap + (bit >> 3);
+
+ if (pos) {
+ if (8 - pos >= nbits)
+ return !nbits || !(*map & fill_mask[pos + nbits] &
+ zero_mask[pos]);
+
+ if (*map++ & zero_mask[pos])
+ return false;
+ nbits -= 8 - pos;
+ }
+
+ pos = ((size_t)map) & (sizeof(size_t) - 1);
+ if (pos) {
+ pos = sizeof(size_t) - pos;
+ if (nbits >= pos * 8) {
+ for (nbits -= pos * 8; pos; pos--, map++) {
+ if (*map)
+ return false;
+ }
+ }
+ }
+
+ for (pos = nbits / BITS_IN_SIZE_T; pos; pos--, map += sizeof(size_t)) {
+ if (*((size_t *)map))
+ return false;
+ }
+
+ for (pos = (nbits % BITS_IN_SIZE_T) >> 3; pos; pos--, map++) {
+ if (*map)
+ return false;
+ }
+
+ pos = nbits & 7;
+ if (pos && (*map & fill_mask[pos]))
+ return false;
+
+ return true;
+}
+
+/*
+ * are_bits_set
+ *
+ * Return: True if all bits [bit, bit+nbits) are ones "1".
+ */
+bool are_bits_set(const ulong *lmap, size_t bit, size_t nbits)
+{
+ u8 mask;
+ size_t pos = bit & 7;
+ const u8 *map = (u8 *)lmap + (bit >> 3);
+
+ if (pos) {
+ if (8 - pos >= nbits) {
+ mask = fill_mask[pos + nbits] & zero_mask[pos];
+ return !nbits || (*map & mask) == mask;
+ }
+
+ mask = zero_mask[pos];
+ if ((*map++ & mask) != mask)
+ return false;
+ nbits -= 8 - pos;
+ }
+
+ pos = ((size_t)map) & (sizeof(size_t) - 1);
+ if (pos) {
+ pos = sizeof(size_t) - pos;
+ if (nbits >= pos * 8) {
+ for (nbits -= pos * 8; pos; pos--, map++) {
+ if (*map != 0xFF)
+ return false;
+ }
+ }
+ }
+
+ for (pos = nbits / BITS_IN_SIZE_T; pos; pos--, map += sizeof(size_t)) {
+ if (*((size_t *)map) != MINUS_ONE_T)
+ return false;
+ }
+
+ for (pos = (nbits % BITS_IN_SIZE_T) >> 3; pos; pos--, map++) {
+ if (*map != 0xFF)
+ return false;
+ }
+
+ pos = nbits & 7;
+ if (pos) {
+ mask = fill_mask[pos];
+ if ((*map & mask) != mask)
+ return false;
+ }
+
+ return true;
+}
diff --git a/fs/ntfs3/bitmap.c b/fs/ntfs3/bitmap.c
new file mode 100644
index 000000000..c055bbdfe
--- /dev/null
+++ b/fs/ntfs3/bitmap.c
@@ -0,0 +1,1485 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ *
+ * Copyright (C) 2019-2021 Paragon Software GmbH, All rights reserved.
+ *
+ * This code builds two trees of free clusters extents.
+ * Trees are sorted by start of extent and by length of extent.
+ * NTFS_MAX_WND_EXTENTS defines the maximum number of elements in trees.
+ * In extreme case code reads on-disk bitmap to find free clusters.
+ *
+ */
+
+#include <linux/buffer_head.h>
+#include <linux/fs.h>
+#include <linux/kernel.h>
+
+#include "ntfs.h"
+#include "ntfs_fs.h"
+
+/*
+ * Maximum number of extents in tree.
+ */
+#define NTFS_MAX_WND_EXTENTS (32u * 1024u)
+
+struct rb_node_key {
+ struct rb_node node;
+ size_t key;
+};
+
+struct e_node {
+ struct rb_node_key start; /* Tree sorted by start. */
+ struct rb_node_key count; /* Tree sorted by len. */
+};
+
+static int wnd_rescan(struct wnd_bitmap *wnd);
+static struct buffer_head *wnd_map(struct wnd_bitmap *wnd, size_t iw);
+static bool wnd_is_free_hlp(struct wnd_bitmap *wnd, size_t bit, size_t bits);
+
+static struct kmem_cache *ntfs_enode_cachep;
+
+int __init ntfs3_init_bitmap(void)
+{
+ ntfs_enode_cachep =
+ kmem_cache_create("ntfs3_enode_cache", sizeof(struct e_node), 0,
+ SLAB_RECLAIM_ACCOUNT, NULL);
+ return ntfs_enode_cachep ? 0 : -ENOMEM;
+}
+
+void ntfs3_exit_bitmap(void)
+{
+ kmem_cache_destroy(ntfs_enode_cachep);
+}
+
+/*
+ * wnd_scan
+ *
+ * b_pos + b_len - biggest fragment.
+ * Scan range [wpos wbits) window @buf.
+ *
+ * Return: -1 if not found.
+ */
+static size_t wnd_scan(const ulong *buf, size_t wbit, u32 wpos, u32 wend,
+ size_t to_alloc, size_t *prev_tail, size_t *b_pos,
+ size_t *b_len)
+{
+ while (wpos < wend) {
+ size_t free_len;
+ u32 free_bits, end;
+ u32 used = find_next_zero_bit(buf, wend, wpos);
+
+ if (used >= wend) {
+ if (*b_len < *prev_tail) {
+ *b_pos = wbit - *prev_tail;
+ *b_len = *prev_tail;
+ }
+
+ *prev_tail = 0;
+ return -1;
+ }
+
+ if (used > wpos) {
+ wpos = used;
+ if (*b_len < *prev_tail) {
+ *b_pos = wbit - *prev_tail;
+ *b_len = *prev_tail;
+ }
+
+ *prev_tail = 0;
+ }
+
+ /*
+ * Now we have a fragment [wpos, wend) staring with 0.
+ */
+ end = wpos + to_alloc - *prev_tail;
+ free_bits = find_next_bit(buf, min(end, wend), wpos);
+
+ free_len = *prev_tail + free_bits - wpos;
+
+ if (*b_len < free_len) {
+ *b_pos = wbit + wpos - *prev_tail;
+ *b_len = free_len;
+ }
+
+ if (free_len >= to_alloc)
+ return wbit + wpos - *prev_tail;
+
+ if (free_bits >= wend) {
+ *prev_tail += free_bits - wpos;
+ return -1;
+ }
+
+ wpos = free_bits + 1;
+
+ *prev_tail = 0;
+ }
+
+ return -1;
+}
+
+/*
+ * wnd_close - Frees all resources.
+ */
+void wnd_close(struct wnd_bitmap *wnd)
+{
+ struct rb_node *node, *next;
+
+ kfree(wnd->free_bits);
+ run_close(&wnd->run);
+
+ node = rb_first(&wnd->start_tree);
+
+ while (node) {
+ next = rb_next(node);
+ rb_erase(node, &wnd->start_tree);
+ kmem_cache_free(ntfs_enode_cachep,
+ rb_entry(node, struct e_node, start.node));
+ node = next;
+ }
+}
+
+static struct rb_node *rb_lookup(struct rb_root *root, size_t v)
+{
+ struct rb_node **p = &root->rb_node;
+ struct rb_node *r = NULL;
+
+ while (*p) {
+ struct rb_node_key *k;
+
+ k = rb_entry(*p, struct rb_node_key, node);
+ if (v < k->key) {
+ p = &(*p)->rb_left;
+ } else if (v > k->key) {
+ r = &k->node;
+ p = &(*p)->rb_right;
+ } else {
+ return &k->node;
+ }
+ }
+
+ return r;
+}
+
+/*
+ * rb_insert_count - Helper function to insert special kind of 'count' tree.
+ */
+static inline bool rb_insert_count(struct rb_root *root, struct e_node *e)
+{
+ struct rb_node **p = &root->rb_node;
+ struct rb_node *parent = NULL;
+ size_t e_ckey = e->count.key;
+ size_t e_skey = e->start.key;
+
+ while (*p) {
+ struct e_node *k =
+ rb_entry(parent = *p, struct e_node, count.node);
+
+ if (e_ckey > k->count.key) {
+ p = &(*p)->rb_left;
+ } else if (e_ckey < k->count.key) {
+ p = &(*p)->rb_right;
+ } else if (e_skey < k->start.key) {
+ p = &(*p)->rb_left;
+ } else if (e_skey > k->start.key) {
+ p = &(*p)->rb_right;
+ } else {
+ WARN_ON(1);
+ return false;
+ }
+ }
+
+ rb_link_node(&e->count.node, parent, p);
+ rb_insert_color(&e->count.node, root);
+ return true;
+}
+
+/*
+ * rb_insert_start - Helper function to insert special kind of 'count' tree.
+ */
+static inline bool rb_insert_start(struct rb_root *root, struct e_node *e)
+{
+ struct rb_node **p = &root->rb_node;
+ struct rb_node *parent = NULL;
+ size_t e_skey = e->start.key;
+
+ while (*p) {
+ struct e_node *k;
+
+ parent = *p;
+
+ k = rb_entry(parent, struct e_node, start.node);
+ if (e_skey < k->start.key) {
+ p = &(*p)->rb_left;
+ } else if (e_skey > k->start.key) {
+ p = &(*p)->rb_right;
+ } else {
+ WARN_ON(1);
+ return false;
+ }
+ }
+
+ rb_link_node(&e->start.node, parent, p);
+ rb_insert_color(&e->start.node, root);
+ return true;
+}
+
+/*
+ * wnd_add_free_ext - Adds a new extent of free space.
+ * @build: 1 when building tree.
+ */
+static void wnd_add_free_ext(struct wnd_bitmap *wnd, size_t bit, size_t len,
+ bool build)
+{
+ struct e_node *e, *e0 = NULL;
+ size_t ib, end_in = bit + len;
+ struct rb_node *n;
+
+ if (build) {
+ /* Use extent_min to filter too short extents. */
+ if (wnd->count >= NTFS_MAX_WND_EXTENTS &&
+ len <= wnd->extent_min) {
+ wnd->uptodated = -1;
+ return;
+ }
+ } else {
+ /* Try to find extent before 'bit'. */
+ n = rb_lookup(&wnd->start_tree, bit);
+
+ if (!n) {
+ n = rb_first(&wnd->start_tree);
+ } else {
+ e = rb_entry(n, struct e_node, start.node);
+ n = rb_next(n);
+ if (e->start.key + e->count.key == bit) {
+ /* Remove left. */
+ bit = e->start.key;
+ len += e->count.key;
+ rb_erase(&e->start.node, &wnd->start_tree);
+ rb_erase(&e->count.node, &wnd->count_tree);
+ wnd->count -= 1;
+ e0 = e;
+ }
+ }
+
+ while (n) {
+ size_t next_end;
+
+ e = rb_entry(n, struct e_node, start.node);
+ next_end = e->start.key + e->count.key;
+ if (e->start.key > end_in)
+ break;
+
+ /* Remove right. */
+ n = rb_next(n);
+ len += next_end - end_in;
+ end_in = next_end;
+ rb_erase(&e->start.node, &wnd->start_tree);
+ rb_erase(&e->count.node, &wnd->count_tree);
+ wnd->count -= 1;
+
+ if (!e0)
+ e0 = e;
+ else
+ kmem_cache_free(ntfs_enode_cachep, e);
+ }
+
+ if (wnd->uptodated != 1) {
+ /* Check bits before 'bit'. */
+ ib = wnd->zone_bit == wnd->zone_end ||
+ bit < wnd->zone_end
+ ? 0
+ : wnd->zone_end;
+
+ while (bit > ib && wnd_is_free_hlp(wnd, bit - 1, 1)) {
+ bit -= 1;
+ len += 1;
+ }
+
+ /* Check bits after 'end_in'. */
+ ib = wnd->zone_bit == wnd->zone_end ||
+ end_in > wnd->zone_bit
+ ? wnd->nbits
+ : wnd->zone_bit;
+
+ while (end_in < ib && wnd_is_free_hlp(wnd, end_in, 1)) {
+ end_in += 1;
+ len += 1;
+ }
+ }
+ }
+ /* Insert new fragment. */
+ if (wnd->count >= NTFS_MAX_WND_EXTENTS) {
+ if (e0)
+ kmem_cache_free(ntfs_enode_cachep, e0);
+
+ wnd->uptodated = -1;
+
+ /* Compare with smallest fragment. */
+ n = rb_last(&wnd->count_tree);
+ e = rb_entry(n, struct e_node, count.node);
+ if (len <= e->count.key)
+ goto out; /* Do not insert small fragments. */
+
+ if (build) {
+ struct e_node *e2;
+
+ n = rb_prev(n);
+ e2 = rb_entry(n, struct e_node, count.node);
+ /* Smallest fragment will be 'e2->count.key'. */
+ wnd->extent_min = e2->count.key;
+ }
+
+ /* Replace smallest fragment by new one. */
+ rb_erase(&e->start.node, &wnd->start_tree);
+ rb_erase(&e->count.node, &wnd->count_tree);
+ wnd->count -= 1;
+ } else {
+ e = e0 ? e0 : kmem_cache_alloc(ntfs_enode_cachep, GFP_ATOMIC);
+ if (!e) {
+ wnd->uptodated = -1;
+ goto out;
+ }
+
+ if (build && len <= wnd->extent_min)
+ wnd->extent_min = len;
+ }
+ e->start.key = bit;
+ e->count.key = len;
+ if (len > wnd->extent_max)
+ wnd->extent_max = len;
+
+ rb_insert_start(&wnd->start_tree, e);
+ rb_insert_count(&wnd->count_tree, e);
+ wnd->count += 1;
+
+out:;
+}
+
+/*
+ * wnd_remove_free_ext - Remove a run from the cached free space.
+ */
+static void wnd_remove_free_ext(struct wnd_bitmap *wnd, size_t bit, size_t len)
+{
+ struct rb_node *n, *n3;
+ struct e_node *e, *e3;
+ size_t end_in = bit + len;
+ size_t end3, end, new_key, new_len, max_new_len;
+
+ /* Try to find extent before 'bit'. */
+ n = rb_lookup(&wnd->start_tree, bit);
+
+ if (!n)
+ return;
+
+ e = rb_entry(n, struct e_node, start.node);
+ end = e->start.key + e->count.key;
+
+ new_key = new_len = 0;
+ len = e->count.key;
+
+ /* Range [bit,end_in) must be inside 'e' or outside 'e' and 'n'. */
+ if (e->start.key > bit)
+ ;
+ else if (end_in <= end) {
+ /* Range [bit,end_in) inside 'e'. */
+ new_key = end_in;
+ new_len = end - end_in;
+ len = bit - e->start.key;
+ } else if (bit > end) {
+ bool bmax = false;
+
+ n3 = rb_next(n);
+
+ while (n3) {
+ e3 = rb_entry(n3, struct e_node, start.node);
+ if (e3->start.key >= end_in)
+ break;
+
+ if (e3->count.key == wnd->extent_max)
+ bmax = true;
+
+ end3 = e3->start.key + e3->count.key;
+ if (end3 > end_in) {
+ e3->start.key = end_in;
+ rb_erase(&e3->count.node, &wnd->count_tree);
+ e3->count.key = end3 - end_in;
+ rb_insert_count(&wnd->count_tree, e3);
+ break;
+ }
+
+ n3 = rb_next(n3);
+ rb_erase(&e3->start.node, &wnd->start_tree);
+ rb_erase(&e3->count.node, &wnd->count_tree);
+ wnd->count -= 1;
+ kmem_cache_free(ntfs_enode_cachep, e3);
+ }
+ if (!bmax)
+ return;
+ n3 = rb_first(&wnd->count_tree);
+ wnd->extent_max =
+ n3 ? rb_entry(n3, struct e_node, count.node)->count.key
+ : 0;
+ return;
+ }
+
+ if (e->count.key != wnd->extent_max) {
+ ;
+ } else if (rb_prev(&e->count.node)) {
+ ;
+ } else {
+ n3 = rb_next(&e->count.node);
+ max_new_len = max(len, new_len);
+ if (!n3) {
+ wnd->extent_max = max_new_len;
+ } else {
+ e3 = rb_entry(n3, struct e_node, count.node);
+ wnd->extent_max = max(e3->count.key, max_new_len);
+ }
+ }
+
+ if (!len) {
+ if (new_len) {
+ e->start.key = new_key;
+ rb_erase(&e->count.node, &wnd->count_tree);
+ e->count.key = new_len;
+ rb_insert_count(&wnd->count_tree, e);
+ } else {
+ rb_erase(&e->start.node, &wnd->start_tree);
+ rb_erase(&e->count.node, &wnd->count_tree);
+ wnd->count -= 1;
+ kmem_cache_free(ntfs_enode_cachep, e);
+ }
+ goto out;
+ }
+ rb_erase(&e->count.node, &wnd->count_tree);
+ e->count.key = len;
+ rb_insert_count(&wnd->count_tree, e);
+
+ if (!new_len)
+ goto out;
+
+ if (wnd->count >= NTFS_MAX_WND_EXTENTS) {
+ wnd->uptodated = -1;
+
+ /* Get minimal extent. */
+ e = rb_entry(rb_last(&wnd->count_tree), struct e_node,
+ count.node);
+ if (e->count.key > new_len)
+ goto out;
+
+ /* Replace minimum. */
+ rb_erase(&e->start.node, &wnd->start_tree);
+ rb_erase(&e->count.node, &wnd->count_tree);
+ wnd->count -= 1;
+ } else {
+ e = kmem_cache_alloc(ntfs_enode_cachep, GFP_ATOMIC);
+ if (!e)
+ wnd->uptodated = -1;
+ }
+
+ if (e) {
+ e->start.key = new_key;
+ e->count.key = new_len;
+ rb_insert_start(&wnd->start_tree, e);
+ rb_insert_count(&wnd->count_tree, e);
+ wnd->count += 1;
+ }
+
+out:
+ if (!wnd->count && 1 != wnd->uptodated)
+ wnd_rescan(wnd);
+}
+
+/*
+ * wnd_rescan - Scan all bitmap. Used while initialization.
+ */
+static int wnd_rescan(struct wnd_bitmap *wnd)
+{
+ int err = 0;
+ size_t prev_tail = 0;
+ struct super_block *sb = wnd->sb;
+ struct ntfs_sb_info *sbi = sb->s_fs_info;
+ u64 lbo, len = 0;
+ u32 blocksize = sb->s_blocksize;
+ u8 cluster_bits = sbi->cluster_bits;
+ u32 wbits = 8 * sb->s_blocksize;
+ u32 used, frb;
+ const ulong *buf;
+ size_t wpos, wbit, iw, vbo;
+ struct buffer_head *bh = NULL;
+ CLST lcn, clen;
+
+ wnd->uptodated = 0;
+ wnd->extent_max = 0;
+ wnd->extent_min = MINUS_ONE_T;
+ wnd->total_zeroes = 0;
+
+ vbo = 0;
+
+ for (iw = 0; iw < wnd->nwnd; iw++) {
+ if (iw + 1 == wnd->nwnd)
+ wbits = wnd->bits_last;
+
+ if (wnd->inited) {
+ if (!wnd->free_bits[iw]) {
+ /* All ones. */
+ if (prev_tail) {
+ wnd_add_free_ext(wnd,
+ vbo * 8 - prev_tail,
+ prev_tail, true);
+ prev_tail = 0;
+ }
+ goto next_wnd;
+ }
+ if (wbits == wnd->free_bits[iw]) {
+ /* All zeroes. */
+ prev_tail += wbits;
+ wnd->total_zeroes += wbits;
+ goto next_wnd;
+ }
+ }
+
+ if (!len) {
+ u32 off = vbo & sbi->cluster_mask;
+
+ if (!run_lookup_entry(&wnd->run, vbo >> cluster_bits,
+ &lcn, &clen, NULL)) {
+ err = -ENOENT;
+ goto out;
+ }
+
+ lbo = ((u64)lcn << cluster_bits) + off;
+ len = ((u64)clen << cluster_bits) - off;
+ }
+
+ bh = ntfs_bread(sb, lbo >> sb->s_blocksize_bits);
+ if (!bh) {
+ err = -EIO;
+ goto out;
+ }
+
+ buf = (ulong *)bh->b_data;
+
+ used = bitmap_weight(buf, wbits);
+ if (used < wbits) {
+ frb = wbits - used;
+ wnd->free_bits[iw] = frb;
+ wnd->total_zeroes += frb;
+ }
+
+ wpos = 0;
+ wbit = vbo * 8;
+
+ if (wbit + wbits > wnd->nbits)
+ wbits = wnd->nbits - wbit;
+
+ do {
+ used = find_next_zero_bit(buf, wbits, wpos);
+
+ if (used > wpos && prev_tail) {
+ wnd_add_free_ext(wnd, wbit + wpos - prev_tail,
+ prev_tail, true);
+ prev_tail = 0;
+ }
+
+ wpos = used;
+
+ if (wpos >= wbits) {
+ /* No free blocks. */
+ prev_tail = 0;
+ break;
+ }
+
+ frb = find_next_bit(buf, wbits, wpos);
+ if (frb >= wbits) {
+ /* Keep last free block. */
+ prev_tail += frb - wpos;
+ break;
+ }
+
+ wnd_add_free_ext(wnd, wbit + wpos - prev_tail,
+ frb + prev_tail - wpos, true);
+
+ /* Skip free block and first '1'. */
+ wpos = frb + 1;
+ /* Reset previous tail. */
+ prev_tail = 0;
+ } while (wpos < wbits);
+
+next_wnd:
+
+ if (bh)
+ put_bh(bh);
+ bh = NULL;
+
+ vbo += blocksize;
+ if (len) {
+ len -= blocksize;
+ lbo += blocksize;
+ }
+ }
+
+ /* Add last block. */
+ if (prev_tail)
+ wnd_add_free_ext(wnd, wnd->nbits - prev_tail, prev_tail, true);
+
+ /*
+ * Before init cycle wnd->uptodated was 0.
+ * If any errors or limits occurs while initialization then
+ * wnd->uptodated will be -1.
+ * If 'uptodated' is still 0 then Tree is really updated.
+ */
+ if (!wnd->uptodated)
+ wnd->uptodated = 1;
+
+ if (wnd->zone_bit != wnd->zone_end) {
+ size_t zlen = wnd->zone_end - wnd->zone_bit;
+
+ wnd->zone_end = wnd->zone_bit;
+ wnd_zone_set(wnd, wnd->zone_bit, zlen);
+ }
+
+out:
+ return err;
+}
+
+int wnd_init(struct wnd_bitmap *wnd, struct super_block *sb, size_t nbits)
+{
+ int err;
+ u32 blocksize = sb->s_blocksize;
+ u32 wbits = blocksize * 8;
+
+ init_rwsem(&wnd->rw_lock);
+
+ wnd->sb = sb;
+ wnd->nbits = nbits;
+ wnd->total_zeroes = nbits;
+ wnd->extent_max = MINUS_ONE_T;
+ wnd->zone_bit = wnd->zone_end = 0;
+ wnd->nwnd = bytes_to_block(sb, bitmap_size(nbits));
+ wnd->bits_last = nbits & (wbits - 1);
+ if (!wnd->bits_last)
+ wnd->bits_last = wbits;
+
+ wnd->free_bits =
+ kvmalloc_array(wnd->nwnd, sizeof(u16), GFP_KERNEL | __GFP_ZERO);
+
+ if (!wnd->free_bits)
+ return -ENOMEM;
+
+ err = wnd_rescan(wnd);
+ if (err)
+ return err;
+
+ wnd->inited = true;
+
+ return 0;
+}
+
+/*
+ * wnd_map - Call sb_bread for requested window.
+ */
+static struct buffer_head *wnd_map(struct wnd_bitmap *wnd, size_t iw)
+{
+ size_t vbo;
+ CLST lcn, clen;
+ struct super_block *sb = wnd->sb;
+ struct ntfs_sb_info *sbi;
+ struct buffer_head *bh;
+ u64 lbo;
+
+ sbi = sb->s_fs_info;
+ vbo = (u64)iw << sb->s_blocksize_bits;
+
+ if (!run_lookup_entry(&wnd->run, vbo >> sbi->cluster_bits, &lcn, &clen,
+ NULL)) {
+ return ERR_PTR(-ENOENT);
+ }
+
+ lbo = ((u64)lcn << sbi->cluster_bits) + (vbo & sbi->cluster_mask);
+
+ bh = ntfs_bread(wnd->sb, lbo >> sb->s_blocksize_bits);
+ if (!bh)
+ return ERR_PTR(-EIO);
+
+ return bh;
+}
+
+/*
+ * wnd_set_free - Mark the bits range from bit to bit + bits as free.
+ */
+int wnd_set_free(struct wnd_bitmap *wnd, size_t bit, size_t bits)
+{
+ int err = 0;
+ struct super_block *sb = wnd->sb;
+ size_t bits0 = bits;
+ u32 wbits = 8 * sb->s_blocksize;
+ size_t iw = bit >> (sb->s_blocksize_bits + 3);
+ u32 wbit = bit & (wbits - 1);
+ struct buffer_head *bh;
+
+ while (iw < wnd->nwnd && bits) {
+ u32 tail, op;
+ ulong *buf;
+
+ if (iw + 1 == wnd->nwnd)
+ wbits = wnd->bits_last;
+
+ tail = wbits - wbit;
+ op = min_t(u32, tail, bits);
+
+ bh = wnd_map(wnd, iw);
+ if (IS_ERR(bh)) {
+ err = PTR_ERR(bh);
+ break;
+ }
+
+ buf = (ulong *)bh->b_data;
+
+ lock_buffer(bh);
+
+ __bitmap_clear(buf, wbit, op);
+
+ wnd->free_bits[iw] += op;
+
+ set_buffer_uptodate(bh);
+ mark_buffer_dirty(bh);
+ unlock_buffer(bh);
+ put_bh(bh);
+
+ wnd->total_zeroes += op;
+ bits -= op;
+ wbit = 0;
+ iw += 1;
+ }
+
+ wnd_add_free_ext(wnd, bit, bits0, false);
+
+ return err;
+}
+
+/*
+ * wnd_set_used - Mark the bits range from bit to bit + bits as used.
+ */
+int wnd_set_used(struct wnd_bitmap *wnd, size_t bit, size_t bits)
+{
+ int err = 0;
+ struct super_block *sb = wnd->sb;
+ size_t bits0 = bits;
+ size_t iw = bit >> (sb->s_blocksize_bits + 3);
+ u32 wbits = 8 * sb->s_blocksize;
+ u32 wbit = bit & (wbits - 1);
+ struct buffer_head *bh;
+
+ while (iw < wnd->nwnd && bits) {
+ u32 tail, op;
+ ulong *buf;
+
+ if (unlikely(iw + 1 == wnd->nwnd))
+ wbits = wnd->bits_last;
+
+ tail = wbits - wbit;
+ op = min_t(u32, tail, bits);
+
+ bh = wnd_map(wnd, iw);
+ if (IS_ERR(bh)) {
+ err = PTR_ERR(bh);
+ break;
+ }
+ buf = (ulong *)bh->b_data;
+
+ lock_buffer(bh);
+
+ __bitmap_set(buf, wbit, op);
+ wnd->free_bits[iw] -= op;
+
+ set_buffer_uptodate(bh);
+ mark_buffer_dirty(bh);
+ unlock_buffer(bh);
+ put_bh(bh);
+
+ wnd->total_zeroes -= op;
+ bits -= op;
+ wbit = 0;
+ iw += 1;
+ }
+
+ if (!RB_EMPTY_ROOT(&wnd->start_tree))
+ wnd_remove_free_ext(wnd, bit, bits0);
+
+ return err;
+}
+
+/*
+ * wnd_is_free_hlp
+ *
+ * Return: True if all clusters [bit, bit+bits) are free (bitmap only).
+ */
+static bool wnd_is_free_hlp(struct wnd_bitmap *wnd, size_t bit, size_t bits)
+{
+ struct super_block *sb = wnd->sb;
+ size_t iw = bit >> (sb->s_blocksize_bits + 3);
+ u32 wbits = 8 * sb->s_blocksize;
+ u32 wbit = bit & (wbits - 1);
+
+ while (iw < wnd->nwnd && bits) {
+ u32 tail, op;
+
+ if (unlikely(iw + 1 == wnd->nwnd))
+ wbits = wnd->bits_last;
+
+ tail = wbits - wbit;
+ op = min_t(u32, tail, bits);
+
+ if (wbits != wnd->free_bits[iw]) {
+ bool ret;
+ struct buffer_head *bh = wnd_map(wnd, iw);
+
+ if (IS_ERR(bh))
+ return false;
+
+ ret = are_bits_clear((ulong *)bh->b_data, wbit, op);
+
+ put_bh(bh);
+ if (!ret)
+ return false;
+ }
+
+ bits -= op;
+ wbit = 0;
+ iw += 1;
+ }
+
+ return true;
+}
+
+/*
+ * wnd_is_free
+ *
+ * Return: True if all clusters [bit, bit+bits) are free.
+ */
+bool wnd_is_free(struct wnd_bitmap *wnd, size_t bit, size_t bits)
+{
+ bool ret;
+ struct rb_node *n;
+ size_t end;
+ struct e_node *e;
+
+ if (RB_EMPTY_ROOT(&wnd->start_tree))
+ goto use_wnd;
+
+ n = rb_lookup(&wnd->start_tree, bit);
+ if (!n)
+ goto use_wnd;
+
+ e = rb_entry(n, struct e_node, start.node);
+
+ end = e->start.key + e->count.key;
+
+ if (bit < end && bit + bits <= end)
+ return true;
+
+use_wnd:
+ ret = wnd_is_free_hlp(wnd, bit, bits);
+
+ return ret;
+}
+
+/*
+ * wnd_is_used
+ *
+ * Return: True if all clusters [bit, bit+bits) are used.
+ */
+bool wnd_is_used(struct wnd_bitmap *wnd, size_t bit, size_t bits)
+{
+ bool ret = false;
+ struct super_block *sb = wnd->sb;
+ size_t iw = bit >> (sb->s_blocksize_bits + 3);
+ u32 wbits = 8 * sb->s_blocksize;
+ u32 wbit = bit & (wbits - 1);
+ size_t end;
+ struct rb_node *n;
+ struct e_node *e;
+
+ if (RB_EMPTY_ROOT(&wnd->start_tree))
+ goto use_wnd;
+
+ end = bit + bits;
+ n = rb_lookup(&wnd->start_tree, end - 1);
+ if (!n)
+ goto use_wnd;
+
+ e = rb_entry(n, struct e_node, start.node);
+ if (e->start.key + e->count.key > bit)
+ return false;
+
+use_wnd:
+ while (iw < wnd->nwnd && bits) {
+ u32 tail, op;
+
+ if (unlikely(iw + 1 == wnd->nwnd))
+ wbits = wnd->bits_last;
+
+ tail = wbits - wbit;
+ op = min_t(u32, tail, bits);
+
+ if (wnd->free_bits[iw]) {
+ bool ret;
+ struct buffer_head *bh = wnd_map(wnd, iw);
+
+ if (IS_ERR(bh))
+ goto out;
+
+ ret = are_bits_set((ulong *)bh->b_data, wbit, op);
+ put_bh(bh);
+ if (!ret)
+ goto out;
+ }
+
+ bits -= op;
+ wbit = 0;
+ iw += 1;
+ }
+ ret = true;
+
+out:
+ return ret;
+}
+
+/*
+ * wnd_find - Look for free space.
+ *
+ * - flags - BITMAP_FIND_XXX flags
+ *
+ * Return: 0 if not found.
+ */
+size_t wnd_find(struct wnd_bitmap *wnd, size_t to_alloc, size_t hint,
+ size_t flags, size_t *allocated)
+{
+ struct super_block *sb;
+ u32 wbits, wpos, wzbit, wzend;
+ size_t fnd, max_alloc, b_len, b_pos;
+ size_t iw, prev_tail, nwnd, wbit, ebit, zbit, zend;
+ size_t to_alloc0 = to_alloc;
+ const ulong *buf;
+ const struct e_node *e;
+ const struct rb_node *pr, *cr;
+ u8 log2_bits;
+ bool fbits_valid;
+ struct buffer_head *bh;
+
+ /* Fast checking for available free space. */
+ if (flags & BITMAP_FIND_FULL) {
+ size_t zeroes = wnd_zeroes(wnd);
+
+ zeroes -= wnd->zone_end - wnd->zone_bit;
+ if (zeroes < to_alloc0)
+ goto no_space;
+
+ if (to_alloc0 > wnd->extent_max)
+ goto no_space;
+ } else {
+ if (to_alloc > wnd->extent_max)
+ to_alloc = wnd->extent_max;
+ }
+
+ if (wnd->zone_bit <= hint && hint < wnd->zone_end)
+ hint = wnd->zone_end;
+
+ max_alloc = wnd->nbits;
+ b_len = b_pos = 0;
+
+ if (hint >= max_alloc)
+ hint = 0;
+
+ if (RB_EMPTY_ROOT(&wnd->start_tree)) {
+ if (wnd->uptodated == 1) {
+ /* Extents tree is updated -> No free space. */
+ goto no_space;
+ }
+ goto scan_bitmap;
+ }
+
+ e = NULL;
+ if (!hint)
+ goto allocate_biggest;
+
+ /* Use hint: Enumerate extents by start >= hint. */
+ pr = NULL;
+ cr = wnd->start_tree.rb_node;
+
+ for (;;) {
+ e = rb_entry(cr, struct e_node, start.node);
+
+ if (e->start.key == hint)
+ break;
+
+ if (e->start.key < hint) {
+ pr = cr;
+ cr = cr->rb_right;
+ if (!cr)
+ break;
+ continue;
+ }
+
+ cr = cr->rb_left;
+ if (!cr) {
+ e = pr ? rb_entry(pr, struct e_node, start.node) : NULL;
+ break;
+ }
+ }
+
+ if (!e)
+ goto allocate_biggest;
+
+ if (e->start.key + e->count.key > hint) {
+ /* We have found extension with 'hint' inside. */
+ size_t len = e->start.key + e->count.key - hint;
+
+ if (len >= to_alloc && hint + to_alloc <= max_alloc) {
+ fnd = hint;
+ goto found;
+ }
+
+ if (!(flags & BITMAP_FIND_FULL)) {
+ if (len > to_alloc)
+ len = to_alloc;
+
+ if (hint + len <= max_alloc) {
+ fnd = hint;
+ to_alloc = len;
+ goto found;
+ }
+ }
+ }
+
+allocate_biggest:
+ /* Allocate from biggest free extent. */
+ e = rb_entry(rb_first(&wnd->count_tree), struct e_node, count.node);
+ if (e->count.key != wnd->extent_max)
+ wnd->extent_max = e->count.key;
+
+ if (e->count.key < max_alloc) {
+ if (e->count.key >= to_alloc) {
+ ;
+ } else if (flags & BITMAP_FIND_FULL) {
+ if (e->count.key < to_alloc0) {
+ /* Biggest free block is less then requested. */
+ goto no_space;
+ }
+ to_alloc = e->count.key;
+ } else if (-1 != wnd->uptodated) {
+ to_alloc = e->count.key;
+ } else {
+ /* Check if we can use more bits. */
+ size_t op, max_check;
+ struct rb_root start_tree;
+
+ memcpy(&start_tree, &wnd->start_tree,
+ sizeof(struct rb_root));
+ memset(&wnd->start_tree, 0, sizeof(struct rb_root));
+
+ max_check = e->start.key + to_alloc;
+ if (max_check > max_alloc)
+ max_check = max_alloc;
+ for (op = e->start.key + e->count.key; op < max_check;
+ op++) {
+ if (!wnd_is_free(wnd, op, 1))
+ break;
+ }
+ memcpy(&wnd->start_tree, &start_tree,
+ sizeof(struct rb_root));
+ to_alloc = op - e->start.key;
+ }
+
+ /* Prepare to return. */
+ fnd = e->start.key;
+ if (e->start.key + to_alloc > max_alloc)
+ to_alloc = max_alloc - e->start.key;
+ goto found;
+ }
+
+ if (wnd->uptodated == 1) {
+ /* Extents tree is updated -> no free space. */
+ goto no_space;
+ }
+
+ b_len = e->count.key;
+ b_pos = e->start.key;
+
+scan_bitmap:
+ sb = wnd->sb;
+ log2_bits = sb->s_blocksize_bits + 3;
+
+ /* At most two ranges [hint, max_alloc) + [0, hint). */
+Again:
+
+ /* TODO: Optimize request for case nbits > wbits. */
+ iw = hint >> log2_bits;
+ wbits = sb->s_blocksize * 8;
+ wpos = hint & (wbits - 1);
+ prev_tail = 0;
+ fbits_valid = true;
+
+ if (max_alloc == wnd->nbits) {
+ nwnd = wnd->nwnd;
+ } else {
+ size_t t = max_alloc + wbits - 1;
+
+ nwnd = likely(t > max_alloc) ? (t >> log2_bits) : wnd->nwnd;
+ }
+
+ /* Enumerate all windows. */
+ for (; iw < nwnd; iw++) {
+ wbit = iw << log2_bits;
+
+ if (!wnd->free_bits[iw]) {
+ if (prev_tail > b_len) {
+ b_pos = wbit - prev_tail;
+ b_len = prev_tail;
+ }
+
+ /* Skip full used window. */
+ prev_tail = 0;
+ wpos = 0;
+ continue;
+ }
+
+ if (unlikely(iw + 1 == nwnd)) {
+ if (max_alloc == wnd->nbits) {
+ wbits = wnd->bits_last;
+ } else {
+ size_t t = max_alloc & (wbits - 1);
+
+ if (t) {
+ wbits = t;
+ fbits_valid = false;
+ }
+ }
+ }
+
+ if (wnd->zone_end > wnd->zone_bit) {
+ ebit = wbit + wbits;
+ zbit = max(wnd->zone_bit, wbit);
+ zend = min(wnd->zone_end, ebit);
+
+ /* Here we have a window [wbit, ebit) and zone [zbit, zend). */
+ if (zend <= zbit) {
+ /* Zone does not overlap window. */
+ } else {
+ wzbit = zbit - wbit;
+ wzend = zend - wbit;
+
+ /* Zone overlaps window. */
+ if (wnd->free_bits[iw] == wzend - wzbit) {
+ prev_tail = 0;
+ wpos = 0;
+ continue;
+ }
+
+ /* Scan two ranges window: [wbit, zbit) and [zend, ebit). */
+ bh = wnd_map(wnd, iw);
+
+ if (IS_ERR(bh)) {
+ /* TODO: Error */
+ prev_tail = 0;
+ wpos = 0;
+ continue;
+ }
+
+ buf = (ulong *)bh->b_data;
+
+ /* Scan range [wbit, zbit). */
+ if (wpos < wzbit) {
+ /* Scan range [wpos, zbit). */
+ fnd = wnd_scan(buf, wbit, wpos, wzbit,
+ to_alloc, &prev_tail,
+ &b_pos, &b_len);
+ if (fnd != MINUS_ONE_T) {
+ put_bh(bh);
+ goto found;
+ }
+ }
+
+ prev_tail = 0;
+
+ /* Scan range [zend, ebit). */
+ if (wzend < wbits) {
+ fnd = wnd_scan(buf, wbit,
+ max(wzend, wpos), wbits,
+ to_alloc, &prev_tail,
+ &b_pos, &b_len);
+ if (fnd != MINUS_ONE_T) {
+ put_bh(bh);
+ goto found;
+ }
+ }
+
+ wpos = 0;
+ put_bh(bh);
+ continue;
+ }
+ }
+
+ /* Current window does not overlap zone. */
+ if (!wpos && fbits_valid && wnd->free_bits[iw] == wbits) {
+ /* Window is empty. */
+ if (prev_tail + wbits >= to_alloc) {
+ fnd = wbit + wpos - prev_tail;
+ goto found;
+ }
+
+ /* Increase 'prev_tail' and process next window. */
+ prev_tail += wbits;
+ wpos = 0;
+ continue;
+ }
+
+ /* Read window. */
+ bh = wnd_map(wnd, iw);
+ if (IS_ERR(bh)) {
+ // TODO: Error.
+ prev_tail = 0;
+ wpos = 0;
+ continue;
+ }
+
+ buf = (ulong *)bh->b_data;
+
+ /* Scan range [wpos, eBits). */
+ fnd = wnd_scan(buf, wbit, wpos, wbits, to_alloc, &prev_tail,
+ &b_pos, &b_len);
+ put_bh(bh);
+ if (fnd != MINUS_ONE_T)
+ goto found;
+ }
+
+ if (b_len < prev_tail) {
+ /* The last fragment. */
+ b_len = prev_tail;
+ b_pos = max_alloc - prev_tail;
+ }
+
+ if (hint) {
+ /*
+ * We have scanned range [hint max_alloc).
+ * Prepare to scan range [0 hint + to_alloc).
+ */
+ size_t nextmax = hint + to_alloc;
+
+ if (likely(nextmax >= hint) && nextmax < max_alloc)
+ max_alloc = nextmax;
+ hint = 0;
+ goto Again;
+ }
+
+ if (!b_len)
+ goto no_space;
+
+ wnd->extent_max = b_len;
+
+ if (flags & BITMAP_FIND_FULL)
+ goto no_space;
+
+ fnd = b_pos;
+ to_alloc = b_len;
+
+found:
+ if (flags & BITMAP_FIND_MARK_AS_USED) {
+ /* TODO: Optimize remove extent (pass 'e'?). */
+ if (wnd_set_used(wnd, fnd, to_alloc))
+ goto no_space;
+ } else if (wnd->extent_max != MINUS_ONE_T &&
+ to_alloc > wnd->extent_max) {
+ wnd->extent_max = to_alloc;
+ }
+
+ *allocated = fnd;
+ return to_alloc;
+
+no_space:
+ return 0;
+}
+
+/*
+ * wnd_extend - Extend bitmap ($MFT bitmap).
+ */
+int wnd_extend(struct wnd_bitmap *wnd, size_t new_bits)
+{
+ int err;
+ struct super_block *sb = wnd->sb;
+ struct ntfs_sb_info *sbi = sb->s_fs_info;
+ u32 blocksize = sb->s_blocksize;
+ u32 wbits = blocksize * 8;
+ u32 b0, new_last;
+ size_t bits, iw, new_wnd;
+ size_t old_bits = wnd->nbits;
+ u16 *new_free;
+
+ if (new_bits <= old_bits)
+ return -EINVAL;
+
+ /* Align to 8 byte boundary. */
+ new_wnd = bytes_to_block(sb, bitmap_size(new_bits));
+ new_last = new_bits & (wbits - 1);
+ if (!new_last)
+ new_last = wbits;
+
+ if (new_wnd != wnd->nwnd) {
+ new_free = kmalloc(new_wnd * sizeof(u16), GFP_NOFS);
+ if (!new_free)
+ return -ENOMEM;
+
+ memcpy(new_free, wnd->free_bits, wnd->nwnd * sizeof(short));
+ memset(new_free + wnd->nwnd, 0,
+ (new_wnd - wnd->nwnd) * sizeof(short));
+ kfree(wnd->free_bits);
+ wnd->free_bits = new_free;
+ }
+
+ /* Zero bits [old_bits,new_bits). */
+ bits = new_bits - old_bits;
+ b0 = old_bits & (wbits - 1);
+
+ for (iw = old_bits >> (sb->s_blocksize_bits + 3); bits; iw += 1) {
+ u32 op;
+ size_t frb;
+ u64 vbo, lbo, bytes;
+ struct buffer_head *bh;
+ ulong *buf;
+
+ if (iw + 1 == new_wnd)
+ wbits = new_last;
+
+ op = b0 + bits > wbits ? wbits - b0 : bits;
+ vbo = (u64)iw * blocksize;
+
+ err = ntfs_vbo_to_lbo(sbi, &wnd->run, vbo, &lbo, &bytes);
+ if (err)
+ break;
+
+ bh = ntfs_bread(sb, lbo >> sb->s_blocksize_bits);
+ if (!bh)
+ return -EIO;
+
+ lock_buffer(bh);
+ buf = (ulong *)bh->b_data;
+
+ __bitmap_clear(buf, b0, blocksize * 8 - b0);
+ frb = wbits - bitmap_weight(buf, wbits);
+ wnd->total_zeroes += frb - wnd->free_bits[iw];
+ wnd->free_bits[iw] = frb;
+
+ set_buffer_uptodate(bh);
+ mark_buffer_dirty(bh);
+ unlock_buffer(bh);
+ /* err = sync_dirty_buffer(bh); */
+
+ b0 = 0;
+ bits -= op;
+ }
+
+ wnd->nbits = new_bits;
+ wnd->nwnd = new_wnd;
+ wnd->bits_last = new_last;
+
+ wnd_add_free_ext(wnd, old_bits, new_bits - old_bits, false);
+
+ return 0;
+}
+
+void wnd_zone_set(struct wnd_bitmap *wnd, size_t lcn, size_t len)
+{
+ size_t zlen = wnd->zone_end - wnd->zone_bit;
+
+ if (zlen)
+ wnd_add_free_ext(wnd, wnd->zone_bit, zlen, false);
+
+ if (!RB_EMPTY_ROOT(&wnd->start_tree) && len)
+ wnd_remove_free_ext(wnd, lcn, len);
+
+ wnd->zone_bit = lcn;
+ wnd->zone_end = lcn + len;
+}
+
+int ntfs_trim_fs(struct ntfs_sb_info *sbi, struct fstrim_range *range)
+{
+ int err = 0;
+ struct super_block *sb = sbi->sb;
+ struct wnd_bitmap *wnd = &sbi->used.bitmap;
+ u32 wbits = 8 * sb->s_blocksize;
+ CLST len = 0, lcn = 0, done = 0;
+ CLST minlen = bytes_to_cluster(sbi, range->minlen);
+ CLST lcn_from = bytes_to_cluster(sbi, range->start);
+ size_t iw = lcn_from >> (sb->s_blocksize_bits + 3);
+ u32 wbit = lcn_from & (wbits - 1);
+ const ulong *buf;
+ CLST lcn_to;
+
+ if (!minlen)
+ minlen = 1;
+
+ if (range->len == (u64)-1)
+ lcn_to = wnd->nbits;
+ else
+ lcn_to = bytes_to_cluster(sbi, range->start + range->len);
+
+ down_read_nested(&wnd->rw_lock, BITMAP_MUTEX_CLUSTERS);
+
+ for (; iw < wnd->nwnd; iw++, wbit = 0) {
+ CLST lcn_wnd = iw * wbits;
+ struct buffer_head *bh;
+
+ if (lcn_wnd > lcn_to)
+ break;
+
+ if (!wnd->free_bits[iw])
+ continue;
+
+ if (iw + 1 == wnd->nwnd)
+ wbits = wnd->bits_last;
+
+ if (lcn_wnd + wbits > lcn_to)
+ wbits = lcn_to - lcn_wnd;
+
+ bh = wnd_map(wnd, iw);
+ if (IS_ERR(bh)) {
+ err = PTR_ERR(bh);
+ break;
+ }
+
+ buf = (ulong *)bh->b_data;
+
+ for (; wbit < wbits; wbit++) {
+ if (!test_bit(wbit, buf)) {
+ if (!len)
+ lcn = lcn_wnd + wbit;
+ len += 1;
+ continue;
+ }
+ if (len >= minlen) {
+ err = ntfs_discard(sbi, lcn, len);
+ if (err)
+ goto out;
+ done += len;
+ }
+ len = 0;
+ }
+ put_bh(bh);
+ }
+
+ /* Process the last fragment. */
+ if (len >= minlen) {
+ err = ntfs_discard(sbi, lcn, len);
+ if (err)
+ goto out;
+ done += len;
+ }
+
+out:
+ range->len = (u64)done << sbi->cluster_bits;
+
+ up_read(&wnd->rw_lock);
+
+ return err;
+}
diff --git a/fs/ntfs3/debug.h b/fs/ntfs3/debug.h
new file mode 100644
index 000000000..53ef7489c
--- /dev/null
+++ b/fs/ntfs3/debug.h
@@ -0,0 +1,55 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ *
+ * Copyright (C) 2019-2021 Paragon Software GmbH, All rights reserved.
+ *
+ * Useful functions for debugging.
+ *
+ */
+
+// clang-format off
+#ifndef _LINUX_NTFS3_DEBUG_H
+#define _LINUX_NTFS3_DEBUG_H
+
+struct super_block;
+struct inode;
+
+#ifndef Add2Ptr
+#define Add2Ptr(P, I) ((void *)((u8 *)(P) + (I)))
+#define PtrOffset(B, O) ((size_t)((size_t)(O) - (size_t)(B)))
+#endif
+
+#ifdef CONFIG_PRINTK
+__printf(2, 3)
+void ntfs_printk(const struct super_block *sb, const char *fmt, ...);
+__printf(2, 3)
+void ntfs_inode_printk(struct inode *inode, const char *fmt, ...);
+#else
+static inline __printf(2, 3)
+void ntfs_printk(const struct super_block *sb, const char *fmt, ...)
+{
+}
+
+static inline __printf(2, 3)
+void ntfs_inode_printk(struct inode *inode, const char *fmt, ...)
+{
+}
+#endif
+
+/*
+ * Logging macros. Thanks Joe Perches <joe@perches.com> for implementation.
+ */
+
+#define ntfs_err(sb, fmt, ...) ntfs_printk(sb, KERN_ERR fmt, ##__VA_ARGS__)
+#define ntfs_warn(sb, fmt, ...) ntfs_printk(sb, KERN_WARNING fmt, ##__VA_ARGS__)
+#define ntfs_info(sb, fmt, ...) ntfs_printk(sb, KERN_INFO fmt, ##__VA_ARGS__)
+#define ntfs_notice(sb, fmt, ...) \
+ ntfs_printk(sb, KERN_NOTICE fmt, ##__VA_ARGS__)
+
+#define ntfs_inode_err(inode, fmt, ...) \
+ ntfs_inode_printk(inode, KERN_ERR fmt, ##__VA_ARGS__)
+#define ntfs_inode_warn(inode, fmt, ...) \
+ ntfs_inode_printk(inode, KERN_WARNING fmt, ##__VA_ARGS__)
+
+#endif /* _LINUX_NTFS3_DEBUG_H */
+// clang-format on
diff --git a/fs/ntfs3/dir.c b/fs/ntfs3/dir.c
new file mode 100644
index 000000000..d4d9f4ffb
--- /dev/null
+++ b/fs/ntfs3/dir.c
@@ -0,0 +1,597 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ *
+ * Copyright (C) 2019-2021 Paragon Software GmbH, All rights reserved.
+ *
+ * Directory handling functions for NTFS-based filesystems.
+ *
+ */
+
+#include <linux/fs.h>
+#include <linux/nls.h>
+
+#include "debug.h"
+#include "ntfs.h"
+#include "ntfs_fs.h"
+
+/* Convert little endian UTF-16 to NLS string. */
+int ntfs_utf16_to_nls(struct ntfs_sb_info *sbi, const __le16 *name, u32 len,
+ u8 *buf, int buf_len)
+{
+ int ret, warn;
+ u8 *op;
+ struct nls_table *nls = sbi->options->nls;
+
+ static_assert(sizeof(wchar_t) == sizeof(__le16));
+
+ if (!nls) {
+ /* UTF-16 -> UTF-8 */
+ ret = utf16s_to_utf8s(name, len, UTF16_LITTLE_ENDIAN, buf,
+ buf_len);
+ buf[ret] = '\0';
+ return ret;
+ }
+
+ op = buf;
+ warn = 0;
+
+ while (len--) {
+ u16 ec;
+ int charlen;
+ char dump[5];
+
+ if (buf_len < NLS_MAX_CHARSET_SIZE) {
+ ntfs_warn(sbi->sb,
+ "filename was truncated while converting.");
+ break;
+ }
+
+ ec = le16_to_cpu(*name++);
+ charlen = nls->uni2char(ec, op, buf_len);
+
+ if (charlen > 0) {
+ op += charlen;
+ buf_len -= charlen;
+ continue;
+ }
+
+ *op++ = '_';
+ buf_len -= 1;
+ if (warn)
+ continue;
+
+ warn = 1;
+ hex_byte_pack(&dump[0], ec >> 8);
+ hex_byte_pack(&dump[2], ec);
+ dump[4] = 0;
+
+ ntfs_err(sbi->sb, "failed to convert \"%s\" to %s", dump,
+ nls->charset);
+ }
+
+ *op = '\0';
+ return op - buf;
+}
+
+// clang-format off
+#define PLANE_SIZE 0x00010000
+
+#define SURROGATE_PAIR 0x0000d800
+#define SURROGATE_LOW 0x00000400
+#define SURROGATE_BITS 0x000003ff
+// clang-format on
+
+/*
+ * put_utf16 - Modified version of put_utf16 from fs/nls/nls_base.c
+ *
+ * Function is sparse warnings free.
+ */
+static inline void put_utf16(wchar_t *s, unsigned int c,
+ enum utf16_endian endian)
+{
+ static_assert(sizeof(wchar_t) == sizeof(__le16));
+ static_assert(sizeof(wchar_t) == sizeof(__be16));
+
+ switch (endian) {
+ default:
+ *s = (wchar_t)c;
+ break;
+ case UTF16_LITTLE_ENDIAN:
+ *(__le16 *)s = __cpu_to_le16(c);
+ break;
+ case UTF16_BIG_ENDIAN:
+ *(__be16 *)s = __cpu_to_be16(c);
+ break;
+ }
+}
+
+/*
+ * _utf8s_to_utf16s
+ *
+ * Modified version of 'utf8s_to_utf16s' allows to
+ * detect -ENAMETOOLONG without writing out of expected maximum.
+ */
+static int _utf8s_to_utf16s(const u8 *s, int inlen, enum utf16_endian endian,
+ wchar_t *pwcs, int maxout)
+{
+ u16 *op;
+ int size;
+ unicode_t u;
+
+ op = pwcs;
+ while (inlen > 0 && *s) {
+ if (*s & 0x80) {
+ size = utf8_to_utf32(s, inlen, &u);
+ if (size < 0)
+ return -EINVAL;
+ s += size;
+ inlen -= size;
+
+ if (u >= PLANE_SIZE) {
+ if (maxout < 2)
+ return -ENAMETOOLONG;
+
+ u -= PLANE_SIZE;
+ put_utf16(op++,
+ SURROGATE_PAIR |
+ ((u >> 10) & SURROGATE_BITS),
+ endian);
+ put_utf16(op++,
+ SURROGATE_PAIR | SURROGATE_LOW |
+ (u & SURROGATE_BITS),
+ endian);
+ maxout -= 2;
+ } else {
+ if (maxout < 1)
+ return -ENAMETOOLONG;
+
+ put_utf16(op++, u, endian);
+ maxout--;
+ }
+ } else {
+ if (maxout < 1)
+ return -ENAMETOOLONG;
+
+ put_utf16(op++, *s++, endian);
+ inlen--;
+ maxout--;
+ }
+ }
+ return op - pwcs;
+}
+
+/*
+ * ntfs_nls_to_utf16 - Convert input string to UTF-16.
+ * @name: Input name.
+ * @name_len: Input name length.
+ * @uni: Destination memory.
+ * @max_ulen: Destination memory.
+ * @endian: Endian of target UTF-16 string.
+ *
+ * This function is called:
+ * - to create NTFS name
+ * - to create symlink
+ *
+ * Return: UTF-16 string length or error (if negative).
+ */
+int ntfs_nls_to_utf16(struct ntfs_sb_info *sbi, const u8 *name, u32 name_len,
+ struct cpu_str *uni, u32 max_ulen,
+ enum utf16_endian endian)
+{
+ int ret, slen;
+ const u8 *end;
+ struct nls_table *nls = sbi->options->nls;
+ u16 *uname = uni->name;
+
+ static_assert(sizeof(wchar_t) == sizeof(u16));
+
+ if (!nls) {
+ /* utf8 -> utf16 */
+ ret = _utf8s_to_utf16s(name, name_len, endian, uname, max_ulen);
+ uni->len = ret;
+ return ret;
+ }
+
+ for (ret = 0, end = name + name_len; name < end; ret++, name += slen) {
+ if (ret >= max_ulen)
+ return -ENAMETOOLONG;
+
+ slen = nls->char2uni(name, end - name, uname + ret);
+ if (!slen)
+ return -EINVAL;
+ if (slen < 0)
+ return slen;
+ }
+
+#ifdef __BIG_ENDIAN
+ if (endian == UTF16_LITTLE_ENDIAN) {
+ int i = ret;
+
+ while (i--) {
+ __cpu_to_le16s(uname);
+ uname++;
+ }
+ }
+#else
+ if (endian == UTF16_BIG_ENDIAN) {
+ int i = ret;
+
+ while (i--) {
+ __cpu_to_be16s(uname);
+ uname++;
+ }
+ }
+#endif
+
+ uni->len = ret;
+ return ret;
+}
+
+/*
+ * dir_search_u - Helper function.
+ */
+struct inode *dir_search_u(struct inode *dir, const struct cpu_str *uni,
+ struct ntfs_fnd *fnd)
+{
+ int err = 0;
+ struct super_block *sb = dir->i_sb;
+ struct ntfs_sb_info *sbi = sb->s_fs_info;
+ struct ntfs_inode *ni = ntfs_i(dir);
+ struct NTFS_DE *e;
+ int diff;
+ struct inode *inode = NULL;
+ struct ntfs_fnd *fnd_a = NULL;
+
+ if (!fnd) {
+ fnd_a = fnd_get();
+ if (!fnd_a) {
+ err = -ENOMEM;
+ goto out;
+ }
+ fnd = fnd_a;
+ }
+
+ err = indx_find(&ni->dir, ni, NULL, uni, 0, sbi, &diff, &e, fnd);
+
+ if (err)
+ goto out;
+
+ if (diff) {
+ err = -ENOENT;
+ goto out;
+ }
+
+ inode = ntfs_iget5(sb, &e->ref, uni);
+ if (!IS_ERR(inode) && is_bad_inode(inode)) {
+ iput(inode);
+ err = -EINVAL;
+ }
+out:
+ fnd_put(fnd_a);
+
+ return err == -ENOENT ? NULL : err ? ERR_PTR(err) : inode;
+}
+
+static inline int ntfs_filldir(struct ntfs_sb_info *sbi, struct ntfs_inode *ni,
+ const struct NTFS_DE *e, u8 *name,
+ struct dir_context *ctx)
+{
+ const struct ATTR_FILE_NAME *fname;
+ unsigned long ino;
+ int name_len;
+ u32 dt_type;
+
+ fname = Add2Ptr(e, sizeof(struct NTFS_DE));
+
+ if (fname->type == FILE_NAME_DOS)
+ return 0;
+
+ if (!mi_is_ref(&ni->mi, &fname->home))
+ return 0;
+
+ ino = ino_get(&e->ref);
+
+ if (ino == MFT_REC_ROOT)
+ return 0;
+
+ /* Skip meta files. Unless option to show metafiles is set. */
+ if (!sbi->options->showmeta && ntfs_is_meta_file(sbi, ino))
+ return 0;
+
+ if (sbi->options->nohidden && (fname->dup.fa & FILE_ATTRIBUTE_HIDDEN))
+ return 0;
+
+ name_len = ntfs_utf16_to_nls(sbi, fname->name, fname->name_len, name,
+ PATH_MAX);
+ if (name_len <= 0) {
+ ntfs_warn(sbi->sb, "failed to convert name for inode %lx.",
+ ino);
+ return 0;
+ }
+
+ /* NTFS: symlinks are "dir + reparse" or "file + reparse" */
+ if (fname->dup.fa & FILE_ATTRIBUTE_REPARSE_POINT)
+ dt_type = DT_LNK;
+ else
+ dt_type = (fname->dup.fa & FILE_ATTRIBUTE_DIRECTORY) ? DT_DIR : DT_REG;
+
+ return !dir_emit(ctx, (s8 *)name, name_len, ino, dt_type);
+}
+
+/*
+ * ntfs_read_hdr - Helper function for ntfs_readdir().
+ */
+static int ntfs_read_hdr(struct ntfs_sb_info *sbi, struct ntfs_inode *ni,
+ const struct INDEX_HDR *hdr, u64 vbo, u64 pos,
+ u8 *name, struct dir_context *ctx)
+{
+ int err;
+ const struct NTFS_DE *e;
+ u32 e_size;
+ u32 end = le32_to_cpu(hdr->used);
+ u32 off = le32_to_cpu(hdr->de_off);
+
+ for (;; off += e_size) {
+ if (off + sizeof(struct NTFS_DE) > end)
+ return -1;
+
+ e = Add2Ptr(hdr, off);
+ e_size = le16_to_cpu(e->size);
+ if (e_size < sizeof(struct NTFS_DE) || off + e_size > end)
+ return -1;
+
+ if (de_is_last(e))
+ return 0;
+
+ /* Skip already enumerated. */
+ if (vbo + off < pos)
+ continue;
+
+ if (le16_to_cpu(e->key_size) < SIZEOF_ATTRIBUTE_FILENAME)
+ return -1;
+
+ ctx->pos = vbo + off;
+
+ /* Submit the name to the filldir callback. */
+ err = ntfs_filldir(sbi, ni, e, name, ctx);
+ if (err)
+ return err;
+ }
+}
+
+/*
+ * ntfs_readdir - file_operations::iterate_shared
+ *
+ * Use non sorted enumeration.
+ * We have an example of broken volume where sorted enumeration
+ * counts each name twice.
+ */
+static int ntfs_readdir(struct file *file, struct dir_context *ctx)
+{
+ const struct INDEX_ROOT *root;
+ u64 vbo;
+ size_t bit;
+ loff_t eod;
+ int err = 0;
+ struct inode *dir = file_inode(file);
+ struct ntfs_inode *ni = ntfs_i(dir);
+ struct super_block *sb = dir->i_sb;
+ struct ntfs_sb_info *sbi = sb->s_fs_info;
+ loff_t i_size = i_size_read(dir);
+ u32 pos = ctx->pos;
+ u8 *name = NULL;
+ struct indx_node *node = NULL;
+ u8 index_bits = ni->dir.index_bits;
+
+ /* Name is a buffer of PATH_MAX length. */
+ static_assert(NTFS_NAME_LEN * 4 < PATH_MAX);
+
+ eod = i_size + sbi->record_size;
+
+ if (pos >= eod)
+ return 0;
+
+ if (!dir_emit_dots(file, ctx))
+ return 0;
+
+ /* Allocate PATH_MAX bytes. */
+ name = __getname();
+ if (!name)
+ return -ENOMEM;
+
+ if (!ni->mi_loaded && ni->attr_list.size) {
+ /*
+ * Directory inode is locked for read.
+ * Load all subrecords to avoid 'write' access to 'ni' during
+ * directory reading.
+ */
+ ni_lock(ni);
+ if (!ni->mi_loaded && ni->attr_list.size) {
+ err = ni_load_all_mi(ni);
+ if (!err)
+ ni->mi_loaded = true;
+ }
+ ni_unlock(ni);
+ if (err)
+ goto out;
+ }
+
+ root = indx_get_root(&ni->dir, ni, NULL, NULL);
+ if (!root) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ if (pos >= sbi->record_size) {
+ bit = (pos - sbi->record_size) >> index_bits;
+ } else {
+ err = ntfs_read_hdr(sbi, ni, &root->ihdr, 0, pos, name, ctx);
+ if (err)
+ goto out;
+ bit = 0;
+ }
+
+ if (!i_size) {
+ ctx->pos = eod;
+ goto out;
+ }
+
+ for (;;) {
+ vbo = (u64)bit << index_bits;
+ if (vbo >= i_size) {
+ ctx->pos = eod;
+ goto out;
+ }
+
+ err = indx_used_bit(&ni->dir, ni, &bit);
+ if (err)
+ goto out;
+
+ if (bit == MINUS_ONE_T) {
+ ctx->pos = eod;
+ goto out;
+ }
+
+ vbo = (u64)bit << index_bits;
+ if (vbo >= i_size) {
+ ntfs_inode_err(dir, "Looks like your dir is corrupt");
+ err = -EINVAL;
+ goto out;
+ }
+
+ err = indx_read(&ni->dir, ni, bit << ni->dir.idx2vbn_bits,
+ &node);
+ if (err)
+ goto out;
+
+ err = ntfs_read_hdr(sbi, ni, &node->index->ihdr,
+ vbo + sbi->record_size, pos, name, ctx);
+ if (err)
+ goto out;
+
+ bit += 1;
+ }
+
+out:
+
+ __putname(name);
+ put_indx_node(node);
+
+ if (err == -ENOENT) {
+ err = 0;
+ ctx->pos = pos;
+ }
+
+ return err;
+}
+
+static int ntfs_dir_count(struct inode *dir, bool *is_empty, size_t *dirs,
+ size_t *files)
+{
+ int err = 0;
+ struct ntfs_inode *ni = ntfs_i(dir);
+ struct NTFS_DE *e = NULL;
+ struct INDEX_ROOT *root;
+ struct INDEX_HDR *hdr;
+ const struct ATTR_FILE_NAME *fname;
+ u32 e_size, off, end;
+ u64 vbo = 0;
+ size_t drs = 0, fles = 0, bit = 0;
+ loff_t i_size = ni->vfs_inode.i_size;
+ struct indx_node *node = NULL;
+ u8 index_bits = ni->dir.index_bits;
+
+ if (is_empty)
+ *is_empty = true;
+
+ root = indx_get_root(&ni->dir, ni, NULL, NULL);
+ if (!root)
+ return -EINVAL;
+
+ hdr = &root->ihdr;
+
+ for (;;) {
+ end = le32_to_cpu(hdr->used);
+ off = le32_to_cpu(hdr->de_off);
+
+ for (; off + sizeof(struct NTFS_DE) <= end; off += e_size) {
+ e = Add2Ptr(hdr, off);
+ e_size = le16_to_cpu(e->size);
+ if (e_size < sizeof(struct NTFS_DE) ||
+ off + e_size > end)
+ break;
+
+ if (de_is_last(e))
+ break;
+
+ fname = de_get_fname(e);
+ if (!fname)
+ continue;
+
+ if (fname->type == FILE_NAME_DOS)
+ continue;
+
+ if (is_empty) {
+ *is_empty = false;
+ if (!dirs && !files)
+ goto out;
+ }
+
+ if (fname->dup.fa & FILE_ATTRIBUTE_DIRECTORY)
+ drs += 1;
+ else
+ fles += 1;
+ }
+
+ if (vbo >= i_size)
+ goto out;
+
+ err = indx_used_bit(&ni->dir, ni, &bit);
+ if (err)
+ goto out;
+
+ if (bit == MINUS_ONE_T)
+ goto out;
+
+ vbo = (u64)bit << index_bits;
+ if (vbo >= i_size)
+ goto out;
+
+ err = indx_read(&ni->dir, ni, bit << ni->dir.idx2vbn_bits,
+ &node);
+ if (err)
+ goto out;
+
+ hdr = &node->index->ihdr;
+ bit += 1;
+ vbo = (u64)bit << ni->dir.idx2vbn_bits;
+ }
+
+out:
+ put_indx_node(node);
+ if (dirs)
+ *dirs = drs;
+ if (files)
+ *files = fles;
+
+ return err;
+}
+
+bool dir_is_empty(struct inode *dir)
+{
+ bool is_empty = false;
+
+ ntfs_dir_count(dir, &is_empty, NULL, NULL);
+
+ return is_empty;
+}
+
+// clang-format off
+const struct file_operations ntfs_dir_operations = {
+ .llseek = generic_file_llseek,
+ .read = generic_read_dir,
+ .iterate_shared = ntfs_readdir,
+ .fsync = generic_file_fsync,
+ .open = ntfs_file_open,
+};
+// clang-format on
diff --git a/fs/ntfs3/file.c b/fs/ntfs3/file.c
new file mode 100644
index 000000000..f31c0389a
--- /dev/null
+++ b/fs/ntfs3/file.c
@@ -0,0 +1,1279 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ *
+ * Copyright (C) 2019-2021 Paragon Software GmbH, All rights reserved.
+ *
+ * Regular file handling primitives for NTFS-based filesystems.
+ *
+ */
+
+#include <linux/backing-dev.h>
+#include <linux/blkdev.h>
+#include <linux/buffer_head.h>
+#include <linux/compat.h>
+#include <linux/falloc.h>
+#include <linux/fiemap.h>
+
+#include "debug.h"
+#include "ntfs.h"
+#include "ntfs_fs.h"
+
+static int ntfs_ioctl_fitrim(struct ntfs_sb_info *sbi, unsigned long arg)
+{
+ struct fstrim_range __user *user_range;
+ struct fstrim_range range;
+ int err;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if (!bdev_max_discard_sectors(sbi->sb->s_bdev))
+ return -EOPNOTSUPP;
+
+ user_range = (struct fstrim_range __user *)arg;
+ if (copy_from_user(&range, user_range, sizeof(range)))
+ return -EFAULT;
+
+ range.minlen = max_t(u32, range.minlen,
+ bdev_discard_granularity(sbi->sb->s_bdev));
+
+ err = ntfs_trim_fs(sbi, &range);
+ if (err < 0)
+ return err;
+
+ if (copy_to_user(user_range, &range, sizeof(range)))
+ return -EFAULT;
+
+ return 0;
+}
+
+static long ntfs_ioctl(struct file *filp, u32 cmd, unsigned long arg)
+{
+ struct inode *inode = file_inode(filp);
+ struct ntfs_sb_info *sbi = inode->i_sb->s_fs_info;
+
+ switch (cmd) {
+ case FITRIM:
+ return ntfs_ioctl_fitrim(sbi, arg);
+ }
+ return -ENOTTY; /* Inappropriate ioctl for device. */
+}
+
+#ifdef CONFIG_COMPAT
+static long ntfs_compat_ioctl(struct file *filp, u32 cmd, unsigned long arg)
+
+{
+ return ntfs_ioctl(filp, cmd, (unsigned long)compat_ptr(arg));
+}
+#endif
+
+/*
+ * ntfs_getattr - inode_operations::getattr
+ */
+int ntfs_getattr(struct user_namespace *mnt_userns, const struct path *path,
+ struct kstat *stat, u32 request_mask, u32 flags)
+{
+ struct inode *inode = d_inode(path->dentry);
+ struct ntfs_inode *ni = ntfs_i(inode);
+
+ if (is_compressed(ni))
+ stat->attributes |= STATX_ATTR_COMPRESSED;
+
+ if (is_encrypted(ni))
+ stat->attributes |= STATX_ATTR_ENCRYPTED;
+
+ stat->attributes_mask |= STATX_ATTR_COMPRESSED | STATX_ATTR_ENCRYPTED;
+
+ generic_fillattr(mnt_userns, inode, stat);
+
+ stat->result_mask |= STATX_BTIME;
+ stat->btime = ni->i_crtime;
+ stat->blksize = ni->mi.sbi->cluster_size; /* 512, 1K, ..., 2M */
+
+ return 0;
+}
+
+static int ntfs_extend_initialized_size(struct file *file,
+ struct ntfs_inode *ni,
+ const loff_t valid,
+ const loff_t new_valid)
+{
+ struct inode *inode = &ni->vfs_inode;
+ struct address_space *mapping = inode->i_mapping;
+ struct ntfs_sb_info *sbi = inode->i_sb->s_fs_info;
+ loff_t pos = valid;
+ int err;
+
+ if (is_resident(ni)) {
+ ni->i_valid = new_valid;
+ return 0;
+ }
+
+ WARN_ON(is_compressed(ni));
+ WARN_ON(valid >= new_valid);
+
+ for (;;) {
+ u32 zerofrom, len;
+ struct page *page;
+ u8 bits;
+ CLST vcn, lcn, clen;
+
+ if (is_sparsed(ni)) {
+ bits = sbi->cluster_bits;
+ vcn = pos >> bits;
+
+ err = attr_data_get_block(ni, vcn, 0, &lcn, &clen,
+ NULL);
+ if (err)
+ goto out;
+
+ if (lcn == SPARSE_LCN) {
+ loff_t vbo = (loff_t)vcn << bits;
+ loff_t to = vbo + ((loff_t)clen << bits);
+
+ if (to <= new_valid) {
+ ni->i_valid = to;
+ pos = to;
+ goto next;
+ }
+
+ if (vbo < pos) {
+ pos = vbo;
+ } else {
+ to = (new_valid >> bits) << bits;
+ if (pos < to) {
+ ni->i_valid = to;
+ pos = to;
+ goto next;
+ }
+ }
+ }
+ }
+
+ zerofrom = pos & (PAGE_SIZE - 1);
+ len = PAGE_SIZE - zerofrom;
+
+ if (pos + len > new_valid)
+ len = new_valid - pos;
+
+ err = ntfs_write_begin(file, mapping, pos, len, &page, NULL);
+ if (err)
+ goto out;
+
+ zero_user_segment(page, zerofrom, PAGE_SIZE);
+
+ /* This function in any case puts page. */
+ err = ntfs_write_end(file, mapping, pos, len, len, page, NULL);
+ if (err < 0)
+ goto out;
+ pos += len;
+
+next:
+ if (pos >= new_valid)
+ break;
+
+ balance_dirty_pages_ratelimited(mapping);
+ cond_resched();
+ }
+
+ return 0;
+
+out:
+ ni->i_valid = valid;
+ ntfs_inode_warn(inode, "failed to extend initialized size to %llx.",
+ new_valid);
+ return err;
+}
+
+/*
+ * ntfs_zero_range - Helper function for punch_hole.
+ *
+ * It zeroes a range [vbo, vbo_to).
+ */
+static int ntfs_zero_range(struct inode *inode, u64 vbo, u64 vbo_to)
+{
+ int err = 0;
+ struct address_space *mapping = inode->i_mapping;
+ u32 blocksize = 1 << inode->i_blkbits;
+ pgoff_t idx = vbo >> PAGE_SHIFT;
+ u32 z_start = vbo & (PAGE_SIZE - 1);
+ pgoff_t idx_end = (vbo_to + PAGE_SIZE - 1) >> PAGE_SHIFT;
+ loff_t page_off;
+ struct buffer_head *head, *bh;
+ u32 bh_next, bh_off, z_end;
+ sector_t iblock;
+ struct page *page;
+
+ for (; idx < idx_end; idx += 1, z_start = 0) {
+ page_off = (loff_t)idx << PAGE_SHIFT;
+ z_end = (page_off + PAGE_SIZE) > vbo_to ? (vbo_to - page_off)
+ : PAGE_SIZE;
+ iblock = page_off >> inode->i_blkbits;
+
+ page = find_or_create_page(mapping, idx,
+ mapping_gfp_constraint(mapping,
+ ~__GFP_FS));
+ if (!page)
+ return -ENOMEM;
+
+ if (!page_has_buffers(page))
+ create_empty_buffers(page, blocksize, 0);
+
+ bh = head = page_buffers(page);
+ bh_off = 0;
+ do {
+ bh_next = bh_off + blocksize;
+
+ if (bh_next <= z_start || bh_off >= z_end)
+ continue;
+
+ if (!buffer_mapped(bh)) {
+ ntfs_get_block(inode, iblock, bh, 0);
+ /* Unmapped? It's a hole - nothing to do. */
+ if (!buffer_mapped(bh))
+ continue;
+ }
+
+ /* Ok, it's mapped. Make sure it's up-to-date. */
+ if (PageUptodate(page))
+ set_buffer_uptodate(bh);
+
+ if (!buffer_uptodate(bh)) {
+ lock_buffer(bh);
+ bh->b_end_io = end_buffer_read_sync;
+ get_bh(bh);
+ submit_bh(REQ_OP_READ, bh);
+
+ wait_on_buffer(bh);
+ if (!buffer_uptodate(bh)) {
+ unlock_page(page);
+ put_page(page);
+ err = -EIO;
+ goto out;
+ }
+ }
+
+ mark_buffer_dirty(bh);
+
+ } while (bh_off = bh_next, iblock += 1,
+ head != (bh = bh->b_this_page));
+
+ zero_user_segment(page, z_start, z_end);
+
+ unlock_page(page);
+ put_page(page);
+ cond_resched();
+ }
+out:
+ mark_inode_dirty(inode);
+ return err;
+}
+
+/*
+ * ntfs_sparse_cluster - Helper function to zero a new allocated clusters.
+ *
+ * NOTE: 512 <= cluster size <= 2M
+ */
+void ntfs_sparse_cluster(struct inode *inode, struct page *page0, CLST vcn,
+ CLST len)
+{
+ struct address_space *mapping = inode->i_mapping;
+ struct ntfs_sb_info *sbi = inode->i_sb->s_fs_info;
+ u64 vbo = (u64)vcn << sbi->cluster_bits;
+ u64 bytes = (u64)len << sbi->cluster_bits;
+ u32 blocksize = 1 << inode->i_blkbits;
+ pgoff_t idx0 = page0 ? page0->index : -1;
+ loff_t vbo_clst = vbo & sbi->cluster_mask_inv;
+ loff_t end = ntfs_up_cluster(sbi, vbo + bytes);
+ pgoff_t idx = vbo_clst >> PAGE_SHIFT;
+ u32 from = vbo_clst & (PAGE_SIZE - 1);
+ pgoff_t idx_end = (end + PAGE_SIZE - 1) >> PAGE_SHIFT;
+ loff_t page_off;
+ u32 to;
+ bool partial;
+ struct page *page;
+
+ for (; idx < idx_end; idx += 1, from = 0) {
+ page = idx == idx0 ? page0 : grab_cache_page(mapping, idx);
+
+ if (!page)
+ continue;
+
+ page_off = (loff_t)idx << PAGE_SHIFT;
+ to = (page_off + PAGE_SIZE) > end ? (end - page_off)
+ : PAGE_SIZE;
+ partial = false;
+
+ if ((from || PAGE_SIZE != to) &&
+ likely(!page_has_buffers(page))) {
+ create_empty_buffers(page, blocksize, 0);
+ }
+
+ if (page_has_buffers(page)) {
+ struct buffer_head *head, *bh;
+ u32 bh_off = 0;
+
+ bh = head = page_buffers(page);
+ do {
+ u32 bh_next = bh_off + blocksize;
+
+ if (from <= bh_off && bh_next <= to) {
+ set_buffer_uptodate(bh);
+ mark_buffer_dirty(bh);
+ } else if (!buffer_uptodate(bh)) {
+ partial = true;
+ }
+ bh_off = bh_next;
+ } while (head != (bh = bh->b_this_page));
+ }
+
+ zero_user_segment(page, from, to);
+
+ if (!partial) {
+ if (!PageUptodate(page))
+ SetPageUptodate(page);
+ set_page_dirty(page);
+ }
+
+ if (idx != idx0) {
+ unlock_page(page);
+ put_page(page);
+ }
+ cond_resched();
+ }
+ mark_inode_dirty(inode);
+}
+
+/*
+ * ntfs_file_mmap - file_operations::mmap
+ */
+static int ntfs_file_mmap(struct file *file, struct vm_area_struct *vma)
+{
+ struct address_space *mapping = file->f_mapping;
+ struct inode *inode = mapping->host;
+ struct ntfs_inode *ni = ntfs_i(inode);
+ u64 from = ((u64)vma->vm_pgoff << PAGE_SHIFT);
+ bool rw = vma->vm_flags & VM_WRITE;
+ int err;
+
+ if (is_encrypted(ni)) {
+ ntfs_inode_warn(inode, "mmap encrypted not supported");
+ return -EOPNOTSUPP;
+ }
+
+ if (is_dedup(ni)) {
+ ntfs_inode_warn(inode, "mmap deduplicated not supported");
+ return -EOPNOTSUPP;
+ }
+
+ if (is_compressed(ni) && rw) {
+ ntfs_inode_warn(inode, "mmap(write) compressed not supported");
+ return -EOPNOTSUPP;
+ }
+
+ if (rw) {
+ u64 to = min_t(loff_t, i_size_read(inode),
+ from + vma->vm_end - vma->vm_start);
+
+ if (is_sparsed(ni)) {
+ /* Allocate clusters for rw map. */
+ struct ntfs_sb_info *sbi = inode->i_sb->s_fs_info;
+ CLST lcn, len;
+ CLST vcn = from >> sbi->cluster_bits;
+ CLST end = bytes_to_cluster(sbi, to);
+ bool new;
+
+ for (; vcn < end; vcn += len) {
+ err = attr_data_get_block(ni, vcn, 1, &lcn,
+ &len, &new);
+ if (err)
+ goto out;
+
+ if (!new)
+ continue;
+ ntfs_sparse_cluster(inode, NULL, vcn, 1);
+ }
+ }
+
+ if (ni->i_valid < to) {
+ if (!inode_trylock(inode)) {
+ err = -EAGAIN;
+ goto out;
+ }
+ err = ntfs_extend_initialized_size(file, ni,
+ ni->i_valid, to);
+ inode_unlock(inode);
+ if (err)
+ goto out;
+ }
+ }
+
+ err = generic_file_mmap(file, vma);
+out:
+ return err;
+}
+
+static int ntfs_extend(struct inode *inode, loff_t pos, size_t count,
+ struct file *file)
+{
+ struct ntfs_inode *ni = ntfs_i(inode);
+ struct address_space *mapping = inode->i_mapping;
+ loff_t end = pos + count;
+ bool extend_init = file && pos > ni->i_valid;
+ int err;
+
+ if (end <= inode->i_size && !extend_init)
+ return 0;
+
+ /* Mark rw ntfs as dirty. It will be cleared at umount. */
+ ntfs_set_state(ni->mi.sbi, NTFS_DIRTY_DIRTY);
+
+ if (end > inode->i_size) {
+ err = ntfs_set_size(inode, end);
+ if (err)
+ goto out;
+ inode->i_size = end;
+ }
+
+ if (extend_init && !is_compressed(ni)) {
+ err = ntfs_extend_initialized_size(file, ni, ni->i_valid, pos);
+ if (err)
+ goto out;
+ } else {
+ err = 0;
+ }
+
+ inode->i_ctime = inode->i_mtime = current_time(inode);
+ mark_inode_dirty(inode);
+
+ if (IS_SYNC(inode)) {
+ int err2;
+
+ err = filemap_fdatawrite_range(mapping, pos, end - 1);
+ err2 = sync_mapping_buffers(mapping);
+ if (!err)
+ err = err2;
+ err2 = write_inode_now(inode, 1);
+ if (!err)
+ err = err2;
+ if (!err)
+ err = filemap_fdatawait_range(mapping, pos, end - 1);
+ }
+
+out:
+ return err;
+}
+
+static int ntfs_truncate(struct inode *inode, loff_t new_size)
+{
+ struct super_block *sb = inode->i_sb;
+ struct ntfs_inode *ni = ntfs_i(inode);
+ int err, dirty = 0;
+ u64 new_valid;
+
+ if (!S_ISREG(inode->i_mode))
+ return 0;
+
+ if (is_compressed(ni)) {
+ if (ni->i_valid > new_size)
+ ni->i_valid = new_size;
+ } else {
+ err = block_truncate_page(inode->i_mapping, new_size,
+ ntfs_get_block);
+ if (err)
+ return err;
+ }
+
+ new_valid = ntfs_up_block(sb, min_t(u64, ni->i_valid, new_size));
+
+ truncate_setsize(inode, new_size);
+
+ ni_lock(ni);
+
+ down_write(&ni->file.run_lock);
+ err = attr_set_size(ni, ATTR_DATA, NULL, 0, &ni->file.run, new_size,
+ &new_valid, ni->mi.sbi->options->prealloc, NULL);
+ up_write(&ni->file.run_lock);
+
+ if (new_valid < ni->i_valid)
+ ni->i_valid = new_valid;
+
+ ni_unlock(ni);
+
+ ni->std_fa |= FILE_ATTRIBUTE_ARCHIVE;
+ inode->i_ctime = inode->i_mtime = current_time(inode);
+ if (!IS_DIRSYNC(inode)) {
+ dirty = 1;
+ } else {
+ err = ntfs_sync_inode(inode);
+ if (err)
+ return err;
+ }
+
+ if (dirty)
+ mark_inode_dirty(inode);
+
+ /*ntfs_flush_inodes(inode->i_sb, inode, NULL);*/
+
+ return 0;
+}
+
+/*
+ * ntfs_fallocate
+ *
+ * Preallocate space for a file. This implements ntfs's fallocate file
+ * operation, which gets called from sys_fallocate system call. User
+ * space requests 'len' bytes at 'vbo'. If FALLOC_FL_KEEP_SIZE is set
+ * we just allocate clusters without zeroing them out. Otherwise we
+ * allocate and zero out clusters via an expanding truncate.
+ */
+static long ntfs_fallocate(struct file *file, int mode, loff_t vbo, loff_t len)
+{
+ struct inode *inode = file->f_mapping->host;
+ struct address_space *mapping = inode->i_mapping;
+ struct super_block *sb = inode->i_sb;
+ struct ntfs_sb_info *sbi = sb->s_fs_info;
+ struct ntfs_inode *ni = ntfs_i(inode);
+ loff_t end = vbo + len;
+ loff_t vbo_down = round_down(vbo, PAGE_SIZE);
+ bool is_supported_holes = is_sparsed(ni) || is_compressed(ni);
+ loff_t i_size, new_size;
+ bool map_locked;
+ int err;
+
+ /* No support for dir. */
+ if (!S_ISREG(inode->i_mode))
+ return -EOPNOTSUPP;
+
+ /*
+ * vfs_fallocate checks all possible combinations of mode.
+ * Do additional checks here before ntfs_set_state(dirty).
+ */
+ if (mode & FALLOC_FL_PUNCH_HOLE) {
+ if (!is_supported_holes)
+ return -EOPNOTSUPP;
+ } else if (mode & FALLOC_FL_COLLAPSE_RANGE) {
+ } else if (mode & FALLOC_FL_INSERT_RANGE) {
+ if (!is_supported_holes)
+ return -EOPNOTSUPP;
+ } else if (mode &
+ ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |
+ FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_INSERT_RANGE)) {
+ ntfs_inode_warn(inode, "fallocate(0x%x) is not supported",
+ mode);
+ return -EOPNOTSUPP;
+ }
+
+ ntfs_set_state(sbi, NTFS_DIRTY_DIRTY);
+
+ inode_lock(inode);
+ i_size = inode->i_size;
+ new_size = max(end, i_size);
+ map_locked = false;
+
+ if (WARN_ON(ni->ni_flags & NI_FLAG_COMPRESSED_MASK)) {
+ /* Should never be here, see ntfs_file_open. */
+ err = -EOPNOTSUPP;
+ goto out;
+ }
+
+ if (mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_COLLAPSE_RANGE |
+ FALLOC_FL_INSERT_RANGE)) {
+ inode_dio_wait(inode);
+ filemap_invalidate_lock(mapping);
+ map_locked = true;
+ }
+
+ if (mode & FALLOC_FL_PUNCH_HOLE) {
+ u32 frame_size;
+ loff_t mask, vbo_a, end_a, tmp;
+
+ err = filemap_write_and_wait_range(mapping, vbo, end - 1);
+ if (err)
+ goto out;
+
+ err = filemap_write_and_wait_range(mapping, end, LLONG_MAX);
+ if (err)
+ goto out;
+
+ truncate_pagecache(inode, vbo_down);
+
+ ni_lock(ni);
+ err = attr_punch_hole(ni, vbo, len, &frame_size);
+ ni_unlock(ni);
+ if (err != E_NTFS_NOTALIGNED)
+ goto out;
+
+ /* Process not aligned punch. */
+ mask = frame_size - 1;
+ vbo_a = (vbo + mask) & ~mask;
+ end_a = end & ~mask;
+
+ tmp = min(vbo_a, end);
+ if (tmp > vbo) {
+ err = ntfs_zero_range(inode, vbo, tmp);
+ if (err)
+ goto out;
+ }
+
+ if (vbo < end_a && end_a < end) {
+ err = ntfs_zero_range(inode, end_a, end);
+ if (err)
+ goto out;
+ }
+
+ /* Aligned punch_hole */
+ if (end_a > vbo_a) {
+ ni_lock(ni);
+ err = attr_punch_hole(ni, vbo_a, end_a - vbo_a, NULL);
+ ni_unlock(ni);
+ }
+ } else if (mode & FALLOC_FL_COLLAPSE_RANGE) {
+ /*
+ * Write tail of the last page before removed range since
+ * it will get removed from the page cache below.
+ */
+ err = filemap_write_and_wait_range(mapping, vbo_down, vbo);
+ if (err)
+ goto out;
+
+ /*
+ * Write data that will be shifted to preserve them
+ * when discarding page cache below.
+ */
+ err = filemap_write_and_wait_range(mapping, end, LLONG_MAX);
+ if (err)
+ goto out;
+
+ truncate_pagecache(inode, vbo_down);
+
+ ni_lock(ni);
+ err = attr_collapse_range(ni, vbo, len);
+ ni_unlock(ni);
+ } else if (mode & FALLOC_FL_INSERT_RANGE) {
+ /* Check new size. */
+ err = inode_newsize_ok(inode, new_size);
+ if (err)
+ goto out;
+
+ /* Write out all dirty pages. */
+ err = filemap_write_and_wait_range(mapping, vbo_down,
+ LLONG_MAX);
+ if (err)
+ goto out;
+ truncate_pagecache(inode, vbo_down);
+
+ ni_lock(ni);
+ err = attr_insert_range(ni, vbo, len);
+ ni_unlock(ni);
+ } else {
+ /* Check new size. */
+
+ /* generic/213: expected -ENOSPC instead of -EFBIG. */
+ if (!is_supported_holes) {
+ loff_t to_alloc = new_size - inode_get_bytes(inode);
+
+ if (to_alloc > 0 &&
+ (to_alloc >> sbi->cluster_bits) >
+ wnd_zeroes(&sbi->used.bitmap)) {
+ err = -ENOSPC;
+ goto out;
+ }
+ }
+
+ err = inode_newsize_ok(inode, new_size);
+ if (err)
+ goto out;
+
+ /*
+ * Allocate clusters, do not change 'valid' size.
+ */
+ err = ntfs_set_size(inode, new_size);
+ if (err)
+ goto out;
+
+ if (is_supported_holes) {
+ CLST vcn_v = ni->i_valid >> sbi->cluster_bits;
+ CLST vcn = vbo >> sbi->cluster_bits;
+ CLST cend = bytes_to_cluster(sbi, end);
+ CLST lcn, clen;
+ bool new;
+
+ /*
+ * Allocate but do not zero new clusters. (see below comments)
+ * This breaks security: One can read unused on-disk areas.
+ * Zeroing these clusters may be too long.
+ * Maybe we should check here for root rights?
+ */
+ for (; vcn < cend; vcn += clen) {
+ err = attr_data_get_block(ni, vcn, cend - vcn,
+ &lcn, &clen, &new);
+ if (err)
+ goto out;
+ if (!new || vcn >= vcn_v)
+ continue;
+
+ /*
+ * Unwritten area.
+ * NTFS is not able to store several unwritten areas.
+ * Activate 'ntfs_sparse_cluster' to zero new allocated clusters.
+ *
+ * Dangerous in case:
+ * 1G of sparsed clusters + 1 cluster of data =>
+ * valid_size == 1G + 1 cluster
+ * fallocate(1G) will zero 1G and this can be very long
+ * xfstest 016/086 will fail without 'ntfs_sparse_cluster'.
+ */
+ ntfs_sparse_cluster(inode, NULL, vcn,
+ min(vcn_v - vcn, clen));
+ }
+ }
+
+ if (mode & FALLOC_FL_KEEP_SIZE) {
+ ni_lock(ni);
+ /* True - Keep preallocated. */
+ err = attr_set_size(ni, ATTR_DATA, NULL, 0,
+ &ni->file.run, i_size, &ni->i_valid,
+ true, NULL);
+ ni_unlock(ni);
+ }
+ }
+
+out:
+ if (map_locked)
+ filemap_invalidate_unlock(mapping);
+
+ if (!err) {
+ inode->i_ctime = inode->i_mtime = current_time(inode);
+ mark_inode_dirty(inode);
+ }
+
+ inode_unlock(inode);
+ return err;
+}
+
+/*
+ * ntfs3_setattr - inode_operations::setattr
+ */
+int ntfs3_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
+ struct iattr *attr)
+{
+ struct super_block *sb = dentry->d_sb;
+ struct ntfs_sb_info *sbi = sb->s_fs_info;
+ struct inode *inode = d_inode(dentry);
+ struct ntfs_inode *ni = ntfs_i(inode);
+ u32 ia_valid = attr->ia_valid;
+ umode_t mode = inode->i_mode;
+ int err;
+
+ if (sbi->options->noacsrules) {
+ /* "No access rules" - Force any changes of time etc. */
+ attr->ia_valid |= ATTR_FORCE;
+ /* and disable for editing some attributes. */
+ attr->ia_valid &= ~(ATTR_UID | ATTR_GID | ATTR_MODE);
+ ia_valid = attr->ia_valid;
+ }
+
+ err = setattr_prepare(mnt_userns, dentry, attr);
+ if (err)
+ goto out;
+
+ if (ia_valid & ATTR_SIZE) {
+ loff_t oldsize = inode->i_size;
+
+ if (WARN_ON(ni->ni_flags & NI_FLAG_COMPRESSED_MASK)) {
+ /* Should never be here, see ntfs_file_open(). */
+ err = -EOPNOTSUPP;
+ goto out;
+ }
+ inode_dio_wait(inode);
+
+ if (attr->ia_size <= oldsize)
+ err = ntfs_truncate(inode, attr->ia_size);
+ else if (attr->ia_size > oldsize)
+ err = ntfs_extend(inode, attr->ia_size, 0, NULL);
+
+ if (err)
+ goto out;
+
+ ni->ni_flags |= NI_FLAG_UPDATE_PARENT;
+ }
+
+ setattr_copy(mnt_userns, inode, attr);
+
+ if (mode != inode->i_mode) {
+ err = ntfs_acl_chmod(mnt_userns, inode);
+ if (err)
+ goto out;
+
+ /* Linux 'w' -> Windows 'ro'. */
+ if (0222 & inode->i_mode)
+ ni->std_fa &= ~FILE_ATTRIBUTE_READONLY;
+ else
+ ni->std_fa |= FILE_ATTRIBUTE_READONLY;
+ }
+
+ if (ia_valid & (ATTR_UID | ATTR_GID | ATTR_MODE))
+ ntfs_save_wsl_perm(inode);
+ mark_inode_dirty(inode);
+out:
+ return err;
+}
+
+static ssize_t ntfs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
+{
+ struct file *file = iocb->ki_filp;
+ struct inode *inode = file->f_mapping->host;
+ struct ntfs_inode *ni = ntfs_i(inode);
+
+ if (is_encrypted(ni)) {
+ ntfs_inode_warn(inode, "encrypted i/o not supported");
+ return -EOPNOTSUPP;
+ }
+
+ if (is_compressed(ni) && (iocb->ki_flags & IOCB_DIRECT)) {
+ ntfs_inode_warn(inode, "direct i/o + compressed not supported");
+ return -EOPNOTSUPP;
+ }
+
+#ifndef CONFIG_NTFS3_LZX_XPRESS
+ if (ni->ni_flags & NI_FLAG_COMPRESSED_MASK) {
+ ntfs_inode_warn(
+ inode,
+ "activate CONFIG_NTFS3_LZX_XPRESS to read external compressed files");
+ return -EOPNOTSUPP;
+ }
+#endif
+
+ if (is_dedup(ni)) {
+ ntfs_inode_warn(inode, "read deduplicated not supported");
+ return -EOPNOTSUPP;
+ }
+
+ return generic_file_read_iter(iocb, iter);
+}
+
+/*
+ * ntfs_get_frame_pages
+ *
+ * Return: Array of locked pages.
+ */
+static int ntfs_get_frame_pages(struct address_space *mapping, pgoff_t index,
+ struct page **pages, u32 pages_per_frame,
+ bool *frame_uptodate)
+{
+ gfp_t gfp_mask = mapping_gfp_mask(mapping);
+ u32 npages;
+
+ *frame_uptodate = true;
+
+ for (npages = 0; npages < pages_per_frame; npages++, index++) {
+ struct page *page;
+
+ page = find_or_create_page(mapping, index, gfp_mask);
+ if (!page) {
+ while (npages--) {
+ page = pages[npages];
+ unlock_page(page);
+ put_page(page);
+ }
+
+ return -ENOMEM;
+ }
+
+ if (!PageUptodate(page))
+ *frame_uptodate = false;
+
+ pages[npages] = page;
+ }
+
+ return 0;
+}
+
+/*
+ * ntfs_compress_write - Helper for ntfs_file_write_iter() (compressed files).
+ */
+static ssize_t ntfs_compress_write(struct kiocb *iocb, struct iov_iter *from)
+{
+ int err;
+ struct file *file = iocb->ki_filp;
+ size_t count = iov_iter_count(from);
+ loff_t pos = iocb->ki_pos;
+ struct inode *inode = file_inode(file);
+ loff_t i_size = inode->i_size;
+ struct address_space *mapping = inode->i_mapping;
+ struct ntfs_inode *ni = ntfs_i(inode);
+ u64 valid = ni->i_valid;
+ struct ntfs_sb_info *sbi = ni->mi.sbi;
+ struct page *page, **pages = NULL;
+ size_t written = 0;
+ u8 frame_bits = NTFS_LZNT_CUNIT + sbi->cluster_bits;
+ u32 frame_size = 1u << frame_bits;
+ u32 pages_per_frame = frame_size >> PAGE_SHIFT;
+ u32 ip, off;
+ CLST frame;
+ u64 frame_vbo;
+ pgoff_t index;
+ bool frame_uptodate;
+
+ if (frame_size < PAGE_SIZE) {
+ /*
+ * frame_size == 8K if cluster 512
+ * frame_size == 64K if cluster 4096
+ */
+ ntfs_inode_warn(inode, "page size is bigger than frame size");
+ return -EOPNOTSUPP;
+ }
+
+ pages = kmalloc_array(pages_per_frame, sizeof(struct page *), GFP_NOFS);
+ if (!pages)
+ return -ENOMEM;
+
+ current->backing_dev_info = inode_to_bdi(inode);
+ err = file_remove_privs(file);
+ if (err)
+ goto out;
+
+ err = file_update_time(file);
+ if (err)
+ goto out;
+
+ /* Zero range [valid : pos). */
+ while (valid < pos) {
+ CLST lcn, clen;
+
+ frame = valid >> frame_bits;
+ frame_vbo = valid & ~(frame_size - 1);
+ off = valid & (frame_size - 1);
+
+ err = attr_data_get_block(ni, frame << NTFS_LZNT_CUNIT, 0, &lcn,
+ &clen, NULL);
+ if (err)
+ goto out;
+
+ if (lcn == SPARSE_LCN) {
+ ni->i_valid = valid =
+ frame_vbo + ((u64)clen << sbi->cluster_bits);
+ continue;
+ }
+
+ /* Load full frame. */
+ err = ntfs_get_frame_pages(mapping, frame_vbo >> PAGE_SHIFT,
+ pages, pages_per_frame,
+ &frame_uptodate);
+ if (err)
+ goto out;
+
+ if (!frame_uptodate && off) {
+ err = ni_read_frame(ni, frame_vbo, pages,
+ pages_per_frame);
+ if (err) {
+ for (ip = 0; ip < pages_per_frame; ip++) {
+ page = pages[ip];
+ unlock_page(page);
+ put_page(page);
+ }
+ goto out;
+ }
+ }
+
+ ip = off >> PAGE_SHIFT;
+ off = offset_in_page(valid);
+ for (; ip < pages_per_frame; ip++, off = 0) {
+ page = pages[ip];
+ zero_user_segment(page, off, PAGE_SIZE);
+ flush_dcache_page(page);
+ SetPageUptodate(page);
+ }
+
+ ni_lock(ni);
+ err = ni_write_frame(ni, pages, pages_per_frame);
+ ni_unlock(ni);
+
+ for (ip = 0; ip < pages_per_frame; ip++) {
+ page = pages[ip];
+ SetPageUptodate(page);
+ unlock_page(page);
+ put_page(page);
+ }
+
+ if (err)
+ goto out;
+
+ ni->i_valid = valid = frame_vbo + frame_size;
+ }
+
+ /* Copy user data [pos : pos + count). */
+ while (count) {
+ size_t copied, bytes;
+
+ off = pos & (frame_size - 1);
+ bytes = frame_size - off;
+ if (bytes > count)
+ bytes = count;
+
+ frame_vbo = pos & ~(frame_size - 1);
+ index = frame_vbo >> PAGE_SHIFT;
+
+ if (unlikely(fault_in_iov_iter_readable(from, bytes))) {
+ err = -EFAULT;
+ goto out;
+ }
+
+ /* Load full frame. */
+ err = ntfs_get_frame_pages(mapping, index, pages,
+ pages_per_frame, &frame_uptodate);
+ if (err)
+ goto out;
+
+ if (!frame_uptodate) {
+ loff_t to = pos + bytes;
+
+ if (off || (to < i_size && (to & (frame_size - 1)))) {
+ err = ni_read_frame(ni, frame_vbo, pages,
+ pages_per_frame);
+ if (err) {
+ for (ip = 0; ip < pages_per_frame;
+ ip++) {
+ page = pages[ip];
+ unlock_page(page);
+ put_page(page);
+ }
+ goto out;
+ }
+ }
+ }
+
+ WARN_ON(!bytes);
+ copied = 0;
+ ip = off >> PAGE_SHIFT;
+ off = offset_in_page(pos);
+
+ /* Copy user data to pages. */
+ for (;;) {
+ size_t cp, tail = PAGE_SIZE - off;
+
+ page = pages[ip];
+ cp = copy_page_from_iter_atomic(page, off,
+ min(tail, bytes), from);
+ flush_dcache_page(page);
+
+ copied += cp;
+ bytes -= cp;
+ if (!bytes || !cp)
+ break;
+
+ if (cp < tail) {
+ off += cp;
+ } else {
+ ip++;
+ off = 0;
+ }
+ }
+
+ ni_lock(ni);
+ err = ni_write_frame(ni, pages, pages_per_frame);
+ ni_unlock(ni);
+
+ for (ip = 0; ip < pages_per_frame; ip++) {
+ page = pages[ip];
+ ClearPageDirty(page);
+ SetPageUptodate(page);
+ unlock_page(page);
+ put_page(page);
+ }
+
+ if (err)
+ goto out;
+
+ /*
+ * We can loop for a long time in here. Be nice and allow
+ * us to schedule out to avoid softlocking if preempt
+ * is disabled.
+ */
+ cond_resched();
+
+ pos += copied;
+ written += copied;
+
+ count = iov_iter_count(from);
+ }
+
+out:
+ kfree(pages);
+
+ current->backing_dev_info = NULL;
+
+ if (err < 0)
+ return err;
+
+ iocb->ki_pos += written;
+ if (iocb->ki_pos > ni->i_valid)
+ ni->i_valid = iocb->ki_pos;
+
+ return written;
+}
+
+/*
+ * ntfs_file_write_iter - file_operations::write_iter
+ */
+static ssize_t ntfs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
+{
+ struct file *file = iocb->ki_filp;
+ struct address_space *mapping = file->f_mapping;
+ struct inode *inode = mapping->host;
+ ssize_t ret;
+ struct ntfs_inode *ni = ntfs_i(inode);
+
+ if (is_encrypted(ni)) {
+ ntfs_inode_warn(inode, "encrypted i/o not supported");
+ return -EOPNOTSUPP;
+ }
+
+ if (is_compressed(ni) && (iocb->ki_flags & IOCB_DIRECT)) {
+ ntfs_inode_warn(inode, "direct i/o + compressed not supported");
+ return -EOPNOTSUPP;
+ }
+
+ if (is_dedup(ni)) {
+ ntfs_inode_warn(inode, "write into deduplicated not supported");
+ return -EOPNOTSUPP;
+ }
+
+ if (!inode_trylock(inode)) {
+ if (iocb->ki_flags & IOCB_NOWAIT)
+ return -EAGAIN;
+ inode_lock(inode);
+ }
+
+ ret = generic_write_checks(iocb, from);
+ if (ret <= 0)
+ goto out;
+
+ if (WARN_ON(ni->ni_flags & NI_FLAG_COMPRESSED_MASK)) {
+ /* Should never be here, see ntfs_file_open(). */
+ ret = -EOPNOTSUPP;
+ goto out;
+ }
+
+ ret = ntfs_extend(inode, iocb->ki_pos, ret, file);
+ if (ret)
+ goto out;
+
+ ret = is_compressed(ni) ? ntfs_compress_write(iocb, from)
+ : __generic_file_write_iter(iocb, from);
+
+out:
+ inode_unlock(inode);
+
+ if (ret > 0)
+ ret = generic_write_sync(iocb, ret);
+
+ return ret;
+}
+
+/*
+ * ntfs_file_open - file_operations::open
+ */
+int ntfs_file_open(struct inode *inode, struct file *file)
+{
+ struct ntfs_inode *ni = ntfs_i(inode);
+
+ if (unlikely((is_compressed(ni) || is_encrypted(ni)) &&
+ (file->f_flags & O_DIRECT))) {
+ return -EOPNOTSUPP;
+ }
+
+ /* Decompress "external compressed" file if opened for rw. */
+ if ((ni->ni_flags & NI_FLAG_COMPRESSED_MASK) &&
+ (file->f_flags & (O_WRONLY | O_RDWR | O_TRUNC))) {
+#ifdef CONFIG_NTFS3_LZX_XPRESS
+ int err = ni_decompress_file(ni);
+
+ if (err)
+ return err;
+#else
+ ntfs_inode_warn(
+ inode,
+ "activate CONFIG_NTFS3_LZX_XPRESS to write external compressed files");
+ return -EOPNOTSUPP;
+#endif
+ }
+
+ return generic_file_open(inode, file);
+}
+
+/*
+ * ntfs_file_release - file_operations::release
+ */
+static int ntfs_file_release(struct inode *inode, struct file *file)
+{
+ struct ntfs_inode *ni = ntfs_i(inode);
+ struct ntfs_sb_info *sbi = ni->mi.sbi;
+ int err = 0;
+
+ /* If we are last writer on the inode, drop the block reservation. */
+ if (sbi->options->prealloc && ((file->f_mode & FMODE_WRITE) &&
+ atomic_read(&inode->i_writecount) == 1)) {
+ ni_lock(ni);
+ down_write(&ni->file.run_lock);
+
+ err = attr_set_size(ni, ATTR_DATA, NULL, 0, &ni->file.run,
+ inode->i_size, &ni->i_valid, false, NULL);
+
+ up_write(&ni->file.run_lock);
+ ni_unlock(ni);
+ }
+ return err;
+}
+
+/*
+ * ntfs_fiemap - file_operations::fiemap
+ */
+int ntfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
+ __u64 start, __u64 len)
+{
+ int err;
+ struct ntfs_inode *ni = ntfs_i(inode);
+
+ err = fiemap_prep(inode, fieinfo, start, &len, ~FIEMAP_FLAG_XATTR);
+ if (err)
+ return err;
+
+ ni_lock(ni);
+
+ err = ni_fiemap(ni, fieinfo, start, len);
+
+ ni_unlock(ni);
+
+ return err;
+}
+
+// clang-format off
+const struct inode_operations ntfs_file_inode_operations = {
+ .getattr = ntfs_getattr,
+ .setattr = ntfs3_setattr,
+ .listxattr = ntfs_listxattr,
+ .permission = ntfs_permission,
+ .get_acl = ntfs_get_acl,
+ .set_acl = ntfs_set_acl,
+ .fiemap = ntfs_fiemap,
+};
+
+const struct file_operations ntfs_file_operations = {
+ .llseek = generic_file_llseek,
+ .read_iter = ntfs_file_read_iter,
+ .write_iter = ntfs_file_write_iter,
+ .unlocked_ioctl = ntfs_ioctl,
+#ifdef CONFIG_COMPAT
+ .compat_ioctl = ntfs_compat_ioctl,
+#endif
+ .splice_read = generic_file_splice_read,
+ .mmap = ntfs_file_mmap,
+ .open = ntfs_file_open,
+ .fsync = generic_file_fsync,
+ .splice_write = iter_file_splice_write,
+ .fallocate = ntfs_fallocate,
+ .release = ntfs_file_release,
+};
+// clang-format on
diff --git a/fs/ntfs3/frecord.c b/fs/ntfs3/frecord.c
new file mode 100644
index 000000000..bb7e33c24
--- /dev/null
+++ b/fs/ntfs3/frecord.c
@@ -0,0 +1,3368 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ *
+ * Copyright (C) 2019-2021 Paragon Software GmbH, All rights reserved.
+ *
+ */
+
+#include <linux/fiemap.h>
+#include <linux/fs.h>
+#include <linux/minmax.h>
+#include <linux/vmalloc.h>
+
+#include "debug.h"
+#include "ntfs.h"
+#include "ntfs_fs.h"
+#ifdef CONFIG_NTFS3_LZX_XPRESS
+#include "lib/lib.h"
+#endif
+
+static struct mft_inode *ni_ins_mi(struct ntfs_inode *ni, struct rb_root *tree,
+ CLST ino, struct rb_node *ins)
+{
+ struct rb_node **p = &tree->rb_node;
+ struct rb_node *pr = NULL;
+
+ while (*p) {
+ struct mft_inode *mi;
+
+ pr = *p;
+ mi = rb_entry(pr, struct mft_inode, node);
+ if (mi->rno > ino)
+ p = &pr->rb_left;
+ else if (mi->rno < ino)
+ p = &pr->rb_right;
+ else
+ return mi;
+ }
+
+ if (!ins)
+ return NULL;
+
+ rb_link_node(ins, pr, p);
+ rb_insert_color(ins, tree);
+ return rb_entry(ins, struct mft_inode, node);
+}
+
+/*
+ * ni_find_mi - Find mft_inode by record number.
+ */
+static struct mft_inode *ni_find_mi(struct ntfs_inode *ni, CLST rno)
+{
+ return ni_ins_mi(ni, &ni->mi_tree, rno, NULL);
+}
+
+/*
+ * ni_add_mi - Add new mft_inode into ntfs_inode.
+ */
+static void ni_add_mi(struct ntfs_inode *ni, struct mft_inode *mi)
+{
+ ni_ins_mi(ni, &ni->mi_tree, mi->rno, &mi->node);
+}
+
+/*
+ * ni_remove_mi - Remove mft_inode from ntfs_inode.
+ */
+void ni_remove_mi(struct ntfs_inode *ni, struct mft_inode *mi)
+{
+ rb_erase(&mi->node, &ni->mi_tree);
+}
+
+/*
+ * ni_std - Return: Pointer into std_info from primary record.
+ */
+struct ATTR_STD_INFO *ni_std(struct ntfs_inode *ni)
+{
+ const struct ATTRIB *attr;
+
+ attr = mi_find_attr(&ni->mi, NULL, ATTR_STD, NULL, 0, NULL);
+ return attr ? resident_data_ex(attr, sizeof(struct ATTR_STD_INFO))
+ : NULL;
+}
+
+/*
+ * ni_std5
+ *
+ * Return: Pointer into std_info from primary record.
+ */
+struct ATTR_STD_INFO5 *ni_std5(struct ntfs_inode *ni)
+{
+ const struct ATTRIB *attr;
+
+ attr = mi_find_attr(&ni->mi, NULL, ATTR_STD, NULL, 0, NULL);
+
+ return attr ? resident_data_ex(attr, sizeof(struct ATTR_STD_INFO5))
+ : NULL;
+}
+
+/*
+ * ni_clear - Clear resources allocated by ntfs_inode.
+ */
+void ni_clear(struct ntfs_inode *ni)
+{
+ struct rb_node *node;
+
+ if (!ni->vfs_inode.i_nlink && is_rec_inuse(ni->mi.mrec))
+ ni_delete_all(ni);
+
+ al_destroy(ni);
+
+ for (node = rb_first(&ni->mi_tree); node;) {
+ struct rb_node *next = rb_next(node);
+ struct mft_inode *mi = rb_entry(node, struct mft_inode, node);
+
+ rb_erase(node, &ni->mi_tree);
+ mi_put(mi);
+ node = next;
+ }
+
+ /* Bad inode always has mode == S_IFREG. */
+ if (ni->ni_flags & NI_FLAG_DIR)
+ indx_clear(&ni->dir);
+ else {
+ run_close(&ni->file.run);
+#ifdef CONFIG_NTFS3_LZX_XPRESS
+ if (ni->file.offs_page) {
+ /* On-demand allocated page for offsets. */
+ put_page(ni->file.offs_page);
+ ni->file.offs_page = NULL;
+ }
+#endif
+ }
+
+ mi_clear(&ni->mi);
+}
+
+/*
+ * ni_load_mi_ex - Find mft_inode by record number.
+ */
+int ni_load_mi_ex(struct ntfs_inode *ni, CLST rno, struct mft_inode **mi)
+{
+ int err;
+ struct mft_inode *r;
+
+ r = ni_find_mi(ni, rno);
+ if (r)
+ goto out;
+
+ err = mi_get(ni->mi.sbi, rno, &r);
+ if (err)
+ return err;
+
+ ni_add_mi(ni, r);
+
+out:
+ if (mi)
+ *mi = r;
+ return 0;
+}
+
+/*
+ * ni_load_mi - Load mft_inode corresponded list_entry.
+ */
+int ni_load_mi(struct ntfs_inode *ni, const struct ATTR_LIST_ENTRY *le,
+ struct mft_inode **mi)
+{
+ CLST rno;
+
+ if (!le) {
+ *mi = &ni->mi;
+ return 0;
+ }
+
+ rno = ino_get(&le->ref);
+ if (rno == ni->mi.rno) {
+ *mi = &ni->mi;
+ return 0;
+ }
+ return ni_load_mi_ex(ni, rno, mi);
+}
+
+/*
+ * ni_find_attr
+ *
+ * Return: Attribute and record this attribute belongs to.
+ */
+struct ATTRIB *ni_find_attr(struct ntfs_inode *ni, struct ATTRIB *attr,
+ struct ATTR_LIST_ENTRY **le_o, enum ATTR_TYPE type,
+ const __le16 *name, u8 name_len, const CLST *vcn,
+ struct mft_inode **mi)
+{
+ struct ATTR_LIST_ENTRY *le;
+ struct mft_inode *m;
+
+ if (!ni->attr_list.size ||
+ (!name_len && (type == ATTR_LIST || type == ATTR_STD))) {
+ if (le_o)
+ *le_o = NULL;
+ if (mi)
+ *mi = &ni->mi;
+
+ /* Look for required attribute in primary record. */
+ return mi_find_attr(&ni->mi, attr, type, name, name_len, NULL);
+ }
+
+ /* First look for list entry of required type. */
+ le = al_find_ex(ni, le_o ? *le_o : NULL, type, name, name_len, vcn);
+ if (!le)
+ return NULL;
+
+ if (le_o)
+ *le_o = le;
+
+ /* Load record that contains this attribute. */
+ if (ni_load_mi(ni, le, &m))
+ return NULL;
+
+ /* Look for required attribute. */
+ attr = mi_find_attr(m, NULL, type, name, name_len, &le->id);
+
+ if (!attr)
+ goto out;
+
+ if (!attr->non_res) {
+ if (vcn && *vcn)
+ goto out;
+ } else if (!vcn) {
+ if (attr->nres.svcn)
+ goto out;
+ } else if (le64_to_cpu(attr->nres.svcn) > *vcn ||
+ *vcn > le64_to_cpu(attr->nres.evcn)) {
+ goto out;
+ }
+
+ if (mi)
+ *mi = m;
+ return attr;
+
+out:
+ ntfs_set_state(ni->mi.sbi, NTFS_DIRTY_ERROR);
+ return NULL;
+}
+
+/*
+ * ni_enum_attr_ex - Enumerates attributes in ntfs_inode.
+ */
+struct ATTRIB *ni_enum_attr_ex(struct ntfs_inode *ni, struct ATTRIB *attr,
+ struct ATTR_LIST_ENTRY **le,
+ struct mft_inode **mi)
+{
+ struct mft_inode *mi2;
+ struct ATTR_LIST_ENTRY *le2;
+
+ /* Do we have an attribute list? */
+ if (!ni->attr_list.size) {
+ *le = NULL;
+ if (mi)
+ *mi = &ni->mi;
+ /* Enum attributes in primary record. */
+ return mi_enum_attr(&ni->mi, attr);
+ }
+
+ /* Get next list entry. */
+ le2 = *le = al_enumerate(ni, attr ? *le : NULL);
+ if (!le2)
+ return NULL;
+
+ /* Load record that contains the required attribute. */
+ if (ni_load_mi(ni, le2, &mi2))
+ return NULL;
+
+ if (mi)
+ *mi = mi2;
+
+ /* Find attribute in loaded record. */
+ return rec_find_attr_le(mi2, le2);
+}
+
+/*
+ * ni_load_attr - Load attribute that contains given VCN.
+ */
+struct ATTRIB *ni_load_attr(struct ntfs_inode *ni, enum ATTR_TYPE type,
+ const __le16 *name, u8 name_len, CLST vcn,
+ struct mft_inode **pmi)
+{
+ struct ATTR_LIST_ENTRY *le;
+ struct ATTRIB *attr;
+ struct mft_inode *mi;
+ struct ATTR_LIST_ENTRY *next;
+
+ if (!ni->attr_list.size) {
+ if (pmi)
+ *pmi = &ni->mi;
+ return mi_find_attr(&ni->mi, NULL, type, name, name_len, NULL);
+ }
+
+ le = al_find_ex(ni, NULL, type, name, name_len, NULL);
+ if (!le)
+ return NULL;
+
+ /*
+ * Unfortunately ATTR_LIST_ENTRY contains only start VCN.
+ * So to find the ATTRIB segment that contains 'vcn' we should
+ * enumerate some entries.
+ */
+ if (vcn) {
+ for (;; le = next) {
+ next = al_find_ex(ni, le, type, name, name_len, NULL);
+ if (!next || le64_to_cpu(next->vcn) > vcn)
+ break;
+ }
+ }
+
+ if (ni_load_mi(ni, le, &mi))
+ return NULL;
+
+ if (pmi)
+ *pmi = mi;
+
+ attr = mi_find_attr(mi, NULL, type, name, name_len, &le->id);
+ if (!attr)
+ return NULL;
+
+ if (!attr->non_res)
+ return attr;
+
+ if (le64_to_cpu(attr->nres.svcn) <= vcn &&
+ vcn <= le64_to_cpu(attr->nres.evcn))
+ return attr;
+
+ return NULL;
+}
+
+/*
+ * ni_load_all_mi - Load all subrecords.
+ */
+int ni_load_all_mi(struct ntfs_inode *ni)
+{
+ int err;
+ struct ATTR_LIST_ENTRY *le;
+
+ if (!ni->attr_list.size)
+ return 0;
+
+ le = NULL;
+
+ while ((le = al_enumerate(ni, le))) {
+ CLST rno = ino_get(&le->ref);
+
+ if (rno == ni->mi.rno)
+ continue;
+
+ err = ni_load_mi_ex(ni, rno, NULL);
+ if (err)
+ return err;
+ }
+
+ return 0;
+}
+
+/*
+ * ni_add_subrecord - Allocate + format + attach a new subrecord.
+ */
+bool ni_add_subrecord(struct ntfs_inode *ni, CLST rno, struct mft_inode **mi)
+{
+ struct mft_inode *m;
+
+ m = kzalloc(sizeof(struct mft_inode), GFP_NOFS);
+ if (!m)
+ return false;
+
+ if (mi_format_new(m, ni->mi.sbi, rno, 0, ni->mi.rno == MFT_REC_MFT)) {
+ mi_put(m);
+ return false;
+ }
+
+ mi_get_ref(&ni->mi, &m->mrec->parent_ref);
+
+ ni_add_mi(ni, m);
+ *mi = m;
+ return true;
+}
+
+/*
+ * ni_remove_attr - Remove all attributes for the given type/name/id.
+ */
+int ni_remove_attr(struct ntfs_inode *ni, enum ATTR_TYPE type,
+ const __le16 *name, size_t name_len, bool base_only,
+ const __le16 *id)
+{
+ int err;
+ struct ATTRIB *attr;
+ struct ATTR_LIST_ENTRY *le;
+ struct mft_inode *mi;
+ u32 type_in;
+ int diff;
+
+ if (base_only || type == ATTR_LIST || !ni->attr_list.size) {
+ attr = mi_find_attr(&ni->mi, NULL, type, name, name_len, id);
+ if (!attr)
+ return -ENOENT;
+
+ mi_remove_attr(ni, &ni->mi, attr);
+ return 0;
+ }
+
+ type_in = le32_to_cpu(type);
+ le = NULL;
+
+ for (;;) {
+ le = al_enumerate(ni, le);
+ if (!le)
+ return 0;
+
+next_le2:
+ diff = le32_to_cpu(le->type) - type_in;
+ if (diff < 0)
+ continue;
+
+ if (diff > 0)
+ return 0;
+
+ if (le->name_len != name_len)
+ continue;
+
+ if (name_len &&
+ memcmp(le_name(le), name, name_len * sizeof(short)))
+ continue;
+
+ if (id && le->id != *id)
+ continue;
+ err = ni_load_mi(ni, le, &mi);
+ if (err)
+ return err;
+
+ al_remove_le(ni, le);
+
+ attr = mi_find_attr(mi, NULL, type, name, name_len, id);
+ if (!attr)
+ return -ENOENT;
+
+ mi_remove_attr(ni, mi, attr);
+
+ if (PtrOffset(ni->attr_list.le, le) >= ni->attr_list.size)
+ return 0;
+ goto next_le2;
+ }
+}
+
+/*
+ * ni_ins_new_attr - Insert the attribute into record.
+ *
+ * Return: Not full constructed attribute or NULL if not possible to create.
+ */
+static struct ATTRIB *
+ni_ins_new_attr(struct ntfs_inode *ni, struct mft_inode *mi,
+ struct ATTR_LIST_ENTRY *le, enum ATTR_TYPE type,
+ const __le16 *name, u8 name_len, u32 asize, u16 name_off,
+ CLST svcn, struct ATTR_LIST_ENTRY **ins_le)
+{
+ int err;
+ struct ATTRIB *attr;
+ bool le_added = false;
+ struct MFT_REF ref;
+
+ mi_get_ref(mi, &ref);
+
+ if (type != ATTR_LIST && !le && ni->attr_list.size) {
+ err = al_add_le(ni, type, name, name_len, svcn, cpu_to_le16(-1),
+ &ref, &le);
+ if (err) {
+ /* No memory or no space. */
+ return ERR_PTR(err);
+ }
+ le_added = true;
+
+ /*
+ * al_add_le -> attr_set_size (list) -> ni_expand_list
+ * which moves some attributes out of primary record
+ * this means that name may point into moved memory
+ * reinit 'name' from le.
+ */
+ name = le->name;
+ }
+
+ attr = mi_insert_attr(mi, type, name, name_len, asize, name_off);
+ if (!attr) {
+ if (le_added)
+ al_remove_le(ni, le);
+ return NULL;
+ }
+
+ if (type == ATTR_LIST) {
+ /* Attr list is not in list entry array. */
+ goto out;
+ }
+
+ if (!le)
+ goto out;
+
+ /* Update ATTRIB Id and record reference. */
+ le->id = attr->id;
+ ni->attr_list.dirty = true;
+ le->ref = ref;
+
+out:
+ if (ins_le)
+ *ins_le = le;
+ return attr;
+}
+
+/*
+ * ni_repack
+ *
+ * Random write access to sparsed or compressed file may result to
+ * not optimized packed runs.
+ * Here is the place to optimize it.
+ */
+static int ni_repack(struct ntfs_inode *ni)
+{
+ int err = 0;
+ struct ntfs_sb_info *sbi = ni->mi.sbi;
+ struct mft_inode *mi, *mi_p = NULL;
+ struct ATTRIB *attr = NULL, *attr_p;
+ struct ATTR_LIST_ENTRY *le = NULL, *le_p;
+ CLST alloc = 0;
+ u8 cluster_bits = sbi->cluster_bits;
+ CLST svcn, evcn = 0, svcn_p, evcn_p, next_svcn;
+ u32 roff, rs = sbi->record_size;
+ struct runs_tree run;
+
+ run_init(&run);
+
+ while ((attr = ni_enum_attr_ex(ni, attr, &le, &mi))) {
+ if (!attr->non_res)
+ continue;
+
+ svcn = le64_to_cpu(attr->nres.svcn);
+ if (svcn != le64_to_cpu(le->vcn)) {
+ err = -EINVAL;
+ break;
+ }
+
+ if (!svcn) {
+ alloc = le64_to_cpu(attr->nres.alloc_size) >>
+ cluster_bits;
+ mi_p = NULL;
+ } else if (svcn != evcn + 1) {
+ err = -EINVAL;
+ break;
+ }
+
+ evcn = le64_to_cpu(attr->nres.evcn);
+
+ if (svcn > evcn + 1) {
+ err = -EINVAL;
+ break;
+ }
+
+ if (!mi_p) {
+ /* Do not try if not enogh free space. */
+ if (le32_to_cpu(mi->mrec->used) + 8 >= rs)
+ continue;
+
+ /* Do not try if last attribute segment. */
+ if (evcn + 1 == alloc)
+ continue;
+ run_close(&run);
+ }
+
+ roff = le16_to_cpu(attr->nres.run_off);
+
+ if (roff > le32_to_cpu(attr->size)) {
+ err = -EINVAL;
+ break;
+ }
+
+ err = run_unpack(&run, sbi, ni->mi.rno, svcn, evcn, svcn,
+ Add2Ptr(attr, roff),
+ le32_to_cpu(attr->size) - roff);
+ if (err < 0)
+ break;
+
+ if (!mi_p) {
+ mi_p = mi;
+ attr_p = attr;
+ svcn_p = svcn;
+ evcn_p = evcn;
+ le_p = le;
+ err = 0;
+ continue;
+ }
+
+ /*
+ * Run contains data from two records: mi_p and mi
+ * Try to pack in one.
+ */
+ err = mi_pack_runs(mi_p, attr_p, &run, evcn + 1 - svcn_p);
+ if (err)
+ break;
+
+ next_svcn = le64_to_cpu(attr_p->nres.evcn) + 1;
+
+ if (next_svcn >= evcn + 1) {
+ /* We can remove this attribute segment. */
+ al_remove_le(ni, le);
+ mi_remove_attr(NULL, mi, attr);
+ le = le_p;
+ continue;
+ }
+
+ attr->nres.svcn = le->vcn = cpu_to_le64(next_svcn);
+ mi->dirty = true;
+ ni->attr_list.dirty = true;
+
+ if (evcn + 1 == alloc) {
+ err = mi_pack_runs(mi, attr, &run,
+ evcn + 1 - next_svcn);
+ if (err)
+ break;
+ mi_p = NULL;
+ } else {
+ mi_p = mi;
+ attr_p = attr;
+ svcn_p = next_svcn;
+ evcn_p = evcn;
+ le_p = le;
+ run_truncate_head(&run, next_svcn);
+ }
+ }
+
+ if (err) {
+ ntfs_inode_warn(&ni->vfs_inode, "repack problem");
+ ntfs_set_state(sbi, NTFS_DIRTY_ERROR);
+
+ /* Pack loaded but not packed runs. */
+ if (mi_p)
+ mi_pack_runs(mi_p, attr_p, &run, evcn_p + 1 - svcn_p);
+ }
+
+ run_close(&run);
+ return err;
+}
+
+/*
+ * ni_try_remove_attr_list
+ *
+ * Can we remove attribute list?
+ * Check the case when primary record contains enough space for all attributes.
+ */
+static int ni_try_remove_attr_list(struct ntfs_inode *ni)
+{
+ int err = 0;
+ struct ntfs_sb_info *sbi = ni->mi.sbi;
+ struct ATTRIB *attr, *attr_list, *attr_ins;
+ struct ATTR_LIST_ENTRY *le;
+ struct mft_inode *mi;
+ u32 asize, free;
+ struct MFT_REF ref;
+ struct MFT_REC *mrec;
+ __le16 id;
+
+ if (!ni->attr_list.dirty)
+ return 0;
+
+ err = ni_repack(ni);
+ if (err)
+ return err;
+
+ attr_list = mi_find_attr(&ni->mi, NULL, ATTR_LIST, NULL, 0, NULL);
+ if (!attr_list)
+ return 0;
+
+ asize = le32_to_cpu(attr_list->size);
+
+ /* Free space in primary record without attribute list. */
+ free = sbi->record_size - le32_to_cpu(ni->mi.mrec->used) + asize;
+ mi_get_ref(&ni->mi, &ref);
+
+ le = NULL;
+ while ((le = al_enumerate(ni, le))) {
+ if (!memcmp(&le->ref, &ref, sizeof(ref)))
+ continue;
+
+ if (le->vcn)
+ return 0;
+
+ mi = ni_find_mi(ni, ino_get(&le->ref));
+ if (!mi)
+ return 0;
+
+ attr = mi_find_attr(mi, NULL, le->type, le_name(le),
+ le->name_len, &le->id);
+ if (!attr)
+ return 0;
+
+ asize = le32_to_cpu(attr->size);
+ if (asize > free)
+ return 0;
+
+ free -= asize;
+ }
+
+ /* Make a copy of primary record to restore if error. */
+ mrec = kmemdup(ni->mi.mrec, sbi->record_size, GFP_NOFS);
+ if (!mrec)
+ return 0; /* Not critical. */
+
+ /* It seems that attribute list can be removed from primary record. */
+ mi_remove_attr(NULL, &ni->mi, attr_list);
+
+ /*
+ * Repeat the cycle above and copy all attributes to primary record.
+ * Do not remove original attributes from subrecords!
+ * It should be success!
+ */
+ le = NULL;
+ while ((le = al_enumerate(ni, le))) {
+ if (!memcmp(&le->ref, &ref, sizeof(ref)))
+ continue;
+
+ mi = ni_find_mi(ni, ino_get(&le->ref));
+ if (!mi) {
+ /* Should never happened, 'cause already checked. */
+ goto out;
+ }
+
+ attr = mi_find_attr(mi, NULL, le->type, le_name(le),
+ le->name_len, &le->id);
+ if (!attr) {
+ /* Should never happened, 'cause already checked. */
+ goto out;
+ }
+ asize = le32_to_cpu(attr->size);
+
+ /* Insert into primary record. */
+ attr_ins = mi_insert_attr(&ni->mi, le->type, le_name(le),
+ le->name_len, asize,
+ le16_to_cpu(attr->name_off));
+ if (!attr_ins) {
+ /*
+ * No space in primary record (already checked).
+ */
+ goto out;
+ }
+
+ /* Copy all except id. */
+ id = attr_ins->id;
+ memcpy(attr_ins, attr, asize);
+ attr_ins->id = id;
+ }
+
+ /*
+ * Repeat the cycle above and remove all attributes from subrecords.
+ */
+ le = NULL;
+ while ((le = al_enumerate(ni, le))) {
+ if (!memcmp(&le->ref, &ref, sizeof(ref)))
+ continue;
+
+ mi = ni_find_mi(ni, ino_get(&le->ref));
+ if (!mi)
+ continue;
+
+ attr = mi_find_attr(mi, NULL, le->type, le_name(le),
+ le->name_len, &le->id);
+ if (!attr)
+ continue;
+
+ /* Remove from original record. */
+ mi_remove_attr(NULL, mi, attr);
+ }
+
+ run_deallocate(sbi, &ni->attr_list.run, true);
+ run_close(&ni->attr_list.run);
+ ni->attr_list.size = 0;
+ kfree(ni->attr_list.le);
+ ni->attr_list.le = NULL;
+ ni->attr_list.dirty = false;
+
+ kfree(mrec);
+ return 0;
+out:
+ /* Restore primary record. */
+ swap(mrec, ni->mi.mrec);
+ kfree(mrec);
+ return 0;
+}
+
+/*
+ * ni_create_attr_list - Generates an attribute list for this primary record.
+ */
+int ni_create_attr_list(struct ntfs_inode *ni)
+{
+ struct ntfs_sb_info *sbi = ni->mi.sbi;
+ int err;
+ u32 lsize;
+ struct ATTRIB *attr;
+ struct ATTRIB *arr_move[7];
+ struct ATTR_LIST_ENTRY *le, *le_b[7];
+ struct MFT_REC *rec;
+ bool is_mft;
+ CLST rno = 0;
+ struct mft_inode *mi;
+ u32 free_b, nb, to_free, rs;
+ u16 sz;
+
+ is_mft = ni->mi.rno == MFT_REC_MFT;
+ rec = ni->mi.mrec;
+ rs = sbi->record_size;
+
+ /*
+ * Skip estimating exact memory requirement.
+ * Looks like one record_size is always enough.
+ */
+ le = kmalloc(al_aligned(rs), GFP_NOFS);
+ if (!le) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ mi_get_ref(&ni->mi, &le->ref);
+ ni->attr_list.le = le;
+
+ attr = NULL;
+ nb = 0;
+ free_b = 0;
+ attr = NULL;
+
+ for (; (attr = mi_enum_attr(&ni->mi, attr)); le = Add2Ptr(le, sz)) {
+ sz = le_size(attr->name_len);
+ le->type = attr->type;
+ le->size = cpu_to_le16(sz);
+ le->name_len = attr->name_len;
+ le->name_off = offsetof(struct ATTR_LIST_ENTRY, name);
+ le->vcn = 0;
+ if (le != ni->attr_list.le)
+ le->ref = ni->attr_list.le->ref;
+ le->id = attr->id;
+
+ if (attr->name_len)
+ memcpy(le->name, attr_name(attr),
+ sizeof(short) * attr->name_len);
+ else if (attr->type == ATTR_STD)
+ continue;
+ else if (attr->type == ATTR_LIST)
+ continue;
+ else if (is_mft && attr->type == ATTR_DATA)
+ continue;
+
+ if (!nb || nb < ARRAY_SIZE(arr_move)) {
+ le_b[nb] = le;
+ arr_move[nb++] = attr;
+ free_b += le32_to_cpu(attr->size);
+ }
+ }
+
+ lsize = PtrOffset(ni->attr_list.le, le);
+ ni->attr_list.size = lsize;
+
+ to_free = le32_to_cpu(rec->used) + lsize + SIZEOF_RESIDENT;
+ if (to_free <= rs) {
+ to_free = 0;
+ } else {
+ to_free -= rs;
+
+ if (to_free > free_b) {
+ err = -EINVAL;
+ goto out1;
+ }
+ }
+
+ /* Allocate child MFT. */
+ err = ntfs_look_free_mft(sbi, &rno, is_mft, ni, &mi);
+ if (err)
+ goto out1;
+
+ err = -EINVAL;
+ /* Call mi_remove_attr() in reverse order to keep pointers 'arr_move' valid. */
+ while (to_free > 0) {
+ struct ATTRIB *b = arr_move[--nb];
+ u32 asize = le32_to_cpu(b->size);
+ u16 name_off = le16_to_cpu(b->name_off);
+
+ attr = mi_insert_attr(mi, b->type, Add2Ptr(b, name_off),
+ b->name_len, asize, name_off);
+ if (!attr)
+ goto out1;
+
+ mi_get_ref(mi, &le_b[nb]->ref);
+ le_b[nb]->id = attr->id;
+
+ /* Copy all except id. */
+ memcpy(attr, b, asize);
+ attr->id = le_b[nb]->id;
+
+ /* Remove from primary record. */
+ if (!mi_remove_attr(NULL, &ni->mi, b))
+ goto out1;
+
+ if (to_free <= asize)
+ break;
+ to_free -= asize;
+ if (!nb)
+ goto out1;
+ }
+
+ attr = mi_insert_attr(&ni->mi, ATTR_LIST, NULL, 0,
+ lsize + SIZEOF_RESIDENT, SIZEOF_RESIDENT);
+ if (!attr)
+ goto out1;
+
+ attr->non_res = 0;
+ attr->flags = 0;
+ attr->res.data_size = cpu_to_le32(lsize);
+ attr->res.data_off = SIZEOF_RESIDENT_LE;
+ attr->res.flags = 0;
+ attr->res.res = 0;
+
+ memcpy(resident_data_ex(attr, lsize), ni->attr_list.le, lsize);
+
+ ni->attr_list.dirty = false;
+
+ mark_inode_dirty(&ni->vfs_inode);
+ goto out;
+
+out1:
+ kfree(ni->attr_list.le);
+ ni->attr_list.le = NULL;
+ ni->attr_list.size = 0;
+ return err;
+
+out:
+ return 0;
+}
+
+/*
+ * ni_ins_attr_ext - Add an external attribute to the ntfs_inode.
+ */
+static int ni_ins_attr_ext(struct ntfs_inode *ni, struct ATTR_LIST_ENTRY *le,
+ enum ATTR_TYPE type, const __le16 *name, u8 name_len,
+ u32 asize, CLST svcn, u16 name_off, bool force_ext,
+ struct ATTRIB **ins_attr, struct mft_inode **ins_mi,
+ struct ATTR_LIST_ENTRY **ins_le)
+{
+ struct ATTRIB *attr;
+ struct mft_inode *mi;
+ CLST rno;
+ u64 vbo;
+ struct rb_node *node;
+ int err;
+ bool is_mft, is_mft_data;
+ struct ntfs_sb_info *sbi = ni->mi.sbi;
+
+ is_mft = ni->mi.rno == MFT_REC_MFT;
+ is_mft_data = is_mft && type == ATTR_DATA && !name_len;
+
+ if (asize > sbi->max_bytes_per_attr) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ /*
+ * Standard information and attr_list cannot be made external.
+ * The Log File cannot have any external attributes.
+ */
+ if (type == ATTR_STD || type == ATTR_LIST ||
+ ni->mi.rno == MFT_REC_LOG) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ /* Create attribute list if it is not already existed. */
+ if (!ni->attr_list.size) {
+ err = ni_create_attr_list(ni);
+ if (err)
+ goto out;
+ }
+
+ vbo = is_mft_data ? ((u64)svcn << sbi->cluster_bits) : 0;
+
+ if (force_ext)
+ goto insert_ext;
+
+ /* Load all subrecords into memory. */
+ err = ni_load_all_mi(ni);
+ if (err)
+ goto out;
+
+ /* Check each of loaded subrecord. */
+ for (node = rb_first(&ni->mi_tree); node; node = rb_next(node)) {
+ mi = rb_entry(node, struct mft_inode, node);
+
+ if (is_mft_data &&
+ (mi_enum_attr(mi, NULL) ||
+ vbo <= ((u64)mi->rno << sbi->record_bits))) {
+ /* We can't accept this record 'cause MFT's bootstrapping. */
+ continue;
+ }
+ if (is_mft &&
+ mi_find_attr(mi, NULL, ATTR_DATA, NULL, 0, NULL)) {
+ /*
+ * This child record already has a ATTR_DATA.
+ * So it can't accept any other records.
+ */
+ continue;
+ }
+
+ if ((type != ATTR_NAME || name_len) &&
+ mi_find_attr(mi, NULL, type, name, name_len, NULL)) {
+ /* Only indexed attributes can share same record. */
+ continue;
+ }
+
+ /*
+ * Do not try to insert this attribute
+ * if there is no room in record.
+ */
+ if (le32_to_cpu(mi->mrec->used) + asize > sbi->record_size)
+ continue;
+
+ /* Try to insert attribute into this subrecord. */
+ attr = ni_ins_new_attr(ni, mi, le, type, name, name_len, asize,
+ name_off, svcn, ins_le);
+ if (!attr)
+ continue;
+ if (IS_ERR(attr))
+ return PTR_ERR(attr);
+
+ if (ins_attr)
+ *ins_attr = attr;
+ if (ins_mi)
+ *ins_mi = mi;
+ return 0;
+ }
+
+insert_ext:
+ /* We have to allocate a new child subrecord. */
+ err = ntfs_look_free_mft(sbi, &rno, is_mft_data, ni, &mi);
+ if (err)
+ goto out;
+
+ if (is_mft_data && vbo <= ((u64)rno << sbi->record_bits)) {
+ err = -EINVAL;
+ goto out1;
+ }
+
+ attr = ni_ins_new_attr(ni, mi, le, type, name, name_len, asize,
+ name_off, svcn, ins_le);
+ if (!attr) {
+ err = -EINVAL;
+ goto out2;
+ }
+
+ if (IS_ERR(attr)) {
+ err = PTR_ERR(attr);
+ goto out2;
+ }
+
+ if (ins_attr)
+ *ins_attr = attr;
+ if (ins_mi)
+ *ins_mi = mi;
+
+ return 0;
+
+out2:
+ ni_remove_mi(ni, mi);
+ mi_put(mi);
+
+out1:
+ ntfs_mark_rec_free(sbi, rno, is_mft);
+
+out:
+ return err;
+}
+
+/*
+ * ni_insert_attr - Insert an attribute into the file.
+ *
+ * If the primary record has room, it will just insert the attribute.
+ * If not, it may make the attribute external.
+ * For $MFT::Data it may make room for the attribute by
+ * making other attributes external.
+ *
+ * NOTE:
+ * The ATTR_LIST and ATTR_STD cannot be made external.
+ * This function does not fill new attribute full.
+ * It only fills 'size'/'type'/'id'/'name_len' fields.
+ */
+static int ni_insert_attr(struct ntfs_inode *ni, enum ATTR_TYPE type,
+ const __le16 *name, u8 name_len, u32 asize,
+ u16 name_off, CLST svcn, struct ATTRIB **ins_attr,
+ struct mft_inode **ins_mi,
+ struct ATTR_LIST_ENTRY **ins_le)
+{
+ struct ntfs_sb_info *sbi = ni->mi.sbi;
+ int err;
+ struct ATTRIB *attr, *eattr;
+ struct MFT_REC *rec;
+ bool is_mft;
+ struct ATTR_LIST_ENTRY *le;
+ u32 list_reserve, max_free, free, used, t32;
+ __le16 id;
+ u16 t16;
+
+ is_mft = ni->mi.rno == MFT_REC_MFT;
+ rec = ni->mi.mrec;
+
+ list_reserve = SIZEOF_NONRESIDENT + 3 * (1 + 2 * sizeof(u32));
+ used = le32_to_cpu(rec->used);
+ free = sbi->record_size - used;
+
+ if (is_mft && type != ATTR_LIST) {
+ /* Reserve space for the ATTRIB list. */
+ if (free < list_reserve)
+ free = 0;
+ else
+ free -= list_reserve;
+ }
+
+ if (asize <= free) {
+ attr = ni_ins_new_attr(ni, &ni->mi, NULL, type, name, name_len,
+ asize, name_off, svcn, ins_le);
+ if (IS_ERR(attr)) {
+ err = PTR_ERR(attr);
+ goto out;
+ }
+
+ if (attr) {
+ if (ins_attr)
+ *ins_attr = attr;
+ if (ins_mi)
+ *ins_mi = &ni->mi;
+ err = 0;
+ goto out;
+ }
+ }
+
+ if (!is_mft || type != ATTR_DATA || svcn) {
+ /* This ATTRIB will be external. */
+ err = ni_ins_attr_ext(ni, NULL, type, name, name_len, asize,
+ svcn, name_off, false, ins_attr, ins_mi,
+ ins_le);
+ goto out;
+ }
+
+ /*
+ * Here we have: "is_mft && type == ATTR_DATA && !svcn"
+ *
+ * The first chunk of the $MFT::Data ATTRIB must be the base record.
+ * Evict as many other attributes as possible.
+ */
+ max_free = free;
+
+ /* Estimate the result of moving all possible attributes away. */
+ attr = NULL;
+
+ while ((attr = mi_enum_attr(&ni->mi, attr))) {
+ if (attr->type == ATTR_STD)
+ continue;
+ if (attr->type == ATTR_LIST)
+ continue;
+ max_free += le32_to_cpu(attr->size);
+ }
+
+ if (max_free < asize + list_reserve) {
+ /* Impossible to insert this attribute into primary record. */
+ err = -EINVAL;
+ goto out;
+ }
+
+ /* Start real attribute moving. */
+ attr = NULL;
+
+ for (;;) {
+ attr = mi_enum_attr(&ni->mi, attr);
+ if (!attr) {
+ /* We should never be here 'cause we have already check this case. */
+ err = -EINVAL;
+ goto out;
+ }
+
+ /* Skip attributes that MUST be primary record. */
+ if (attr->type == ATTR_STD || attr->type == ATTR_LIST)
+ continue;
+
+ le = NULL;
+ if (ni->attr_list.size) {
+ le = al_find_le(ni, NULL, attr);
+ if (!le) {
+ /* Really this is a serious bug. */
+ err = -EINVAL;
+ goto out;
+ }
+ }
+
+ t32 = le32_to_cpu(attr->size);
+ t16 = le16_to_cpu(attr->name_off);
+ err = ni_ins_attr_ext(ni, le, attr->type, Add2Ptr(attr, t16),
+ attr->name_len, t32, attr_svcn(attr), t16,
+ false, &eattr, NULL, NULL);
+ if (err)
+ return err;
+
+ id = eattr->id;
+ memcpy(eattr, attr, t32);
+ eattr->id = id;
+
+ /* Remove from primary record. */
+ mi_remove_attr(NULL, &ni->mi, attr);
+
+ /* attr now points to next attribute. */
+ if (attr->type == ATTR_END)
+ goto out;
+ }
+ while (asize + list_reserve > sbi->record_size - le32_to_cpu(rec->used))
+ ;
+
+ attr = ni_ins_new_attr(ni, &ni->mi, NULL, type, name, name_len, asize,
+ name_off, svcn, ins_le);
+ if (!attr) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ if (IS_ERR(attr)) {
+ err = PTR_ERR(attr);
+ goto out;
+ }
+
+ if (ins_attr)
+ *ins_attr = attr;
+ if (ins_mi)
+ *ins_mi = &ni->mi;
+
+out:
+ return err;
+}
+
+/* ni_expand_mft_list - Split ATTR_DATA of $MFT. */
+static int ni_expand_mft_list(struct ntfs_inode *ni)
+{
+ int err = 0;
+ struct runs_tree *run = &ni->file.run;
+ u32 asize, run_size, done = 0;
+ struct ATTRIB *attr;
+ struct rb_node *node;
+ CLST mft_min, mft_new, svcn, evcn, plen;
+ struct mft_inode *mi, *mi_min, *mi_new;
+ struct ntfs_sb_info *sbi = ni->mi.sbi;
+
+ /* Find the nearest MFT. */
+ mft_min = 0;
+ mft_new = 0;
+ mi_min = NULL;
+
+ for (node = rb_first(&ni->mi_tree); node; node = rb_next(node)) {
+ mi = rb_entry(node, struct mft_inode, node);
+
+ attr = mi_enum_attr(mi, NULL);
+
+ if (!attr) {
+ mft_min = mi->rno;
+ mi_min = mi;
+ break;
+ }
+ }
+
+ if (ntfs_look_free_mft(sbi, &mft_new, true, ni, &mi_new)) {
+ mft_new = 0;
+ /* Really this is not critical. */
+ } else if (mft_min > mft_new) {
+ mft_min = mft_new;
+ mi_min = mi_new;
+ } else {
+ ntfs_mark_rec_free(sbi, mft_new, true);
+ mft_new = 0;
+ ni_remove_mi(ni, mi_new);
+ }
+
+ attr = mi_find_attr(&ni->mi, NULL, ATTR_DATA, NULL, 0, NULL);
+ if (!attr) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ asize = le32_to_cpu(attr->size);
+
+ evcn = le64_to_cpu(attr->nres.evcn);
+ svcn = bytes_to_cluster(sbi, (u64)(mft_min + 1) << sbi->record_bits);
+ if (evcn + 1 >= svcn) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ /*
+ * Split primary attribute [0 evcn] in two parts [0 svcn) + [svcn evcn].
+ *
+ * Update first part of ATTR_DATA in 'primary MFT.
+ */
+ err = run_pack(run, 0, svcn, Add2Ptr(attr, SIZEOF_NONRESIDENT),
+ asize - SIZEOF_NONRESIDENT, &plen);
+ if (err < 0)
+ goto out;
+
+ run_size = ALIGN(err, 8);
+ err = 0;
+
+ if (plen < svcn) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ attr->nres.evcn = cpu_to_le64(svcn - 1);
+ attr->size = cpu_to_le32(run_size + SIZEOF_NONRESIDENT);
+ /* 'done' - How many bytes of primary MFT becomes free. */
+ done = asize - run_size - SIZEOF_NONRESIDENT;
+ le32_sub_cpu(&ni->mi.mrec->used, done);
+
+ /* Estimate packed size (run_buf=NULL). */
+ err = run_pack(run, svcn, evcn + 1 - svcn, NULL, sbi->record_size,
+ &plen);
+ if (err < 0)
+ goto out;
+
+ run_size = ALIGN(err, 8);
+ err = 0;
+
+ if (plen < evcn + 1 - svcn) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ /*
+ * This function may implicitly call expand attr_list.
+ * Insert second part of ATTR_DATA in 'mi_min'.
+ */
+ attr = ni_ins_new_attr(ni, mi_min, NULL, ATTR_DATA, NULL, 0,
+ SIZEOF_NONRESIDENT + run_size,
+ SIZEOF_NONRESIDENT, svcn, NULL);
+ if (!attr) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ if (IS_ERR(attr)) {
+ err = PTR_ERR(attr);
+ goto out;
+ }
+
+ attr->non_res = 1;
+ attr->name_off = SIZEOF_NONRESIDENT_LE;
+ attr->flags = 0;
+
+ /* This function can't fail - cause already checked above. */
+ run_pack(run, svcn, evcn + 1 - svcn, Add2Ptr(attr, SIZEOF_NONRESIDENT),
+ run_size, &plen);
+
+ attr->nres.svcn = cpu_to_le64(svcn);
+ attr->nres.evcn = cpu_to_le64(evcn);
+ attr->nres.run_off = cpu_to_le16(SIZEOF_NONRESIDENT);
+
+out:
+ if (mft_new) {
+ ntfs_mark_rec_free(sbi, mft_new, true);
+ ni_remove_mi(ni, mi_new);
+ }
+
+ return !err && !done ? -EOPNOTSUPP : err;
+}
+
+/*
+ * ni_expand_list - Move all possible attributes out of primary record.
+ */
+int ni_expand_list(struct ntfs_inode *ni)
+{
+ int err = 0;
+ u32 asize, done = 0;
+ struct ATTRIB *attr, *ins_attr;
+ struct ATTR_LIST_ENTRY *le;
+ bool is_mft = ni->mi.rno == MFT_REC_MFT;
+ struct MFT_REF ref;
+
+ mi_get_ref(&ni->mi, &ref);
+ le = NULL;
+
+ while ((le = al_enumerate(ni, le))) {
+ if (le->type == ATTR_STD)
+ continue;
+
+ if (memcmp(&ref, &le->ref, sizeof(struct MFT_REF)))
+ continue;
+
+ if (is_mft && le->type == ATTR_DATA)
+ continue;
+
+ /* Find attribute in primary record. */
+ attr = rec_find_attr_le(&ni->mi, le);
+ if (!attr) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ asize = le32_to_cpu(attr->size);
+
+ /* Always insert into new record to avoid collisions (deep recursive). */
+ err = ni_ins_attr_ext(ni, le, attr->type, attr_name(attr),
+ attr->name_len, asize, attr_svcn(attr),
+ le16_to_cpu(attr->name_off), true,
+ &ins_attr, NULL, NULL);
+
+ if (err)
+ goto out;
+
+ memcpy(ins_attr, attr, asize);
+ ins_attr->id = le->id;
+ /* Remove from primary record. */
+ mi_remove_attr(NULL, &ni->mi, attr);
+
+ done += asize;
+ goto out;
+ }
+
+ if (!is_mft) {
+ err = -EFBIG; /* Attr list is too big(?) */
+ goto out;
+ }
+
+ /* Split MFT data as much as possible. */
+ err = ni_expand_mft_list(ni);
+
+out:
+ return !err && !done ? -EOPNOTSUPP : err;
+}
+
+/*
+ * ni_insert_nonresident - Insert new nonresident attribute.
+ */
+int ni_insert_nonresident(struct ntfs_inode *ni, enum ATTR_TYPE type,
+ const __le16 *name, u8 name_len,
+ const struct runs_tree *run, CLST svcn, CLST len,
+ __le16 flags, struct ATTRIB **new_attr,
+ struct mft_inode **mi, struct ATTR_LIST_ENTRY **le)
+{
+ int err;
+ CLST plen;
+ struct ATTRIB *attr;
+ bool is_ext =
+ (flags & (ATTR_FLAG_SPARSED | ATTR_FLAG_COMPRESSED)) && !svcn;
+ u32 name_size = ALIGN(name_len * sizeof(short), 8);
+ u32 name_off = is_ext ? SIZEOF_NONRESIDENT_EX : SIZEOF_NONRESIDENT;
+ u32 run_off = name_off + name_size;
+ u32 run_size, asize;
+ struct ntfs_sb_info *sbi = ni->mi.sbi;
+
+ /* Estimate packed size (run_buf=NULL). */
+ err = run_pack(run, svcn, len, NULL, sbi->max_bytes_per_attr - run_off,
+ &plen);
+ if (err < 0)
+ goto out;
+
+ run_size = ALIGN(err, 8);
+
+ if (plen < len) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ asize = run_off + run_size;
+
+ if (asize > sbi->max_bytes_per_attr) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ err = ni_insert_attr(ni, type, name, name_len, asize, name_off, svcn,
+ &attr, mi, le);
+
+ if (err)
+ goto out;
+
+ attr->non_res = 1;
+ attr->name_off = cpu_to_le16(name_off);
+ attr->flags = flags;
+
+ /* This function can't fail - cause already checked above. */
+ run_pack(run, svcn, len, Add2Ptr(attr, run_off), run_size, &plen);
+
+ attr->nres.svcn = cpu_to_le64(svcn);
+ attr->nres.evcn = cpu_to_le64((u64)svcn + len - 1);
+
+ if (new_attr)
+ *new_attr = attr;
+
+ *(__le64 *)&attr->nres.run_off = cpu_to_le64(run_off);
+
+ attr->nres.alloc_size =
+ svcn ? 0 : cpu_to_le64((u64)len << ni->mi.sbi->cluster_bits);
+ attr->nres.data_size = attr->nres.alloc_size;
+ attr->nres.valid_size = attr->nres.alloc_size;
+
+ if (is_ext) {
+ if (flags & ATTR_FLAG_COMPRESSED)
+ attr->nres.c_unit = COMPRESSION_UNIT;
+ attr->nres.total_size = attr->nres.alloc_size;
+ }
+
+out:
+ return err;
+}
+
+/*
+ * ni_insert_resident - Inserts new resident attribute.
+ */
+int ni_insert_resident(struct ntfs_inode *ni, u32 data_size,
+ enum ATTR_TYPE type, const __le16 *name, u8 name_len,
+ struct ATTRIB **new_attr, struct mft_inode **mi,
+ struct ATTR_LIST_ENTRY **le)
+{
+ int err;
+ u32 name_size = ALIGN(name_len * sizeof(short), 8);
+ u32 asize = SIZEOF_RESIDENT + name_size + ALIGN(data_size, 8);
+ struct ATTRIB *attr;
+
+ err = ni_insert_attr(ni, type, name, name_len, asize, SIZEOF_RESIDENT,
+ 0, &attr, mi, le);
+ if (err)
+ return err;
+
+ attr->non_res = 0;
+ attr->flags = 0;
+
+ attr->res.data_size = cpu_to_le32(data_size);
+ attr->res.data_off = cpu_to_le16(SIZEOF_RESIDENT + name_size);
+ if (type == ATTR_NAME) {
+ attr->res.flags = RESIDENT_FLAG_INDEXED;
+
+ /* is_attr_indexed(attr)) == true */
+ le16_add_cpu(&ni->mi.mrec->hard_links, 1);
+ ni->mi.dirty = true;
+ }
+ attr->res.res = 0;
+
+ if (new_attr)
+ *new_attr = attr;
+
+ return 0;
+}
+
+/*
+ * ni_remove_attr_le - Remove attribute from record.
+ */
+void ni_remove_attr_le(struct ntfs_inode *ni, struct ATTRIB *attr,
+ struct mft_inode *mi, struct ATTR_LIST_ENTRY *le)
+{
+ mi_remove_attr(ni, mi, attr);
+
+ if (le)
+ al_remove_le(ni, le);
+}
+
+/*
+ * ni_delete_all - Remove all attributes and frees allocates space.
+ *
+ * ntfs_evict_inode->ntfs_clear_inode->ni_delete_all (if no links).
+ */
+int ni_delete_all(struct ntfs_inode *ni)
+{
+ int err;
+ struct ATTR_LIST_ENTRY *le = NULL;
+ struct ATTRIB *attr = NULL;
+ struct rb_node *node;
+ u16 roff;
+ u32 asize;
+ CLST svcn, evcn;
+ struct ntfs_sb_info *sbi = ni->mi.sbi;
+ bool nt3 = is_ntfs3(sbi);
+ struct MFT_REF ref;
+
+ while ((attr = ni_enum_attr_ex(ni, attr, &le, NULL))) {
+ if (!nt3 || attr->name_len) {
+ ;
+ } else if (attr->type == ATTR_REPARSE) {
+ mi_get_ref(&ni->mi, &ref);
+ ntfs_remove_reparse(sbi, 0, &ref);
+ } else if (attr->type == ATTR_ID && !attr->non_res &&
+ le32_to_cpu(attr->res.data_size) >=
+ sizeof(struct GUID)) {
+ ntfs_objid_remove(sbi, resident_data(attr));
+ }
+
+ if (!attr->non_res)
+ continue;
+
+ svcn = le64_to_cpu(attr->nres.svcn);
+ evcn = le64_to_cpu(attr->nres.evcn);
+
+ if (evcn + 1 <= svcn)
+ continue;
+
+ asize = le32_to_cpu(attr->size);
+ roff = le16_to_cpu(attr->nres.run_off);
+
+ if (roff > asize)
+ return -EINVAL;
+
+ /* run==1 means unpack and deallocate. */
+ run_unpack_ex(RUN_DEALLOCATE, sbi, ni->mi.rno, svcn, evcn, svcn,
+ Add2Ptr(attr, roff), asize - roff);
+ }
+
+ if (ni->attr_list.size) {
+ run_deallocate(ni->mi.sbi, &ni->attr_list.run, true);
+ al_destroy(ni);
+ }
+
+ /* Free all subrecords. */
+ for (node = rb_first(&ni->mi_tree); node;) {
+ struct rb_node *next = rb_next(node);
+ struct mft_inode *mi = rb_entry(node, struct mft_inode, node);
+
+ clear_rec_inuse(mi->mrec);
+ mi->dirty = true;
+ mi_write(mi, 0);
+
+ ntfs_mark_rec_free(sbi, mi->rno, false);
+ ni_remove_mi(ni, mi);
+ mi_put(mi);
+ node = next;
+ }
+
+ /* Free base record. */
+ clear_rec_inuse(ni->mi.mrec);
+ ni->mi.dirty = true;
+ err = mi_write(&ni->mi, 0);
+
+ ntfs_mark_rec_free(sbi, ni->mi.rno, false);
+
+ return err;
+}
+
+/* ni_fname_name
+ *
+ * Return: File name attribute by its value.
+ */
+struct ATTR_FILE_NAME *ni_fname_name(struct ntfs_inode *ni,
+ const struct cpu_str *uni,
+ const struct MFT_REF *home_dir,
+ struct mft_inode **mi,
+ struct ATTR_LIST_ENTRY **le)
+{
+ struct ATTRIB *attr = NULL;
+ struct ATTR_FILE_NAME *fname;
+
+ if (le)
+ *le = NULL;
+
+ /* Enumerate all names. */
+next:
+ attr = ni_find_attr(ni, attr, le, ATTR_NAME, NULL, 0, NULL, mi);
+ if (!attr)
+ return NULL;
+
+ fname = resident_data_ex(attr, SIZEOF_ATTRIBUTE_FILENAME);
+ if (!fname)
+ goto next;
+
+ if (home_dir && memcmp(home_dir, &fname->home, sizeof(*home_dir)))
+ goto next;
+
+ if (!uni)
+ return fname;
+
+ if (uni->len != fname->name_len)
+ goto next;
+
+ if (ntfs_cmp_names_cpu(uni, (struct le_str *)&fname->name_len, NULL,
+ false))
+ goto next;
+
+ return fname;
+}
+
+/*
+ * ni_fname_type
+ *
+ * Return: File name attribute with given type.
+ */
+struct ATTR_FILE_NAME *ni_fname_type(struct ntfs_inode *ni, u8 name_type,
+ struct mft_inode **mi,
+ struct ATTR_LIST_ENTRY **le)
+{
+ struct ATTRIB *attr = NULL;
+ struct ATTR_FILE_NAME *fname;
+
+ *le = NULL;
+
+ if (name_type == FILE_NAME_POSIX)
+ return NULL;
+
+ /* Enumerate all names. */
+ for (;;) {
+ attr = ni_find_attr(ni, attr, le, ATTR_NAME, NULL, 0, NULL, mi);
+ if (!attr)
+ return NULL;
+
+ fname = resident_data_ex(attr, SIZEOF_ATTRIBUTE_FILENAME);
+ if (fname && name_type == fname->type)
+ return fname;
+ }
+}
+
+/*
+ * ni_new_attr_flags
+ *
+ * Process compressed/sparsed in special way.
+ * NOTE: You need to set ni->std_fa = new_fa
+ * after this function to keep internal structures in consistency.
+ */
+int ni_new_attr_flags(struct ntfs_inode *ni, enum FILE_ATTRIBUTE new_fa)
+{
+ struct ATTRIB *attr;
+ struct mft_inode *mi;
+ __le16 new_aflags;
+ u32 new_asize;
+
+ attr = ni_find_attr(ni, NULL, NULL, ATTR_DATA, NULL, 0, NULL, &mi);
+ if (!attr)
+ return -EINVAL;
+
+ new_aflags = attr->flags;
+
+ if (new_fa & FILE_ATTRIBUTE_SPARSE_FILE)
+ new_aflags |= ATTR_FLAG_SPARSED;
+ else
+ new_aflags &= ~ATTR_FLAG_SPARSED;
+
+ if (new_fa & FILE_ATTRIBUTE_COMPRESSED)
+ new_aflags |= ATTR_FLAG_COMPRESSED;
+ else
+ new_aflags &= ~ATTR_FLAG_COMPRESSED;
+
+ if (new_aflags == attr->flags)
+ return 0;
+
+ if ((new_aflags & (ATTR_FLAG_COMPRESSED | ATTR_FLAG_SPARSED)) ==
+ (ATTR_FLAG_COMPRESSED | ATTR_FLAG_SPARSED)) {
+ ntfs_inode_warn(&ni->vfs_inode,
+ "file can't be sparsed and compressed");
+ return -EOPNOTSUPP;
+ }
+
+ if (!attr->non_res)
+ goto out;
+
+ if (attr->nres.data_size) {
+ ntfs_inode_warn(
+ &ni->vfs_inode,
+ "one can change sparsed/compressed only for empty files");
+ return -EOPNOTSUPP;
+ }
+
+ /* Resize nonresident empty attribute in-place only. */
+ new_asize = (new_aflags & (ATTR_FLAG_COMPRESSED | ATTR_FLAG_SPARSED))
+ ? (SIZEOF_NONRESIDENT_EX + 8)
+ : (SIZEOF_NONRESIDENT + 8);
+
+ if (!mi_resize_attr(mi, attr, new_asize - le32_to_cpu(attr->size)))
+ return -EOPNOTSUPP;
+
+ if (new_aflags & ATTR_FLAG_SPARSED) {
+ attr->name_off = SIZEOF_NONRESIDENT_EX_LE;
+ /* Windows uses 16 clusters per frame but supports one cluster per frame too. */
+ attr->nres.c_unit = 0;
+ ni->vfs_inode.i_mapping->a_ops = &ntfs_aops;
+ } else if (new_aflags & ATTR_FLAG_COMPRESSED) {
+ attr->name_off = SIZEOF_NONRESIDENT_EX_LE;
+ /* The only allowed: 16 clusters per frame. */
+ attr->nres.c_unit = NTFS_LZNT_CUNIT;
+ ni->vfs_inode.i_mapping->a_ops = &ntfs_aops_cmpr;
+ } else {
+ attr->name_off = SIZEOF_NONRESIDENT_LE;
+ /* Normal files. */
+ attr->nres.c_unit = 0;
+ ni->vfs_inode.i_mapping->a_ops = &ntfs_aops;
+ }
+ attr->nres.run_off = attr->name_off;
+out:
+ attr->flags = new_aflags;
+ mi->dirty = true;
+
+ return 0;
+}
+
+/*
+ * ni_parse_reparse
+ *
+ * buffer - memory for reparse buffer header
+ */
+enum REPARSE_SIGN ni_parse_reparse(struct ntfs_inode *ni, struct ATTRIB *attr,
+ struct REPARSE_DATA_BUFFER *buffer)
+{
+ const struct REPARSE_DATA_BUFFER *rp = NULL;
+ u8 bits;
+ u16 len;
+ typeof(rp->CompressReparseBuffer) *cmpr;
+
+ /* Try to estimate reparse point. */
+ if (!attr->non_res) {
+ rp = resident_data_ex(attr, sizeof(struct REPARSE_DATA_BUFFER));
+ } else if (le64_to_cpu(attr->nres.data_size) >=
+ sizeof(struct REPARSE_DATA_BUFFER)) {
+ struct runs_tree run;
+
+ run_init(&run);
+
+ if (!attr_load_runs_vcn(ni, ATTR_REPARSE, NULL, 0, &run, 0) &&
+ !ntfs_read_run_nb(ni->mi.sbi, &run, 0, buffer,
+ sizeof(struct REPARSE_DATA_BUFFER),
+ NULL)) {
+ rp = buffer;
+ }
+
+ run_close(&run);
+ }
+
+ if (!rp)
+ return REPARSE_NONE;
+
+ len = le16_to_cpu(rp->ReparseDataLength);
+ switch (rp->ReparseTag) {
+ case (IO_REPARSE_TAG_MICROSOFT | IO_REPARSE_TAG_SYMBOLIC_LINK):
+ break; /* Symbolic link. */
+ case IO_REPARSE_TAG_MOUNT_POINT:
+ break; /* Mount points and junctions. */
+ case IO_REPARSE_TAG_SYMLINK:
+ break;
+ case IO_REPARSE_TAG_COMPRESS:
+ /*
+ * WOF - Windows Overlay Filter - Used to compress files with
+ * LZX/Xpress.
+ *
+ * Unlike native NTFS file compression, the Windows
+ * Overlay Filter supports only read operations. This means
+ * that it doesn't need to sector-align each compressed chunk,
+ * so the compressed data can be packed more tightly together.
+ * If you open the file for writing, the WOF just decompresses
+ * the entire file, turning it back into a plain file.
+ *
+ * Ntfs3 driver decompresses the entire file only on write or
+ * change size requests.
+ */
+
+ cmpr = &rp->CompressReparseBuffer;
+ if (len < sizeof(*cmpr) ||
+ cmpr->WofVersion != WOF_CURRENT_VERSION ||
+ cmpr->WofProvider != WOF_PROVIDER_SYSTEM ||
+ cmpr->ProviderVer != WOF_PROVIDER_CURRENT_VERSION) {
+ return REPARSE_NONE;
+ }
+
+ switch (cmpr->CompressionFormat) {
+ case WOF_COMPRESSION_XPRESS4K:
+ bits = 0xc; // 4k
+ break;
+ case WOF_COMPRESSION_XPRESS8K:
+ bits = 0xd; // 8k
+ break;
+ case WOF_COMPRESSION_XPRESS16K:
+ bits = 0xe; // 16k
+ break;
+ case WOF_COMPRESSION_LZX32K:
+ bits = 0xf; // 32k
+ break;
+ default:
+ bits = 0x10; // 64k
+ break;
+ }
+ ni_set_ext_compress_bits(ni, bits);
+ return REPARSE_COMPRESSED;
+
+ case IO_REPARSE_TAG_DEDUP:
+ ni->ni_flags |= NI_FLAG_DEDUPLICATED;
+ return REPARSE_DEDUPLICATED;
+
+ default:
+ if (rp->ReparseTag & IO_REPARSE_TAG_NAME_SURROGATE)
+ break;
+
+ return REPARSE_NONE;
+ }
+
+ if (buffer != rp)
+ memcpy(buffer, rp, sizeof(struct REPARSE_DATA_BUFFER));
+
+ /* Looks like normal symlink. */
+ return REPARSE_LINK;
+}
+
+/*
+ * ni_fiemap - Helper for file_fiemap().
+ *
+ * Assumed ni_lock.
+ * TODO: Less aggressive locks.
+ */
+int ni_fiemap(struct ntfs_inode *ni, struct fiemap_extent_info *fieinfo,
+ __u64 vbo, __u64 len)
+{
+ int err = 0;
+ struct ntfs_sb_info *sbi = ni->mi.sbi;
+ u8 cluster_bits = sbi->cluster_bits;
+ struct runs_tree *run;
+ struct rw_semaphore *run_lock;
+ struct ATTRIB *attr;
+ CLST vcn = vbo >> cluster_bits;
+ CLST lcn, clen;
+ u64 valid = ni->i_valid;
+ u64 lbo, bytes;
+ u64 end, alloc_size;
+ size_t idx = -1;
+ u32 flags;
+ bool ok;
+
+ if (S_ISDIR(ni->vfs_inode.i_mode)) {
+ run = &ni->dir.alloc_run;
+ attr = ni_find_attr(ni, NULL, NULL, ATTR_ALLOC, I30_NAME,
+ ARRAY_SIZE(I30_NAME), NULL, NULL);
+ run_lock = &ni->dir.run_lock;
+ } else {
+ run = &ni->file.run;
+ attr = ni_find_attr(ni, NULL, NULL, ATTR_DATA, NULL, 0, NULL,
+ NULL);
+ if (!attr) {
+ err = -EINVAL;
+ goto out;
+ }
+ if (is_attr_compressed(attr)) {
+ /* Unfortunately cp -r incorrectly treats compressed clusters. */
+ err = -EOPNOTSUPP;
+ ntfs_inode_warn(
+ &ni->vfs_inode,
+ "fiemap is not supported for compressed file (cp -r)");
+ goto out;
+ }
+ run_lock = &ni->file.run_lock;
+ }
+
+ if (!attr || !attr->non_res) {
+ err = fiemap_fill_next_extent(
+ fieinfo, 0, 0,
+ attr ? le32_to_cpu(attr->res.data_size) : 0,
+ FIEMAP_EXTENT_DATA_INLINE | FIEMAP_EXTENT_LAST |
+ FIEMAP_EXTENT_MERGED);
+ goto out;
+ }
+
+ end = vbo + len;
+ alloc_size = le64_to_cpu(attr->nres.alloc_size);
+ if (end > alloc_size)
+ end = alloc_size;
+
+ down_read(run_lock);
+
+ while (vbo < end) {
+ if (idx == -1) {
+ ok = run_lookup_entry(run, vcn, &lcn, &clen, &idx);
+ } else {
+ CLST vcn_next = vcn;
+
+ ok = run_get_entry(run, ++idx, &vcn, &lcn, &clen) &&
+ vcn == vcn_next;
+ if (!ok)
+ vcn = vcn_next;
+ }
+
+ if (!ok) {
+ up_read(run_lock);
+ down_write(run_lock);
+
+ err = attr_load_runs_vcn(ni, attr->type,
+ attr_name(attr),
+ attr->name_len, run, vcn);
+
+ up_write(run_lock);
+ down_read(run_lock);
+
+ if (err)
+ break;
+
+ ok = run_lookup_entry(run, vcn, &lcn, &clen, &idx);
+
+ if (!ok) {
+ err = -EINVAL;
+ break;
+ }
+ }
+
+ if (!clen) {
+ err = -EINVAL; // ?
+ break;
+ }
+
+ if (lcn == SPARSE_LCN) {
+ vcn += clen;
+ vbo = (u64)vcn << cluster_bits;
+ continue;
+ }
+
+ flags = FIEMAP_EXTENT_MERGED;
+ if (S_ISDIR(ni->vfs_inode.i_mode)) {
+ ;
+ } else if (is_attr_compressed(attr)) {
+ CLST clst_data;
+
+ err = attr_is_frame_compressed(
+ ni, attr, vcn >> attr->nres.c_unit, &clst_data);
+ if (err)
+ break;
+ if (clst_data < NTFS_LZNT_CLUSTERS)
+ flags |= FIEMAP_EXTENT_ENCODED;
+ } else if (is_attr_encrypted(attr)) {
+ flags |= FIEMAP_EXTENT_DATA_ENCRYPTED;
+ }
+
+ vbo = (u64)vcn << cluster_bits;
+ bytes = (u64)clen << cluster_bits;
+ lbo = (u64)lcn << cluster_bits;
+
+ vcn += clen;
+
+ if (vbo + bytes >= end)
+ bytes = end - vbo;
+
+ if (vbo + bytes <= valid) {
+ ;
+ } else if (vbo >= valid) {
+ flags |= FIEMAP_EXTENT_UNWRITTEN;
+ } else {
+ /* vbo < valid && valid < vbo + bytes */
+ u64 dlen = valid - vbo;
+
+ if (vbo + dlen >= end)
+ flags |= FIEMAP_EXTENT_LAST;
+
+ err = fiemap_fill_next_extent(fieinfo, vbo, lbo, dlen,
+ flags);
+ if (err < 0)
+ break;
+ if (err == 1) {
+ err = 0;
+ break;
+ }
+
+ vbo = valid;
+ bytes -= dlen;
+ if (!bytes)
+ continue;
+
+ lbo += dlen;
+ flags |= FIEMAP_EXTENT_UNWRITTEN;
+ }
+
+ if (vbo + bytes >= end)
+ flags |= FIEMAP_EXTENT_LAST;
+
+ err = fiemap_fill_next_extent(fieinfo, vbo, lbo, bytes, flags);
+ if (err < 0)
+ break;
+ if (err == 1) {
+ err = 0;
+ break;
+ }
+
+ vbo += bytes;
+ }
+
+ up_read(run_lock);
+
+out:
+ return err;
+}
+
+/*
+ * ni_readpage_cmpr
+ *
+ * When decompressing, we typically obtain more than one page per reference.
+ * We inject the additional pages into the page cache.
+ */
+int ni_readpage_cmpr(struct ntfs_inode *ni, struct page *page)
+{
+ int err;
+ struct ntfs_sb_info *sbi = ni->mi.sbi;
+ struct address_space *mapping = page->mapping;
+ pgoff_t index = page->index;
+ u64 frame_vbo, vbo = (u64)index << PAGE_SHIFT;
+ struct page **pages = NULL; /* Array of at most 16 pages. stack? */
+ u8 frame_bits;
+ CLST frame;
+ u32 i, idx, frame_size, pages_per_frame;
+ gfp_t gfp_mask;
+ struct page *pg;
+
+ if (vbo >= ni->vfs_inode.i_size) {
+ SetPageUptodate(page);
+ err = 0;
+ goto out;
+ }
+
+ if (ni->ni_flags & NI_FLAG_COMPRESSED_MASK) {
+ /* Xpress or LZX. */
+ frame_bits = ni_ext_compress_bits(ni);
+ } else {
+ /* LZNT compression. */
+ frame_bits = NTFS_LZNT_CUNIT + sbi->cluster_bits;
+ }
+ frame_size = 1u << frame_bits;
+ frame = vbo >> frame_bits;
+ frame_vbo = (u64)frame << frame_bits;
+ idx = (vbo - frame_vbo) >> PAGE_SHIFT;
+
+ pages_per_frame = frame_size >> PAGE_SHIFT;
+ pages = kcalloc(pages_per_frame, sizeof(struct page *), GFP_NOFS);
+ if (!pages) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ pages[idx] = page;
+ index = frame_vbo >> PAGE_SHIFT;
+ gfp_mask = mapping_gfp_mask(mapping);
+
+ for (i = 0; i < pages_per_frame; i++, index++) {
+ if (i == idx)
+ continue;
+
+ pg = find_or_create_page(mapping, index, gfp_mask);
+ if (!pg) {
+ err = -ENOMEM;
+ goto out1;
+ }
+ pages[i] = pg;
+ }
+
+ err = ni_read_frame(ni, frame_vbo, pages, pages_per_frame);
+
+out1:
+ if (err)
+ SetPageError(page);
+
+ for (i = 0; i < pages_per_frame; i++) {
+ pg = pages[i];
+ if (i == idx || !pg)
+ continue;
+ unlock_page(pg);
+ put_page(pg);
+ }
+
+out:
+ /* At this point, err contains 0 or -EIO depending on the "critical" page. */
+ kfree(pages);
+ unlock_page(page);
+
+ return err;
+}
+
+#ifdef CONFIG_NTFS3_LZX_XPRESS
+/*
+ * ni_decompress_file - Decompress LZX/Xpress compressed file.
+ *
+ * Remove ATTR_DATA::WofCompressedData.
+ * Remove ATTR_REPARSE.
+ */
+int ni_decompress_file(struct ntfs_inode *ni)
+{
+ struct ntfs_sb_info *sbi = ni->mi.sbi;
+ struct inode *inode = &ni->vfs_inode;
+ loff_t i_size = inode->i_size;
+ struct address_space *mapping = inode->i_mapping;
+ gfp_t gfp_mask = mapping_gfp_mask(mapping);
+ struct page **pages = NULL;
+ struct ATTR_LIST_ENTRY *le;
+ struct ATTRIB *attr;
+ CLST vcn, cend, lcn, clen, end;
+ pgoff_t index;
+ u64 vbo;
+ u8 frame_bits;
+ u32 i, frame_size, pages_per_frame, bytes;
+ struct mft_inode *mi;
+ int err;
+
+ /* Clusters for decompressed data. */
+ cend = bytes_to_cluster(sbi, i_size);
+
+ if (!i_size)
+ goto remove_wof;
+
+ /* Check in advance. */
+ if (cend > wnd_zeroes(&sbi->used.bitmap)) {
+ err = -ENOSPC;
+ goto out;
+ }
+
+ frame_bits = ni_ext_compress_bits(ni);
+ frame_size = 1u << frame_bits;
+ pages_per_frame = frame_size >> PAGE_SHIFT;
+ pages = kcalloc(pages_per_frame, sizeof(struct page *), GFP_NOFS);
+ if (!pages) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ /*
+ * Step 1: Decompress data and copy to new allocated clusters.
+ */
+ index = 0;
+ for (vbo = 0; vbo < i_size; vbo += bytes) {
+ u32 nr_pages;
+ bool new;
+
+ if (vbo + frame_size > i_size) {
+ bytes = i_size - vbo;
+ nr_pages = (bytes + PAGE_SIZE - 1) >> PAGE_SHIFT;
+ } else {
+ nr_pages = pages_per_frame;
+ bytes = frame_size;
+ }
+
+ end = bytes_to_cluster(sbi, vbo + bytes);
+
+ for (vcn = vbo >> sbi->cluster_bits; vcn < end; vcn += clen) {
+ err = attr_data_get_block(ni, vcn, cend - vcn, &lcn,
+ &clen, &new);
+ if (err)
+ goto out;
+ }
+
+ for (i = 0; i < pages_per_frame; i++, index++) {
+ struct page *pg;
+
+ pg = find_or_create_page(mapping, index, gfp_mask);
+ if (!pg) {
+ while (i--) {
+ unlock_page(pages[i]);
+ put_page(pages[i]);
+ }
+ err = -ENOMEM;
+ goto out;
+ }
+ pages[i] = pg;
+ }
+
+ err = ni_read_frame(ni, vbo, pages, pages_per_frame);
+
+ if (!err) {
+ down_read(&ni->file.run_lock);
+ err = ntfs_bio_pages(sbi, &ni->file.run, pages,
+ nr_pages, vbo, bytes,
+ REQ_OP_WRITE);
+ up_read(&ni->file.run_lock);
+ }
+
+ for (i = 0; i < pages_per_frame; i++) {
+ unlock_page(pages[i]);
+ put_page(pages[i]);
+ }
+
+ if (err)
+ goto out;
+
+ cond_resched();
+ }
+
+remove_wof:
+ /*
+ * Step 2: Deallocate attributes ATTR_DATA::WofCompressedData
+ * and ATTR_REPARSE.
+ */
+ attr = NULL;
+ le = NULL;
+ while ((attr = ni_enum_attr_ex(ni, attr, &le, NULL))) {
+ CLST svcn, evcn;
+ u32 asize, roff;
+
+ if (attr->type == ATTR_REPARSE) {
+ struct MFT_REF ref;
+
+ mi_get_ref(&ni->mi, &ref);
+ ntfs_remove_reparse(sbi, 0, &ref);
+ }
+
+ if (!attr->non_res)
+ continue;
+
+ if (attr->type != ATTR_REPARSE &&
+ (attr->type != ATTR_DATA ||
+ attr->name_len != ARRAY_SIZE(WOF_NAME) ||
+ memcmp(attr_name(attr), WOF_NAME, sizeof(WOF_NAME))))
+ continue;
+
+ svcn = le64_to_cpu(attr->nres.svcn);
+ evcn = le64_to_cpu(attr->nres.evcn);
+
+ if (evcn + 1 <= svcn)
+ continue;
+
+ asize = le32_to_cpu(attr->size);
+ roff = le16_to_cpu(attr->nres.run_off);
+
+ if (roff > asize) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ /*run==1 Means unpack and deallocate. */
+ run_unpack_ex(RUN_DEALLOCATE, sbi, ni->mi.rno, svcn, evcn, svcn,
+ Add2Ptr(attr, roff), asize - roff);
+ }
+
+ /*
+ * Step 3: Remove attribute ATTR_DATA::WofCompressedData.
+ */
+ err = ni_remove_attr(ni, ATTR_DATA, WOF_NAME, ARRAY_SIZE(WOF_NAME),
+ false, NULL);
+ if (err)
+ goto out;
+
+ /*
+ * Step 4: Remove ATTR_REPARSE.
+ */
+ err = ni_remove_attr(ni, ATTR_REPARSE, NULL, 0, false, NULL);
+ if (err)
+ goto out;
+
+ /*
+ * Step 5: Remove sparse flag from data attribute.
+ */
+ attr = ni_find_attr(ni, NULL, NULL, ATTR_DATA, NULL, 0, NULL, &mi);
+ if (!attr) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ if (attr->non_res && is_attr_sparsed(attr)) {
+ /* Sparsed attribute header is 8 bytes bigger than normal. */
+ struct MFT_REC *rec = mi->mrec;
+ u32 used = le32_to_cpu(rec->used);
+ u32 asize = le32_to_cpu(attr->size);
+ u16 roff = le16_to_cpu(attr->nres.run_off);
+ char *rbuf = Add2Ptr(attr, roff);
+
+ memmove(rbuf - 8, rbuf, used - PtrOffset(rec, rbuf));
+ attr->size = cpu_to_le32(asize - 8);
+ attr->flags &= ~ATTR_FLAG_SPARSED;
+ attr->nres.run_off = cpu_to_le16(roff - 8);
+ attr->nres.c_unit = 0;
+ rec->used = cpu_to_le32(used - 8);
+ mi->dirty = true;
+ ni->std_fa &= ~(FILE_ATTRIBUTE_SPARSE_FILE |
+ FILE_ATTRIBUTE_REPARSE_POINT);
+
+ mark_inode_dirty(inode);
+ }
+
+ /* Clear cached flag. */
+ ni->ni_flags &= ~NI_FLAG_COMPRESSED_MASK;
+ if (ni->file.offs_page) {
+ put_page(ni->file.offs_page);
+ ni->file.offs_page = NULL;
+ }
+ mapping->a_ops = &ntfs_aops;
+
+out:
+ kfree(pages);
+ if (err)
+ _ntfs_bad_inode(inode);
+
+ return err;
+}
+
+/*
+ * decompress_lzx_xpress - External compression LZX/Xpress.
+ */
+static int decompress_lzx_xpress(struct ntfs_sb_info *sbi, const char *cmpr,
+ size_t cmpr_size, void *unc, size_t unc_size,
+ u32 frame_size)
+{
+ int err;
+ void *ctx;
+
+ if (cmpr_size == unc_size) {
+ /* Frame not compressed. */
+ memcpy(unc, cmpr, unc_size);
+ return 0;
+ }
+
+ err = 0;
+ if (frame_size == 0x8000) {
+ mutex_lock(&sbi->compress.mtx_lzx);
+ /* LZX: Frame compressed. */
+ ctx = sbi->compress.lzx;
+ if (!ctx) {
+ /* Lazy initialize LZX decompress context. */
+ ctx = lzx_allocate_decompressor();
+ if (!ctx) {
+ err = -ENOMEM;
+ goto out1;
+ }
+
+ sbi->compress.lzx = ctx;
+ }
+
+ if (lzx_decompress(ctx, cmpr, cmpr_size, unc, unc_size)) {
+ /* Treat all errors as "invalid argument". */
+ err = -EINVAL;
+ }
+out1:
+ mutex_unlock(&sbi->compress.mtx_lzx);
+ } else {
+ /* XPRESS: Frame compressed. */
+ mutex_lock(&sbi->compress.mtx_xpress);
+ ctx = sbi->compress.xpress;
+ if (!ctx) {
+ /* Lazy initialize Xpress decompress context. */
+ ctx = xpress_allocate_decompressor();
+ if (!ctx) {
+ err = -ENOMEM;
+ goto out2;
+ }
+
+ sbi->compress.xpress = ctx;
+ }
+
+ if (xpress_decompress(ctx, cmpr, cmpr_size, unc, unc_size)) {
+ /* Treat all errors as "invalid argument". */
+ err = -EINVAL;
+ }
+out2:
+ mutex_unlock(&sbi->compress.mtx_xpress);
+ }
+ return err;
+}
+#endif
+
+/*
+ * ni_read_frame
+ *
+ * Pages - Array of locked pages.
+ */
+int ni_read_frame(struct ntfs_inode *ni, u64 frame_vbo, struct page **pages,
+ u32 pages_per_frame)
+{
+ int err;
+ struct ntfs_sb_info *sbi = ni->mi.sbi;
+ u8 cluster_bits = sbi->cluster_bits;
+ char *frame_ondisk = NULL;
+ char *frame_mem = NULL;
+ struct page **pages_disk = NULL;
+ struct ATTR_LIST_ENTRY *le = NULL;
+ struct runs_tree *run = &ni->file.run;
+ u64 valid_size = ni->i_valid;
+ u64 vbo_disk;
+ size_t unc_size;
+ u32 frame_size, i, npages_disk, ondisk_size;
+ struct page *pg;
+ struct ATTRIB *attr;
+ CLST frame, clst_data;
+
+ /*
+ * To simplify decompress algorithm do vmap for source
+ * and target pages.
+ */
+ for (i = 0; i < pages_per_frame; i++)
+ kmap(pages[i]);
+
+ frame_size = pages_per_frame << PAGE_SHIFT;
+ frame_mem = vmap(pages, pages_per_frame, VM_MAP, PAGE_KERNEL);
+ if (!frame_mem) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ attr = ni_find_attr(ni, NULL, &le, ATTR_DATA, NULL, 0, NULL, NULL);
+ if (!attr) {
+ err = -ENOENT;
+ goto out1;
+ }
+
+ if (!attr->non_res) {
+ u32 data_size = le32_to_cpu(attr->res.data_size);
+
+ memset(frame_mem, 0, frame_size);
+ if (frame_vbo < data_size) {
+ ondisk_size = data_size - frame_vbo;
+ memcpy(frame_mem, resident_data(attr) + frame_vbo,
+ min(ondisk_size, frame_size));
+ }
+ err = 0;
+ goto out1;
+ }
+
+ if (frame_vbo >= valid_size) {
+ memset(frame_mem, 0, frame_size);
+ err = 0;
+ goto out1;
+ }
+
+ if (ni->ni_flags & NI_FLAG_COMPRESSED_MASK) {
+#ifndef CONFIG_NTFS3_LZX_XPRESS
+ err = -EOPNOTSUPP;
+ goto out1;
+#else
+ u32 frame_bits = ni_ext_compress_bits(ni);
+ u64 frame64 = frame_vbo >> frame_bits;
+ u64 frames, vbo_data;
+
+ if (frame_size != (1u << frame_bits)) {
+ err = -EINVAL;
+ goto out1;
+ }
+ switch (frame_size) {
+ case 0x1000:
+ case 0x2000:
+ case 0x4000:
+ case 0x8000:
+ break;
+ default:
+ /* Unknown compression. */
+ err = -EOPNOTSUPP;
+ goto out1;
+ }
+
+ attr = ni_find_attr(ni, attr, &le, ATTR_DATA, WOF_NAME,
+ ARRAY_SIZE(WOF_NAME), NULL, NULL);
+ if (!attr) {
+ ntfs_inode_err(
+ &ni->vfs_inode,
+ "external compressed file should contains data attribute \"WofCompressedData\"");
+ err = -EINVAL;
+ goto out1;
+ }
+
+ if (!attr->non_res) {
+ run = NULL;
+ } else {
+ run = run_alloc();
+ if (!run) {
+ err = -ENOMEM;
+ goto out1;
+ }
+ }
+
+ frames = (ni->vfs_inode.i_size - 1) >> frame_bits;
+
+ err = attr_wof_frame_info(ni, attr, run, frame64, frames,
+ frame_bits, &ondisk_size, &vbo_data);
+ if (err)
+ goto out2;
+
+ if (frame64 == frames) {
+ unc_size = 1 + ((ni->vfs_inode.i_size - 1) &
+ (frame_size - 1));
+ ondisk_size = attr_size(attr) - vbo_data;
+ } else {
+ unc_size = frame_size;
+ }
+
+ if (ondisk_size > frame_size) {
+ err = -EINVAL;
+ goto out2;
+ }
+
+ if (!attr->non_res) {
+ if (vbo_data + ondisk_size >
+ le32_to_cpu(attr->res.data_size)) {
+ err = -EINVAL;
+ goto out1;
+ }
+
+ err = decompress_lzx_xpress(
+ sbi, Add2Ptr(resident_data(attr), vbo_data),
+ ondisk_size, frame_mem, unc_size, frame_size);
+ goto out1;
+ }
+ vbo_disk = vbo_data;
+ /* Load all runs to read [vbo_disk-vbo_to). */
+ err = attr_load_runs_range(ni, ATTR_DATA, WOF_NAME,
+ ARRAY_SIZE(WOF_NAME), run, vbo_disk,
+ vbo_data + ondisk_size);
+ if (err)
+ goto out2;
+ npages_disk = (ondisk_size + (vbo_disk & (PAGE_SIZE - 1)) +
+ PAGE_SIZE - 1) >>
+ PAGE_SHIFT;
+#endif
+ } else if (is_attr_compressed(attr)) {
+ /* LZNT compression. */
+ if (sbi->cluster_size > NTFS_LZNT_MAX_CLUSTER) {
+ err = -EOPNOTSUPP;
+ goto out1;
+ }
+
+ if (attr->nres.c_unit != NTFS_LZNT_CUNIT) {
+ err = -EOPNOTSUPP;
+ goto out1;
+ }
+
+ down_write(&ni->file.run_lock);
+ run_truncate_around(run, le64_to_cpu(attr->nres.svcn));
+ frame = frame_vbo >> (cluster_bits + NTFS_LZNT_CUNIT);
+ err = attr_is_frame_compressed(ni, attr, frame, &clst_data);
+ up_write(&ni->file.run_lock);
+ if (err)
+ goto out1;
+
+ if (!clst_data) {
+ memset(frame_mem, 0, frame_size);
+ goto out1;
+ }
+
+ frame_size = sbi->cluster_size << NTFS_LZNT_CUNIT;
+ ondisk_size = clst_data << cluster_bits;
+
+ if (clst_data >= NTFS_LZNT_CLUSTERS) {
+ /* Frame is not compressed. */
+ down_read(&ni->file.run_lock);
+ err = ntfs_bio_pages(sbi, run, pages, pages_per_frame,
+ frame_vbo, ondisk_size,
+ REQ_OP_READ);
+ up_read(&ni->file.run_lock);
+ goto out1;
+ }
+ vbo_disk = frame_vbo;
+ npages_disk = (ondisk_size + PAGE_SIZE - 1) >> PAGE_SHIFT;
+ } else {
+ __builtin_unreachable();
+ err = -EINVAL;
+ goto out1;
+ }
+
+ pages_disk = kzalloc(npages_disk * sizeof(struct page *), GFP_NOFS);
+ if (!pages_disk) {
+ err = -ENOMEM;
+ goto out2;
+ }
+
+ for (i = 0; i < npages_disk; i++) {
+ pg = alloc_page(GFP_KERNEL);
+ if (!pg) {
+ err = -ENOMEM;
+ goto out3;
+ }
+ pages_disk[i] = pg;
+ lock_page(pg);
+ kmap(pg);
+ }
+
+ /* Read 'ondisk_size' bytes from disk. */
+ down_read(&ni->file.run_lock);
+ err = ntfs_bio_pages(sbi, run, pages_disk, npages_disk, vbo_disk,
+ ondisk_size, REQ_OP_READ);
+ up_read(&ni->file.run_lock);
+ if (err)
+ goto out3;
+
+ /*
+ * To simplify decompress algorithm do vmap for source and target pages.
+ */
+ frame_ondisk = vmap(pages_disk, npages_disk, VM_MAP, PAGE_KERNEL_RO);
+ if (!frame_ondisk) {
+ err = -ENOMEM;
+ goto out3;
+ }
+
+ /* Decompress: Frame_ondisk -> frame_mem. */
+#ifdef CONFIG_NTFS3_LZX_XPRESS
+ if (run != &ni->file.run) {
+ /* LZX or XPRESS */
+ err = decompress_lzx_xpress(
+ sbi, frame_ondisk + (vbo_disk & (PAGE_SIZE - 1)),
+ ondisk_size, frame_mem, unc_size, frame_size);
+ } else
+#endif
+ {
+ /* LZNT - Native NTFS compression. */
+ unc_size = decompress_lznt(frame_ondisk, ondisk_size, frame_mem,
+ frame_size);
+ if ((ssize_t)unc_size < 0)
+ err = unc_size;
+ else if (!unc_size || unc_size > frame_size)
+ err = -EINVAL;
+ }
+ if (!err && valid_size < frame_vbo + frame_size) {
+ size_t ok = valid_size - frame_vbo;
+
+ memset(frame_mem + ok, 0, frame_size - ok);
+ }
+
+ vunmap(frame_ondisk);
+
+out3:
+ for (i = 0; i < npages_disk; i++) {
+ pg = pages_disk[i];
+ if (pg) {
+ kunmap(pg);
+ unlock_page(pg);
+ put_page(pg);
+ }
+ }
+ kfree(pages_disk);
+
+out2:
+#ifdef CONFIG_NTFS3_LZX_XPRESS
+ if (run != &ni->file.run)
+ run_free(run);
+#endif
+out1:
+ vunmap(frame_mem);
+out:
+ for (i = 0; i < pages_per_frame; i++) {
+ pg = pages[i];
+ kunmap(pg);
+ ClearPageError(pg);
+ SetPageUptodate(pg);
+ }
+
+ return err;
+}
+
+/*
+ * ni_write_frame
+ *
+ * Pages - Array of locked pages.
+ */
+int ni_write_frame(struct ntfs_inode *ni, struct page **pages,
+ u32 pages_per_frame)
+{
+ int err;
+ struct ntfs_sb_info *sbi = ni->mi.sbi;
+ u8 frame_bits = NTFS_LZNT_CUNIT + sbi->cluster_bits;
+ u32 frame_size = sbi->cluster_size << NTFS_LZNT_CUNIT;
+ u64 frame_vbo = (u64)pages[0]->index << PAGE_SHIFT;
+ CLST frame = frame_vbo >> frame_bits;
+ char *frame_ondisk = NULL;
+ struct page **pages_disk = NULL;
+ struct ATTR_LIST_ENTRY *le = NULL;
+ char *frame_mem;
+ struct ATTRIB *attr;
+ struct mft_inode *mi;
+ u32 i;
+ struct page *pg;
+ size_t compr_size, ondisk_size;
+ struct lznt *lznt;
+
+ attr = ni_find_attr(ni, NULL, &le, ATTR_DATA, NULL, 0, NULL, &mi);
+ if (!attr) {
+ err = -ENOENT;
+ goto out;
+ }
+
+ if (WARN_ON(!is_attr_compressed(attr))) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ if (sbi->cluster_size > NTFS_LZNT_MAX_CLUSTER) {
+ err = -EOPNOTSUPP;
+ goto out;
+ }
+
+ if (!attr->non_res) {
+ down_write(&ni->file.run_lock);
+ err = attr_make_nonresident(ni, attr, le, mi,
+ le32_to_cpu(attr->res.data_size),
+ &ni->file.run, &attr, pages[0]);
+ up_write(&ni->file.run_lock);
+ if (err)
+ goto out;
+ }
+
+ if (attr->nres.c_unit != NTFS_LZNT_CUNIT) {
+ err = -EOPNOTSUPP;
+ goto out;
+ }
+
+ pages_disk = kcalloc(pages_per_frame, sizeof(struct page *), GFP_NOFS);
+ if (!pages_disk) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ for (i = 0; i < pages_per_frame; i++) {
+ pg = alloc_page(GFP_KERNEL);
+ if (!pg) {
+ err = -ENOMEM;
+ goto out1;
+ }
+ pages_disk[i] = pg;
+ lock_page(pg);
+ kmap(pg);
+ }
+
+ /* To simplify compress algorithm do vmap for source and target pages. */
+ frame_ondisk = vmap(pages_disk, pages_per_frame, VM_MAP, PAGE_KERNEL);
+ if (!frame_ondisk) {
+ err = -ENOMEM;
+ goto out1;
+ }
+
+ for (i = 0; i < pages_per_frame; i++)
+ kmap(pages[i]);
+
+ /* Map in-memory frame for read-only. */
+ frame_mem = vmap(pages, pages_per_frame, VM_MAP, PAGE_KERNEL_RO);
+ if (!frame_mem) {
+ err = -ENOMEM;
+ goto out2;
+ }
+
+ mutex_lock(&sbi->compress.mtx_lznt);
+ lznt = NULL;
+ if (!sbi->compress.lznt) {
+ /*
+ * LZNT implements two levels of compression:
+ * 0 - Standard compression
+ * 1 - Best compression, requires a lot of cpu
+ * use mount option?
+ */
+ lznt = get_lznt_ctx(0);
+ if (!lznt) {
+ mutex_unlock(&sbi->compress.mtx_lznt);
+ err = -ENOMEM;
+ goto out3;
+ }
+
+ sbi->compress.lznt = lznt;
+ lznt = NULL;
+ }
+
+ /* Compress: frame_mem -> frame_ondisk */
+ compr_size = compress_lznt(frame_mem, frame_size, frame_ondisk,
+ frame_size, sbi->compress.lznt);
+ mutex_unlock(&sbi->compress.mtx_lznt);
+ kfree(lznt);
+
+ if (compr_size + sbi->cluster_size > frame_size) {
+ /* Frame is not compressed. */
+ compr_size = frame_size;
+ ondisk_size = frame_size;
+ } else if (compr_size) {
+ /* Frame is compressed. */
+ ondisk_size = ntfs_up_cluster(sbi, compr_size);
+ memset(frame_ondisk + compr_size, 0, ondisk_size - compr_size);
+ } else {
+ /* Frame is sparsed. */
+ ondisk_size = 0;
+ }
+
+ down_write(&ni->file.run_lock);
+ run_truncate_around(&ni->file.run, le64_to_cpu(attr->nres.svcn));
+ err = attr_allocate_frame(ni, frame, compr_size, ni->i_valid);
+ up_write(&ni->file.run_lock);
+ if (err)
+ goto out2;
+
+ if (!ondisk_size)
+ goto out2;
+
+ down_read(&ni->file.run_lock);
+ err = ntfs_bio_pages(sbi, &ni->file.run,
+ ondisk_size < frame_size ? pages_disk : pages,
+ pages_per_frame, frame_vbo, ondisk_size,
+ REQ_OP_WRITE);
+ up_read(&ni->file.run_lock);
+
+out3:
+ vunmap(frame_mem);
+
+out2:
+ for (i = 0; i < pages_per_frame; i++)
+ kunmap(pages[i]);
+
+ vunmap(frame_ondisk);
+out1:
+ for (i = 0; i < pages_per_frame; i++) {
+ pg = pages_disk[i];
+ if (pg) {
+ kunmap(pg);
+ unlock_page(pg);
+ put_page(pg);
+ }
+ }
+ kfree(pages_disk);
+out:
+ return err;
+}
+
+/*
+ * ni_remove_name - Removes name 'de' from MFT and from directory.
+ * 'de2' and 'undo_step' are used to restore MFT/dir, if error occurs.
+ */
+int ni_remove_name(struct ntfs_inode *dir_ni, struct ntfs_inode *ni,
+ struct NTFS_DE *de, struct NTFS_DE **de2, int *undo_step)
+{
+ int err;
+ struct ntfs_sb_info *sbi = ni->mi.sbi;
+ struct ATTR_FILE_NAME *de_name = (struct ATTR_FILE_NAME *)(de + 1);
+ struct ATTR_FILE_NAME *fname;
+ struct ATTR_LIST_ENTRY *le;
+ struct mft_inode *mi;
+ u16 de_key_size = le16_to_cpu(de->key_size);
+ u8 name_type;
+
+ *undo_step = 0;
+
+ /* Find name in record. */
+ mi_get_ref(&dir_ni->mi, &de_name->home);
+
+ fname = ni_fname_name(ni, (struct cpu_str *)&de_name->name_len,
+ &de_name->home, &mi, &le);
+ if (!fname)
+ return -ENOENT;
+
+ memcpy(&de_name->dup, &fname->dup, sizeof(struct NTFS_DUP_INFO));
+ name_type = paired_name(fname->type);
+
+ /* Mark ntfs as dirty. It will be cleared at umount. */
+ ntfs_set_state(sbi, NTFS_DIRTY_DIRTY);
+
+ /* Step 1: Remove name from directory. */
+ err = indx_delete_entry(&dir_ni->dir, dir_ni, fname, de_key_size, sbi);
+ if (err)
+ return err;
+
+ /* Step 2: Remove name from MFT. */
+ ni_remove_attr_le(ni, attr_from_name(fname), mi, le);
+
+ *undo_step = 2;
+
+ /* Get paired name. */
+ fname = ni_fname_type(ni, name_type, &mi, &le);
+ if (fname) {
+ u16 de2_key_size = fname_full_size(fname);
+
+ *de2 = Add2Ptr(de, 1024);
+ (*de2)->key_size = cpu_to_le16(de2_key_size);
+
+ memcpy(*de2 + 1, fname, de2_key_size);
+
+ /* Step 3: Remove paired name from directory. */
+ err = indx_delete_entry(&dir_ni->dir, dir_ni, fname,
+ de2_key_size, sbi);
+ if (err)
+ return err;
+
+ /* Step 4: Remove paired name from MFT. */
+ ni_remove_attr_le(ni, attr_from_name(fname), mi, le);
+
+ *undo_step = 4;
+ }
+ return 0;
+}
+
+/*
+ * ni_remove_name_undo - Paired function for ni_remove_name.
+ *
+ * Return: True if ok
+ */
+bool ni_remove_name_undo(struct ntfs_inode *dir_ni, struct ntfs_inode *ni,
+ struct NTFS_DE *de, struct NTFS_DE *de2, int undo_step)
+{
+ struct ntfs_sb_info *sbi = ni->mi.sbi;
+ struct ATTRIB *attr;
+ u16 de_key_size = de2 ? le16_to_cpu(de2->key_size) : 0;
+
+ switch (undo_step) {
+ case 4:
+ if (ni_insert_resident(ni, de_key_size, ATTR_NAME, NULL, 0,
+ &attr, NULL, NULL)) {
+ return false;
+ }
+ memcpy(Add2Ptr(attr, SIZEOF_RESIDENT), de2 + 1, de_key_size);
+
+ mi_get_ref(&ni->mi, &de2->ref);
+ de2->size = cpu_to_le16(ALIGN(de_key_size, 8) +
+ sizeof(struct NTFS_DE));
+ de2->flags = 0;
+ de2->res = 0;
+
+ if (indx_insert_entry(&dir_ni->dir, dir_ni, de2, sbi, NULL,
+ 1)) {
+ return false;
+ }
+ fallthrough;
+
+ case 2:
+ de_key_size = le16_to_cpu(de->key_size);
+
+ if (ni_insert_resident(ni, de_key_size, ATTR_NAME, NULL, 0,
+ &attr, NULL, NULL)) {
+ return false;
+ }
+
+ memcpy(Add2Ptr(attr, SIZEOF_RESIDENT), de + 1, de_key_size);
+ mi_get_ref(&ni->mi, &de->ref);
+
+ if (indx_insert_entry(&dir_ni->dir, dir_ni, de, sbi, NULL, 1))
+ return false;
+ }
+
+ return true;
+}
+
+/*
+ * ni_add_name - Add new name into MFT and into directory.
+ */
+int ni_add_name(struct ntfs_inode *dir_ni, struct ntfs_inode *ni,
+ struct NTFS_DE *de)
+{
+ int err;
+ struct ATTRIB *attr;
+ struct ATTR_LIST_ENTRY *le;
+ struct mft_inode *mi;
+ struct ATTR_FILE_NAME *fname;
+ struct ATTR_FILE_NAME *de_name = (struct ATTR_FILE_NAME *)(de + 1);
+ u16 de_key_size = le16_to_cpu(de->key_size);
+
+ mi_get_ref(&ni->mi, &de->ref);
+ mi_get_ref(&dir_ni->mi, &de_name->home);
+
+ /* Fill duplicate from any ATTR_NAME. */
+ fname = ni_fname_name(ni, NULL, NULL, NULL, NULL);
+ if (fname)
+ memcpy(&de_name->dup, &fname->dup, sizeof(fname->dup));
+ de_name->dup.fa = ni->std_fa;
+
+ /* Insert new name into MFT. */
+ err = ni_insert_resident(ni, de_key_size, ATTR_NAME, NULL, 0, &attr,
+ &mi, &le);
+ if (err)
+ return err;
+
+ memcpy(Add2Ptr(attr, SIZEOF_RESIDENT), de_name, de_key_size);
+
+ /* Insert new name into directory. */
+ err = indx_insert_entry(&dir_ni->dir, dir_ni, de, ni->mi.sbi, NULL, 0);
+ if (err)
+ ni_remove_attr_le(ni, attr, mi, le);
+
+ return err;
+}
+
+/*
+ * ni_rename - Remove one name and insert new name.
+ */
+int ni_rename(struct ntfs_inode *dir_ni, struct ntfs_inode *new_dir_ni,
+ struct ntfs_inode *ni, struct NTFS_DE *de, struct NTFS_DE *new_de,
+ bool *is_bad)
+{
+ int err;
+ struct NTFS_DE *de2 = NULL;
+ int undo = 0;
+
+ /*
+ * There are two possible ways to rename:
+ * 1) Add new name and remove old name.
+ * 2) Remove old name and add new name.
+ *
+ * In most cases (not all!) adding new name into MFT and into directory can
+ * allocate additional cluster(s).
+ * Second way may result to bad inode if we can't add new name
+ * and then can't restore (add) old name.
+ */
+
+ /*
+ * Way 1 - Add new + remove old.
+ */
+ err = ni_add_name(new_dir_ni, ni, new_de);
+ if (!err) {
+ err = ni_remove_name(dir_ni, ni, de, &de2, &undo);
+ if (err && ni_remove_name(new_dir_ni, ni, new_de, &de2, &undo))
+ *is_bad = true;
+ }
+
+ /*
+ * Way 2 - Remove old + add new.
+ */
+ /*
+ * err = ni_remove_name(dir_ni, ni, de, &de2, &undo);
+ * if (!err) {
+ * err = ni_add_name(new_dir_ni, ni, new_de);
+ * if (err && !ni_remove_name_undo(dir_ni, ni, de, de2, undo))
+ * *is_bad = true;
+ * }
+ */
+
+ return err;
+}
+
+/*
+ * ni_is_dirty - Return: True if 'ni' requires ni_write_inode.
+ */
+bool ni_is_dirty(struct inode *inode)
+{
+ struct ntfs_inode *ni = ntfs_i(inode);
+ struct rb_node *node;
+
+ if (ni->mi.dirty || ni->attr_list.dirty ||
+ (ni->ni_flags & NI_FLAG_UPDATE_PARENT))
+ return true;
+
+ for (node = rb_first(&ni->mi_tree); node; node = rb_next(node)) {
+ if (rb_entry(node, struct mft_inode, node)->dirty)
+ return true;
+ }
+
+ return false;
+}
+
+/*
+ * ni_update_parent
+ *
+ * Update duplicate info of ATTR_FILE_NAME in MFT and in parent directories.
+ */
+static bool ni_update_parent(struct ntfs_inode *ni, struct NTFS_DUP_INFO *dup,
+ int sync)
+{
+ struct ATTRIB *attr;
+ struct mft_inode *mi;
+ struct ATTR_LIST_ENTRY *le = NULL;
+ struct ntfs_sb_info *sbi = ni->mi.sbi;
+ struct super_block *sb = sbi->sb;
+ bool re_dirty = false;
+
+ if (ni->mi.mrec->flags & RECORD_FLAG_DIR) {
+ dup->fa |= FILE_ATTRIBUTE_DIRECTORY;
+ attr = NULL;
+ dup->alloc_size = 0;
+ dup->data_size = 0;
+ } else {
+ dup->fa &= ~FILE_ATTRIBUTE_DIRECTORY;
+
+ attr = ni_find_attr(ni, NULL, &le, ATTR_DATA, NULL, 0, NULL,
+ &mi);
+ if (!attr) {
+ dup->alloc_size = dup->data_size = 0;
+ } else if (!attr->non_res) {
+ u32 data_size = le32_to_cpu(attr->res.data_size);
+
+ dup->alloc_size = cpu_to_le64(ALIGN(data_size, 8));
+ dup->data_size = cpu_to_le64(data_size);
+ } else {
+ u64 new_valid = ni->i_valid;
+ u64 data_size = le64_to_cpu(attr->nres.data_size);
+ __le64 valid_le;
+
+ dup->alloc_size = is_attr_ext(attr)
+ ? attr->nres.total_size
+ : attr->nres.alloc_size;
+ dup->data_size = attr->nres.data_size;
+
+ if (new_valid > data_size)
+ new_valid = data_size;
+
+ valid_le = cpu_to_le64(new_valid);
+ if (valid_le != attr->nres.valid_size) {
+ attr->nres.valid_size = valid_le;
+ mi->dirty = true;
+ }
+ }
+ }
+
+ /* TODO: Fill reparse info. */
+ dup->reparse = 0;
+ dup->ea_size = 0;
+
+ if (ni->ni_flags & NI_FLAG_EA) {
+ attr = ni_find_attr(ni, attr, &le, ATTR_EA_INFO, NULL, 0, NULL,
+ NULL);
+ if (attr) {
+ const struct EA_INFO *info;
+
+ info = resident_data_ex(attr, sizeof(struct EA_INFO));
+ /* If ATTR_EA_INFO exists 'info' can't be NULL. */
+ if (info)
+ dup->ea_size = info->size_pack;
+ }
+ }
+
+ attr = NULL;
+ le = NULL;
+
+ while ((attr = ni_find_attr(ni, attr, &le, ATTR_NAME, NULL, 0, NULL,
+ &mi))) {
+ struct inode *dir;
+ struct ATTR_FILE_NAME *fname;
+
+ fname = resident_data_ex(attr, SIZEOF_ATTRIBUTE_FILENAME);
+ if (!fname || !memcmp(&fname->dup, dup, sizeof(fname->dup)))
+ continue;
+
+ /* Check simple case when parent inode equals current inode. */
+ if (ino_get(&fname->home) == ni->vfs_inode.i_ino) {
+ ntfs_set_state(sbi, NTFS_DIRTY_ERROR);
+ continue;
+ }
+
+ /* ntfs_iget5 may sleep. */
+ dir = ntfs_iget5(sb, &fname->home, NULL);
+ if (IS_ERR(dir)) {
+ ntfs_inode_warn(
+ &ni->vfs_inode,
+ "failed to open parent directory r=%lx to update",
+ (long)ino_get(&fname->home));
+ continue;
+ }
+
+ if (!is_bad_inode(dir)) {
+ struct ntfs_inode *dir_ni = ntfs_i(dir);
+
+ if (!ni_trylock(dir_ni)) {
+ re_dirty = true;
+ } else {
+ indx_update_dup(dir_ni, sbi, fname, dup, sync);
+ ni_unlock(dir_ni);
+ memcpy(&fname->dup, dup, sizeof(fname->dup));
+ mi->dirty = true;
+ }
+ }
+ iput(dir);
+ }
+
+ return re_dirty;
+}
+
+/*
+ * ni_write_inode - Write MFT base record and all subrecords to disk.
+ */
+int ni_write_inode(struct inode *inode, int sync, const char *hint)
+{
+ int err = 0, err2;
+ struct ntfs_inode *ni = ntfs_i(inode);
+ struct super_block *sb = inode->i_sb;
+ struct ntfs_sb_info *sbi = sb->s_fs_info;
+ bool re_dirty = false;
+ struct ATTR_STD_INFO *std;
+ struct rb_node *node, *next;
+ struct NTFS_DUP_INFO dup;
+
+ if (is_bad_inode(inode) || sb_rdonly(sb))
+ return 0;
+
+ if (!ni_trylock(ni)) {
+ /* 'ni' is under modification, skip for now. */
+ mark_inode_dirty_sync(inode);
+ return 0;
+ }
+
+ if (is_rec_inuse(ni->mi.mrec) &&
+ !(sbi->flags & NTFS_FLAGS_LOG_REPLAYING) && inode->i_nlink) {
+ bool modified = false;
+
+ /* Update times in standard attribute. */
+ std = ni_std(ni);
+ if (!std) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ /* Update the access times if they have changed. */
+ dup.m_time = kernel2nt(&inode->i_mtime);
+ if (std->m_time != dup.m_time) {
+ std->m_time = dup.m_time;
+ modified = true;
+ }
+
+ dup.c_time = kernel2nt(&inode->i_ctime);
+ if (std->c_time != dup.c_time) {
+ std->c_time = dup.c_time;
+ modified = true;
+ }
+
+ dup.a_time = kernel2nt(&inode->i_atime);
+ if (std->a_time != dup.a_time) {
+ std->a_time = dup.a_time;
+ modified = true;
+ }
+
+ dup.fa = ni->std_fa;
+ if (std->fa != dup.fa) {
+ std->fa = dup.fa;
+ modified = true;
+ }
+
+ if (modified)
+ ni->mi.dirty = true;
+
+ if (!ntfs_is_meta_file(sbi, inode->i_ino) &&
+ (modified || (ni->ni_flags & NI_FLAG_UPDATE_PARENT))
+ /* Avoid __wait_on_freeing_inode(inode). */
+ && (sb->s_flags & SB_ACTIVE)) {
+ dup.cr_time = std->cr_time;
+ /* Not critical if this function fail. */
+ re_dirty = ni_update_parent(ni, &dup, sync);
+
+ if (re_dirty)
+ ni->ni_flags |= NI_FLAG_UPDATE_PARENT;
+ else
+ ni->ni_flags &= ~NI_FLAG_UPDATE_PARENT;
+ }
+
+ /* Update attribute list. */
+ if (ni->attr_list.size && ni->attr_list.dirty) {
+ if (inode->i_ino != MFT_REC_MFT || sync) {
+ err = ni_try_remove_attr_list(ni);
+ if (err)
+ goto out;
+ }
+
+ err = al_update(ni, sync);
+ if (err)
+ goto out;
+ }
+ }
+
+ for (node = rb_first(&ni->mi_tree); node; node = next) {
+ struct mft_inode *mi = rb_entry(node, struct mft_inode, node);
+ bool is_empty;
+
+ next = rb_next(node);
+
+ if (!mi->dirty)
+ continue;
+
+ is_empty = !mi_enum_attr(mi, NULL);
+
+ if (is_empty)
+ clear_rec_inuse(mi->mrec);
+
+ err2 = mi_write(mi, sync);
+ if (!err && err2)
+ err = err2;
+
+ if (is_empty) {
+ ntfs_mark_rec_free(sbi, mi->rno, false);
+ rb_erase(node, &ni->mi_tree);
+ mi_put(mi);
+ }
+ }
+
+ if (ni->mi.dirty) {
+ err2 = mi_write(&ni->mi, sync);
+ if (!err && err2)
+ err = err2;
+ }
+out:
+ ni_unlock(ni);
+
+ if (err) {
+ ntfs_err(sb, "%s r=%lx failed, %d.", hint, inode->i_ino, err);
+ ntfs_set_state(sbi, NTFS_DIRTY_ERROR);
+ return err;
+ }
+
+ if (re_dirty)
+ mark_inode_dirty_sync(inode);
+
+ return 0;
+}
diff --git a/fs/ntfs3/fslog.c b/fs/ntfs3/fslog.c
new file mode 100644
index 000000000..710cb5aa5
--- /dev/null
+++ b/fs/ntfs3/fslog.c
@@ -0,0 +1,5210 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ *
+ * Copyright (C) 2019-2021 Paragon Software GmbH, All rights reserved.
+ *
+ */
+
+#include <linux/blkdev.h>
+#include <linux/fs.h>
+#include <linux/random.h>
+#include <linux/slab.h>
+
+#include "debug.h"
+#include "ntfs.h"
+#include "ntfs_fs.h"
+
+/*
+ * LOG FILE structs
+ */
+
+// clang-format off
+
+#define MaxLogFileSize 0x100000000ull
+#define DefaultLogPageSize 4096
+#define MinLogRecordPages 0x30
+
+struct RESTART_HDR {
+ struct NTFS_RECORD_HEADER rhdr; // 'RSTR'
+ __le32 sys_page_size; // 0x10: Page size of the system which initialized the log.
+ __le32 page_size; // 0x14: Log page size used for this log file.
+ __le16 ra_off; // 0x18:
+ __le16 minor_ver; // 0x1A:
+ __le16 major_ver; // 0x1C:
+ __le16 fixups[];
+};
+
+#define LFS_NO_CLIENT 0xffff
+#define LFS_NO_CLIENT_LE cpu_to_le16(0xffff)
+
+struct CLIENT_REC {
+ __le64 oldest_lsn;
+ __le64 restart_lsn; // 0x08:
+ __le16 prev_client; // 0x10:
+ __le16 next_client; // 0x12:
+ __le16 seq_num; // 0x14:
+ u8 align[6]; // 0x16:
+ __le32 name_bytes; // 0x1C: In bytes.
+ __le16 name[32]; // 0x20: Name of client.
+};
+
+static_assert(sizeof(struct CLIENT_REC) == 0x60);
+
+/* Two copies of these will exist at the beginning of the log file */
+struct RESTART_AREA {
+ __le64 current_lsn; // 0x00: Current logical end of log file.
+ __le16 log_clients; // 0x08: Maximum number of clients.
+ __le16 client_idx[2]; // 0x0A: Free/use index into the client record arrays.
+ __le16 flags; // 0x0E: See RESTART_SINGLE_PAGE_IO.
+ __le32 seq_num_bits; // 0x10: The number of bits in sequence number.
+ __le16 ra_len; // 0x14:
+ __le16 client_off; // 0x16:
+ __le64 l_size; // 0x18: Usable log file size.
+ __le32 last_lsn_data_len; // 0x20:
+ __le16 rec_hdr_len; // 0x24: Log page data offset.
+ __le16 data_off; // 0x26: Log page data length.
+ __le32 open_log_count; // 0x28:
+ __le32 align[5]; // 0x2C:
+ struct CLIENT_REC clients[]; // 0x40:
+};
+
+struct LOG_REC_HDR {
+ __le16 redo_op; // 0x00: NTFS_LOG_OPERATION
+ __le16 undo_op; // 0x02: NTFS_LOG_OPERATION
+ __le16 redo_off; // 0x04: Offset to Redo record.
+ __le16 redo_len; // 0x06: Redo length.
+ __le16 undo_off; // 0x08: Offset to Undo record.
+ __le16 undo_len; // 0x0A: Undo length.
+ __le16 target_attr; // 0x0C:
+ __le16 lcns_follow; // 0x0E:
+ __le16 record_off; // 0x10:
+ __le16 attr_off; // 0x12:
+ __le16 cluster_off; // 0x14:
+ __le16 reserved; // 0x16:
+ __le64 target_vcn; // 0x18:
+ __le64 page_lcns[]; // 0x20:
+};
+
+static_assert(sizeof(struct LOG_REC_HDR) == 0x20);
+
+#define RESTART_ENTRY_ALLOCATED 0xFFFFFFFF
+#define RESTART_ENTRY_ALLOCATED_LE cpu_to_le32(0xFFFFFFFF)
+
+struct RESTART_TABLE {
+ __le16 size; // 0x00: In bytes
+ __le16 used; // 0x02: Entries
+ __le16 total; // 0x04: Entries
+ __le16 res[3]; // 0x06:
+ __le32 free_goal; // 0x0C:
+ __le32 first_free; // 0x10:
+ __le32 last_free; // 0x14:
+
+};
+
+static_assert(sizeof(struct RESTART_TABLE) == 0x18);
+
+struct ATTR_NAME_ENTRY {
+ __le16 off; // Offset in the Open attribute Table.
+ __le16 name_bytes;
+ __le16 name[];
+};
+
+struct OPEN_ATTR_ENRTY {
+ __le32 next; // 0x00: RESTART_ENTRY_ALLOCATED if allocated
+ __le32 bytes_per_index; // 0x04:
+ enum ATTR_TYPE type; // 0x08:
+ u8 is_dirty_pages; // 0x0C:
+ u8 is_attr_name; // 0x0B: Faked field to manage 'ptr'
+ u8 name_len; // 0x0C: Faked field to manage 'ptr'
+ u8 res;
+ struct MFT_REF ref; // 0x10: File Reference of file containing attribute
+ __le64 open_record_lsn; // 0x18:
+ void *ptr; // 0x20:
+};
+
+/* 32 bit version of 'struct OPEN_ATTR_ENRTY' */
+struct OPEN_ATTR_ENRTY_32 {
+ __le32 next; // 0x00: RESTART_ENTRY_ALLOCATED if allocated
+ __le32 ptr; // 0x04:
+ struct MFT_REF ref; // 0x08:
+ __le64 open_record_lsn; // 0x10:
+ u8 is_dirty_pages; // 0x18:
+ u8 is_attr_name; // 0x19:
+ u8 res1[2];
+ enum ATTR_TYPE type; // 0x1C:
+ u8 name_len; // 0x20: In wchar
+ u8 res2[3];
+ __le32 AttributeName; // 0x24:
+ __le32 bytes_per_index; // 0x28:
+};
+
+#define SIZEOF_OPENATTRIBUTEENTRY0 0x2c
+// static_assert( 0x2C == sizeof(struct OPEN_ATTR_ENRTY_32) );
+static_assert(sizeof(struct OPEN_ATTR_ENRTY) < SIZEOF_OPENATTRIBUTEENTRY0);
+
+/*
+ * One entry exists in the Dirty Pages Table for each page which is dirty at
+ * the time the Restart Area is written.
+ */
+struct DIR_PAGE_ENTRY {
+ __le32 next; // 0x00: RESTART_ENTRY_ALLOCATED if allocated
+ __le32 target_attr; // 0x04: Index into the Open attribute Table
+ __le32 transfer_len; // 0x08:
+ __le32 lcns_follow; // 0x0C:
+ __le64 vcn; // 0x10: Vcn of dirty page
+ __le64 oldest_lsn; // 0x18:
+ __le64 page_lcns[]; // 0x20:
+};
+
+static_assert(sizeof(struct DIR_PAGE_ENTRY) == 0x20);
+
+/* 32 bit version of 'struct DIR_PAGE_ENTRY' */
+struct DIR_PAGE_ENTRY_32 {
+ __le32 next; // 0x00: RESTART_ENTRY_ALLOCATED if allocated
+ __le32 target_attr; // 0x04: Index into the Open attribute Table
+ __le32 transfer_len; // 0x08:
+ __le32 lcns_follow; // 0x0C:
+ __le32 reserved; // 0x10:
+ __le32 vcn_low; // 0x14: Vcn of dirty page
+ __le32 vcn_hi; // 0x18: Vcn of dirty page
+ __le32 oldest_lsn_low; // 0x1C:
+ __le32 oldest_lsn_hi; // 0x1C:
+ __le32 page_lcns_low; // 0x24:
+ __le32 page_lcns_hi; // 0x24:
+};
+
+static_assert(offsetof(struct DIR_PAGE_ENTRY_32, vcn_low) == 0x14);
+static_assert(sizeof(struct DIR_PAGE_ENTRY_32) == 0x2c);
+
+enum transact_state {
+ TransactionUninitialized = 0,
+ TransactionActive,
+ TransactionPrepared,
+ TransactionCommitted
+};
+
+struct TRANSACTION_ENTRY {
+ __le32 next; // 0x00: RESTART_ENTRY_ALLOCATED if allocated
+ u8 transact_state; // 0x04:
+ u8 reserved[3]; // 0x05:
+ __le64 first_lsn; // 0x08:
+ __le64 prev_lsn; // 0x10:
+ __le64 undo_next_lsn; // 0x18:
+ __le32 undo_records; // 0x20: Number of undo log records pending abort
+ __le32 undo_len; // 0x24: Total undo size
+};
+
+static_assert(sizeof(struct TRANSACTION_ENTRY) == 0x28);
+
+struct NTFS_RESTART {
+ __le32 major_ver; // 0x00:
+ __le32 minor_ver; // 0x04:
+ __le64 check_point_start; // 0x08:
+ __le64 open_attr_table_lsn; // 0x10:
+ __le64 attr_names_lsn; // 0x18:
+ __le64 dirty_pages_table_lsn; // 0x20:
+ __le64 transact_table_lsn; // 0x28:
+ __le32 open_attr_len; // 0x30: In bytes
+ __le32 attr_names_len; // 0x34: In bytes
+ __le32 dirty_pages_len; // 0x38: In bytes
+ __le32 transact_table_len; // 0x3C: In bytes
+};
+
+static_assert(sizeof(struct NTFS_RESTART) == 0x40);
+
+struct NEW_ATTRIBUTE_SIZES {
+ __le64 alloc_size;
+ __le64 valid_size;
+ __le64 data_size;
+ __le64 total_size;
+};
+
+struct BITMAP_RANGE {
+ __le32 bitmap_off;
+ __le32 bits;
+};
+
+struct LCN_RANGE {
+ __le64 lcn;
+ __le64 len;
+};
+
+/* The following type defines the different log record types. */
+#define LfsClientRecord cpu_to_le32(1)
+#define LfsClientRestart cpu_to_le32(2)
+
+/* This is used to uniquely identify a client for a particular log file. */
+struct CLIENT_ID {
+ __le16 seq_num;
+ __le16 client_idx;
+};
+
+/* This is the header that begins every Log Record in the log file. */
+struct LFS_RECORD_HDR {
+ __le64 this_lsn; // 0x00:
+ __le64 client_prev_lsn; // 0x08:
+ __le64 client_undo_next_lsn; // 0x10:
+ __le32 client_data_len; // 0x18:
+ struct CLIENT_ID client; // 0x1C: Owner of this log record.
+ __le32 record_type; // 0x20: LfsClientRecord or LfsClientRestart.
+ __le32 transact_id; // 0x24:
+ __le16 flags; // 0x28: LOG_RECORD_MULTI_PAGE
+ u8 align[6]; // 0x2A:
+};
+
+#define LOG_RECORD_MULTI_PAGE cpu_to_le16(1)
+
+static_assert(sizeof(struct LFS_RECORD_HDR) == 0x30);
+
+struct LFS_RECORD {
+ __le16 next_record_off; // 0x00: Offset of the free space in the page,
+ u8 align[6]; // 0x02:
+ __le64 last_end_lsn; // 0x08: lsn for the last log record which ends on the page,
+};
+
+static_assert(sizeof(struct LFS_RECORD) == 0x10);
+
+struct RECORD_PAGE_HDR {
+ struct NTFS_RECORD_HEADER rhdr; // 'RCRD'
+ __le32 rflags; // 0x10: See LOG_PAGE_LOG_RECORD_END
+ __le16 page_count; // 0x14:
+ __le16 page_pos; // 0x16:
+ struct LFS_RECORD record_hdr; // 0x18:
+ __le16 fixups[10]; // 0x28:
+ __le32 file_off; // 0x3c: Used when major version >= 2
+};
+
+// clang-format on
+
+// Page contains the end of a log record.
+#define LOG_PAGE_LOG_RECORD_END cpu_to_le32(0x00000001)
+
+static inline bool is_log_record_end(const struct RECORD_PAGE_HDR *hdr)
+{
+ return hdr->rflags & LOG_PAGE_LOG_RECORD_END;
+}
+
+static_assert(offsetof(struct RECORD_PAGE_HDR, file_off) == 0x3c);
+
+/*
+ * END of NTFS LOG structures
+ */
+
+/* Define some tuning parameters to keep the restart tables a reasonable size. */
+#define INITIAL_NUMBER_TRANSACTIONS 5
+
+enum NTFS_LOG_OPERATION {
+
+ Noop = 0x00,
+ CompensationLogRecord = 0x01,
+ InitializeFileRecordSegment = 0x02,
+ DeallocateFileRecordSegment = 0x03,
+ WriteEndOfFileRecordSegment = 0x04,
+ CreateAttribute = 0x05,
+ DeleteAttribute = 0x06,
+ UpdateResidentValue = 0x07,
+ UpdateNonresidentValue = 0x08,
+ UpdateMappingPairs = 0x09,
+ DeleteDirtyClusters = 0x0A,
+ SetNewAttributeSizes = 0x0B,
+ AddIndexEntryRoot = 0x0C,
+ DeleteIndexEntryRoot = 0x0D,
+ AddIndexEntryAllocation = 0x0E,
+ DeleteIndexEntryAllocation = 0x0F,
+ WriteEndOfIndexBuffer = 0x10,
+ SetIndexEntryVcnRoot = 0x11,
+ SetIndexEntryVcnAllocation = 0x12,
+ UpdateFileNameRoot = 0x13,
+ UpdateFileNameAllocation = 0x14,
+ SetBitsInNonresidentBitMap = 0x15,
+ ClearBitsInNonresidentBitMap = 0x16,
+ HotFix = 0x17,
+ EndTopLevelAction = 0x18,
+ PrepareTransaction = 0x19,
+ CommitTransaction = 0x1A,
+ ForgetTransaction = 0x1B,
+ OpenNonresidentAttribute = 0x1C,
+ OpenAttributeTableDump = 0x1D,
+ AttributeNamesDump = 0x1E,
+ DirtyPageTableDump = 0x1F,
+ TransactionTableDump = 0x20,
+ UpdateRecordDataRoot = 0x21,
+ UpdateRecordDataAllocation = 0x22,
+
+ UpdateRelativeDataInIndex =
+ 0x23, // NtOfsRestartUpdateRelativeDataInIndex
+ UpdateRelativeDataInIndex2 = 0x24,
+ ZeroEndOfFileRecord = 0x25,
+};
+
+/*
+ * Array for log records which require a target attribute.
+ * A true indicates that the corresponding restart operation
+ * requires a target attribute.
+ */
+static const u8 AttributeRequired[] = {
+ 0xFC, 0xFB, 0xFF, 0x10, 0x06,
+};
+
+static inline bool is_target_required(u16 op)
+{
+ bool ret = op <= UpdateRecordDataAllocation &&
+ (AttributeRequired[op >> 3] >> (op & 7) & 1);
+ return ret;
+}
+
+static inline bool can_skip_action(enum NTFS_LOG_OPERATION op)
+{
+ switch (op) {
+ case Noop:
+ case DeleteDirtyClusters:
+ case HotFix:
+ case EndTopLevelAction:
+ case PrepareTransaction:
+ case CommitTransaction:
+ case ForgetTransaction:
+ case CompensationLogRecord:
+ case OpenNonresidentAttribute:
+ case OpenAttributeTableDump:
+ case AttributeNamesDump:
+ case DirtyPageTableDump:
+ case TransactionTableDump:
+ return true;
+ default:
+ return false;
+ }
+}
+
+enum { lcb_ctx_undo_next, lcb_ctx_prev, lcb_ctx_next };
+
+/* Bytes per restart table. */
+static inline u32 bytes_per_rt(const struct RESTART_TABLE *rt)
+{
+ return le16_to_cpu(rt->used) * le16_to_cpu(rt->size) +
+ sizeof(struct RESTART_TABLE);
+}
+
+/* Log record length. */
+static inline u32 lrh_length(const struct LOG_REC_HDR *lr)
+{
+ u16 t16 = le16_to_cpu(lr->lcns_follow);
+
+ return struct_size(lr, page_lcns, max_t(u16, 1, t16));
+}
+
+struct lcb {
+ struct LFS_RECORD_HDR *lrh; // Log record header of the current lsn.
+ struct LOG_REC_HDR *log_rec;
+ u32 ctx_mode; // lcb_ctx_undo_next/lcb_ctx_prev/lcb_ctx_next
+ struct CLIENT_ID client;
+ bool alloc; // If true the we should deallocate 'log_rec'.
+};
+
+static void lcb_put(struct lcb *lcb)
+{
+ if (lcb->alloc)
+ kfree(lcb->log_rec);
+ kfree(lcb->lrh);
+ kfree(lcb);
+}
+
+/* Find the oldest lsn from active clients. */
+static inline void oldest_client_lsn(const struct CLIENT_REC *ca,
+ __le16 next_client, u64 *oldest_lsn)
+{
+ while (next_client != LFS_NO_CLIENT_LE) {
+ const struct CLIENT_REC *cr = ca + le16_to_cpu(next_client);
+ u64 lsn = le64_to_cpu(cr->oldest_lsn);
+
+ /* Ignore this block if it's oldest lsn is 0. */
+ if (lsn && lsn < *oldest_lsn)
+ *oldest_lsn = lsn;
+
+ next_client = cr->next_client;
+ }
+}
+
+static inline bool is_rst_page_hdr_valid(u32 file_off,
+ const struct RESTART_HDR *rhdr)
+{
+ u32 sys_page = le32_to_cpu(rhdr->sys_page_size);
+ u32 page_size = le32_to_cpu(rhdr->page_size);
+ u32 end_usa;
+ u16 ro;
+
+ if (sys_page < SECTOR_SIZE || page_size < SECTOR_SIZE ||
+ sys_page & (sys_page - 1) || page_size & (page_size - 1)) {
+ return false;
+ }
+
+ /* Check that if the file offset isn't 0, it is the system page size. */
+ if (file_off && file_off != sys_page)
+ return false;
+
+ /* Check support version 1.1+. */
+ if (le16_to_cpu(rhdr->major_ver) <= 1 && !rhdr->minor_ver)
+ return false;
+
+ if (le16_to_cpu(rhdr->major_ver) > 2)
+ return false;
+
+ ro = le16_to_cpu(rhdr->ra_off);
+ if (!IS_ALIGNED(ro, 8) || ro > sys_page)
+ return false;
+
+ end_usa = ((sys_page >> SECTOR_SHIFT) + 1) * sizeof(short);
+ end_usa += le16_to_cpu(rhdr->rhdr.fix_off);
+
+ if (ro < end_usa)
+ return false;
+
+ return true;
+}
+
+static inline bool is_rst_area_valid(const struct RESTART_HDR *rhdr)
+{
+ const struct RESTART_AREA *ra;
+ u16 cl, fl, ul;
+ u32 off, l_size, file_dat_bits, file_size_round;
+ u16 ro = le16_to_cpu(rhdr->ra_off);
+ u32 sys_page = le32_to_cpu(rhdr->sys_page_size);
+
+ if (ro + offsetof(struct RESTART_AREA, l_size) >
+ SECTOR_SIZE - sizeof(short))
+ return false;
+
+ ra = Add2Ptr(rhdr, ro);
+ cl = le16_to_cpu(ra->log_clients);
+
+ if (cl > 1)
+ return false;
+
+ off = le16_to_cpu(ra->client_off);
+
+ if (!IS_ALIGNED(off, 8) || ro + off > SECTOR_SIZE - sizeof(short))
+ return false;
+
+ off += cl * sizeof(struct CLIENT_REC);
+
+ if (off > sys_page)
+ return false;
+
+ /*
+ * Check the restart length field and whether the entire
+ * restart area is contained that length.
+ */
+ if (le16_to_cpu(rhdr->ra_off) + le16_to_cpu(ra->ra_len) > sys_page ||
+ off > le16_to_cpu(ra->ra_len)) {
+ return false;
+ }
+
+ /*
+ * As a final check make sure that the use list and the free list
+ * are either empty or point to a valid client.
+ */
+ fl = le16_to_cpu(ra->client_idx[0]);
+ ul = le16_to_cpu(ra->client_idx[1]);
+ if ((fl != LFS_NO_CLIENT && fl >= cl) ||
+ (ul != LFS_NO_CLIENT && ul >= cl))
+ return false;
+
+ /* Make sure the sequence number bits match the log file size. */
+ l_size = le64_to_cpu(ra->l_size);
+
+ file_dat_bits = sizeof(u64) * 8 - le32_to_cpu(ra->seq_num_bits);
+ file_size_round = 1u << (file_dat_bits + 3);
+ if (file_size_round != l_size &&
+ (file_size_round < l_size || (file_size_round / 2) > l_size)) {
+ return false;
+ }
+
+ /* The log page data offset and record header length must be quad-aligned. */
+ if (!IS_ALIGNED(le16_to_cpu(ra->data_off), 8) ||
+ !IS_ALIGNED(le16_to_cpu(ra->rec_hdr_len), 8))
+ return false;
+
+ return true;
+}
+
+static inline bool is_client_area_valid(const struct RESTART_HDR *rhdr,
+ bool usa_error)
+{
+ u16 ro = le16_to_cpu(rhdr->ra_off);
+ const struct RESTART_AREA *ra = Add2Ptr(rhdr, ro);
+ u16 ra_len = le16_to_cpu(ra->ra_len);
+ const struct CLIENT_REC *ca;
+ u32 i;
+
+ if (usa_error && ra_len + ro > SECTOR_SIZE - sizeof(short))
+ return false;
+
+ /* Find the start of the client array. */
+ ca = Add2Ptr(ra, le16_to_cpu(ra->client_off));
+
+ /*
+ * Start with the free list.
+ * Check that all the clients are valid and that there isn't a cycle.
+ * Do the in-use list on the second pass.
+ */
+ for (i = 0; i < 2; i++) {
+ u16 client_idx = le16_to_cpu(ra->client_idx[i]);
+ bool first_client = true;
+ u16 clients = le16_to_cpu(ra->log_clients);
+
+ while (client_idx != LFS_NO_CLIENT) {
+ const struct CLIENT_REC *cr;
+
+ if (!clients ||
+ client_idx >= le16_to_cpu(ra->log_clients))
+ return false;
+
+ clients -= 1;
+ cr = ca + client_idx;
+
+ client_idx = le16_to_cpu(cr->next_client);
+
+ if (first_client) {
+ first_client = false;
+ if (cr->prev_client != LFS_NO_CLIENT_LE)
+ return false;
+ }
+ }
+ }
+
+ return true;
+}
+
+/*
+ * remove_client
+ *
+ * Remove a client record from a client record list an restart area.
+ */
+static inline void remove_client(struct CLIENT_REC *ca,
+ const struct CLIENT_REC *cr, __le16 *head)
+{
+ if (cr->prev_client == LFS_NO_CLIENT_LE)
+ *head = cr->next_client;
+ else
+ ca[le16_to_cpu(cr->prev_client)].next_client = cr->next_client;
+
+ if (cr->next_client != LFS_NO_CLIENT_LE)
+ ca[le16_to_cpu(cr->next_client)].prev_client = cr->prev_client;
+}
+
+/*
+ * add_client - Add a client record to the start of a list.
+ */
+static inline void add_client(struct CLIENT_REC *ca, u16 index, __le16 *head)
+{
+ struct CLIENT_REC *cr = ca + index;
+
+ cr->prev_client = LFS_NO_CLIENT_LE;
+ cr->next_client = *head;
+
+ if (*head != LFS_NO_CLIENT_LE)
+ ca[le16_to_cpu(*head)].prev_client = cpu_to_le16(index);
+
+ *head = cpu_to_le16(index);
+}
+
+static inline void *enum_rstbl(struct RESTART_TABLE *t, void *c)
+{
+ __le32 *e;
+ u32 bprt;
+ u16 rsize = t ? le16_to_cpu(t->size) : 0;
+
+ if (!c) {
+ if (!t || !t->total)
+ return NULL;
+ e = Add2Ptr(t, sizeof(struct RESTART_TABLE));
+ } else {
+ e = Add2Ptr(c, rsize);
+ }
+
+ /* Loop until we hit the first one allocated, or the end of the list. */
+ for (bprt = bytes_per_rt(t); PtrOffset(t, e) < bprt;
+ e = Add2Ptr(e, rsize)) {
+ if (*e == RESTART_ENTRY_ALLOCATED_LE)
+ return e;
+ }
+ return NULL;
+}
+
+/*
+ * find_dp - Search for a @vcn in Dirty Page Table.
+ */
+static inline struct DIR_PAGE_ENTRY *find_dp(struct RESTART_TABLE *dptbl,
+ u32 target_attr, u64 vcn)
+{
+ __le32 ta = cpu_to_le32(target_attr);
+ struct DIR_PAGE_ENTRY *dp = NULL;
+
+ while ((dp = enum_rstbl(dptbl, dp))) {
+ u64 dp_vcn = le64_to_cpu(dp->vcn);
+
+ if (dp->target_attr == ta && vcn >= dp_vcn &&
+ vcn < dp_vcn + le32_to_cpu(dp->lcns_follow)) {
+ return dp;
+ }
+ }
+ return NULL;
+}
+
+static inline u32 norm_file_page(u32 page_size, u32 *l_size, bool use_default)
+{
+ if (use_default)
+ page_size = DefaultLogPageSize;
+
+ /* Round the file size down to a system page boundary. */
+ *l_size &= ~(page_size - 1);
+
+ /* File should contain at least 2 restart pages and MinLogRecordPages pages. */
+ if (*l_size < (MinLogRecordPages + 2) * page_size)
+ return 0;
+
+ return page_size;
+}
+
+static bool check_log_rec(const struct LOG_REC_HDR *lr, u32 bytes, u32 tr,
+ u32 bytes_per_attr_entry)
+{
+ u16 t16;
+
+ if (bytes < sizeof(struct LOG_REC_HDR))
+ return false;
+ if (!tr)
+ return false;
+
+ if ((tr - sizeof(struct RESTART_TABLE)) %
+ sizeof(struct TRANSACTION_ENTRY))
+ return false;
+
+ if (le16_to_cpu(lr->redo_off) & 7)
+ return false;
+
+ if (le16_to_cpu(lr->undo_off) & 7)
+ return false;
+
+ if (lr->target_attr)
+ goto check_lcns;
+
+ if (is_target_required(le16_to_cpu(lr->redo_op)))
+ return false;
+
+ if (is_target_required(le16_to_cpu(lr->undo_op)))
+ return false;
+
+check_lcns:
+ if (!lr->lcns_follow)
+ goto check_length;
+
+ t16 = le16_to_cpu(lr->target_attr);
+ if ((t16 - sizeof(struct RESTART_TABLE)) % bytes_per_attr_entry)
+ return false;
+
+check_length:
+ if (bytes < lrh_length(lr))
+ return false;
+
+ return true;
+}
+
+static bool check_rstbl(const struct RESTART_TABLE *rt, size_t bytes)
+{
+ u32 ts;
+ u32 i, off;
+ u16 rsize = le16_to_cpu(rt->size);
+ u16 ne = le16_to_cpu(rt->used);
+ u32 ff = le32_to_cpu(rt->first_free);
+ u32 lf = le32_to_cpu(rt->last_free);
+
+ ts = rsize * ne + sizeof(struct RESTART_TABLE);
+
+ if (!rsize || rsize > bytes ||
+ rsize + sizeof(struct RESTART_TABLE) > bytes || bytes < ts ||
+ le16_to_cpu(rt->total) > ne || ff > ts || lf > ts ||
+ (ff && ff < sizeof(struct RESTART_TABLE)) ||
+ (lf && lf < sizeof(struct RESTART_TABLE))) {
+ return false;
+ }
+
+ /*
+ * Verify each entry is either allocated or points
+ * to a valid offset the table.
+ */
+ for (i = 0; i < ne; i++) {
+ off = le32_to_cpu(*(__le32 *)Add2Ptr(
+ rt, i * rsize + sizeof(struct RESTART_TABLE)));
+
+ if (off != RESTART_ENTRY_ALLOCATED && off &&
+ (off < sizeof(struct RESTART_TABLE) ||
+ ((off - sizeof(struct RESTART_TABLE)) % rsize))) {
+ return false;
+ }
+ }
+
+ /*
+ * Walk through the list headed by the first entry to make
+ * sure none of the entries are currently being used.
+ */
+ for (off = ff; off;) {
+ if (off == RESTART_ENTRY_ALLOCATED)
+ return false;
+
+ off = le32_to_cpu(*(__le32 *)Add2Ptr(rt, off));
+ }
+
+ return true;
+}
+
+/*
+ * free_rsttbl_idx - Free a previously allocated index a Restart Table.
+ */
+static inline void free_rsttbl_idx(struct RESTART_TABLE *rt, u32 off)
+{
+ __le32 *e;
+ u32 lf = le32_to_cpu(rt->last_free);
+ __le32 off_le = cpu_to_le32(off);
+
+ e = Add2Ptr(rt, off);
+
+ if (off < le32_to_cpu(rt->free_goal)) {
+ *e = rt->first_free;
+ rt->first_free = off_le;
+ if (!lf)
+ rt->last_free = off_le;
+ } else {
+ if (lf)
+ *(__le32 *)Add2Ptr(rt, lf) = off_le;
+ else
+ rt->first_free = off_le;
+
+ rt->last_free = off_le;
+ *e = 0;
+ }
+
+ le16_sub_cpu(&rt->total, 1);
+}
+
+static inline struct RESTART_TABLE *init_rsttbl(u16 esize, u16 used)
+{
+ __le32 *e, *last_free;
+ u32 off;
+ u32 bytes = esize * used + sizeof(struct RESTART_TABLE);
+ u32 lf = sizeof(struct RESTART_TABLE) + (used - 1) * esize;
+ struct RESTART_TABLE *t = kzalloc(bytes, GFP_NOFS);
+
+ if (!t)
+ return NULL;
+
+ t->size = cpu_to_le16(esize);
+ t->used = cpu_to_le16(used);
+ t->free_goal = cpu_to_le32(~0u);
+ t->first_free = cpu_to_le32(sizeof(struct RESTART_TABLE));
+ t->last_free = cpu_to_le32(lf);
+
+ e = (__le32 *)(t + 1);
+ last_free = Add2Ptr(t, lf);
+
+ for (off = sizeof(struct RESTART_TABLE) + esize; e < last_free;
+ e = Add2Ptr(e, esize), off += esize) {
+ *e = cpu_to_le32(off);
+ }
+ return t;
+}
+
+static inline struct RESTART_TABLE *extend_rsttbl(struct RESTART_TABLE *tbl,
+ u32 add, u32 free_goal)
+{
+ u16 esize = le16_to_cpu(tbl->size);
+ __le32 osize = cpu_to_le32(bytes_per_rt(tbl));
+ u32 used = le16_to_cpu(tbl->used);
+ struct RESTART_TABLE *rt;
+
+ rt = init_rsttbl(esize, used + add);
+ if (!rt)
+ return NULL;
+
+ memcpy(rt + 1, tbl + 1, esize * used);
+
+ rt->free_goal = free_goal == ~0u
+ ? cpu_to_le32(~0u)
+ : cpu_to_le32(sizeof(struct RESTART_TABLE) +
+ free_goal * esize);
+
+ if (tbl->first_free) {
+ rt->first_free = tbl->first_free;
+ *(__le32 *)Add2Ptr(rt, le32_to_cpu(tbl->last_free)) = osize;
+ } else {
+ rt->first_free = osize;
+ }
+
+ rt->total = tbl->total;
+
+ kfree(tbl);
+ return rt;
+}
+
+/*
+ * alloc_rsttbl_idx
+ *
+ * Allocate an index from within a previously initialized Restart Table.
+ */
+static inline void *alloc_rsttbl_idx(struct RESTART_TABLE **tbl)
+{
+ u32 off;
+ __le32 *e;
+ struct RESTART_TABLE *t = *tbl;
+
+ if (!t->first_free) {
+ *tbl = t = extend_rsttbl(t, 16, ~0u);
+ if (!t)
+ return NULL;
+ }
+
+ off = le32_to_cpu(t->first_free);
+
+ /* Dequeue this entry and zero it. */
+ e = Add2Ptr(t, off);
+
+ t->first_free = *e;
+
+ memset(e, 0, le16_to_cpu(t->size));
+
+ *e = RESTART_ENTRY_ALLOCATED_LE;
+
+ /* If list is going empty, then we fix the last_free as well. */
+ if (!t->first_free)
+ t->last_free = 0;
+
+ le16_add_cpu(&t->total, 1);
+
+ return Add2Ptr(t, off);
+}
+
+/*
+ * alloc_rsttbl_from_idx
+ *
+ * Allocate a specific index from within a previously initialized Restart Table.
+ */
+static inline void *alloc_rsttbl_from_idx(struct RESTART_TABLE **tbl, u32 vbo)
+{
+ u32 off;
+ __le32 *e;
+ struct RESTART_TABLE *rt = *tbl;
+ u32 bytes = bytes_per_rt(rt);
+ u16 esize = le16_to_cpu(rt->size);
+
+ /* If the entry is not the table, we will have to extend the table. */
+ if (vbo >= bytes) {
+ /*
+ * Extend the size by computing the number of entries between
+ * the existing size and the desired index and adding 1 to that.
+ */
+ u32 bytes2idx = vbo - bytes;
+
+ /*
+ * There should always be an integral number of entries
+ * being added. Now extend the table.
+ */
+ *tbl = rt = extend_rsttbl(rt, bytes2idx / esize + 1, bytes);
+ if (!rt)
+ return NULL;
+ }
+
+ /* See if the entry is already allocated, and just return if it is. */
+ e = Add2Ptr(rt, vbo);
+
+ if (*e == RESTART_ENTRY_ALLOCATED_LE)
+ return e;
+
+ /*
+ * Walk through the table, looking for the entry we're
+ * interested and the previous entry.
+ */
+ off = le32_to_cpu(rt->first_free);
+ e = Add2Ptr(rt, off);
+
+ if (off == vbo) {
+ /* this is a match */
+ rt->first_free = *e;
+ goto skip_looking;
+ }
+
+ /*
+ * Need to walk through the list looking for the predecessor
+ * of our entry.
+ */
+ for (;;) {
+ /* Remember the entry just found */
+ u32 last_off = off;
+ __le32 *last_e = e;
+
+ /* Should never run of entries. */
+
+ /* Lookup up the next entry the list. */
+ off = le32_to_cpu(*last_e);
+ e = Add2Ptr(rt, off);
+
+ /* If this is our match we are done. */
+ if (off == vbo) {
+ *last_e = *e;
+
+ /*
+ * If this was the last entry, we update that
+ * table as well.
+ */
+ if (le32_to_cpu(rt->last_free) == off)
+ rt->last_free = cpu_to_le32(last_off);
+ break;
+ }
+ }
+
+skip_looking:
+ /* If the list is now empty, we fix the last_free as well. */
+ if (!rt->first_free)
+ rt->last_free = 0;
+
+ /* Zero this entry. */
+ memset(e, 0, esize);
+ *e = RESTART_ENTRY_ALLOCATED_LE;
+
+ le16_add_cpu(&rt->total, 1);
+
+ return e;
+}
+
+#define RESTART_SINGLE_PAGE_IO cpu_to_le16(0x0001)
+
+#define NTFSLOG_WRAPPED 0x00000001
+#define NTFSLOG_MULTIPLE_PAGE_IO 0x00000002
+#define NTFSLOG_NO_LAST_LSN 0x00000004
+#define NTFSLOG_REUSE_TAIL 0x00000010
+#define NTFSLOG_NO_OLDEST_LSN 0x00000020
+
+/* Helper struct to work with NTFS $LogFile. */
+struct ntfs_log {
+ struct ntfs_inode *ni;
+
+ u32 l_size;
+ u32 sys_page_size;
+ u32 sys_page_mask;
+ u32 page_size;
+ u32 page_mask; // page_size - 1
+ u8 page_bits;
+ struct RECORD_PAGE_HDR *one_page_buf;
+
+ struct RESTART_TABLE *open_attr_tbl;
+ u32 transaction_id;
+ u32 clst_per_page;
+
+ u32 first_page;
+ u32 next_page;
+ u32 ra_off;
+ u32 data_off;
+ u32 restart_size;
+ u32 data_size;
+ u16 record_header_len;
+ u64 seq_num;
+ u32 seq_num_bits;
+ u32 file_data_bits;
+ u32 seq_num_mask; /* (1 << file_data_bits) - 1 */
+
+ struct RESTART_AREA *ra; /* In-memory image of the next restart area. */
+ u32 ra_size; /* The usable size of the restart area. */
+
+ /*
+ * If true, then the in-memory restart area is to be written
+ * to the first position on the disk.
+ */
+ bool init_ra;
+ bool set_dirty; /* True if we need to set dirty flag. */
+
+ u64 oldest_lsn;
+
+ u32 oldest_lsn_off;
+ u64 last_lsn;
+
+ u32 total_avail;
+ u32 total_avail_pages;
+ u32 total_undo_commit;
+ u32 max_current_avail;
+ u32 current_avail;
+ u32 reserved;
+
+ short major_ver;
+ short minor_ver;
+
+ u32 l_flags; /* See NTFSLOG_XXX */
+ u32 current_openlog_count; /* On-disk value for open_log_count. */
+
+ struct CLIENT_ID client_id;
+ u32 client_undo_commit;
+};
+
+static inline u32 lsn_to_vbo(struct ntfs_log *log, const u64 lsn)
+{
+ u32 vbo = (lsn << log->seq_num_bits) >> (log->seq_num_bits - 3);
+
+ return vbo;
+}
+
+/* Compute the offset in the log file of the next log page. */
+static inline u32 next_page_off(struct ntfs_log *log, u32 off)
+{
+ off = (off & ~log->sys_page_mask) + log->page_size;
+ return off >= log->l_size ? log->first_page : off;
+}
+
+static inline u32 lsn_to_page_off(struct ntfs_log *log, u64 lsn)
+{
+ return (((u32)lsn) << 3) & log->page_mask;
+}
+
+static inline u64 vbo_to_lsn(struct ntfs_log *log, u32 off, u64 Seq)
+{
+ return (off >> 3) + (Seq << log->file_data_bits);
+}
+
+static inline bool is_lsn_in_file(struct ntfs_log *log, u64 lsn)
+{
+ return lsn >= log->oldest_lsn &&
+ lsn <= le64_to_cpu(log->ra->current_lsn);
+}
+
+static inline u32 hdr_file_off(struct ntfs_log *log,
+ struct RECORD_PAGE_HDR *hdr)
+{
+ if (log->major_ver < 2)
+ return le64_to_cpu(hdr->rhdr.lsn);
+
+ return le32_to_cpu(hdr->file_off);
+}
+
+static inline u64 base_lsn(struct ntfs_log *log,
+ const struct RECORD_PAGE_HDR *hdr, u64 lsn)
+{
+ u64 h_lsn = le64_to_cpu(hdr->rhdr.lsn);
+ u64 ret = (((h_lsn >> log->file_data_bits) +
+ (lsn < (lsn_to_vbo(log, h_lsn) & ~log->page_mask) ? 1 : 0))
+ << log->file_data_bits) +
+ ((((is_log_record_end(hdr) &&
+ h_lsn <= le64_to_cpu(hdr->record_hdr.last_end_lsn))
+ ? le16_to_cpu(hdr->record_hdr.next_record_off)
+ : log->page_size) +
+ lsn) >>
+ 3);
+
+ return ret;
+}
+
+static inline bool verify_client_lsn(struct ntfs_log *log,
+ const struct CLIENT_REC *client, u64 lsn)
+{
+ return lsn >= le64_to_cpu(client->oldest_lsn) &&
+ lsn <= le64_to_cpu(log->ra->current_lsn) && lsn;
+}
+
+struct restart_info {
+ u64 last_lsn;
+ struct RESTART_HDR *r_page;
+ u32 vbo;
+ bool chkdsk_was_run;
+ bool valid_page;
+ bool initialized;
+ bool restart;
+};
+
+static int read_log_page(struct ntfs_log *log, u32 vbo,
+ struct RECORD_PAGE_HDR **buffer, bool *usa_error)
+{
+ int err = 0;
+ u32 page_idx = vbo >> log->page_bits;
+ u32 page_off = vbo & log->page_mask;
+ u32 bytes = log->page_size - page_off;
+ void *to_free = NULL;
+ u32 page_vbo = page_idx << log->page_bits;
+ struct RECORD_PAGE_HDR *page_buf;
+ struct ntfs_inode *ni = log->ni;
+ bool bBAAD;
+
+ if (vbo >= log->l_size)
+ return -EINVAL;
+
+ if (!*buffer) {
+ to_free = kmalloc(log->page_size, GFP_NOFS);
+ if (!to_free)
+ return -ENOMEM;
+ *buffer = to_free;
+ }
+
+ page_buf = page_off ? log->one_page_buf : *buffer;
+
+ err = ntfs_read_run_nb(ni->mi.sbi, &ni->file.run, page_vbo, page_buf,
+ log->page_size, NULL);
+ if (err)
+ goto out;
+
+ if (page_buf->rhdr.sign != NTFS_FFFF_SIGNATURE)
+ ntfs_fix_post_read(&page_buf->rhdr, PAGE_SIZE, false);
+
+ if (page_buf != *buffer)
+ memcpy(*buffer, Add2Ptr(page_buf, page_off), bytes);
+
+ bBAAD = page_buf->rhdr.sign == NTFS_BAAD_SIGNATURE;
+
+ if (usa_error)
+ *usa_error = bBAAD;
+ /* Check that the update sequence array for this page is valid */
+ /* If we don't allow errors, raise an error status */
+ else if (bBAAD)
+ err = -EINVAL;
+
+out:
+ if (err && to_free) {
+ kfree(to_free);
+ *buffer = NULL;
+ }
+
+ return err;
+}
+
+/*
+ * log_read_rst
+ *
+ * It walks through 512 blocks of the file looking for a valid
+ * restart page header. It will stop the first time we find a
+ * valid page header.
+ */
+static int log_read_rst(struct ntfs_log *log, u32 l_size, bool first,
+ struct restart_info *info)
+{
+ u32 skip, vbo;
+ struct RESTART_HDR *r_page = NULL;
+
+ /* Determine which restart area we are looking for. */
+ if (first) {
+ vbo = 0;
+ skip = 512;
+ } else {
+ vbo = 512;
+ skip = 0;
+ }
+
+ /* Loop continuously until we succeed. */
+ for (; vbo < l_size; vbo = 2 * vbo + skip, skip = 0) {
+ bool usa_error;
+ bool brst, bchk;
+ struct RESTART_AREA *ra;
+
+ /* Read a page header at the current offset. */
+ if (read_log_page(log, vbo, (struct RECORD_PAGE_HDR **)&r_page,
+ &usa_error)) {
+ /* Ignore any errors. */
+ continue;
+ }
+
+ /* Exit if the signature is a log record page. */
+ if (r_page->rhdr.sign == NTFS_RCRD_SIGNATURE) {
+ info->initialized = true;
+ break;
+ }
+
+ brst = r_page->rhdr.sign == NTFS_RSTR_SIGNATURE;
+ bchk = r_page->rhdr.sign == NTFS_CHKD_SIGNATURE;
+
+ if (!bchk && !brst) {
+ if (r_page->rhdr.sign != NTFS_FFFF_SIGNATURE) {
+ /*
+ * Remember if the signature does not
+ * indicate uninitialized file.
+ */
+ info->initialized = true;
+ }
+ continue;
+ }
+
+ ra = NULL;
+ info->valid_page = false;
+ info->initialized = true;
+ info->vbo = vbo;
+
+ /* Let's check the restart area if this is a valid page. */
+ if (!is_rst_page_hdr_valid(vbo, r_page))
+ goto check_result;
+ ra = Add2Ptr(r_page, le16_to_cpu(r_page->ra_off));
+
+ if (!is_rst_area_valid(r_page))
+ goto check_result;
+
+ /*
+ * We have a valid restart page header and restart area.
+ * If chkdsk was run or we have no clients then we have
+ * no more checking to do.
+ */
+ if (bchk || ra->client_idx[1] == LFS_NO_CLIENT_LE) {
+ info->valid_page = true;
+ goto check_result;
+ }
+
+ if (is_client_area_valid(r_page, usa_error)) {
+ info->valid_page = true;
+ ra = Add2Ptr(r_page, le16_to_cpu(r_page->ra_off));
+ }
+
+check_result:
+ /*
+ * If chkdsk was run then update the caller's
+ * values and return.
+ */
+ if (r_page->rhdr.sign == NTFS_CHKD_SIGNATURE) {
+ info->chkdsk_was_run = true;
+ info->last_lsn = le64_to_cpu(r_page->rhdr.lsn);
+ info->restart = true;
+ info->r_page = r_page;
+ return 0;
+ }
+
+ /*
+ * If we have a valid page then copy the values
+ * we need from it.
+ */
+ if (info->valid_page) {
+ info->last_lsn = le64_to_cpu(ra->current_lsn);
+ info->restart = true;
+ info->r_page = r_page;
+ return 0;
+ }
+ }
+
+ kfree(r_page);
+
+ return 0;
+}
+
+/*
+ * Ilog_init_pg_hdr - Init @log from restart page header.
+ */
+static void log_init_pg_hdr(struct ntfs_log *log, u32 sys_page_size,
+ u32 page_size, u16 major_ver, u16 minor_ver)
+{
+ log->sys_page_size = sys_page_size;
+ log->sys_page_mask = sys_page_size - 1;
+ log->page_size = page_size;
+ log->page_mask = page_size - 1;
+ log->page_bits = blksize_bits(page_size);
+
+ log->clst_per_page = log->page_size >> log->ni->mi.sbi->cluster_bits;
+ if (!log->clst_per_page)
+ log->clst_per_page = 1;
+
+ log->first_page = major_ver >= 2
+ ? 0x22 * page_size
+ : ((sys_page_size << 1) + (page_size << 1));
+ log->major_ver = major_ver;
+ log->minor_ver = minor_ver;
+}
+
+/*
+ * log_create - Init @log in cases when we don't have a restart area to use.
+ */
+static void log_create(struct ntfs_log *log, u32 l_size, const u64 last_lsn,
+ u32 open_log_count, bool wrapped, bool use_multi_page)
+{
+ log->l_size = l_size;
+ /* All file offsets must be quadword aligned. */
+ log->file_data_bits = blksize_bits(l_size) - 3;
+ log->seq_num_mask = (8 << log->file_data_bits) - 1;
+ log->seq_num_bits = sizeof(u64) * 8 - log->file_data_bits;
+ log->seq_num = (last_lsn >> log->file_data_bits) + 2;
+ log->next_page = log->first_page;
+ log->oldest_lsn = log->seq_num << log->file_data_bits;
+ log->oldest_lsn_off = 0;
+ log->last_lsn = log->oldest_lsn;
+
+ log->l_flags |= NTFSLOG_NO_LAST_LSN | NTFSLOG_NO_OLDEST_LSN;
+
+ /* Set the correct flags for the I/O and indicate if we have wrapped. */
+ if (wrapped)
+ log->l_flags |= NTFSLOG_WRAPPED;
+
+ if (use_multi_page)
+ log->l_flags |= NTFSLOG_MULTIPLE_PAGE_IO;
+
+ /* Compute the log page values. */
+ log->data_off = ALIGN(
+ offsetof(struct RECORD_PAGE_HDR, fixups) +
+ sizeof(short) * ((log->page_size >> SECTOR_SHIFT) + 1),
+ 8);
+ log->data_size = log->page_size - log->data_off;
+ log->record_header_len = sizeof(struct LFS_RECORD_HDR);
+
+ /* Remember the different page sizes for reservation. */
+ log->reserved = log->data_size - log->record_header_len;
+
+ /* Compute the restart page values. */
+ log->ra_off = ALIGN(
+ offsetof(struct RESTART_HDR, fixups) +
+ sizeof(short) *
+ ((log->sys_page_size >> SECTOR_SHIFT) + 1),
+ 8);
+ log->restart_size = log->sys_page_size - log->ra_off;
+ log->ra_size = struct_size(log->ra, clients, 1);
+ log->current_openlog_count = open_log_count;
+
+ /*
+ * The total available log file space is the number of
+ * log file pages times the space available on each page.
+ */
+ log->total_avail_pages = log->l_size - log->first_page;
+ log->total_avail = log->total_avail_pages >> log->page_bits;
+
+ /*
+ * We assume that we can't use the end of the page less than
+ * the file record size.
+ * Then we won't need to reserve more than the caller asks for.
+ */
+ log->max_current_avail = log->total_avail * log->reserved;
+ log->total_avail = log->total_avail * log->data_size;
+ log->current_avail = log->max_current_avail;
+}
+
+/*
+ * log_create_ra - Fill a restart area from the values stored in @log.
+ */
+static struct RESTART_AREA *log_create_ra(struct ntfs_log *log)
+{
+ struct CLIENT_REC *cr;
+ struct RESTART_AREA *ra = kzalloc(log->restart_size, GFP_NOFS);
+
+ if (!ra)
+ return NULL;
+
+ ra->current_lsn = cpu_to_le64(log->last_lsn);
+ ra->log_clients = cpu_to_le16(1);
+ ra->client_idx[1] = LFS_NO_CLIENT_LE;
+ if (log->l_flags & NTFSLOG_MULTIPLE_PAGE_IO)
+ ra->flags = RESTART_SINGLE_PAGE_IO;
+ ra->seq_num_bits = cpu_to_le32(log->seq_num_bits);
+ ra->ra_len = cpu_to_le16(log->ra_size);
+ ra->client_off = cpu_to_le16(offsetof(struct RESTART_AREA, clients));
+ ra->l_size = cpu_to_le64(log->l_size);
+ ra->rec_hdr_len = cpu_to_le16(log->record_header_len);
+ ra->data_off = cpu_to_le16(log->data_off);
+ ra->open_log_count = cpu_to_le32(log->current_openlog_count + 1);
+
+ cr = ra->clients;
+
+ cr->prev_client = LFS_NO_CLIENT_LE;
+ cr->next_client = LFS_NO_CLIENT_LE;
+
+ return ra;
+}
+
+static u32 final_log_off(struct ntfs_log *log, u64 lsn, u32 data_len)
+{
+ u32 base_vbo = lsn << 3;
+ u32 final_log_off = (base_vbo & log->seq_num_mask) & ~log->page_mask;
+ u32 page_off = base_vbo & log->page_mask;
+ u32 tail = log->page_size - page_off;
+
+ page_off -= 1;
+
+ /* Add the length of the header. */
+ data_len += log->record_header_len;
+
+ /*
+ * If this lsn is contained this log page we are done.
+ * Otherwise we need to walk through several log pages.
+ */
+ if (data_len > tail) {
+ data_len -= tail;
+ tail = log->data_size;
+ page_off = log->data_off - 1;
+
+ for (;;) {
+ final_log_off = next_page_off(log, final_log_off);
+
+ /*
+ * We are done if the remaining bytes
+ * fit on this page.
+ */
+ if (data_len <= tail)
+ break;
+ data_len -= tail;
+ }
+ }
+
+ /*
+ * We add the remaining bytes to our starting position on this page
+ * and then add that value to the file offset of this log page.
+ */
+ return final_log_off + data_len + page_off;
+}
+
+static int next_log_lsn(struct ntfs_log *log, const struct LFS_RECORD_HDR *rh,
+ u64 *lsn)
+{
+ int err;
+ u64 this_lsn = le64_to_cpu(rh->this_lsn);
+ u32 vbo = lsn_to_vbo(log, this_lsn);
+ u32 end =
+ final_log_off(log, this_lsn, le32_to_cpu(rh->client_data_len));
+ u32 hdr_off = end & ~log->sys_page_mask;
+ u64 seq = this_lsn >> log->file_data_bits;
+ struct RECORD_PAGE_HDR *page = NULL;
+
+ /* Remember if we wrapped. */
+ if (end <= vbo)
+ seq += 1;
+
+ /* Log page header for this page. */
+ err = read_log_page(log, hdr_off, &page, NULL);
+ if (err)
+ return err;
+
+ /*
+ * If the lsn we were given was not the last lsn on this page,
+ * then the starting offset for the next lsn is on a quad word
+ * boundary following the last file offset for the current lsn.
+ * Otherwise the file offset is the start of the data on the next page.
+ */
+ if (this_lsn == le64_to_cpu(page->rhdr.lsn)) {
+ /* If we wrapped, we need to increment the sequence number. */
+ hdr_off = next_page_off(log, hdr_off);
+ if (hdr_off == log->first_page)
+ seq += 1;
+
+ vbo = hdr_off + log->data_off;
+ } else {
+ vbo = ALIGN(end, 8);
+ }
+
+ /* Compute the lsn based on the file offset and the sequence count. */
+ *lsn = vbo_to_lsn(log, vbo, seq);
+
+ /*
+ * If this lsn is within the legal range for the file, we return true.
+ * Otherwise false indicates that there are no more lsn's.
+ */
+ if (!is_lsn_in_file(log, *lsn))
+ *lsn = 0;
+
+ kfree(page);
+
+ return 0;
+}
+
+/*
+ * current_log_avail - Calculate the number of bytes available for log records.
+ */
+static u32 current_log_avail(struct ntfs_log *log)
+{
+ u32 oldest_off, next_free_off, free_bytes;
+
+ if (log->l_flags & NTFSLOG_NO_LAST_LSN) {
+ /* The entire file is available. */
+ return log->max_current_avail;
+ }
+
+ /*
+ * If there is a last lsn the restart area then we know that we will
+ * have to compute the free range.
+ * If there is no oldest lsn then start at the first page of the file.
+ */
+ oldest_off = (log->l_flags & NTFSLOG_NO_OLDEST_LSN)
+ ? log->first_page
+ : (log->oldest_lsn_off & ~log->sys_page_mask);
+
+ /*
+ * We will use the next log page offset to compute the next free page.
+ * If we are going to reuse this page go to the next page.
+ * If we are at the first page then use the end of the file.
+ */
+ next_free_off = (log->l_flags & NTFSLOG_REUSE_TAIL)
+ ? log->next_page + log->page_size
+ : log->next_page == log->first_page
+ ? log->l_size
+ : log->next_page;
+
+ /* If the two offsets are the same then there is no available space. */
+ if (oldest_off == next_free_off)
+ return 0;
+ /*
+ * If the free offset follows the oldest offset then subtract
+ * this range from the total available pages.
+ */
+ free_bytes =
+ oldest_off < next_free_off
+ ? log->total_avail_pages - (next_free_off - oldest_off)
+ : oldest_off - next_free_off;
+
+ free_bytes >>= log->page_bits;
+ return free_bytes * log->reserved;
+}
+
+static bool check_subseq_log_page(struct ntfs_log *log,
+ const struct RECORD_PAGE_HDR *rp, u32 vbo,
+ u64 seq)
+{
+ u64 lsn_seq;
+ const struct NTFS_RECORD_HEADER *rhdr = &rp->rhdr;
+ u64 lsn = le64_to_cpu(rhdr->lsn);
+
+ if (rhdr->sign == NTFS_FFFF_SIGNATURE || !rhdr->sign)
+ return false;
+
+ /*
+ * If the last lsn on the page occurs was written after the page
+ * that caused the original error then we have a fatal error.
+ */
+ lsn_seq = lsn >> log->file_data_bits;
+
+ /*
+ * If the sequence number for the lsn the page is equal or greater
+ * than lsn we expect, then this is a subsequent write.
+ */
+ return lsn_seq >= seq ||
+ (lsn_seq == seq - 1 && log->first_page == vbo &&
+ vbo != (lsn_to_vbo(log, lsn) & ~log->page_mask));
+}
+
+/*
+ * last_log_lsn
+ *
+ * Walks through the log pages for a file, searching for the
+ * last log page written to the file.
+ */
+static int last_log_lsn(struct ntfs_log *log)
+{
+ int err;
+ bool usa_error = false;
+ bool replace_page = false;
+ bool reuse_page = log->l_flags & NTFSLOG_REUSE_TAIL;
+ bool wrapped_file, wrapped;
+
+ u32 page_cnt = 1, page_pos = 1;
+ u32 page_off = 0, page_off1 = 0, saved_off = 0;
+ u32 final_off, second_off, final_off_prev = 0, second_off_prev = 0;
+ u32 first_file_off = 0, second_file_off = 0;
+ u32 part_io_count = 0;
+ u32 tails = 0;
+ u32 this_off, curpage_off, nextpage_off, remain_pages;
+
+ u64 expected_seq, seq_base = 0, lsn_base = 0;
+ u64 best_lsn, best_lsn1, best_lsn2;
+ u64 lsn_cur, lsn1, lsn2;
+ u64 last_ok_lsn = reuse_page ? log->last_lsn : 0;
+
+ u16 cur_pos, best_page_pos;
+
+ struct RECORD_PAGE_HDR *page = NULL;
+ struct RECORD_PAGE_HDR *tst_page = NULL;
+ struct RECORD_PAGE_HDR *first_tail = NULL;
+ struct RECORD_PAGE_HDR *second_tail = NULL;
+ struct RECORD_PAGE_HDR *tail_page = NULL;
+ struct RECORD_PAGE_HDR *second_tail_prev = NULL;
+ struct RECORD_PAGE_HDR *first_tail_prev = NULL;
+ struct RECORD_PAGE_HDR *page_bufs = NULL;
+ struct RECORD_PAGE_HDR *best_page;
+
+ if (log->major_ver >= 2) {
+ final_off = 0x02 * log->page_size;
+ second_off = 0x12 * log->page_size;
+
+ // 0x10 == 0x12 - 0x2
+ page_bufs = kmalloc(log->page_size * 0x10, GFP_NOFS);
+ if (!page_bufs)
+ return -ENOMEM;
+ } else {
+ second_off = log->first_page - log->page_size;
+ final_off = second_off - log->page_size;
+ }
+
+next_tail:
+ /* Read second tail page (at pos 3/0x12000). */
+ if (read_log_page(log, second_off, &second_tail, &usa_error) ||
+ usa_error || second_tail->rhdr.sign != NTFS_RCRD_SIGNATURE) {
+ kfree(second_tail);
+ second_tail = NULL;
+ second_file_off = 0;
+ lsn2 = 0;
+ } else {
+ second_file_off = hdr_file_off(log, second_tail);
+ lsn2 = le64_to_cpu(second_tail->record_hdr.last_end_lsn);
+ }
+
+ /* Read first tail page (at pos 2/0x2000). */
+ if (read_log_page(log, final_off, &first_tail, &usa_error) ||
+ usa_error || first_tail->rhdr.sign != NTFS_RCRD_SIGNATURE) {
+ kfree(first_tail);
+ first_tail = NULL;
+ first_file_off = 0;
+ lsn1 = 0;
+ } else {
+ first_file_off = hdr_file_off(log, first_tail);
+ lsn1 = le64_to_cpu(first_tail->record_hdr.last_end_lsn);
+ }
+
+ if (log->major_ver < 2) {
+ int best_page;
+
+ first_tail_prev = first_tail;
+ final_off_prev = first_file_off;
+ second_tail_prev = second_tail;
+ second_off_prev = second_file_off;
+ tails = 1;
+
+ if (!first_tail && !second_tail)
+ goto tail_read;
+
+ if (first_tail && second_tail)
+ best_page = lsn1 < lsn2 ? 1 : 0;
+ else if (first_tail)
+ best_page = 0;
+ else
+ best_page = 1;
+
+ page_off = best_page ? second_file_off : first_file_off;
+ seq_base = (best_page ? lsn2 : lsn1) >> log->file_data_bits;
+ goto tail_read;
+ }
+
+ best_lsn1 = first_tail ? base_lsn(log, first_tail, first_file_off) : 0;
+ best_lsn2 =
+ second_tail ? base_lsn(log, second_tail, second_file_off) : 0;
+
+ if (first_tail && second_tail) {
+ if (best_lsn1 > best_lsn2) {
+ best_lsn = best_lsn1;
+ best_page = first_tail;
+ this_off = first_file_off;
+ } else {
+ best_lsn = best_lsn2;
+ best_page = second_tail;
+ this_off = second_file_off;
+ }
+ } else if (first_tail) {
+ best_lsn = best_lsn1;
+ best_page = first_tail;
+ this_off = first_file_off;
+ } else if (second_tail) {
+ best_lsn = best_lsn2;
+ best_page = second_tail;
+ this_off = second_file_off;
+ } else {
+ goto tail_read;
+ }
+
+ best_page_pos = le16_to_cpu(best_page->page_pos);
+
+ if (!tails) {
+ if (best_page_pos == page_pos) {
+ seq_base = best_lsn >> log->file_data_bits;
+ saved_off = page_off = le32_to_cpu(best_page->file_off);
+ lsn_base = best_lsn;
+
+ memmove(page_bufs, best_page, log->page_size);
+
+ page_cnt = le16_to_cpu(best_page->page_count);
+ if (page_cnt > 1)
+ page_pos += 1;
+
+ tails = 1;
+ }
+ } else if (seq_base == (best_lsn >> log->file_data_bits) &&
+ saved_off + log->page_size == this_off &&
+ lsn_base < best_lsn &&
+ (page_pos != page_cnt || best_page_pos == page_pos ||
+ best_page_pos == 1) &&
+ (page_pos >= page_cnt || best_page_pos == page_pos)) {
+ u16 bppc = le16_to_cpu(best_page->page_count);
+
+ saved_off += log->page_size;
+ lsn_base = best_lsn;
+
+ memmove(Add2Ptr(page_bufs, tails * log->page_size), best_page,
+ log->page_size);
+
+ tails += 1;
+
+ if (best_page_pos != bppc) {
+ page_cnt = bppc;
+ page_pos = best_page_pos;
+
+ if (page_cnt > 1)
+ page_pos += 1;
+ } else {
+ page_pos = page_cnt = 1;
+ }
+ } else {
+ kfree(first_tail);
+ kfree(second_tail);
+ goto tail_read;
+ }
+
+ kfree(first_tail_prev);
+ first_tail_prev = first_tail;
+ final_off_prev = first_file_off;
+ first_tail = NULL;
+
+ kfree(second_tail_prev);
+ second_tail_prev = second_tail;
+ second_off_prev = second_file_off;
+ second_tail = NULL;
+
+ final_off += log->page_size;
+ second_off += log->page_size;
+
+ if (tails < 0x10)
+ goto next_tail;
+tail_read:
+ first_tail = first_tail_prev;
+ final_off = final_off_prev;
+
+ second_tail = second_tail_prev;
+ second_off = second_off_prev;
+
+ page_cnt = page_pos = 1;
+
+ curpage_off = seq_base == log->seq_num ? min(log->next_page, page_off)
+ : log->next_page;
+
+ wrapped_file =
+ curpage_off == log->first_page &&
+ !(log->l_flags & (NTFSLOG_NO_LAST_LSN | NTFSLOG_REUSE_TAIL));
+
+ expected_seq = wrapped_file ? (log->seq_num + 1) : log->seq_num;
+
+ nextpage_off = curpage_off;
+
+next_page:
+ tail_page = NULL;
+ /* Read the next log page. */
+ err = read_log_page(log, curpage_off, &page, &usa_error);
+
+ /* Compute the next log page offset the file. */
+ nextpage_off = next_page_off(log, curpage_off);
+ wrapped = nextpage_off == log->first_page;
+
+ if (tails > 1) {
+ struct RECORD_PAGE_HDR *cur_page =
+ Add2Ptr(page_bufs, curpage_off - page_off);
+
+ if (curpage_off == saved_off) {
+ tail_page = cur_page;
+ goto use_tail_page;
+ }
+
+ if (page_off > curpage_off || curpage_off >= saved_off)
+ goto use_tail_page;
+
+ if (page_off1)
+ goto use_cur_page;
+
+ if (!err && !usa_error &&
+ page->rhdr.sign == NTFS_RCRD_SIGNATURE &&
+ cur_page->rhdr.lsn == page->rhdr.lsn &&
+ cur_page->record_hdr.next_record_off ==
+ page->record_hdr.next_record_off &&
+ ((page_pos == page_cnt &&
+ le16_to_cpu(page->page_pos) == 1) ||
+ (page_pos != page_cnt &&
+ le16_to_cpu(page->page_pos) == page_pos + 1 &&
+ le16_to_cpu(page->page_count) == page_cnt))) {
+ cur_page = NULL;
+ goto use_tail_page;
+ }
+
+ page_off1 = page_off;
+
+use_cur_page:
+
+ lsn_cur = le64_to_cpu(cur_page->rhdr.lsn);
+
+ if (last_ok_lsn !=
+ le64_to_cpu(cur_page->record_hdr.last_end_lsn) &&
+ ((lsn_cur >> log->file_data_bits) +
+ ((curpage_off <
+ (lsn_to_vbo(log, lsn_cur) & ~log->page_mask))
+ ? 1
+ : 0)) != expected_seq) {
+ goto check_tail;
+ }
+
+ if (!is_log_record_end(cur_page)) {
+ tail_page = NULL;
+ last_ok_lsn = lsn_cur;
+ goto next_page_1;
+ }
+
+ log->seq_num = expected_seq;
+ log->l_flags &= ~NTFSLOG_NO_LAST_LSN;
+ log->last_lsn = le64_to_cpu(cur_page->record_hdr.last_end_lsn);
+ log->ra->current_lsn = cur_page->record_hdr.last_end_lsn;
+
+ if (log->record_header_len <=
+ log->page_size -
+ le16_to_cpu(cur_page->record_hdr.next_record_off)) {
+ log->l_flags |= NTFSLOG_REUSE_TAIL;
+ log->next_page = curpage_off;
+ } else {
+ log->l_flags &= ~NTFSLOG_REUSE_TAIL;
+ log->next_page = nextpage_off;
+ }
+
+ if (wrapped_file)
+ log->l_flags |= NTFSLOG_WRAPPED;
+
+ last_ok_lsn = le64_to_cpu(cur_page->record_hdr.last_end_lsn);
+ goto next_page_1;
+ }
+
+ /*
+ * If we are at the expected first page of a transfer check to see
+ * if either tail copy is at this offset.
+ * If this page is the last page of a transfer, check if we wrote
+ * a subsequent tail copy.
+ */
+ if (page_cnt == page_pos || page_cnt == page_pos + 1) {
+ /*
+ * Check if the offset matches either the first or second
+ * tail copy. It is possible it will match both.
+ */
+ if (curpage_off == final_off)
+ tail_page = first_tail;
+
+ /*
+ * If we already matched on the first page then
+ * check the ending lsn's.
+ */
+ if (curpage_off == second_off) {
+ if (!tail_page ||
+ (second_tail &&
+ le64_to_cpu(second_tail->record_hdr.last_end_lsn) >
+ le64_to_cpu(first_tail->record_hdr
+ .last_end_lsn))) {
+ tail_page = second_tail;
+ }
+ }
+ }
+
+use_tail_page:
+ if (tail_page) {
+ /* We have a candidate for a tail copy. */
+ lsn_cur = le64_to_cpu(tail_page->record_hdr.last_end_lsn);
+
+ if (last_ok_lsn < lsn_cur) {
+ /*
+ * If the sequence number is not expected,
+ * then don't use the tail copy.
+ */
+ if (expected_seq != (lsn_cur >> log->file_data_bits))
+ tail_page = NULL;
+ } else if (last_ok_lsn > lsn_cur) {
+ /*
+ * If the last lsn is greater than the one on
+ * this page then forget this tail.
+ */
+ tail_page = NULL;
+ }
+ }
+
+ /*
+ *If we have an error on the current page,
+ * we will break of this loop.
+ */
+ if (err || usa_error)
+ goto check_tail;
+
+ /*
+ * Done if the last lsn on this page doesn't match the previous known
+ * last lsn or the sequence number is not expected.
+ */
+ lsn_cur = le64_to_cpu(page->rhdr.lsn);
+ if (last_ok_lsn != lsn_cur &&
+ expected_seq != (lsn_cur >> log->file_data_bits)) {
+ goto check_tail;
+ }
+
+ /*
+ * Check that the page position and page count values are correct.
+ * If this is the first page of a transfer the position must be 1
+ * and the count will be unknown.
+ */
+ if (page_cnt == page_pos) {
+ if (page->page_pos != cpu_to_le16(1) &&
+ (!reuse_page || page->page_pos != page->page_count)) {
+ /*
+ * If the current page is the first page we are
+ * looking at and we are reusing this page then
+ * it can be either the first or last page of a
+ * transfer. Otherwise it can only be the first.
+ */
+ goto check_tail;
+ }
+ } else if (le16_to_cpu(page->page_count) != page_cnt ||
+ le16_to_cpu(page->page_pos) != page_pos + 1) {
+ /*
+ * The page position better be 1 more than the last page
+ * position and the page count better match.
+ */
+ goto check_tail;
+ }
+
+ /*
+ * We have a valid page the file and may have a valid page
+ * the tail copy area.
+ * If the tail page was written after the page the file then
+ * break of the loop.
+ */
+ if (tail_page &&
+ le64_to_cpu(tail_page->record_hdr.last_end_lsn) > lsn_cur) {
+ /* Remember if we will replace the page. */
+ replace_page = true;
+ goto check_tail;
+ }
+
+ tail_page = NULL;
+
+ if (is_log_record_end(page)) {
+ /*
+ * Since we have read this page we know the sequence number
+ * is the same as our expected value.
+ */
+ log->seq_num = expected_seq;
+ log->last_lsn = le64_to_cpu(page->record_hdr.last_end_lsn);
+ log->ra->current_lsn = page->record_hdr.last_end_lsn;
+ log->l_flags &= ~NTFSLOG_NO_LAST_LSN;
+
+ /*
+ * If there is room on this page for another header then
+ * remember we want to reuse the page.
+ */
+ if (log->record_header_len <=
+ log->page_size -
+ le16_to_cpu(page->record_hdr.next_record_off)) {
+ log->l_flags |= NTFSLOG_REUSE_TAIL;
+ log->next_page = curpage_off;
+ } else {
+ log->l_flags &= ~NTFSLOG_REUSE_TAIL;
+ log->next_page = nextpage_off;
+ }
+
+ /* Remember if we wrapped the log file. */
+ if (wrapped_file)
+ log->l_flags |= NTFSLOG_WRAPPED;
+ }
+
+ /*
+ * Remember the last page count and position.
+ * Also remember the last known lsn.
+ */
+ page_cnt = le16_to_cpu(page->page_count);
+ page_pos = le16_to_cpu(page->page_pos);
+ last_ok_lsn = le64_to_cpu(page->rhdr.lsn);
+
+next_page_1:
+
+ if (wrapped) {
+ expected_seq += 1;
+ wrapped_file = 1;
+ }
+
+ curpage_off = nextpage_off;
+ kfree(page);
+ page = NULL;
+ reuse_page = 0;
+ goto next_page;
+
+check_tail:
+ if (tail_page) {
+ log->seq_num = expected_seq;
+ log->last_lsn = le64_to_cpu(tail_page->record_hdr.last_end_lsn);
+ log->ra->current_lsn = tail_page->record_hdr.last_end_lsn;
+ log->l_flags &= ~NTFSLOG_NO_LAST_LSN;
+
+ if (log->page_size -
+ le16_to_cpu(
+ tail_page->record_hdr.next_record_off) >=
+ log->record_header_len) {
+ log->l_flags |= NTFSLOG_REUSE_TAIL;
+ log->next_page = curpage_off;
+ } else {
+ log->l_flags &= ~NTFSLOG_REUSE_TAIL;
+ log->next_page = nextpage_off;
+ }
+
+ if (wrapped)
+ log->l_flags |= NTFSLOG_WRAPPED;
+ }
+
+ /* Remember that the partial IO will start at the next page. */
+ second_off = nextpage_off;
+
+ /*
+ * If the next page is the first page of the file then update
+ * the sequence number for log records which begon the next page.
+ */
+ if (wrapped)
+ expected_seq += 1;
+
+ /*
+ * If we have a tail copy or are performing single page I/O we can
+ * immediately look at the next page.
+ */
+ if (replace_page || (log->ra->flags & RESTART_SINGLE_PAGE_IO)) {
+ page_cnt = 2;
+ page_pos = 1;
+ goto check_valid;
+ }
+
+ if (page_pos != page_cnt)
+ goto check_valid;
+ /*
+ * If the next page causes us to wrap to the beginning of the log
+ * file then we know which page to check next.
+ */
+ if (wrapped) {
+ page_cnt = 2;
+ page_pos = 1;
+ goto check_valid;
+ }
+
+ cur_pos = 2;
+
+next_test_page:
+ kfree(tst_page);
+ tst_page = NULL;
+
+ /* Walk through the file, reading log pages. */
+ err = read_log_page(log, nextpage_off, &tst_page, &usa_error);
+
+ /*
+ * If we get a USA error then assume that we correctly found
+ * the end of the original transfer.
+ */
+ if (usa_error)
+ goto file_is_valid;
+
+ /*
+ * If we were able to read the page, we examine it to see if it
+ * is the same or different Io block.
+ */
+ if (err)
+ goto next_test_page_1;
+
+ if (le16_to_cpu(tst_page->page_pos) == cur_pos &&
+ check_subseq_log_page(log, tst_page, nextpage_off, expected_seq)) {
+ page_cnt = le16_to_cpu(tst_page->page_count) + 1;
+ page_pos = le16_to_cpu(tst_page->page_pos);
+ goto check_valid;
+ } else {
+ goto file_is_valid;
+ }
+
+next_test_page_1:
+
+ nextpage_off = next_page_off(log, curpage_off);
+ wrapped = nextpage_off == log->first_page;
+
+ if (wrapped) {
+ expected_seq += 1;
+ page_cnt = 2;
+ page_pos = 1;
+ }
+
+ cur_pos += 1;
+ part_io_count += 1;
+ if (!wrapped)
+ goto next_test_page;
+
+check_valid:
+ /* Skip over the remaining pages this transfer. */
+ remain_pages = page_cnt - page_pos - 1;
+ part_io_count += remain_pages;
+
+ while (remain_pages--) {
+ nextpage_off = next_page_off(log, curpage_off);
+ wrapped = nextpage_off == log->first_page;
+
+ if (wrapped)
+ expected_seq += 1;
+ }
+
+ /* Call our routine to check this log page. */
+ kfree(tst_page);
+ tst_page = NULL;
+
+ err = read_log_page(log, nextpage_off, &tst_page, &usa_error);
+ if (!err && !usa_error &&
+ check_subseq_log_page(log, tst_page, nextpage_off, expected_seq)) {
+ err = -EINVAL;
+ goto out;
+ }
+
+file_is_valid:
+
+ /* We have a valid file. */
+ if (page_off1 || tail_page) {
+ struct RECORD_PAGE_HDR *tmp_page;
+
+ if (sb_rdonly(log->ni->mi.sbi->sb)) {
+ err = -EROFS;
+ goto out;
+ }
+
+ if (page_off1) {
+ tmp_page = Add2Ptr(page_bufs, page_off1 - page_off);
+ tails -= (page_off1 - page_off) / log->page_size;
+ if (!tail_page)
+ tails -= 1;
+ } else {
+ tmp_page = tail_page;
+ tails = 1;
+ }
+
+ while (tails--) {
+ u64 off = hdr_file_off(log, tmp_page);
+
+ if (!page) {
+ page = kmalloc(log->page_size, GFP_NOFS);
+ if (!page) {
+ err = -ENOMEM;
+ goto out;
+ }
+ }
+
+ /*
+ * Correct page and copy the data from this page
+ * into it and flush it to disk.
+ */
+ memcpy(page, tmp_page, log->page_size);
+
+ /* Fill last flushed lsn value flush the page. */
+ if (log->major_ver < 2)
+ page->rhdr.lsn = page->record_hdr.last_end_lsn;
+ else
+ page->file_off = 0;
+
+ page->page_pos = page->page_count = cpu_to_le16(1);
+
+ ntfs_fix_pre_write(&page->rhdr, log->page_size);
+
+ err = ntfs_sb_write_run(log->ni->mi.sbi,
+ &log->ni->file.run, off, page,
+ log->page_size, 0);
+
+ if (err)
+ goto out;
+
+ if (part_io_count && second_off == off) {
+ second_off += log->page_size;
+ part_io_count -= 1;
+ }
+
+ tmp_page = Add2Ptr(tmp_page, log->page_size);
+ }
+ }
+
+ if (part_io_count) {
+ if (sb_rdonly(log->ni->mi.sbi->sb)) {
+ err = -EROFS;
+ goto out;
+ }
+ }
+
+out:
+ kfree(second_tail);
+ kfree(first_tail);
+ kfree(page);
+ kfree(tst_page);
+ kfree(page_bufs);
+
+ return err;
+}
+
+/*
+ * read_log_rec_buf - Copy a log record from the file to a buffer.
+ *
+ * The log record may span several log pages and may even wrap the file.
+ */
+static int read_log_rec_buf(struct ntfs_log *log,
+ const struct LFS_RECORD_HDR *rh, void *buffer)
+{
+ int err;
+ struct RECORD_PAGE_HDR *ph = NULL;
+ u64 lsn = le64_to_cpu(rh->this_lsn);
+ u32 vbo = lsn_to_vbo(log, lsn) & ~log->page_mask;
+ u32 off = lsn_to_page_off(log, lsn) + log->record_header_len;
+ u32 data_len = le32_to_cpu(rh->client_data_len);
+
+ /*
+ * While there are more bytes to transfer,
+ * we continue to attempt to perform the read.
+ */
+ for (;;) {
+ bool usa_error;
+ u32 tail = log->page_size - off;
+
+ if (tail >= data_len)
+ tail = data_len;
+
+ data_len -= tail;
+
+ err = read_log_page(log, vbo, &ph, &usa_error);
+ if (err)
+ goto out;
+
+ /*
+ * The last lsn on this page better be greater or equal
+ * to the lsn we are copying.
+ */
+ if (lsn > le64_to_cpu(ph->rhdr.lsn)) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ memcpy(buffer, Add2Ptr(ph, off), tail);
+
+ /* If there are no more bytes to transfer, we exit the loop. */
+ if (!data_len) {
+ if (!is_log_record_end(ph) ||
+ lsn > le64_to_cpu(ph->record_hdr.last_end_lsn)) {
+ err = -EINVAL;
+ goto out;
+ }
+ break;
+ }
+
+ if (ph->rhdr.lsn == ph->record_hdr.last_end_lsn ||
+ lsn > le64_to_cpu(ph->rhdr.lsn)) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ vbo = next_page_off(log, vbo);
+ off = log->data_off;
+
+ /*
+ * Adjust our pointer the user's buffer to transfer
+ * the next block to.
+ */
+ buffer = Add2Ptr(buffer, tail);
+ }
+
+out:
+ kfree(ph);
+ return err;
+}
+
+static int read_rst_area(struct ntfs_log *log, struct NTFS_RESTART **rst_,
+ u64 *lsn)
+{
+ int err;
+ struct LFS_RECORD_HDR *rh = NULL;
+ const struct CLIENT_REC *cr =
+ Add2Ptr(log->ra, le16_to_cpu(log->ra->client_off));
+ u64 lsnr, lsnc = le64_to_cpu(cr->restart_lsn);
+ u32 len;
+ struct NTFS_RESTART *rst;
+
+ *lsn = 0;
+ *rst_ = NULL;
+
+ /* If the client doesn't have a restart area, go ahead and exit now. */
+ if (!lsnc)
+ return 0;
+
+ err = read_log_page(log, lsn_to_vbo(log, lsnc),
+ (struct RECORD_PAGE_HDR **)&rh, NULL);
+ if (err)
+ return err;
+
+ rst = NULL;
+ lsnr = le64_to_cpu(rh->this_lsn);
+
+ if (lsnc != lsnr) {
+ /* If the lsn values don't match, then the disk is corrupt. */
+ err = -EINVAL;
+ goto out;
+ }
+
+ *lsn = lsnr;
+ len = le32_to_cpu(rh->client_data_len);
+
+ if (!len) {
+ err = 0;
+ goto out;
+ }
+
+ if (len < sizeof(struct NTFS_RESTART)) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ rst = kmalloc(len, GFP_NOFS);
+ if (!rst) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ /* Copy the data into the 'rst' buffer. */
+ err = read_log_rec_buf(log, rh, rst);
+ if (err)
+ goto out;
+
+ *rst_ = rst;
+ rst = NULL;
+
+out:
+ kfree(rh);
+ kfree(rst);
+
+ return err;
+}
+
+static int find_log_rec(struct ntfs_log *log, u64 lsn, struct lcb *lcb)
+{
+ int err;
+ struct LFS_RECORD_HDR *rh = lcb->lrh;
+ u32 rec_len, len;
+
+ /* Read the record header for this lsn. */
+ if (!rh) {
+ err = read_log_page(log, lsn_to_vbo(log, lsn),
+ (struct RECORD_PAGE_HDR **)&rh, NULL);
+
+ lcb->lrh = rh;
+ if (err)
+ return err;
+ }
+
+ /*
+ * If the lsn the log record doesn't match the desired
+ * lsn then the disk is corrupt.
+ */
+ if (lsn != le64_to_cpu(rh->this_lsn))
+ return -EINVAL;
+
+ len = le32_to_cpu(rh->client_data_len);
+
+ /*
+ * Check that the length field isn't greater than the total
+ * available space the log file.
+ */
+ rec_len = len + log->record_header_len;
+ if (rec_len >= log->total_avail)
+ return -EINVAL;
+
+ /*
+ * If the entire log record is on this log page,
+ * put a pointer to the log record the context block.
+ */
+ if (rh->flags & LOG_RECORD_MULTI_PAGE) {
+ void *lr = kmalloc(len, GFP_NOFS);
+
+ if (!lr)
+ return -ENOMEM;
+
+ lcb->log_rec = lr;
+ lcb->alloc = true;
+
+ /* Copy the data into the buffer returned. */
+ err = read_log_rec_buf(log, rh, lr);
+ if (err)
+ return err;
+ } else {
+ /* If beyond the end of the current page -> an error. */
+ u32 page_off = lsn_to_page_off(log, lsn);
+
+ if (page_off + len + log->record_header_len > log->page_size)
+ return -EINVAL;
+
+ lcb->log_rec = Add2Ptr(rh, sizeof(struct LFS_RECORD_HDR));
+ lcb->alloc = false;
+ }
+
+ return 0;
+}
+
+/*
+ * read_log_rec_lcb - Init the query operation.
+ */
+static int read_log_rec_lcb(struct ntfs_log *log, u64 lsn, u32 ctx_mode,
+ struct lcb **lcb_)
+{
+ int err;
+ const struct CLIENT_REC *cr;
+ struct lcb *lcb;
+
+ switch (ctx_mode) {
+ case lcb_ctx_undo_next:
+ case lcb_ctx_prev:
+ case lcb_ctx_next:
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ /* Check that the given lsn is the legal range for this client. */
+ cr = Add2Ptr(log->ra, le16_to_cpu(log->ra->client_off));
+
+ if (!verify_client_lsn(log, cr, lsn))
+ return -EINVAL;
+
+ lcb = kzalloc(sizeof(struct lcb), GFP_NOFS);
+ if (!lcb)
+ return -ENOMEM;
+ lcb->client = log->client_id;
+ lcb->ctx_mode = ctx_mode;
+
+ /* Find the log record indicated by the given lsn. */
+ err = find_log_rec(log, lsn, lcb);
+ if (err)
+ goto out;
+
+ *lcb_ = lcb;
+ return 0;
+
+out:
+ lcb_put(lcb);
+ *lcb_ = NULL;
+ return err;
+}
+
+/*
+ * find_client_next_lsn
+ *
+ * Attempt to find the next lsn to return to a client based on the context mode.
+ */
+static int find_client_next_lsn(struct ntfs_log *log, struct lcb *lcb, u64 *lsn)
+{
+ int err;
+ u64 next_lsn;
+ struct LFS_RECORD_HDR *hdr;
+
+ hdr = lcb->lrh;
+ *lsn = 0;
+
+ if (lcb_ctx_next != lcb->ctx_mode)
+ goto check_undo_next;
+
+ /* Loop as long as another lsn can be found. */
+ for (;;) {
+ u64 current_lsn;
+
+ err = next_log_lsn(log, hdr, &current_lsn);
+ if (err)
+ goto out;
+
+ if (!current_lsn)
+ break;
+
+ if (hdr != lcb->lrh)
+ kfree(hdr);
+
+ hdr = NULL;
+ err = read_log_page(log, lsn_to_vbo(log, current_lsn),
+ (struct RECORD_PAGE_HDR **)&hdr, NULL);
+ if (err)
+ goto out;
+
+ if (memcmp(&hdr->client, &lcb->client,
+ sizeof(struct CLIENT_ID))) {
+ /*err = -EINVAL; */
+ } else if (LfsClientRecord == hdr->record_type) {
+ kfree(lcb->lrh);
+ lcb->lrh = hdr;
+ *lsn = current_lsn;
+ return 0;
+ }
+ }
+
+out:
+ if (hdr != lcb->lrh)
+ kfree(hdr);
+ return err;
+
+check_undo_next:
+ if (lcb_ctx_undo_next == lcb->ctx_mode)
+ next_lsn = le64_to_cpu(hdr->client_undo_next_lsn);
+ else if (lcb_ctx_prev == lcb->ctx_mode)
+ next_lsn = le64_to_cpu(hdr->client_prev_lsn);
+ else
+ return 0;
+
+ if (!next_lsn)
+ return 0;
+
+ if (!verify_client_lsn(
+ log, Add2Ptr(log->ra, le16_to_cpu(log->ra->client_off)),
+ next_lsn))
+ return 0;
+
+ hdr = NULL;
+ err = read_log_page(log, lsn_to_vbo(log, next_lsn),
+ (struct RECORD_PAGE_HDR **)&hdr, NULL);
+ if (err)
+ return err;
+ kfree(lcb->lrh);
+ lcb->lrh = hdr;
+
+ *lsn = next_lsn;
+
+ return 0;
+}
+
+static int read_next_log_rec(struct ntfs_log *log, struct lcb *lcb, u64 *lsn)
+{
+ int err;
+
+ err = find_client_next_lsn(log, lcb, lsn);
+ if (err)
+ return err;
+
+ if (!*lsn)
+ return 0;
+
+ if (lcb->alloc)
+ kfree(lcb->log_rec);
+
+ lcb->log_rec = NULL;
+ lcb->alloc = false;
+ kfree(lcb->lrh);
+ lcb->lrh = NULL;
+
+ return find_log_rec(log, *lsn, lcb);
+}
+
+bool check_index_header(const struct INDEX_HDR *hdr, size_t bytes)
+{
+ __le16 mask;
+ u32 min_de, de_off, used, total;
+ const struct NTFS_DE *e;
+
+ if (hdr_has_subnode(hdr)) {
+ min_de = sizeof(struct NTFS_DE) + sizeof(u64);
+ mask = NTFS_IE_HAS_SUBNODES;
+ } else {
+ min_de = sizeof(struct NTFS_DE);
+ mask = 0;
+ }
+
+ de_off = le32_to_cpu(hdr->de_off);
+ used = le32_to_cpu(hdr->used);
+ total = le32_to_cpu(hdr->total);
+
+ if (de_off > bytes - min_de || used > bytes || total > bytes ||
+ de_off + min_de > used || used > total) {
+ return false;
+ }
+
+ e = Add2Ptr(hdr, de_off);
+ for (;;) {
+ u16 esize = le16_to_cpu(e->size);
+ struct NTFS_DE *next = Add2Ptr(e, esize);
+
+ if (esize < min_de || PtrOffset(hdr, next) > used ||
+ (e->flags & NTFS_IE_HAS_SUBNODES) != mask) {
+ return false;
+ }
+
+ if (de_is_last(e))
+ break;
+
+ e = next;
+ }
+
+ return true;
+}
+
+static inline bool check_index_buffer(const struct INDEX_BUFFER *ib, u32 bytes)
+{
+ u16 fo;
+ const struct NTFS_RECORD_HEADER *r = &ib->rhdr;
+
+ if (r->sign != NTFS_INDX_SIGNATURE)
+ return false;
+
+ fo = (SECTOR_SIZE - ((bytes >> SECTOR_SHIFT) + 1) * sizeof(short));
+
+ if (le16_to_cpu(r->fix_off) > fo)
+ return false;
+
+ if ((le16_to_cpu(r->fix_num) - 1) * SECTOR_SIZE != bytes)
+ return false;
+
+ return check_index_header(&ib->ihdr,
+ bytes - offsetof(struct INDEX_BUFFER, ihdr));
+}
+
+static inline bool check_index_root(const struct ATTRIB *attr,
+ struct ntfs_sb_info *sbi)
+{
+ bool ret;
+ const struct INDEX_ROOT *root = resident_data(attr);
+ u8 index_bits = le32_to_cpu(root->index_block_size) >= sbi->cluster_size
+ ? sbi->cluster_bits
+ : SECTOR_SHIFT;
+ u8 block_clst = root->index_block_clst;
+
+ if (le32_to_cpu(attr->res.data_size) < sizeof(struct INDEX_ROOT) ||
+ (root->type != ATTR_NAME && root->type != ATTR_ZERO) ||
+ (root->type == ATTR_NAME &&
+ root->rule != NTFS_COLLATION_TYPE_FILENAME) ||
+ (le32_to_cpu(root->index_block_size) !=
+ (block_clst << index_bits)) ||
+ (block_clst != 1 && block_clst != 2 && block_clst != 4 &&
+ block_clst != 8 && block_clst != 0x10 && block_clst != 0x20 &&
+ block_clst != 0x40 && block_clst != 0x80)) {
+ return false;
+ }
+
+ ret = check_index_header(&root->ihdr,
+ le32_to_cpu(attr->res.data_size) -
+ offsetof(struct INDEX_ROOT, ihdr));
+ return ret;
+}
+
+static inline bool check_attr(const struct MFT_REC *rec,
+ const struct ATTRIB *attr,
+ struct ntfs_sb_info *sbi)
+{
+ u32 asize = le32_to_cpu(attr->size);
+ u32 rsize = 0;
+ u64 dsize, svcn, evcn;
+ u16 run_off;
+
+ /* Check the fixed part of the attribute record header. */
+ if (asize >= sbi->record_size ||
+ asize + PtrOffset(rec, attr) >= sbi->record_size ||
+ (attr->name_len &&
+ le16_to_cpu(attr->name_off) + attr->name_len * sizeof(short) >
+ asize)) {
+ return false;
+ }
+
+ /* Check the attribute fields. */
+ switch (attr->non_res) {
+ case 0:
+ rsize = le32_to_cpu(attr->res.data_size);
+ if (rsize >= asize ||
+ le16_to_cpu(attr->res.data_off) + rsize > asize) {
+ return false;
+ }
+ break;
+
+ case 1:
+ dsize = le64_to_cpu(attr->nres.data_size);
+ svcn = le64_to_cpu(attr->nres.svcn);
+ evcn = le64_to_cpu(attr->nres.evcn);
+ run_off = le16_to_cpu(attr->nres.run_off);
+
+ if (svcn > evcn + 1 || run_off >= asize ||
+ le64_to_cpu(attr->nres.valid_size) > dsize ||
+ dsize > le64_to_cpu(attr->nres.alloc_size)) {
+ return false;
+ }
+
+ if (run_off > asize)
+ return false;
+
+ if (run_unpack(NULL, sbi, 0, svcn, evcn, svcn,
+ Add2Ptr(attr, run_off), asize - run_off) < 0) {
+ return false;
+ }
+
+ return true;
+
+ default:
+ return false;
+ }
+
+ switch (attr->type) {
+ case ATTR_NAME:
+ if (fname_full_size(Add2Ptr(
+ attr, le16_to_cpu(attr->res.data_off))) > asize) {
+ return false;
+ }
+ break;
+
+ case ATTR_ROOT:
+ return check_index_root(attr, sbi);
+
+ case ATTR_STD:
+ if (rsize < sizeof(struct ATTR_STD_INFO5) &&
+ rsize != sizeof(struct ATTR_STD_INFO)) {
+ return false;
+ }
+ break;
+
+ case ATTR_LIST:
+ case ATTR_ID:
+ case ATTR_SECURE:
+ case ATTR_LABEL:
+ case ATTR_VOL_INFO:
+ case ATTR_DATA:
+ case ATTR_ALLOC:
+ case ATTR_BITMAP:
+ case ATTR_REPARSE:
+ case ATTR_EA_INFO:
+ case ATTR_EA:
+ case ATTR_PROPERTYSET:
+ case ATTR_LOGGED_UTILITY_STREAM:
+ break;
+
+ default:
+ return false;
+ }
+
+ return true;
+}
+
+static inline bool check_file_record(const struct MFT_REC *rec,
+ const struct MFT_REC *rec2,
+ struct ntfs_sb_info *sbi)
+{
+ const struct ATTRIB *attr;
+ u16 fo = le16_to_cpu(rec->rhdr.fix_off);
+ u16 fn = le16_to_cpu(rec->rhdr.fix_num);
+ u16 ao = le16_to_cpu(rec->attr_off);
+ u32 rs = sbi->record_size;
+
+ /* Check the file record header for consistency. */
+ if (rec->rhdr.sign != NTFS_FILE_SIGNATURE ||
+ fo > (SECTOR_SIZE - ((rs >> SECTOR_SHIFT) + 1) * sizeof(short)) ||
+ (fn - 1) * SECTOR_SIZE != rs || ao < MFTRECORD_FIXUP_OFFSET_1 ||
+ ao > sbi->record_size - SIZEOF_RESIDENT || !is_rec_inuse(rec) ||
+ le32_to_cpu(rec->total) != rs) {
+ return false;
+ }
+
+ /* Loop to check all of the attributes. */
+ for (attr = Add2Ptr(rec, ao); attr->type != ATTR_END;
+ attr = Add2Ptr(attr, le32_to_cpu(attr->size))) {
+ if (check_attr(rec, attr, sbi))
+ continue;
+ return false;
+ }
+
+ return true;
+}
+
+static inline int check_lsn(const struct NTFS_RECORD_HEADER *hdr,
+ const u64 *rlsn)
+{
+ u64 lsn;
+
+ if (!rlsn)
+ return true;
+
+ lsn = le64_to_cpu(hdr->lsn);
+
+ if (hdr->sign == NTFS_HOLE_SIGNATURE)
+ return false;
+
+ if (*rlsn > lsn)
+ return true;
+
+ return false;
+}
+
+static inline bool check_if_attr(const struct MFT_REC *rec,
+ const struct LOG_REC_HDR *lrh)
+{
+ u16 ro = le16_to_cpu(lrh->record_off);
+ u16 o = le16_to_cpu(rec->attr_off);
+ const struct ATTRIB *attr = Add2Ptr(rec, o);
+
+ while (o < ro) {
+ u32 asize;
+
+ if (attr->type == ATTR_END)
+ break;
+
+ asize = le32_to_cpu(attr->size);
+ if (!asize)
+ break;
+
+ o += asize;
+ attr = Add2Ptr(attr, asize);
+ }
+
+ return o == ro;
+}
+
+static inline bool check_if_index_root(const struct MFT_REC *rec,
+ const struct LOG_REC_HDR *lrh)
+{
+ u16 ro = le16_to_cpu(lrh->record_off);
+ u16 o = le16_to_cpu(rec->attr_off);
+ const struct ATTRIB *attr = Add2Ptr(rec, o);
+
+ while (o < ro) {
+ u32 asize;
+
+ if (attr->type == ATTR_END)
+ break;
+
+ asize = le32_to_cpu(attr->size);
+ if (!asize)
+ break;
+
+ o += asize;
+ attr = Add2Ptr(attr, asize);
+ }
+
+ return o == ro && attr->type == ATTR_ROOT;
+}
+
+static inline bool check_if_root_index(const struct ATTRIB *attr,
+ const struct INDEX_HDR *hdr,
+ const struct LOG_REC_HDR *lrh)
+{
+ u16 ao = le16_to_cpu(lrh->attr_off);
+ u32 de_off = le32_to_cpu(hdr->de_off);
+ u32 o = PtrOffset(attr, hdr) + de_off;
+ const struct NTFS_DE *e = Add2Ptr(hdr, de_off);
+ u32 asize = le32_to_cpu(attr->size);
+
+ while (o < ao) {
+ u16 esize;
+
+ if (o >= asize)
+ break;
+
+ esize = le16_to_cpu(e->size);
+ if (!esize)
+ break;
+
+ o += esize;
+ e = Add2Ptr(e, esize);
+ }
+
+ return o == ao;
+}
+
+static inline bool check_if_alloc_index(const struct INDEX_HDR *hdr,
+ u32 attr_off)
+{
+ u32 de_off = le32_to_cpu(hdr->de_off);
+ u32 o = offsetof(struct INDEX_BUFFER, ihdr) + de_off;
+ const struct NTFS_DE *e = Add2Ptr(hdr, de_off);
+ u32 used = le32_to_cpu(hdr->used);
+
+ while (o < attr_off) {
+ u16 esize;
+
+ if (de_off >= used)
+ break;
+
+ esize = le16_to_cpu(e->size);
+ if (!esize)
+ break;
+
+ o += esize;
+ de_off += esize;
+ e = Add2Ptr(e, esize);
+ }
+
+ return o == attr_off;
+}
+
+static inline void change_attr_size(struct MFT_REC *rec, struct ATTRIB *attr,
+ u32 nsize)
+{
+ u32 asize = le32_to_cpu(attr->size);
+ int dsize = nsize - asize;
+ u8 *next = Add2Ptr(attr, asize);
+ u32 used = le32_to_cpu(rec->used);
+
+ memmove(Add2Ptr(attr, nsize), next, used - PtrOffset(rec, next));
+
+ rec->used = cpu_to_le32(used + dsize);
+ attr->size = cpu_to_le32(nsize);
+}
+
+struct OpenAttr {
+ struct ATTRIB *attr;
+ struct runs_tree *run1;
+ struct runs_tree run0;
+ struct ntfs_inode *ni;
+ // CLST rno;
+};
+
+/*
+ * cmp_type_and_name
+ *
+ * Return: 0 if 'attr' has the same type and name.
+ */
+static inline int cmp_type_and_name(const struct ATTRIB *a1,
+ const struct ATTRIB *a2)
+{
+ return a1->type != a2->type || a1->name_len != a2->name_len ||
+ (a1->name_len && memcmp(attr_name(a1), attr_name(a2),
+ a1->name_len * sizeof(short)));
+}
+
+static struct OpenAttr *find_loaded_attr(struct ntfs_log *log,
+ const struct ATTRIB *attr, CLST rno)
+{
+ struct OPEN_ATTR_ENRTY *oe = NULL;
+
+ while ((oe = enum_rstbl(log->open_attr_tbl, oe))) {
+ struct OpenAttr *op_attr;
+
+ if (ino_get(&oe->ref) != rno)
+ continue;
+
+ op_attr = (struct OpenAttr *)oe->ptr;
+ if (!cmp_type_and_name(op_attr->attr, attr))
+ return op_attr;
+ }
+ return NULL;
+}
+
+static struct ATTRIB *attr_create_nonres_log(struct ntfs_sb_info *sbi,
+ enum ATTR_TYPE type, u64 size,
+ const u16 *name, size_t name_len,
+ __le16 flags)
+{
+ struct ATTRIB *attr;
+ u32 name_size = ALIGN(name_len * sizeof(short), 8);
+ bool is_ext = flags & (ATTR_FLAG_COMPRESSED | ATTR_FLAG_SPARSED);
+ u32 asize = name_size +
+ (is_ext ? SIZEOF_NONRESIDENT_EX : SIZEOF_NONRESIDENT);
+
+ attr = kzalloc(asize, GFP_NOFS);
+ if (!attr)
+ return NULL;
+
+ attr->type = type;
+ attr->size = cpu_to_le32(asize);
+ attr->flags = flags;
+ attr->non_res = 1;
+ attr->name_len = name_len;
+
+ attr->nres.evcn = cpu_to_le64((u64)bytes_to_cluster(sbi, size) - 1);
+ attr->nres.alloc_size = cpu_to_le64(ntfs_up_cluster(sbi, size));
+ attr->nres.data_size = cpu_to_le64(size);
+ attr->nres.valid_size = attr->nres.data_size;
+ if (is_ext) {
+ attr->name_off = SIZEOF_NONRESIDENT_EX_LE;
+ if (is_attr_compressed(attr))
+ attr->nres.c_unit = COMPRESSION_UNIT;
+
+ attr->nres.run_off =
+ cpu_to_le16(SIZEOF_NONRESIDENT_EX + name_size);
+ memcpy(Add2Ptr(attr, SIZEOF_NONRESIDENT_EX), name,
+ name_len * sizeof(short));
+ } else {
+ attr->name_off = SIZEOF_NONRESIDENT_LE;
+ attr->nres.run_off =
+ cpu_to_le16(SIZEOF_NONRESIDENT + name_size);
+ memcpy(Add2Ptr(attr, SIZEOF_NONRESIDENT), name,
+ name_len * sizeof(short));
+ }
+
+ return attr;
+}
+
+/*
+ * do_action - Common routine for the Redo and Undo Passes.
+ * @rlsn: If it is NULL then undo.
+ */
+static int do_action(struct ntfs_log *log, struct OPEN_ATTR_ENRTY *oe,
+ const struct LOG_REC_HDR *lrh, u32 op, void *data,
+ u32 dlen, u32 rec_len, const u64 *rlsn)
+{
+ int err = 0;
+ struct ntfs_sb_info *sbi = log->ni->mi.sbi;
+ struct inode *inode = NULL, *inode_parent;
+ struct mft_inode *mi = NULL, *mi2_child = NULL;
+ CLST rno = 0, rno_base = 0;
+ struct INDEX_BUFFER *ib = NULL;
+ struct MFT_REC *rec = NULL;
+ struct ATTRIB *attr = NULL, *attr2;
+ struct INDEX_HDR *hdr;
+ struct INDEX_ROOT *root;
+ struct NTFS_DE *e, *e1, *e2;
+ struct NEW_ATTRIBUTE_SIZES *new_sz;
+ struct ATTR_FILE_NAME *fname;
+ struct OpenAttr *oa, *oa2;
+ u32 nsize, t32, asize, used, esize, bmp_off, bmp_bits;
+ u16 id, id2;
+ u32 record_size = sbi->record_size;
+ u64 t64;
+ u16 roff = le16_to_cpu(lrh->record_off);
+ u16 aoff = le16_to_cpu(lrh->attr_off);
+ u64 lco = 0;
+ u64 cbo = (u64)le16_to_cpu(lrh->cluster_off) << SECTOR_SHIFT;
+ u64 tvo = le64_to_cpu(lrh->target_vcn) << sbi->cluster_bits;
+ u64 vbo = cbo + tvo;
+ void *buffer_le = NULL;
+ u32 bytes = 0;
+ bool a_dirty = false;
+ u16 data_off;
+
+ oa = oe->ptr;
+
+ /* Big switch to prepare. */
+ switch (op) {
+ /* ============================================================
+ * Process MFT records, as described by the current log record.
+ * ============================================================
+ */
+ case InitializeFileRecordSegment:
+ case DeallocateFileRecordSegment:
+ case WriteEndOfFileRecordSegment:
+ case CreateAttribute:
+ case DeleteAttribute:
+ case UpdateResidentValue:
+ case UpdateMappingPairs:
+ case SetNewAttributeSizes:
+ case AddIndexEntryRoot:
+ case DeleteIndexEntryRoot:
+ case SetIndexEntryVcnRoot:
+ case UpdateFileNameRoot:
+ case UpdateRecordDataRoot:
+ case ZeroEndOfFileRecord:
+ rno = vbo >> sbi->record_bits;
+ inode = ilookup(sbi->sb, rno);
+ if (inode) {
+ mi = &ntfs_i(inode)->mi;
+ } else if (op == InitializeFileRecordSegment) {
+ mi = kzalloc(sizeof(struct mft_inode), GFP_NOFS);
+ if (!mi)
+ return -ENOMEM;
+ err = mi_format_new(mi, sbi, rno, 0, false);
+ if (err)
+ goto out;
+ } else {
+ /* Read from disk. */
+ err = mi_get(sbi, rno, &mi);
+ if (err)
+ return err;
+ }
+ rec = mi->mrec;
+
+ if (op == DeallocateFileRecordSegment)
+ goto skip_load_parent;
+
+ if (InitializeFileRecordSegment != op) {
+ if (rec->rhdr.sign == NTFS_BAAD_SIGNATURE)
+ goto dirty_vol;
+ if (!check_lsn(&rec->rhdr, rlsn))
+ goto out;
+ if (!check_file_record(rec, NULL, sbi))
+ goto dirty_vol;
+ attr = Add2Ptr(rec, roff);
+ }
+
+ if (is_rec_base(rec) || InitializeFileRecordSegment == op) {
+ rno_base = rno;
+ goto skip_load_parent;
+ }
+
+ rno_base = ino_get(&rec->parent_ref);
+ inode_parent = ntfs_iget5(sbi->sb, &rec->parent_ref, NULL);
+ if (IS_ERR(inode_parent))
+ goto skip_load_parent;
+
+ if (is_bad_inode(inode_parent)) {
+ iput(inode_parent);
+ goto skip_load_parent;
+ }
+
+ if (ni_load_mi_ex(ntfs_i(inode_parent), rno, &mi2_child)) {
+ iput(inode_parent);
+ } else {
+ if (mi2_child->mrec != mi->mrec)
+ memcpy(mi2_child->mrec, mi->mrec,
+ sbi->record_size);
+
+ if (inode)
+ iput(inode);
+ else if (mi)
+ mi_put(mi);
+
+ inode = inode_parent;
+ mi = mi2_child;
+ rec = mi2_child->mrec;
+ attr = Add2Ptr(rec, roff);
+ }
+
+skip_load_parent:
+ inode_parent = NULL;
+ break;
+
+ /*
+ * Process attributes, as described by the current log record.
+ */
+ case UpdateNonresidentValue:
+ case AddIndexEntryAllocation:
+ case DeleteIndexEntryAllocation:
+ case WriteEndOfIndexBuffer:
+ case SetIndexEntryVcnAllocation:
+ case UpdateFileNameAllocation:
+ case SetBitsInNonresidentBitMap:
+ case ClearBitsInNonresidentBitMap:
+ case UpdateRecordDataAllocation:
+ attr = oa->attr;
+ bytes = UpdateNonresidentValue == op ? dlen : 0;
+ lco = (u64)le16_to_cpu(lrh->lcns_follow) << sbi->cluster_bits;
+
+ if (attr->type == ATTR_ALLOC) {
+ t32 = le32_to_cpu(oe->bytes_per_index);
+ if (bytes < t32)
+ bytes = t32;
+ }
+
+ if (!bytes)
+ bytes = lco - cbo;
+
+ bytes += roff;
+ if (attr->type == ATTR_ALLOC)
+ bytes = (bytes + 511) & ~511; // align
+
+ buffer_le = kmalloc(bytes, GFP_NOFS);
+ if (!buffer_le)
+ return -ENOMEM;
+
+ err = ntfs_read_run_nb(sbi, oa->run1, vbo, buffer_le, bytes,
+ NULL);
+ if (err)
+ goto out;
+
+ if (attr->type == ATTR_ALLOC && *(int *)buffer_le)
+ ntfs_fix_post_read(buffer_le, bytes, false);
+ break;
+
+ default:
+ WARN_ON(1);
+ }
+
+ /* Big switch to do operation. */
+ switch (op) {
+ case InitializeFileRecordSegment:
+ if (roff + dlen > record_size)
+ goto dirty_vol;
+
+ memcpy(Add2Ptr(rec, roff), data, dlen);
+ mi->dirty = true;
+ break;
+
+ case DeallocateFileRecordSegment:
+ clear_rec_inuse(rec);
+ le16_add_cpu(&rec->seq, 1);
+ mi->dirty = true;
+ break;
+
+ case WriteEndOfFileRecordSegment:
+ attr2 = (struct ATTRIB *)data;
+ if (!check_if_attr(rec, lrh) || roff + dlen > record_size)
+ goto dirty_vol;
+
+ memmove(attr, attr2, dlen);
+ rec->used = cpu_to_le32(ALIGN(roff + dlen, 8));
+
+ mi->dirty = true;
+ break;
+
+ case CreateAttribute:
+ attr2 = (struct ATTRIB *)data;
+ asize = le32_to_cpu(attr2->size);
+ used = le32_to_cpu(rec->used);
+
+ if (!check_if_attr(rec, lrh) || dlen < SIZEOF_RESIDENT ||
+ !IS_ALIGNED(asize, 8) ||
+ Add2Ptr(attr2, asize) > Add2Ptr(lrh, rec_len) ||
+ dlen > record_size - used) {
+ goto dirty_vol;
+ }
+
+ memmove(Add2Ptr(attr, asize), attr, used - roff);
+ memcpy(attr, attr2, asize);
+
+ rec->used = cpu_to_le32(used + asize);
+ id = le16_to_cpu(rec->next_attr_id);
+ id2 = le16_to_cpu(attr2->id);
+ if (id <= id2)
+ rec->next_attr_id = cpu_to_le16(id2 + 1);
+ if (is_attr_indexed(attr))
+ le16_add_cpu(&rec->hard_links, 1);
+
+ oa2 = find_loaded_attr(log, attr, rno_base);
+ if (oa2) {
+ void *p2 = kmemdup(attr, le32_to_cpu(attr->size),
+ GFP_NOFS);
+ if (p2) {
+ // run_close(oa2->run1);
+ kfree(oa2->attr);
+ oa2->attr = p2;
+ }
+ }
+
+ mi->dirty = true;
+ break;
+
+ case DeleteAttribute:
+ asize = le32_to_cpu(attr->size);
+ used = le32_to_cpu(rec->used);
+
+ if (!check_if_attr(rec, lrh))
+ goto dirty_vol;
+
+ rec->used = cpu_to_le32(used - asize);
+ if (is_attr_indexed(attr))
+ le16_add_cpu(&rec->hard_links, -1);
+
+ memmove(attr, Add2Ptr(attr, asize), used - asize - roff);
+
+ mi->dirty = true;
+ break;
+
+ case UpdateResidentValue:
+ nsize = aoff + dlen;
+
+ if (!check_if_attr(rec, lrh))
+ goto dirty_vol;
+
+ asize = le32_to_cpu(attr->size);
+ used = le32_to_cpu(rec->used);
+
+ if (lrh->redo_len == lrh->undo_len) {
+ if (nsize > asize)
+ goto dirty_vol;
+ goto move_data;
+ }
+
+ if (nsize > asize && nsize - asize > record_size - used)
+ goto dirty_vol;
+
+ nsize = ALIGN(nsize, 8);
+ data_off = le16_to_cpu(attr->res.data_off);
+
+ if (nsize < asize) {
+ memmove(Add2Ptr(attr, aoff), data, dlen);
+ data = NULL; // To skip below memmove().
+ }
+
+ memmove(Add2Ptr(attr, nsize), Add2Ptr(attr, asize),
+ used - le16_to_cpu(lrh->record_off) - asize);
+
+ rec->used = cpu_to_le32(used + nsize - asize);
+ attr->size = cpu_to_le32(nsize);
+ attr->res.data_size = cpu_to_le32(aoff + dlen - data_off);
+
+move_data:
+ if (data)
+ memmove(Add2Ptr(attr, aoff), data, dlen);
+
+ oa2 = find_loaded_attr(log, attr, rno_base);
+ if (oa2) {
+ void *p2 = kmemdup(attr, le32_to_cpu(attr->size),
+ GFP_NOFS);
+ if (p2) {
+ // run_close(&oa2->run0);
+ oa2->run1 = &oa2->run0;
+ kfree(oa2->attr);
+ oa2->attr = p2;
+ }
+ }
+
+ mi->dirty = true;
+ break;
+
+ case UpdateMappingPairs:
+ nsize = aoff + dlen;
+ asize = le32_to_cpu(attr->size);
+ used = le32_to_cpu(rec->used);
+
+ if (!check_if_attr(rec, lrh) || !attr->non_res ||
+ aoff < le16_to_cpu(attr->nres.run_off) || aoff > asize ||
+ (nsize > asize && nsize - asize > record_size - used)) {
+ goto dirty_vol;
+ }
+
+ nsize = ALIGN(nsize, 8);
+
+ memmove(Add2Ptr(attr, nsize), Add2Ptr(attr, asize),
+ used - le16_to_cpu(lrh->record_off) - asize);
+ rec->used = cpu_to_le32(used + nsize - asize);
+ attr->size = cpu_to_le32(nsize);
+ memmove(Add2Ptr(attr, aoff), data, dlen);
+
+ if (run_get_highest_vcn(le64_to_cpu(attr->nres.svcn),
+ attr_run(attr), &t64)) {
+ goto dirty_vol;
+ }
+
+ attr->nres.evcn = cpu_to_le64(t64);
+ oa2 = find_loaded_attr(log, attr, rno_base);
+ if (oa2 && oa2->attr->non_res)
+ oa2->attr->nres.evcn = attr->nres.evcn;
+
+ mi->dirty = true;
+ break;
+
+ case SetNewAttributeSizes:
+ new_sz = data;
+ if (!check_if_attr(rec, lrh) || !attr->non_res)
+ goto dirty_vol;
+
+ attr->nres.alloc_size = new_sz->alloc_size;
+ attr->nres.data_size = new_sz->data_size;
+ attr->nres.valid_size = new_sz->valid_size;
+
+ if (dlen >= sizeof(struct NEW_ATTRIBUTE_SIZES))
+ attr->nres.total_size = new_sz->total_size;
+
+ oa2 = find_loaded_attr(log, attr, rno_base);
+ if (oa2) {
+ void *p2 = kmemdup(attr, le32_to_cpu(attr->size),
+ GFP_NOFS);
+ if (p2) {
+ kfree(oa2->attr);
+ oa2->attr = p2;
+ }
+ }
+ mi->dirty = true;
+ break;
+
+ case AddIndexEntryRoot:
+ e = (struct NTFS_DE *)data;
+ esize = le16_to_cpu(e->size);
+ root = resident_data(attr);
+ hdr = &root->ihdr;
+ used = le32_to_cpu(hdr->used);
+
+ if (!check_if_index_root(rec, lrh) ||
+ !check_if_root_index(attr, hdr, lrh) ||
+ Add2Ptr(data, esize) > Add2Ptr(lrh, rec_len) ||
+ esize > le32_to_cpu(rec->total) - le32_to_cpu(rec->used)) {
+ goto dirty_vol;
+ }
+
+ e1 = Add2Ptr(attr, le16_to_cpu(lrh->attr_off));
+
+ change_attr_size(rec, attr, le32_to_cpu(attr->size) + esize);
+
+ memmove(Add2Ptr(e1, esize), e1,
+ PtrOffset(e1, Add2Ptr(hdr, used)));
+ memmove(e1, e, esize);
+
+ le32_add_cpu(&attr->res.data_size, esize);
+ hdr->used = cpu_to_le32(used + esize);
+ le32_add_cpu(&hdr->total, esize);
+
+ mi->dirty = true;
+ break;
+
+ case DeleteIndexEntryRoot:
+ root = resident_data(attr);
+ hdr = &root->ihdr;
+ used = le32_to_cpu(hdr->used);
+
+ if (!check_if_index_root(rec, lrh) ||
+ !check_if_root_index(attr, hdr, lrh)) {
+ goto dirty_vol;
+ }
+
+ e1 = Add2Ptr(attr, le16_to_cpu(lrh->attr_off));
+ esize = le16_to_cpu(e1->size);
+ e2 = Add2Ptr(e1, esize);
+
+ memmove(e1, e2, PtrOffset(e2, Add2Ptr(hdr, used)));
+
+ le32_sub_cpu(&attr->res.data_size, esize);
+ hdr->used = cpu_to_le32(used - esize);
+ le32_sub_cpu(&hdr->total, esize);
+
+ change_attr_size(rec, attr, le32_to_cpu(attr->size) - esize);
+
+ mi->dirty = true;
+ break;
+
+ case SetIndexEntryVcnRoot:
+ root = resident_data(attr);
+ hdr = &root->ihdr;
+
+ if (!check_if_index_root(rec, lrh) ||
+ !check_if_root_index(attr, hdr, lrh)) {
+ goto dirty_vol;
+ }
+
+ e = Add2Ptr(attr, le16_to_cpu(lrh->attr_off));
+
+ de_set_vbn_le(e, *(__le64 *)data);
+ mi->dirty = true;
+ break;
+
+ case UpdateFileNameRoot:
+ root = resident_data(attr);
+ hdr = &root->ihdr;
+
+ if (!check_if_index_root(rec, lrh) ||
+ !check_if_root_index(attr, hdr, lrh)) {
+ goto dirty_vol;
+ }
+
+ e = Add2Ptr(attr, le16_to_cpu(lrh->attr_off));
+ fname = (struct ATTR_FILE_NAME *)(e + 1);
+ memmove(&fname->dup, data, sizeof(fname->dup)); //
+ mi->dirty = true;
+ break;
+
+ case UpdateRecordDataRoot:
+ root = resident_data(attr);
+ hdr = &root->ihdr;
+
+ if (!check_if_index_root(rec, lrh) ||
+ !check_if_root_index(attr, hdr, lrh)) {
+ goto dirty_vol;
+ }
+
+ e = Add2Ptr(attr, le16_to_cpu(lrh->attr_off));
+
+ memmove(Add2Ptr(e, le16_to_cpu(e->view.data_off)), data, dlen);
+
+ mi->dirty = true;
+ break;
+
+ case ZeroEndOfFileRecord:
+ if (roff + dlen > record_size)
+ goto dirty_vol;
+
+ memset(attr, 0, dlen);
+ mi->dirty = true;
+ break;
+
+ case UpdateNonresidentValue:
+ if (lco < cbo + roff + dlen)
+ goto dirty_vol;
+
+ memcpy(Add2Ptr(buffer_le, roff), data, dlen);
+
+ a_dirty = true;
+ if (attr->type == ATTR_ALLOC)
+ ntfs_fix_pre_write(buffer_le, bytes);
+ break;
+
+ case AddIndexEntryAllocation:
+ ib = Add2Ptr(buffer_le, roff);
+ hdr = &ib->ihdr;
+ e = data;
+ esize = le16_to_cpu(e->size);
+ e1 = Add2Ptr(ib, aoff);
+
+ if (is_baad(&ib->rhdr))
+ goto dirty_vol;
+ if (!check_lsn(&ib->rhdr, rlsn))
+ goto out;
+
+ used = le32_to_cpu(hdr->used);
+
+ if (!check_index_buffer(ib, bytes) ||
+ !check_if_alloc_index(hdr, aoff) ||
+ Add2Ptr(e, esize) > Add2Ptr(lrh, rec_len) ||
+ used + esize > le32_to_cpu(hdr->total)) {
+ goto dirty_vol;
+ }
+
+ memmove(Add2Ptr(e1, esize), e1,
+ PtrOffset(e1, Add2Ptr(hdr, used)));
+ memcpy(e1, e, esize);
+
+ hdr->used = cpu_to_le32(used + esize);
+
+ a_dirty = true;
+
+ ntfs_fix_pre_write(&ib->rhdr, bytes);
+ break;
+
+ case DeleteIndexEntryAllocation:
+ ib = Add2Ptr(buffer_le, roff);
+ hdr = &ib->ihdr;
+ e = Add2Ptr(ib, aoff);
+ esize = le16_to_cpu(e->size);
+
+ if (is_baad(&ib->rhdr))
+ goto dirty_vol;
+ if (!check_lsn(&ib->rhdr, rlsn))
+ goto out;
+
+ if (!check_index_buffer(ib, bytes) ||
+ !check_if_alloc_index(hdr, aoff)) {
+ goto dirty_vol;
+ }
+
+ e1 = Add2Ptr(e, esize);
+ nsize = esize;
+ used = le32_to_cpu(hdr->used);
+
+ memmove(e, e1, PtrOffset(e1, Add2Ptr(hdr, used)));
+
+ hdr->used = cpu_to_le32(used - nsize);
+
+ a_dirty = true;
+
+ ntfs_fix_pre_write(&ib->rhdr, bytes);
+ break;
+
+ case WriteEndOfIndexBuffer:
+ ib = Add2Ptr(buffer_le, roff);
+ hdr = &ib->ihdr;
+ e = Add2Ptr(ib, aoff);
+
+ if (is_baad(&ib->rhdr))
+ goto dirty_vol;
+ if (!check_lsn(&ib->rhdr, rlsn))
+ goto out;
+ if (!check_index_buffer(ib, bytes) ||
+ !check_if_alloc_index(hdr, aoff) ||
+ aoff + dlen > offsetof(struct INDEX_BUFFER, ihdr) +
+ le32_to_cpu(hdr->total)) {
+ goto dirty_vol;
+ }
+
+ hdr->used = cpu_to_le32(dlen + PtrOffset(hdr, e));
+ memmove(e, data, dlen);
+
+ a_dirty = true;
+ ntfs_fix_pre_write(&ib->rhdr, bytes);
+ break;
+
+ case SetIndexEntryVcnAllocation:
+ ib = Add2Ptr(buffer_le, roff);
+ hdr = &ib->ihdr;
+ e = Add2Ptr(ib, aoff);
+
+ if (is_baad(&ib->rhdr))
+ goto dirty_vol;
+
+ if (!check_lsn(&ib->rhdr, rlsn))
+ goto out;
+ if (!check_index_buffer(ib, bytes) ||
+ !check_if_alloc_index(hdr, aoff)) {
+ goto dirty_vol;
+ }
+
+ de_set_vbn_le(e, *(__le64 *)data);
+
+ a_dirty = true;
+ ntfs_fix_pre_write(&ib->rhdr, bytes);
+ break;
+
+ case UpdateFileNameAllocation:
+ ib = Add2Ptr(buffer_le, roff);
+ hdr = &ib->ihdr;
+ e = Add2Ptr(ib, aoff);
+
+ if (is_baad(&ib->rhdr))
+ goto dirty_vol;
+
+ if (!check_lsn(&ib->rhdr, rlsn))
+ goto out;
+ if (!check_index_buffer(ib, bytes) ||
+ !check_if_alloc_index(hdr, aoff)) {
+ goto dirty_vol;
+ }
+
+ fname = (struct ATTR_FILE_NAME *)(e + 1);
+ memmove(&fname->dup, data, sizeof(fname->dup));
+
+ a_dirty = true;
+ ntfs_fix_pre_write(&ib->rhdr, bytes);
+ break;
+
+ case SetBitsInNonresidentBitMap:
+ bmp_off =
+ le32_to_cpu(((struct BITMAP_RANGE *)data)->bitmap_off);
+ bmp_bits = le32_to_cpu(((struct BITMAP_RANGE *)data)->bits);
+
+ if (cbo + (bmp_off + 7) / 8 > lco ||
+ cbo + ((bmp_off + bmp_bits + 7) / 8) > lco) {
+ goto dirty_vol;
+ }
+
+ __bitmap_set(Add2Ptr(buffer_le, roff), bmp_off, bmp_bits);
+ a_dirty = true;
+ break;
+
+ case ClearBitsInNonresidentBitMap:
+ bmp_off =
+ le32_to_cpu(((struct BITMAP_RANGE *)data)->bitmap_off);
+ bmp_bits = le32_to_cpu(((struct BITMAP_RANGE *)data)->bits);
+
+ if (cbo + (bmp_off + 7) / 8 > lco ||
+ cbo + ((bmp_off + bmp_bits + 7) / 8) > lco) {
+ goto dirty_vol;
+ }
+
+ __bitmap_clear(Add2Ptr(buffer_le, roff), bmp_off, bmp_bits);
+ a_dirty = true;
+ break;
+
+ case UpdateRecordDataAllocation:
+ ib = Add2Ptr(buffer_le, roff);
+ hdr = &ib->ihdr;
+ e = Add2Ptr(ib, aoff);
+
+ if (is_baad(&ib->rhdr))
+ goto dirty_vol;
+
+ if (!check_lsn(&ib->rhdr, rlsn))
+ goto out;
+ if (!check_index_buffer(ib, bytes) ||
+ !check_if_alloc_index(hdr, aoff)) {
+ goto dirty_vol;
+ }
+
+ memmove(Add2Ptr(e, le16_to_cpu(e->view.data_off)), data, dlen);
+
+ a_dirty = true;
+ ntfs_fix_pre_write(&ib->rhdr, bytes);
+ break;
+
+ default:
+ WARN_ON(1);
+ }
+
+ if (rlsn) {
+ __le64 t64 = cpu_to_le64(*rlsn);
+
+ if (rec)
+ rec->rhdr.lsn = t64;
+ if (ib)
+ ib->rhdr.lsn = t64;
+ }
+
+ if (mi && mi->dirty) {
+ err = mi_write(mi, 0);
+ if (err)
+ goto out;
+ }
+
+ if (a_dirty) {
+ attr = oa->attr;
+ err = ntfs_sb_write_run(sbi, oa->run1, vbo, buffer_le, bytes, 0);
+ if (err)
+ goto out;
+ }
+
+out:
+
+ if (inode)
+ iput(inode);
+ else if (mi != mi2_child)
+ mi_put(mi);
+
+ kfree(buffer_le);
+
+ return err;
+
+dirty_vol:
+ log->set_dirty = true;
+ goto out;
+}
+
+/*
+ * log_replay - Replays log and empties it.
+ *
+ * This function is called during mount operation.
+ * It replays log and empties it.
+ * Initialized is set false if logfile contains '-1'.
+ */
+int log_replay(struct ntfs_inode *ni, bool *initialized)
+{
+ int err;
+ struct ntfs_sb_info *sbi = ni->mi.sbi;
+ struct ntfs_log *log;
+
+ struct restart_info rst_info, rst_info2;
+ u64 rec_lsn, ra_lsn, checkpt_lsn = 0, rlsn = 0;
+ struct ATTR_NAME_ENTRY *attr_names = NULL;
+ struct ATTR_NAME_ENTRY *ane;
+ struct RESTART_TABLE *dptbl = NULL;
+ struct RESTART_TABLE *trtbl = NULL;
+ const struct RESTART_TABLE *rt;
+ struct RESTART_TABLE *oatbl = NULL;
+ struct inode *inode;
+ struct OpenAttr *oa;
+ struct ntfs_inode *ni_oe;
+ struct ATTRIB *attr = NULL;
+ u64 size, vcn, undo_next_lsn;
+ CLST rno, lcn, lcn0, len0, clen;
+ void *data;
+ struct NTFS_RESTART *rst = NULL;
+ struct lcb *lcb = NULL;
+ struct OPEN_ATTR_ENRTY *oe;
+ struct TRANSACTION_ENTRY *tr;
+ struct DIR_PAGE_ENTRY *dp;
+ u32 i, bytes_per_attr_entry;
+ u32 l_size = ni->vfs_inode.i_size;
+ u32 orig_file_size = l_size;
+ u32 page_size, vbo, tail, off, dlen;
+ u32 saved_len, rec_len, transact_id;
+ bool use_second_page;
+ struct RESTART_AREA *ra2, *ra = NULL;
+ struct CLIENT_REC *ca, *cr;
+ __le16 client;
+ struct RESTART_HDR *rh;
+ const struct LFS_RECORD_HDR *frh;
+ const struct LOG_REC_HDR *lrh;
+ bool is_mapped;
+ bool is_ro = sb_rdonly(sbi->sb);
+ u64 t64;
+ u16 t16;
+ u32 t32;
+
+ /* Get the size of page. NOTE: To replay we can use default page. */
+#if PAGE_SIZE >= DefaultLogPageSize && PAGE_SIZE <= DefaultLogPageSize * 2
+ page_size = norm_file_page(PAGE_SIZE, &l_size, true);
+#else
+ page_size = norm_file_page(PAGE_SIZE, &l_size, false);
+#endif
+ if (!page_size)
+ return -EINVAL;
+
+ log = kzalloc(sizeof(struct ntfs_log), GFP_NOFS);
+ if (!log)
+ return -ENOMEM;
+
+ memset(&rst_info, 0, sizeof(struct restart_info));
+
+ log->ni = ni;
+ log->l_size = l_size;
+ log->one_page_buf = kmalloc(page_size, GFP_NOFS);
+ if (!log->one_page_buf) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ log->page_size = page_size;
+ log->page_mask = page_size - 1;
+ log->page_bits = blksize_bits(page_size);
+
+ /* Look for a restart area on the disk. */
+ err = log_read_rst(log, l_size, true, &rst_info);
+ if (err)
+ goto out;
+
+ /* remember 'initialized' */
+ *initialized = rst_info.initialized;
+
+ if (!rst_info.restart) {
+ if (rst_info.initialized) {
+ /* No restart area but the file is not initialized. */
+ err = -EINVAL;
+ goto out;
+ }
+
+ log_init_pg_hdr(log, page_size, page_size, 1, 1);
+ log_create(log, l_size, 0, get_random_u32(), false, false);
+
+ log->ra = ra;
+
+ ra = log_create_ra(log);
+ if (!ra) {
+ err = -ENOMEM;
+ goto out;
+ }
+ log->ra = ra;
+ log->init_ra = true;
+
+ goto process_log;
+ }
+
+ /*
+ * If the restart offset above wasn't zero then we won't
+ * look for a second restart.
+ */
+ if (rst_info.vbo)
+ goto check_restart_area;
+
+ memset(&rst_info2, 0, sizeof(struct restart_info));
+ err = log_read_rst(log, l_size, false, &rst_info2);
+ if (err)
+ goto out;
+
+ /* Determine which restart area to use. */
+ if (!rst_info2.restart || rst_info2.last_lsn <= rst_info.last_lsn)
+ goto use_first_page;
+
+ use_second_page = true;
+
+ if (rst_info.chkdsk_was_run && page_size != rst_info.vbo) {
+ struct RECORD_PAGE_HDR *sp = NULL;
+ bool usa_error;
+
+ if (!read_log_page(log, page_size, &sp, &usa_error) &&
+ sp->rhdr.sign == NTFS_CHKD_SIGNATURE) {
+ use_second_page = false;
+ }
+ kfree(sp);
+ }
+
+ if (use_second_page) {
+ kfree(rst_info.r_page);
+ memcpy(&rst_info, &rst_info2, sizeof(struct restart_info));
+ rst_info2.r_page = NULL;
+ }
+
+use_first_page:
+ kfree(rst_info2.r_page);
+
+check_restart_area:
+ /*
+ * If the restart area is at offset 0, we want
+ * to write the second restart area first.
+ */
+ log->init_ra = !!rst_info.vbo;
+
+ /* If we have a valid page then grab a pointer to the restart area. */
+ ra2 = rst_info.valid_page
+ ? Add2Ptr(rst_info.r_page,
+ le16_to_cpu(rst_info.r_page->ra_off))
+ : NULL;
+
+ if (rst_info.chkdsk_was_run ||
+ (ra2 && ra2->client_idx[1] == LFS_NO_CLIENT_LE)) {
+ bool wrapped = false;
+ bool use_multi_page = false;
+ u32 open_log_count;
+
+ /* Do some checks based on whether we have a valid log page. */
+ if (!rst_info.valid_page) {
+ open_log_count = get_random_u32();
+ goto init_log_instance;
+ }
+ open_log_count = le32_to_cpu(ra2->open_log_count);
+
+ /*
+ * If the restart page size isn't changing then we want to
+ * check how much work we need to do.
+ */
+ if (page_size != le32_to_cpu(rst_info.r_page->sys_page_size))
+ goto init_log_instance;
+
+init_log_instance:
+ log_init_pg_hdr(log, page_size, page_size, 1, 1);
+
+ log_create(log, l_size, rst_info.last_lsn, open_log_count,
+ wrapped, use_multi_page);
+
+ ra = log_create_ra(log);
+ if (!ra) {
+ err = -ENOMEM;
+ goto out;
+ }
+ log->ra = ra;
+
+ /* Put the restart areas and initialize
+ * the log file as required.
+ */
+ goto process_log;
+ }
+
+ if (!ra2) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ /*
+ * If the log page or the system page sizes have changed, we can't
+ * use the log file. We must use the system page size instead of the
+ * default size if there is not a clean shutdown.
+ */
+ t32 = le32_to_cpu(rst_info.r_page->sys_page_size);
+ if (page_size != t32) {
+ l_size = orig_file_size;
+ page_size =
+ norm_file_page(t32, &l_size, t32 == DefaultLogPageSize);
+ }
+
+ if (page_size != t32 ||
+ page_size != le32_to_cpu(rst_info.r_page->page_size)) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ /* If the file size has shrunk then we won't mount it. */
+ if (l_size < le64_to_cpu(ra2->l_size)) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ log_init_pg_hdr(log, page_size, page_size,
+ le16_to_cpu(rst_info.r_page->major_ver),
+ le16_to_cpu(rst_info.r_page->minor_ver));
+
+ log->l_size = le64_to_cpu(ra2->l_size);
+ log->seq_num_bits = le32_to_cpu(ra2->seq_num_bits);
+ log->file_data_bits = sizeof(u64) * 8 - log->seq_num_bits;
+ log->seq_num_mask = (8 << log->file_data_bits) - 1;
+ log->last_lsn = le64_to_cpu(ra2->current_lsn);
+ log->seq_num = log->last_lsn >> log->file_data_bits;
+ log->ra_off = le16_to_cpu(rst_info.r_page->ra_off);
+ log->restart_size = log->sys_page_size - log->ra_off;
+ log->record_header_len = le16_to_cpu(ra2->rec_hdr_len);
+ log->ra_size = le16_to_cpu(ra2->ra_len);
+ log->data_off = le16_to_cpu(ra2->data_off);
+ log->data_size = log->page_size - log->data_off;
+ log->reserved = log->data_size - log->record_header_len;
+
+ vbo = lsn_to_vbo(log, log->last_lsn);
+
+ if (vbo < log->first_page) {
+ /* This is a pseudo lsn. */
+ log->l_flags |= NTFSLOG_NO_LAST_LSN;
+ log->next_page = log->first_page;
+ goto find_oldest;
+ }
+
+ /* Find the end of this log record. */
+ off = final_log_off(log, log->last_lsn,
+ le32_to_cpu(ra2->last_lsn_data_len));
+
+ /* If we wrapped the file then increment the sequence number. */
+ if (off <= vbo) {
+ log->seq_num += 1;
+ log->l_flags |= NTFSLOG_WRAPPED;
+ }
+
+ /* Now compute the next log page to use. */
+ vbo &= ~log->sys_page_mask;
+ tail = log->page_size - (off & log->page_mask) - 1;
+
+ /*
+ *If we can fit another log record on the page,
+ * move back a page the log file.
+ */
+ if (tail >= log->record_header_len) {
+ log->l_flags |= NTFSLOG_REUSE_TAIL;
+ log->next_page = vbo;
+ } else {
+ log->next_page = next_page_off(log, vbo);
+ }
+
+find_oldest:
+ /*
+ * Find the oldest client lsn. Use the last
+ * flushed lsn as a starting point.
+ */
+ log->oldest_lsn = log->last_lsn;
+ oldest_client_lsn(Add2Ptr(ra2, le16_to_cpu(ra2->client_off)),
+ ra2->client_idx[1], &log->oldest_lsn);
+ log->oldest_lsn_off = lsn_to_vbo(log, log->oldest_lsn);
+
+ if (log->oldest_lsn_off < log->first_page)
+ log->l_flags |= NTFSLOG_NO_OLDEST_LSN;
+
+ if (!(ra2->flags & RESTART_SINGLE_PAGE_IO))
+ log->l_flags |= NTFSLOG_WRAPPED | NTFSLOG_MULTIPLE_PAGE_IO;
+
+ log->current_openlog_count = le32_to_cpu(ra2->open_log_count);
+ log->total_avail_pages = log->l_size - log->first_page;
+ log->total_avail = log->total_avail_pages >> log->page_bits;
+ log->max_current_avail = log->total_avail * log->reserved;
+ log->total_avail = log->total_avail * log->data_size;
+
+ log->current_avail = current_log_avail(log);
+
+ ra = kzalloc(log->restart_size, GFP_NOFS);
+ if (!ra) {
+ err = -ENOMEM;
+ goto out;
+ }
+ log->ra = ra;
+
+ t16 = le16_to_cpu(ra2->client_off);
+ if (t16 == offsetof(struct RESTART_AREA, clients)) {
+ memcpy(ra, ra2, log->ra_size);
+ } else {
+ memcpy(ra, ra2, offsetof(struct RESTART_AREA, clients));
+ memcpy(ra->clients, Add2Ptr(ra2, t16),
+ le16_to_cpu(ra2->ra_len) - t16);
+
+ log->current_openlog_count = get_random_u32();
+ ra->open_log_count = cpu_to_le32(log->current_openlog_count);
+ log->ra_size = offsetof(struct RESTART_AREA, clients) +
+ sizeof(struct CLIENT_REC);
+ ra->client_off =
+ cpu_to_le16(offsetof(struct RESTART_AREA, clients));
+ ra->ra_len = cpu_to_le16(log->ra_size);
+ }
+
+ le32_add_cpu(&ra->open_log_count, 1);
+
+ /* Now we need to walk through looking for the last lsn. */
+ err = last_log_lsn(log);
+ if (err)
+ goto out;
+
+ log->current_avail = current_log_avail(log);
+
+ /* Remember which restart area to write first. */
+ log->init_ra = rst_info.vbo;
+
+process_log:
+ /* 1.0, 1.1, 2.0 log->major_ver/minor_ver - short values. */
+ switch ((log->major_ver << 16) + log->minor_ver) {
+ case 0x10000:
+ case 0x10001:
+ case 0x20000:
+ break;
+ default:
+ ntfs_warn(sbi->sb, "\x24LogFile version %d.%d is not supported",
+ log->major_ver, log->minor_ver);
+ err = -EOPNOTSUPP;
+ log->set_dirty = true;
+ goto out;
+ }
+
+ /* One client "NTFS" per logfile. */
+ ca = Add2Ptr(ra, le16_to_cpu(ra->client_off));
+
+ for (client = ra->client_idx[1];; client = cr->next_client) {
+ if (client == LFS_NO_CLIENT_LE) {
+ /* Insert "NTFS" client LogFile. */
+ client = ra->client_idx[0];
+ if (client == LFS_NO_CLIENT_LE) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ t16 = le16_to_cpu(client);
+ cr = ca + t16;
+
+ remove_client(ca, cr, &ra->client_idx[0]);
+
+ cr->restart_lsn = 0;
+ cr->oldest_lsn = cpu_to_le64(log->oldest_lsn);
+ cr->name_bytes = cpu_to_le32(8);
+ cr->name[0] = cpu_to_le16('N');
+ cr->name[1] = cpu_to_le16('T');
+ cr->name[2] = cpu_to_le16('F');
+ cr->name[3] = cpu_to_le16('S');
+
+ add_client(ca, t16, &ra->client_idx[1]);
+ break;
+ }
+
+ cr = ca + le16_to_cpu(client);
+
+ if (cpu_to_le32(8) == cr->name_bytes &&
+ cpu_to_le16('N') == cr->name[0] &&
+ cpu_to_le16('T') == cr->name[1] &&
+ cpu_to_le16('F') == cr->name[2] &&
+ cpu_to_le16('S') == cr->name[3])
+ break;
+ }
+
+ /* Update the client handle with the client block information. */
+ log->client_id.seq_num = cr->seq_num;
+ log->client_id.client_idx = client;
+
+ err = read_rst_area(log, &rst, &ra_lsn);
+ if (err)
+ goto out;
+
+ if (!rst)
+ goto out;
+
+ bytes_per_attr_entry = !rst->major_ver ? 0x2C : 0x28;
+
+ checkpt_lsn = le64_to_cpu(rst->check_point_start);
+ if (!checkpt_lsn)
+ checkpt_lsn = ra_lsn;
+
+ /* Allocate and Read the Transaction Table. */
+ if (!rst->transact_table_len)
+ goto check_dirty_page_table;
+
+ t64 = le64_to_cpu(rst->transact_table_lsn);
+ err = read_log_rec_lcb(log, t64, lcb_ctx_prev, &lcb);
+ if (err)
+ goto out;
+
+ lrh = lcb->log_rec;
+ frh = lcb->lrh;
+ rec_len = le32_to_cpu(frh->client_data_len);
+
+ if (!check_log_rec(lrh, rec_len, le32_to_cpu(frh->transact_id),
+ bytes_per_attr_entry)) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ t16 = le16_to_cpu(lrh->redo_off);
+
+ rt = Add2Ptr(lrh, t16);
+ t32 = rec_len - t16;
+
+ /* Now check that this is a valid restart table. */
+ if (!check_rstbl(rt, t32)) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ trtbl = kmemdup(rt, t32, GFP_NOFS);
+ if (!trtbl) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ lcb_put(lcb);
+ lcb = NULL;
+
+check_dirty_page_table:
+ /* The next record back should be the Dirty Pages Table. */
+ if (!rst->dirty_pages_len)
+ goto check_attribute_names;
+
+ t64 = le64_to_cpu(rst->dirty_pages_table_lsn);
+ err = read_log_rec_lcb(log, t64, lcb_ctx_prev, &lcb);
+ if (err)
+ goto out;
+
+ lrh = lcb->log_rec;
+ frh = lcb->lrh;
+ rec_len = le32_to_cpu(frh->client_data_len);
+
+ if (!check_log_rec(lrh, rec_len, le32_to_cpu(frh->transact_id),
+ bytes_per_attr_entry)) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ t16 = le16_to_cpu(lrh->redo_off);
+
+ rt = Add2Ptr(lrh, t16);
+ t32 = rec_len - t16;
+
+ /* Now check that this is a valid restart table. */
+ if (!check_rstbl(rt, t32)) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ dptbl = kmemdup(rt, t32, GFP_NOFS);
+ if (!dptbl) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ /* Convert Ra version '0' into version '1'. */
+ if (rst->major_ver)
+ goto end_conv_1;
+
+ dp = NULL;
+ while ((dp = enum_rstbl(dptbl, dp))) {
+ struct DIR_PAGE_ENTRY_32 *dp0 = (struct DIR_PAGE_ENTRY_32 *)dp;
+ // NOTE: Danger. Check for of boundary.
+ memmove(&dp->vcn, &dp0->vcn_low,
+ 2 * sizeof(u64) +
+ le32_to_cpu(dp->lcns_follow) * sizeof(u64));
+ }
+
+end_conv_1:
+ lcb_put(lcb);
+ lcb = NULL;
+
+ /*
+ * Go through the table and remove the duplicates,
+ * remembering the oldest lsn values.
+ */
+ if (sbi->cluster_size <= log->page_size)
+ goto trace_dp_table;
+
+ dp = NULL;
+ while ((dp = enum_rstbl(dptbl, dp))) {
+ struct DIR_PAGE_ENTRY *next = dp;
+
+ while ((next = enum_rstbl(dptbl, next))) {
+ if (next->target_attr == dp->target_attr &&
+ next->vcn == dp->vcn) {
+ if (le64_to_cpu(next->oldest_lsn) <
+ le64_to_cpu(dp->oldest_lsn)) {
+ dp->oldest_lsn = next->oldest_lsn;
+ }
+
+ free_rsttbl_idx(dptbl, PtrOffset(dptbl, next));
+ }
+ }
+ }
+trace_dp_table:
+check_attribute_names:
+ /* The next record should be the Attribute Names. */
+ if (!rst->attr_names_len)
+ goto check_attr_table;
+
+ t64 = le64_to_cpu(rst->attr_names_lsn);
+ err = read_log_rec_lcb(log, t64, lcb_ctx_prev, &lcb);
+ if (err)
+ goto out;
+
+ lrh = lcb->log_rec;
+ frh = lcb->lrh;
+ rec_len = le32_to_cpu(frh->client_data_len);
+
+ if (!check_log_rec(lrh, rec_len, le32_to_cpu(frh->transact_id),
+ bytes_per_attr_entry)) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ t32 = lrh_length(lrh);
+ rec_len -= t32;
+
+ attr_names = kmemdup(Add2Ptr(lrh, t32), rec_len, GFP_NOFS);
+ if (!attr_names) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ lcb_put(lcb);
+ lcb = NULL;
+
+check_attr_table:
+ /* The next record should be the attribute Table. */
+ if (!rst->open_attr_len)
+ goto check_attribute_names2;
+
+ t64 = le64_to_cpu(rst->open_attr_table_lsn);
+ err = read_log_rec_lcb(log, t64, lcb_ctx_prev, &lcb);
+ if (err)
+ goto out;
+
+ lrh = lcb->log_rec;
+ frh = lcb->lrh;
+ rec_len = le32_to_cpu(frh->client_data_len);
+
+ if (!check_log_rec(lrh, rec_len, le32_to_cpu(frh->transact_id),
+ bytes_per_attr_entry)) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ t16 = le16_to_cpu(lrh->redo_off);
+
+ rt = Add2Ptr(lrh, t16);
+ t32 = rec_len - t16;
+
+ if (!check_rstbl(rt, t32)) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ oatbl = kmemdup(rt, t32, GFP_NOFS);
+ if (!oatbl) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ log->open_attr_tbl = oatbl;
+
+ /* Clear all of the Attr pointers. */
+ oe = NULL;
+ while ((oe = enum_rstbl(oatbl, oe))) {
+ if (!rst->major_ver) {
+ struct OPEN_ATTR_ENRTY_32 oe0;
+
+ /* Really 'oe' points to OPEN_ATTR_ENRTY_32. */
+ memcpy(&oe0, oe, SIZEOF_OPENATTRIBUTEENTRY0);
+
+ oe->bytes_per_index = oe0.bytes_per_index;
+ oe->type = oe0.type;
+ oe->is_dirty_pages = oe0.is_dirty_pages;
+ oe->name_len = 0;
+ oe->ref = oe0.ref;
+ oe->open_record_lsn = oe0.open_record_lsn;
+ }
+
+ oe->is_attr_name = 0;
+ oe->ptr = NULL;
+ }
+
+ lcb_put(lcb);
+ lcb = NULL;
+
+check_attribute_names2:
+ if (!rst->attr_names_len)
+ goto trace_attribute_table;
+
+ ane = attr_names;
+ if (!oatbl)
+ goto trace_attribute_table;
+ while (ane->off) {
+ /* TODO: Clear table on exit! */
+ oe = Add2Ptr(oatbl, le16_to_cpu(ane->off));
+ t16 = le16_to_cpu(ane->name_bytes);
+ oe->name_len = t16 / sizeof(short);
+ oe->ptr = ane->name;
+ oe->is_attr_name = 2;
+ ane = Add2Ptr(ane, sizeof(struct ATTR_NAME_ENTRY) + t16);
+ }
+
+trace_attribute_table:
+ /*
+ * If the checkpt_lsn is zero, then this is a freshly
+ * formatted disk and we have no work to do.
+ */
+ if (!checkpt_lsn) {
+ err = 0;
+ goto out;
+ }
+
+ if (!oatbl) {
+ oatbl = init_rsttbl(bytes_per_attr_entry, 8);
+ if (!oatbl) {
+ err = -ENOMEM;
+ goto out;
+ }
+ }
+
+ log->open_attr_tbl = oatbl;
+
+ /* Start the analysis pass from the Checkpoint lsn. */
+ rec_lsn = checkpt_lsn;
+
+ /* Read the first lsn. */
+ err = read_log_rec_lcb(log, checkpt_lsn, lcb_ctx_next, &lcb);
+ if (err)
+ goto out;
+
+ /* Loop to read all subsequent records to the end of the log file. */
+next_log_record_analyze:
+ err = read_next_log_rec(log, lcb, &rec_lsn);
+ if (err)
+ goto out;
+
+ if (!rec_lsn)
+ goto end_log_records_enumerate;
+
+ frh = lcb->lrh;
+ transact_id = le32_to_cpu(frh->transact_id);
+ rec_len = le32_to_cpu(frh->client_data_len);
+ lrh = lcb->log_rec;
+
+ if (!check_log_rec(lrh, rec_len, transact_id, bytes_per_attr_entry)) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ /*
+ * The first lsn after the previous lsn remembered
+ * the checkpoint is the first candidate for the rlsn.
+ */
+ if (!rlsn)
+ rlsn = rec_lsn;
+
+ if (LfsClientRecord != frh->record_type)
+ goto next_log_record_analyze;
+
+ /*
+ * Now update the Transaction Table for this transaction. If there
+ * is no entry present or it is unallocated we allocate the entry.
+ */
+ if (!trtbl) {
+ trtbl = init_rsttbl(sizeof(struct TRANSACTION_ENTRY),
+ INITIAL_NUMBER_TRANSACTIONS);
+ if (!trtbl) {
+ err = -ENOMEM;
+ goto out;
+ }
+ }
+
+ tr = Add2Ptr(trtbl, transact_id);
+
+ if (transact_id >= bytes_per_rt(trtbl) ||
+ tr->next != RESTART_ENTRY_ALLOCATED_LE) {
+ tr = alloc_rsttbl_from_idx(&trtbl, transact_id);
+ if (!tr) {
+ err = -ENOMEM;
+ goto out;
+ }
+ tr->transact_state = TransactionActive;
+ tr->first_lsn = cpu_to_le64(rec_lsn);
+ }
+
+ tr->prev_lsn = tr->undo_next_lsn = cpu_to_le64(rec_lsn);
+
+ /*
+ * If this is a compensation log record, then change
+ * the undo_next_lsn to be the undo_next_lsn of this record.
+ */
+ if (lrh->undo_op == cpu_to_le16(CompensationLogRecord))
+ tr->undo_next_lsn = frh->client_undo_next_lsn;
+
+ /* Dispatch to handle log record depending on type. */
+ switch (le16_to_cpu(lrh->redo_op)) {
+ case InitializeFileRecordSegment:
+ case DeallocateFileRecordSegment:
+ case WriteEndOfFileRecordSegment:
+ case CreateAttribute:
+ case DeleteAttribute:
+ case UpdateResidentValue:
+ case UpdateNonresidentValue:
+ case UpdateMappingPairs:
+ case SetNewAttributeSizes:
+ case AddIndexEntryRoot:
+ case DeleteIndexEntryRoot:
+ case AddIndexEntryAllocation:
+ case DeleteIndexEntryAllocation:
+ case WriteEndOfIndexBuffer:
+ case SetIndexEntryVcnRoot:
+ case SetIndexEntryVcnAllocation:
+ case UpdateFileNameRoot:
+ case UpdateFileNameAllocation:
+ case SetBitsInNonresidentBitMap:
+ case ClearBitsInNonresidentBitMap:
+ case UpdateRecordDataRoot:
+ case UpdateRecordDataAllocation:
+ case ZeroEndOfFileRecord:
+ t16 = le16_to_cpu(lrh->target_attr);
+ t64 = le64_to_cpu(lrh->target_vcn);
+ dp = find_dp(dptbl, t16, t64);
+
+ if (dp)
+ goto copy_lcns;
+
+ /*
+ * Calculate the number of clusters per page the system
+ * which wrote the checkpoint, possibly creating the table.
+ */
+ if (dptbl) {
+ t32 = (le16_to_cpu(dptbl->size) -
+ sizeof(struct DIR_PAGE_ENTRY)) /
+ sizeof(u64);
+ } else {
+ t32 = log->clst_per_page;
+ kfree(dptbl);
+ dptbl = init_rsttbl(struct_size(dp, page_lcns, t32),
+ 32);
+ if (!dptbl) {
+ err = -ENOMEM;
+ goto out;
+ }
+ }
+
+ dp = alloc_rsttbl_idx(&dptbl);
+ if (!dp) {
+ err = -ENOMEM;
+ goto out;
+ }
+ dp->target_attr = cpu_to_le32(t16);
+ dp->transfer_len = cpu_to_le32(t32 << sbi->cluster_bits);
+ dp->lcns_follow = cpu_to_le32(t32);
+ dp->vcn = cpu_to_le64(t64 & ~((u64)t32 - 1));
+ dp->oldest_lsn = cpu_to_le64(rec_lsn);
+
+copy_lcns:
+ /*
+ * Copy the Lcns from the log record into the Dirty Page Entry.
+ * TODO: For different page size support, must somehow make
+ * whole routine a loop, case Lcns do not fit below.
+ */
+ t16 = le16_to_cpu(lrh->lcns_follow);
+ for (i = 0; i < t16; i++) {
+ size_t j = (size_t)(le64_to_cpu(lrh->target_vcn) -
+ le64_to_cpu(dp->vcn));
+ dp->page_lcns[j + i] = lrh->page_lcns[i];
+ }
+
+ goto next_log_record_analyze;
+
+ case DeleteDirtyClusters: {
+ u32 range_count =
+ le16_to_cpu(lrh->redo_len) / sizeof(struct LCN_RANGE);
+ const struct LCN_RANGE *r =
+ Add2Ptr(lrh, le16_to_cpu(lrh->redo_off));
+
+ /* Loop through all of the Lcn ranges this log record. */
+ for (i = 0; i < range_count; i++, r++) {
+ u64 lcn0 = le64_to_cpu(r->lcn);
+ u64 lcn_e = lcn0 + le64_to_cpu(r->len) - 1;
+
+ dp = NULL;
+ while ((dp = enum_rstbl(dptbl, dp))) {
+ u32 j;
+
+ t32 = le32_to_cpu(dp->lcns_follow);
+ for (j = 0; j < t32; j++) {
+ t64 = le64_to_cpu(dp->page_lcns[j]);
+ if (t64 >= lcn0 && t64 <= lcn_e)
+ dp->page_lcns[j] = 0;
+ }
+ }
+ }
+ goto next_log_record_analyze;
+ ;
+ }
+
+ case OpenNonresidentAttribute:
+ t16 = le16_to_cpu(lrh->target_attr);
+ if (t16 >= bytes_per_rt(oatbl)) {
+ /*
+ * Compute how big the table needs to be.
+ * Add 10 extra entries for some cushion.
+ */
+ u32 new_e = t16 / le16_to_cpu(oatbl->size);
+
+ new_e += 10 - le16_to_cpu(oatbl->used);
+
+ oatbl = extend_rsttbl(oatbl, new_e, ~0u);
+ log->open_attr_tbl = oatbl;
+ if (!oatbl) {
+ err = -ENOMEM;
+ goto out;
+ }
+ }
+
+ /* Point to the entry being opened. */
+ oe = alloc_rsttbl_from_idx(&oatbl, t16);
+ log->open_attr_tbl = oatbl;
+ if (!oe) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ /* Initialize this entry from the log record. */
+ t16 = le16_to_cpu(lrh->redo_off);
+ if (!rst->major_ver) {
+ /* Convert version '0' into version '1'. */
+ struct OPEN_ATTR_ENRTY_32 *oe0 = Add2Ptr(lrh, t16);
+
+ oe->bytes_per_index = oe0->bytes_per_index;
+ oe->type = oe0->type;
+ oe->is_dirty_pages = oe0->is_dirty_pages;
+ oe->name_len = 0; //oe0.name_len;
+ oe->ref = oe0->ref;
+ oe->open_record_lsn = oe0->open_record_lsn;
+ } else {
+ memcpy(oe, Add2Ptr(lrh, t16), bytes_per_attr_entry);
+ }
+
+ t16 = le16_to_cpu(lrh->undo_len);
+ if (t16) {
+ oe->ptr = kmalloc(t16, GFP_NOFS);
+ if (!oe->ptr) {
+ err = -ENOMEM;
+ goto out;
+ }
+ oe->name_len = t16 / sizeof(short);
+ memcpy(oe->ptr,
+ Add2Ptr(lrh, le16_to_cpu(lrh->undo_off)), t16);
+ oe->is_attr_name = 1;
+ } else {
+ oe->ptr = NULL;
+ oe->is_attr_name = 0;
+ }
+
+ goto next_log_record_analyze;
+
+ case HotFix:
+ t16 = le16_to_cpu(lrh->target_attr);
+ t64 = le64_to_cpu(lrh->target_vcn);
+ dp = find_dp(dptbl, t16, t64);
+ if (dp) {
+ size_t j = le64_to_cpu(lrh->target_vcn) -
+ le64_to_cpu(dp->vcn);
+ if (dp->page_lcns[j])
+ dp->page_lcns[j] = lrh->page_lcns[0];
+ }
+ goto next_log_record_analyze;
+
+ case EndTopLevelAction:
+ tr = Add2Ptr(trtbl, transact_id);
+ tr->prev_lsn = cpu_to_le64(rec_lsn);
+ tr->undo_next_lsn = frh->client_undo_next_lsn;
+ goto next_log_record_analyze;
+
+ case PrepareTransaction:
+ tr = Add2Ptr(trtbl, transact_id);
+ tr->transact_state = TransactionPrepared;
+ goto next_log_record_analyze;
+
+ case CommitTransaction:
+ tr = Add2Ptr(trtbl, transact_id);
+ tr->transact_state = TransactionCommitted;
+ goto next_log_record_analyze;
+
+ case ForgetTransaction:
+ free_rsttbl_idx(trtbl, transact_id);
+ goto next_log_record_analyze;
+
+ case Noop:
+ case OpenAttributeTableDump:
+ case AttributeNamesDump:
+ case DirtyPageTableDump:
+ case TransactionTableDump:
+ /* The following cases require no action the Analysis Pass. */
+ goto next_log_record_analyze;
+
+ default:
+ /*
+ * All codes will be explicitly handled.
+ * If we see a code we do not expect, then we are trouble.
+ */
+ goto next_log_record_analyze;
+ }
+
+end_log_records_enumerate:
+ lcb_put(lcb);
+ lcb = NULL;
+
+ /*
+ * Scan the Dirty Page Table and Transaction Table for
+ * the lowest lsn, and return it as the Redo lsn.
+ */
+ dp = NULL;
+ while ((dp = enum_rstbl(dptbl, dp))) {
+ t64 = le64_to_cpu(dp->oldest_lsn);
+ if (t64 && t64 < rlsn)
+ rlsn = t64;
+ }
+
+ tr = NULL;
+ while ((tr = enum_rstbl(trtbl, tr))) {
+ t64 = le64_to_cpu(tr->first_lsn);
+ if (t64 && t64 < rlsn)
+ rlsn = t64;
+ }
+
+ /*
+ * Only proceed if the Dirty Page Table or Transaction
+ * table are not empty.
+ */
+ if ((!dptbl || !dptbl->total) && (!trtbl || !trtbl->total))
+ goto end_reply;
+
+ sbi->flags |= NTFS_FLAGS_NEED_REPLAY;
+ if (is_ro)
+ goto out;
+
+ /* Reopen all of the attributes with dirty pages. */
+ oe = NULL;
+next_open_attribute:
+
+ oe = enum_rstbl(oatbl, oe);
+ if (!oe) {
+ err = 0;
+ dp = NULL;
+ goto next_dirty_page;
+ }
+
+ oa = kzalloc(sizeof(struct OpenAttr), GFP_NOFS);
+ if (!oa) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ inode = ntfs_iget5(sbi->sb, &oe->ref, NULL);
+ if (IS_ERR(inode))
+ goto fake_attr;
+
+ if (is_bad_inode(inode)) {
+ iput(inode);
+fake_attr:
+ if (oa->ni) {
+ iput(&oa->ni->vfs_inode);
+ oa->ni = NULL;
+ }
+
+ attr = attr_create_nonres_log(sbi, oe->type, 0, oe->ptr,
+ oe->name_len, 0);
+ if (!attr) {
+ kfree(oa);
+ err = -ENOMEM;
+ goto out;
+ }
+ oa->attr = attr;
+ oa->run1 = &oa->run0;
+ goto final_oe;
+ }
+
+ ni_oe = ntfs_i(inode);
+ oa->ni = ni_oe;
+
+ attr = ni_find_attr(ni_oe, NULL, NULL, oe->type, oe->ptr, oe->name_len,
+ NULL, NULL);
+
+ if (!attr)
+ goto fake_attr;
+
+ t32 = le32_to_cpu(attr->size);
+ oa->attr = kmemdup(attr, t32, GFP_NOFS);
+ if (!oa->attr)
+ goto fake_attr;
+
+ if (!S_ISDIR(inode->i_mode)) {
+ if (attr->type == ATTR_DATA && !attr->name_len) {
+ oa->run1 = &ni_oe->file.run;
+ goto final_oe;
+ }
+ } else {
+ if (attr->type == ATTR_ALLOC &&
+ attr->name_len == ARRAY_SIZE(I30_NAME) &&
+ !memcmp(attr_name(attr), I30_NAME, sizeof(I30_NAME))) {
+ oa->run1 = &ni_oe->dir.alloc_run;
+ goto final_oe;
+ }
+ }
+
+ if (attr->non_res) {
+ u16 roff = le16_to_cpu(attr->nres.run_off);
+ CLST svcn = le64_to_cpu(attr->nres.svcn);
+
+ if (roff > t32) {
+ kfree(oa->attr);
+ oa->attr = NULL;
+ goto fake_attr;
+ }
+
+ err = run_unpack(&oa->run0, sbi, inode->i_ino, svcn,
+ le64_to_cpu(attr->nres.evcn), svcn,
+ Add2Ptr(attr, roff), t32 - roff);
+ if (err < 0) {
+ kfree(oa->attr);
+ oa->attr = NULL;
+ goto fake_attr;
+ }
+ err = 0;
+ }
+ oa->run1 = &oa->run0;
+ attr = oa->attr;
+
+final_oe:
+ if (oe->is_attr_name == 1)
+ kfree(oe->ptr);
+ oe->is_attr_name = 0;
+ oe->ptr = oa;
+ oe->name_len = attr->name_len;
+
+ goto next_open_attribute;
+
+ /*
+ * Now loop through the dirty page table to extract all of the Vcn/Lcn.
+ * Mapping that we have, and insert it into the appropriate run.
+ */
+next_dirty_page:
+ dp = enum_rstbl(dptbl, dp);
+ if (!dp)
+ goto do_redo_1;
+
+ oe = Add2Ptr(oatbl, le32_to_cpu(dp->target_attr));
+
+ if (oe->next != RESTART_ENTRY_ALLOCATED_LE)
+ goto next_dirty_page;
+
+ oa = oe->ptr;
+ if (!oa)
+ goto next_dirty_page;
+
+ i = -1;
+next_dirty_page_vcn:
+ i += 1;
+ if (i >= le32_to_cpu(dp->lcns_follow))
+ goto next_dirty_page;
+
+ vcn = le64_to_cpu(dp->vcn) + i;
+ size = (vcn + 1) << sbi->cluster_bits;
+
+ if (!dp->page_lcns[i])
+ goto next_dirty_page_vcn;
+
+ rno = ino_get(&oe->ref);
+ if (rno <= MFT_REC_MIRR &&
+ size < (MFT_REC_VOL + 1) * sbi->record_size &&
+ oe->type == ATTR_DATA) {
+ goto next_dirty_page_vcn;
+ }
+
+ lcn = le64_to_cpu(dp->page_lcns[i]);
+
+ if ((!run_lookup_entry(oa->run1, vcn, &lcn0, &len0, NULL) ||
+ lcn0 != lcn) &&
+ !run_add_entry(oa->run1, vcn, lcn, 1, false)) {
+ err = -ENOMEM;
+ goto out;
+ }
+ attr = oa->attr;
+ t64 = le64_to_cpu(attr->nres.alloc_size);
+ if (size > t64) {
+ attr->nres.valid_size = attr->nres.data_size =
+ attr->nres.alloc_size = cpu_to_le64(size);
+ }
+ goto next_dirty_page_vcn;
+
+do_redo_1:
+ /*
+ * Perform the Redo Pass, to restore all of the dirty pages to the same
+ * contents that they had immediately before the crash. If the dirty
+ * page table is empty, then we can skip the entire Redo Pass.
+ */
+ if (!dptbl || !dptbl->total)
+ goto do_undo_action;
+
+ rec_lsn = rlsn;
+
+ /*
+ * Read the record at the Redo lsn, before falling
+ * into common code to handle each record.
+ */
+ err = read_log_rec_lcb(log, rlsn, lcb_ctx_next, &lcb);
+ if (err)
+ goto out;
+
+ /*
+ * Now loop to read all of our log records forwards, until
+ * we hit the end of the file, cleaning up at the end.
+ */
+do_action_next:
+ frh = lcb->lrh;
+
+ if (LfsClientRecord != frh->record_type)
+ goto read_next_log_do_action;
+
+ transact_id = le32_to_cpu(frh->transact_id);
+ rec_len = le32_to_cpu(frh->client_data_len);
+ lrh = lcb->log_rec;
+
+ if (!check_log_rec(lrh, rec_len, transact_id, bytes_per_attr_entry)) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ /* Ignore log records that do not update pages. */
+ if (lrh->lcns_follow)
+ goto find_dirty_page;
+
+ goto read_next_log_do_action;
+
+find_dirty_page:
+ t16 = le16_to_cpu(lrh->target_attr);
+ t64 = le64_to_cpu(lrh->target_vcn);
+ dp = find_dp(dptbl, t16, t64);
+
+ if (!dp)
+ goto read_next_log_do_action;
+
+ if (rec_lsn < le64_to_cpu(dp->oldest_lsn))
+ goto read_next_log_do_action;
+
+ t16 = le16_to_cpu(lrh->target_attr);
+ if (t16 >= bytes_per_rt(oatbl)) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ oe = Add2Ptr(oatbl, t16);
+
+ if (oe->next != RESTART_ENTRY_ALLOCATED_LE) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ oa = oe->ptr;
+
+ if (!oa) {
+ err = -EINVAL;
+ goto out;
+ }
+ attr = oa->attr;
+
+ vcn = le64_to_cpu(lrh->target_vcn);
+
+ if (!run_lookup_entry(oa->run1, vcn, &lcn, NULL, NULL) ||
+ lcn == SPARSE_LCN) {
+ goto read_next_log_do_action;
+ }
+
+ /* Point to the Redo data and get its length. */
+ data = Add2Ptr(lrh, le16_to_cpu(lrh->redo_off));
+ dlen = le16_to_cpu(lrh->redo_len);
+
+ /* Shorten length by any Lcns which were deleted. */
+ saved_len = dlen;
+
+ for (i = le16_to_cpu(lrh->lcns_follow); i; i--) {
+ size_t j;
+ u32 alen, voff;
+
+ voff = le16_to_cpu(lrh->record_off) +
+ le16_to_cpu(lrh->attr_off);
+ voff += le16_to_cpu(lrh->cluster_off) << SECTOR_SHIFT;
+
+ /* If the Vcn question is allocated, we can just get out. */
+ j = le64_to_cpu(lrh->target_vcn) - le64_to_cpu(dp->vcn);
+ if (dp->page_lcns[j + i - 1])
+ break;
+
+ if (!saved_len)
+ saved_len = 1;
+
+ /*
+ * Calculate the allocated space left relative to the
+ * log record Vcn, after removing this unallocated Vcn.
+ */
+ alen = (i - 1) << sbi->cluster_bits;
+
+ /*
+ * If the update described this log record goes beyond
+ * the allocated space, then we will have to reduce the length.
+ */
+ if (voff >= alen)
+ dlen = 0;
+ else if (voff + dlen > alen)
+ dlen = alen - voff;
+ }
+
+ /*
+ * If the resulting dlen from above is now zero,
+ * we can skip this log record.
+ */
+ if (!dlen && saved_len)
+ goto read_next_log_do_action;
+
+ t16 = le16_to_cpu(lrh->redo_op);
+ if (can_skip_action(t16))
+ goto read_next_log_do_action;
+
+ /* Apply the Redo operation a common routine. */
+ err = do_action(log, oe, lrh, t16, data, dlen, rec_len, &rec_lsn);
+ if (err)
+ goto out;
+
+ /* Keep reading and looping back until end of file. */
+read_next_log_do_action:
+ err = read_next_log_rec(log, lcb, &rec_lsn);
+ if (!err && rec_lsn)
+ goto do_action_next;
+
+ lcb_put(lcb);
+ lcb = NULL;
+
+do_undo_action:
+ /* Scan Transaction Table. */
+ tr = NULL;
+transaction_table_next:
+ tr = enum_rstbl(trtbl, tr);
+ if (!tr)
+ goto undo_action_done;
+
+ if (TransactionActive != tr->transact_state || !tr->undo_next_lsn) {
+ free_rsttbl_idx(trtbl, PtrOffset(trtbl, tr));
+ goto transaction_table_next;
+ }
+
+ log->transaction_id = PtrOffset(trtbl, tr);
+ undo_next_lsn = le64_to_cpu(tr->undo_next_lsn);
+
+ /*
+ * We only have to do anything if the transaction has
+ * something its undo_next_lsn field.
+ */
+ if (!undo_next_lsn)
+ goto commit_undo;
+
+ /* Read the first record to be undone by this transaction. */
+ err = read_log_rec_lcb(log, undo_next_lsn, lcb_ctx_undo_next, &lcb);
+ if (err)
+ goto out;
+
+ /*
+ * Now loop to read all of our log records forwards,
+ * until we hit the end of the file, cleaning up at the end.
+ */
+undo_action_next:
+
+ lrh = lcb->log_rec;
+ frh = lcb->lrh;
+ transact_id = le32_to_cpu(frh->transact_id);
+ rec_len = le32_to_cpu(frh->client_data_len);
+
+ if (!check_log_rec(lrh, rec_len, transact_id, bytes_per_attr_entry)) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ if (lrh->undo_op == cpu_to_le16(Noop))
+ goto read_next_log_undo_action;
+
+ oe = Add2Ptr(oatbl, le16_to_cpu(lrh->target_attr));
+ oa = oe->ptr;
+
+ t16 = le16_to_cpu(lrh->lcns_follow);
+ if (!t16)
+ goto add_allocated_vcns;
+
+ is_mapped = run_lookup_entry(oa->run1, le64_to_cpu(lrh->target_vcn),
+ &lcn, &clen, NULL);
+
+ /*
+ * If the mapping isn't already the table or the mapping
+ * corresponds to a hole the mapping, we need to make sure
+ * there is no partial page already memory.
+ */
+ if (is_mapped && lcn != SPARSE_LCN && clen >= t16)
+ goto add_allocated_vcns;
+
+ vcn = le64_to_cpu(lrh->target_vcn);
+ vcn &= ~(u64)(log->clst_per_page - 1);
+
+add_allocated_vcns:
+ for (i = 0, vcn = le64_to_cpu(lrh->target_vcn),
+ size = (vcn + 1) << sbi->cluster_bits;
+ i < t16; i++, vcn += 1, size += sbi->cluster_size) {
+ attr = oa->attr;
+ if (!attr->non_res) {
+ if (size > le32_to_cpu(attr->res.data_size))
+ attr->res.data_size = cpu_to_le32(size);
+ } else {
+ if (size > le64_to_cpu(attr->nres.data_size))
+ attr->nres.valid_size = attr->nres.data_size =
+ attr->nres.alloc_size =
+ cpu_to_le64(size);
+ }
+ }
+
+ t16 = le16_to_cpu(lrh->undo_op);
+ if (can_skip_action(t16))
+ goto read_next_log_undo_action;
+
+ /* Point to the Redo data and get its length. */
+ data = Add2Ptr(lrh, le16_to_cpu(lrh->undo_off));
+ dlen = le16_to_cpu(lrh->undo_len);
+
+ /* It is time to apply the undo action. */
+ err = do_action(log, oe, lrh, t16, data, dlen, rec_len, NULL);
+
+read_next_log_undo_action:
+ /*
+ * Keep reading and looping back until we have read the
+ * last record for this transaction.
+ */
+ err = read_next_log_rec(log, lcb, &rec_lsn);
+ if (err)
+ goto out;
+
+ if (rec_lsn)
+ goto undo_action_next;
+
+ lcb_put(lcb);
+ lcb = NULL;
+
+commit_undo:
+ free_rsttbl_idx(trtbl, log->transaction_id);
+
+ log->transaction_id = 0;
+
+ goto transaction_table_next;
+
+undo_action_done:
+
+ ntfs_update_mftmirr(sbi, 0);
+
+ sbi->flags &= ~NTFS_FLAGS_NEED_REPLAY;
+
+end_reply:
+
+ err = 0;
+ if (is_ro)
+ goto out;
+
+ rh = kzalloc(log->page_size, GFP_NOFS);
+ if (!rh) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ rh->rhdr.sign = NTFS_RSTR_SIGNATURE;
+ rh->rhdr.fix_off = cpu_to_le16(offsetof(struct RESTART_HDR, fixups));
+ t16 = (log->page_size >> SECTOR_SHIFT) + 1;
+ rh->rhdr.fix_num = cpu_to_le16(t16);
+ rh->sys_page_size = cpu_to_le32(log->page_size);
+ rh->page_size = cpu_to_le32(log->page_size);
+
+ t16 = ALIGN(offsetof(struct RESTART_HDR, fixups) + sizeof(short) * t16,
+ 8);
+ rh->ra_off = cpu_to_le16(t16);
+ rh->minor_ver = cpu_to_le16(1); // 0x1A:
+ rh->major_ver = cpu_to_le16(1); // 0x1C:
+
+ ra2 = Add2Ptr(rh, t16);
+ memcpy(ra2, ra, sizeof(struct RESTART_AREA));
+
+ ra2->client_idx[0] = 0;
+ ra2->client_idx[1] = LFS_NO_CLIENT_LE;
+ ra2->flags = cpu_to_le16(2);
+
+ le32_add_cpu(&ra2->open_log_count, 1);
+
+ ntfs_fix_pre_write(&rh->rhdr, log->page_size);
+
+ err = ntfs_sb_write_run(sbi, &ni->file.run, 0, rh, log->page_size, 0);
+ if (!err)
+ err = ntfs_sb_write_run(sbi, &log->ni->file.run, log->page_size,
+ rh, log->page_size, 0);
+
+ kfree(rh);
+ if (err)
+ goto out;
+
+out:
+ kfree(rst);
+ if (lcb)
+ lcb_put(lcb);
+
+ /*
+ * Scan the Open Attribute Table to close all of
+ * the open attributes.
+ */
+ oe = NULL;
+ while ((oe = enum_rstbl(oatbl, oe))) {
+ rno = ino_get(&oe->ref);
+
+ if (oe->is_attr_name == 1) {
+ kfree(oe->ptr);
+ oe->ptr = NULL;
+ continue;
+ }
+
+ if (oe->is_attr_name)
+ continue;
+
+ oa = oe->ptr;
+ if (!oa)
+ continue;
+
+ run_close(&oa->run0);
+ kfree(oa->attr);
+ if (oa->ni)
+ iput(&oa->ni->vfs_inode);
+ kfree(oa);
+ }
+
+ kfree(trtbl);
+ kfree(oatbl);
+ kfree(dptbl);
+ kfree(attr_names);
+ kfree(rst_info.r_page);
+
+ kfree(ra);
+ kfree(log->one_page_buf);
+
+ if (err)
+ sbi->flags |= NTFS_FLAGS_NEED_REPLAY;
+
+ if (err == -EROFS)
+ err = 0;
+ else if (log->set_dirty)
+ ntfs_set_state(sbi, NTFS_DIRTY_ERROR);
+
+ kfree(log);
+
+ return err;
+}
diff --git a/fs/ntfs3/fsntfs.c b/fs/ntfs3/fsntfs.c
new file mode 100644
index 000000000..873b1434a
--- /dev/null
+++ b/fs/ntfs3/fsntfs.c
@@ -0,0 +1,2503 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ *
+ * Copyright (C) 2019-2021 Paragon Software GmbH, All rights reserved.
+ *
+ */
+
+#include <linux/blkdev.h>
+#include <linux/buffer_head.h>
+#include <linux/fs.h>
+#include <linux/kernel.h>
+
+#include "debug.h"
+#include "ntfs.h"
+#include "ntfs_fs.h"
+
+// clang-format off
+const struct cpu_str NAME_MFT = {
+ 4, 0, { '$', 'M', 'F', 'T' },
+};
+const struct cpu_str NAME_MIRROR = {
+ 8, 0, { '$', 'M', 'F', 'T', 'M', 'i', 'r', 'r' },
+};
+const struct cpu_str NAME_LOGFILE = {
+ 8, 0, { '$', 'L', 'o', 'g', 'F', 'i', 'l', 'e' },
+};
+const struct cpu_str NAME_VOLUME = {
+ 7, 0, { '$', 'V', 'o', 'l', 'u', 'm', 'e' },
+};
+const struct cpu_str NAME_ATTRDEF = {
+ 8, 0, { '$', 'A', 't', 't', 'r', 'D', 'e', 'f' },
+};
+const struct cpu_str NAME_ROOT = {
+ 1, 0, { '.' },
+};
+const struct cpu_str NAME_BITMAP = {
+ 7, 0, { '$', 'B', 'i', 't', 'm', 'a', 'p' },
+};
+const struct cpu_str NAME_BOOT = {
+ 5, 0, { '$', 'B', 'o', 'o', 't' },
+};
+const struct cpu_str NAME_BADCLUS = {
+ 8, 0, { '$', 'B', 'a', 'd', 'C', 'l', 'u', 's' },
+};
+const struct cpu_str NAME_QUOTA = {
+ 6, 0, { '$', 'Q', 'u', 'o', 't', 'a' },
+};
+const struct cpu_str NAME_SECURE = {
+ 7, 0, { '$', 'S', 'e', 'c', 'u', 'r', 'e' },
+};
+const struct cpu_str NAME_UPCASE = {
+ 7, 0, { '$', 'U', 'p', 'C', 'a', 's', 'e' },
+};
+const struct cpu_str NAME_EXTEND = {
+ 7, 0, { '$', 'E', 'x', 't', 'e', 'n', 'd' },
+};
+const struct cpu_str NAME_OBJID = {
+ 6, 0, { '$', 'O', 'b', 'j', 'I', 'd' },
+};
+const struct cpu_str NAME_REPARSE = {
+ 8, 0, { '$', 'R', 'e', 'p', 'a', 'r', 's', 'e' },
+};
+const struct cpu_str NAME_USNJRNL = {
+ 8, 0, { '$', 'U', 's', 'n', 'J', 'r', 'n', 'l' },
+};
+const __le16 BAD_NAME[4] = {
+ cpu_to_le16('$'), cpu_to_le16('B'), cpu_to_le16('a'), cpu_to_le16('d'),
+};
+const __le16 I30_NAME[4] = {
+ cpu_to_le16('$'), cpu_to_le16('I'), cpu_to_le16('3'), cpu_to_le16('0'),
+};
+const __le16 SII_NAME[4] = {
+ cpu_to_le16('$'), cpu_to_le16('S'), cpu_to_le16('I'), cpu_to_le16('I'),
+};
+const __le16 SDH_NAME[4] = {
+ cpu_to_le16('$'), cpu_to_le16('S'), cpu_to_le16('D'), cpu_to_le16('H'),
+};
+const __le16 SDS_NAME[4] = {
+ cpu_to_le16('$'), cpu_to_le16('S'), cpu_to_le16('D'), cpu_to_le16('S'),
+};
+const __le16 SO_NAME[2] = {
+ cpu_to_le16('$'), cpu_to_le16('O'),
+};
+const __le16 SQ_NAME[2] = {
+ cpu_to_le16('$'), cpu_to_le16('Q'),
+};
+const __le16 SR_NAME[2] = {
+ cpu_to_le16('$'), cpu_to_le16('R'),
+};
+
+#ifdef CONFIG_NTFS3_LZX_XPRESS
+const __le16 WOF_NAME[17] = {
+ cpu_to_le16('W'), cpu_to_le16('o'), cpu_to_le16('f'), cpu_to_le16('C'),
+ cpu_to_le16('o'), cpu_to_le16('m'), cpu_to_le16('p'), cpu_to_le16('r'),
+ cpu_to_le16('e'), cpu_to_le16('s'), cpu_to_le16('s'), cpu_to_le16('e'),
+ cpu_to_le16('d'), cpu_to_le16('D'), cpu_to_le16('a'), cpu_to_le16('t'),
+ cpu_to_le16('a'),
+};
+#endif
+
+// clang-format on
+
+/*
+ * ntfs_fix_pre_write - Insert fixups into @rhdr before writing to disk.
+ */
+bool ntfs_fix_pre_write(struct NTFS_RECORD_HEADER *rhdr, size_t bytes)
+{
+ u16 *fixup, *ptr;
+ u16 sample;
+ u16 fo = le16_to_cpu(rhdr->fix_off);
+ u16 fn = le16_to_cpu(rhdr->fix_num);
+
+ if ((fo & 1) || fo + fn * sizeof(short) > SECTOR_SIZE || !fn-- ||
+ fn * SECTOR_SIZE > bytes) {
+ return false;
+ }
+
+ /* Get fixup pointer. */
+ fixup = Add2Ptr(rhdr, fo);
+
+ if (*fixup >= 0x7FFF)
+ *fixup = 1;
+ else
+ *fixup += 1;
+
+ sample = *fixup;
+
+ ptr = Add2Ptr(rhdr, SECTOR_SIZE - sizeof(short));
+
+ while (fn--) {
+ *++fixup = *ptr;
+ *ptr = sample;
+ ptr += SECTOR_SIZE / sizeof(short);
+ }
+ return true;
+}
+
+/*
+ * ntfs_fix_post_read - Remove fixups after reading from disk.
+ *
+ * Return: < 0 if error, 0 if ok, 1 if need to update fixups.
+ */
+int ntfs_fix_post_read(struct NTFS_RECORD_HEADER *rhdr, size_t bytes,
+ bool simple)
+{
+ int ret;
+ u16 *fixup, *ptr;
+ u16 sample, fo, fn;
+
+ fo = le16_to_cpu(rhdr->fix_off);
+ fn = simple ? ((bytes >> SECTOR_SHIFT) + 1)
+ : le16_to_cpu(rhdr->fix_num);
+
+ /* Check errors. */
+ if ((fo & 1) || fo + fn * sizeof(short) > SECTOR_SIZE || !fn-- ||
+ fn * SECTOR_SIZE > bytes) {
+ return -E_NTFS_CORRUPT;
+ }
+
+ /* Get fixup pointer. */
+ fixup = Add2Ptr(rhdr, fo);
+ sample = *fixup;
+ ptr = Add2Ptr(rhdr, SECTOR_SIZE - sizeof(short));
+ ret = 0;
+
+ while (fn--) {
+ /* Test current word. */
+ if (*ptr != sample) {
+ /* Fixup does not match! Is it serious error? */
+ ret = -E_NTFS_FIXUP;
+ }
+
+ /* Replace fixup. */
+ *ptr = *++fixup;
+ ptr += SECTOR_SIZE / sizeof(short);
+ }
+
+ return ret;
+}
+
+/*
+ * ntfs_extend_init - Load $Extend file.
+ */
+int ntfs_extend_init(struct ntfs_sb_info *sbi)
+{
+ int err;
+ struct super_block *sb = sbi->sb;
+ struct inode *inode, *inode2;
+ struct MFT_REF ref;
+
+ if (sbi->volume.major_ver < 3) {
+ ntfs_notice(sb, "Skip $Extend 'cause NTFS version");
+ return 0;
+ }
+
+ ref.low = cpu_to_le32(MFT_REC_EXTEND);
+ ref.high = 0;
+ ref.seq = cpu_to_le16(MFT_REC_EXTEND);
+ inode = ntfs_iget5(sb, &ref, &NAME_EXTEND);
+ if (IS_ERR(inode)) {
+ err = PTR_ERR(inode);
+ ntfs_err(sb, "Failed to load $Extend.");
+ inode = NULL;
+ goto out;
+ }
+
+ /* If ntfs_iget5() reads from disk it never returns bad inode. */
+ if (!S_ISDIR(inode->i_mode)) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ /* Try to find $ObjId */
+ inode2 = dir_search_u(inode, &NAME_OBJID, NULL);
+ if (inode2 && !IS_ERR(inode2)) {
+ if (is_bad_inode(inode2)) {
+ iput(inode2);
+ } else {
+ sbi->objid.ni = ntfs_i(inode2);
+ sbi->objid_no = inode2->i_ino;
+ }
+ }
+
+ /* Try to find $Quota */
+ inode2 = dir_search_u(inode, &NAME_QUOTA, NULL);
+ if (inode2 && !IS_ERR(inode2)) {
+ sbi->quota_no = inode2->i_ino;
+ iput(inode2);
+ }
+
+ /* Try to find $Reparse */
+ inode2 = dir_search_u(inode, &NAME_REPARSE, NULL);
+ if (inode2 && !IS_ERR(inode2)) {
+ sbi->reparse.ni = ntfs_i(inode2);
+ sbi->reparse_no = inode2->i_ino;
+ }
+
+ /* Try to find $UsnJrnl */
+ inode2 = dir_search_u(inode, &NAME_USNJRNL, NULL);
+ if (inode2 && !IS_ERR(inode2)) {
+ sbi->usn_jrnl_no = inode2->i_ino;
+ iput(inode2);
+ }
+
+ err = 0;
+out:
+ iput(inode);
+ return err;
+}
+
+int ntfs_loadlog_and_replay(struct ntfs_inode *ni, struct ntfs_sb_info *sbi)
+{
+ int err = 0;
+ struct super_block *sb = sbi->sb;
+ bool initialized = false;
+ struct MFT_REF ref;
+ struct inode *inode;
+
+ /* Check for 4GB. */
+ if (ni->vfs_inode.i_size >= 0x100000000ull) {
+ ntfs_err(sb, "\x24LogFile is too big");
+ err = -EINVAL;
+ goto out;
+ }
+
+ sbi->flags |= NTFS_FLAGS_LOG_REPLAYING;
+
+ ref.low = cpu_to_le32(MFT_REC_MFT);
+ ref.high = 0;
+ ref.seq = cpu_to_le16(1);
+
+ inode = ntfs_iget5(sb, &ref, NULL);
+
+ if (IS_ERR(inode))
+ inode = NULL;
+
+ if (!inode) {
+ /* Try to use MFT copy. */
+ u64 t64 = sbi->mft.lbo;
+
+ sbi->mft.lbo = sbi->mft.lbo2;
+ inode = ntfs_iget5(sb, &ref, NULL);
+ sbi->mft.lbo = t64;
+ if (IS_ERR(inode))
+ inode = NULL;
+ }
+
+ if (!inode) {
+ err = -EINVAL;
+ ntfs_err(sb, "Failed to load $MFT.");
+ goto out;
+ }
+
+ sbi->mft.ni = ntfs_i(inode);
+
+ /* LogFile should not contains attribute list. */
+ err = ni_load_all_mi(sbi->mft.ni);
+ if (!err)
+ err = log_replay(ni, &initialized);
+
+ iput(inode);
+ sbi->mft.ni = NULL;
+
+ sync_blockdev(sb->s_bdev);
+ invalidate_bdev(sb->s_bdev);
+
+ if (sbi->flags & NTFS_FLAGS_NEED_REPLAY) {
+ err = 0;
+ goto out;
+ }
+
+ if (sb_rdonly(sb) || !initialized)
+ goto out;
+
+ /* Fill LogFile by '-1' if it is initialized. */
+ err = ntfs_bio_fill_1(sbi, &ni->file.run);
+
+out:
+ sbi->flags &= ~NTFS_FLAGS_LOG_REPLAYING;
+
+ return err;
+}
+
+/*
+ * ntfs_query_def
+ *
+ * Return: Current ATTR_DEF_ENTRY for given attribute type.
+ */
+const struct ATTR_DEF_ENTRY *ntfs_query_def(struct ntfs_sb_info *sbi,
+ enum ATTR_TYPE type)
+{
+ int type_in = le32_to_cpu(type);
+ size_t min_idx = 0;
+ size_t max_idx = sbi->def_entries - 1;
+
+ while (min_idx <= max_idx) {
+ size_t i = min_idx + ((max_idx - min_idx) >> 1);
+ const struct ATTR_DEF_ENTRY *entry = sbi->def_table + i;
+ int diff = le32_to_cpu(entry->type) - type_in;
+
+ if (!diff)
+ return entry;
+ if (diff < 0)
+ min_idx = i + 1;
+ else if (i)
+ max_idx = i - 1;
+ else
+ return NULL;
+ }
+ return NULL;
+}
+
+/*
+ * ntfs_look_for_free_space - Look for a free space in bitmap.
+ */
+int ntfs_look_for_free_space(struct ntfs_sb_info *sbi, CLST lcn, CLST len,
+ CLST *new_lcn, CLST *new_len,
+ enum ALLOCATE_OPT opt)
+{
+ int err;
+ CLST alen;
+ struct super_block *sb = sbi->sb;
+ size_t alcn, zlen, zeroes, zlcn, zlen2, ztrim, new_zlen;
+ struct wnd_bitmap *wnd = &sbi->used.bitmap;
+
+ down_write_nested(&wnd->rw_lock, BITMAP_MUTEX_CLUSTERS);
+ if (opt & ALLOCATE_MFT) {
+ zlen = wnd_zone_len(wnd);
+
+ if (!zlen) {
+ err = ntfs_refresh_zone(sbi);
+ if (err)
+ goto up_write;
+
+ zlen = wnd_zone_len(wnd);
+ }
+
+ if (!zlen) {
+ ntfs_err(sbi->sb, "no free space to extend mft");
+ err = -ENOSPC;
+ goto up_write;
+ }
+
+ lcn = wnd_zone_bit(wnd);
+ alen = min_t(CLST, len, zlen);
+
+ wnd_zone_set(wnd, lcn + alen, zlen - alen);
+
+ err = wnd_set_used(wnd, lcn, alen);
+ if (err)
+ goto up_write;
+
+ alcn = lcn;
+ goto space_found;
+ }
+ /*
+ * 'Cause cluster 0 is always used this value means that we should use
+ * cached value of 'next_free_lcn' to improve performance.
+ */
+ if (!lcn)
+ lcn = sbi->used.next_free_lcn;
+
+ if (lcn >= wnd->nbits)
+ lcn = 0;
+
+ alen = wnd_find(wnd, len, lcn, BITMAP_FIND_MARK_AS_USED, &alcn);
+ if (alen)
+ goto space_found;
+
+ /* Try to use clusters from MftZone. */
+ zlen = wnd_zone_len(wnd);
+ zeroes = wnd_zeroes(wnd);
+
+ /* Check too big request */
+ if (len > zeroes + zlen || zlen <= NTFS_MIN_MFT_ZONE) {
+ err = -ENOSPC;
+ goto up_write;
+ }
+
+ /* How many clusters to cat from zone. */
+ zlcn = wnd_zone_bit(wnd);
+ zlen2 = zlen >> 1;
+ ztrim = clamp_val(len, zlen2, zlen);
+ new_zlen = max_t(size_t, zlen - ztrim, NTFS_MIN_MFT_ZONE);
+
+ wnd_zone_set(wnd, zlcn, new_zlen);
+
+ /* Allocate continues clusters. */
+ alen = wnd_find(wnd, len, 0,
+ BITMAP_FIND_MARK_AS_USED | BITMAP_FIND_FULL, &alcn);
+ if (!alen) {
+ err = -ENOSPC;
+ goto up_write;
+ }
+
+space_found:
+ err = 0;
+ *new_len = alen;
+ *new_lcn = alcn;
+
+ ntfs_unmap_meta(sb, alcn, alen);
+
+ /* Set hint for next requests. */
+ if (!(opt & ALLOCATE_MFT))
+ sbi->used.next_free_lcn = alcn + alen;
+up_write:
+ up_write(&wnd->rw_lock);
+ return err;
+}
+
+/*
+ * ntfs_extend_mft - Allocate additional MFT records.
+ *
+ * sbi->mft.bitmap is locked for write.
+ *
+ * NOTE: recursive:
+ * ntfs_look_free_mft ->
+ * ntfs_extend_mft ->
+ * attr_set_size ->
+ * ni_insert_nonresident ->
+ * ni_insert_attr ->
+ * ni_ins_attr_ext ->
+ * ntfs_look_free_mft ->
+ * ntfs_extend_mft
+ *
+ * To avoid recursive always allocate space for two new MFT records
+ * see attrib.c: "at least two MFT to avoid recursive loop".
+ */
+static int ntfs_extend_mft(struct ntfs_sb_info *sbi)
+{
+ int err;
+ struct ntfs_inode *ni = sbi->mft.ni;
+ size_t new_mft_total;
+ u64 new_mft_bytes, new_bitmap_bytes;
+ struct ATTRIB *attr;
+ struct wnd_bitmap *wnd = &sbi->mft.bitmap;
+
+ new_mft_total = (wnd->nbits + MFT_INCREASE_CHUNK + 127) & (CLST)~127;
+ new_mft_bytes = (u64)new_mft_total << sbi->record_bits;
+
+ /* Step 1: Resize $MFT::DATA. */
+ down_write(&ni->file.run_lock);
+ err = attr_set_size(ni, ATTR_DATA, NULL, 0, &ni->file.run,
+ new_mft_bytes, NULL, false, &attr);
+
+ if (err) {
+ up_write(&ni->file.run_lock);
+ goto out;
+ }
+
+ attr->nres.valid_size = attr->nres.data_size;
+ new_mft_total = le64_to_cpu(attr->nres.alloc_size) >> sbi->record_bits;
+ ni->mi.dirty = true;
+
+ /* Step 2: Resize $MFT::BITMAP. */
+ new_bitmap_bytes = bitmap_size(new_mft_total);
+
+ err = attr_set_size(ni, ATTR_BITMAP, NULL, 0, &sbi->mft.bitmap.run,
+ new_bitmap_bytes, &new_bitmap_bytes, true, NULL);
+
+ /* Refresh MFT Zone if necessary. */
+ down_write_nested(&sbi->used.bitmap.rw_lock, BITMAP_MUTEX_CLUSTERS);
+
+ ntfs_refresh_zone(sbi);
+
+ up_write(&sbi->used.bitmap.rw_lock);
+ up_write(&ni->file.run_lock);
+
+ if (err)
+ goto out;
+
+ err = wnd_extend(wnd, new_mft_total);
+
+ if (err)
+ goto out;
+
+ ntfs_clear_mft_tail(sbi, sbi->mft.used, new_mft_total);
+
+ err = _ni_write_inode(&ni->vfs_inode, 0);
+out:
+ return err;
+}
+
+/*
+ * ntfs_look_free_mft - Look for a free MFT record.
+ */
+int ntfs_look_free_mft(struct ntfs_sb_info *sbi, CLST *rno, bool mft,
+ struct ntfs_inode *ni, struct mft_inode **mi)
+{
+ int err = 0;
+ size_t zbit, zlen, from, to, fr;
+ size_t mft_total;
+ struct MFT_REF ref;
+ struct super_block *sb = sbi->sb;
+ struct wnd_bitmap *wnd = &sbi->mft.bitmap;
+ u32 ir;
+
+ static_assert(sizeof(sbi->mft.reserved_bitmap) * 8 >=
+ MFT_REC_FREE - MFT_REC_RESERVED);
+
+ if (!mft)
+ down_write_nested(&wnd->rw_lock, BITMAP_MUTEX_MFT);
+
+ zlen = wnd_zone_len(wnd);
+
+ /* Always reserve space for MFT. */
+ if (zlen) {
+ if (mft) {
+ zbit = wnd_zone_bit(wnd);
+ *rno = zbit;
+ wnd_zone_set(wnd, zbit + 1, zlen - 1);
+ }
+ goto found;
+ }
+
+ /* No MFT zone. Find the nearest to '0' free MFT. */
+ if (!wnd_find(wnd, 1, MFT_REC_FREE, 0, &zbit)) {
+ /* Resize MFT */
+ mft_total = wnd->nbits;
+
+ err = ntfs_extend_mft(sbi);
+ if (!err) {
+ zbit = mft_total;
+ goto reserve_mft;
+ }
+
+ if (!mft || MFT_REC_FREE == sbi->mft.next_reserved)
+ goto out;
+
+ err = 0;
+
+ /*
+ * Look for free record reserved area [11-16) ==
+ * [MFT_REC_RESERVED, MFT_REC_FREE ) MFT bitmap always
+ * marks it as used.
+ */
+ if (!sbi->mft.reserved_bitmap) {
+ /* Once per session create internal bitmap for 5 bits. */
+ sbi->mft.reserved_bitmap = 0xFF;
+
+ ref.high = 0;
+ for (ir = MFT_REC_RESERVED; ir < MFT_REC_FREE; ir++) {
+ struct inode *i;
+ struct ntfs_inode *ni;
+ struct MFT_REC *mrec;
+
+ ref.low = cpu_to_le32(ir);
+ ref.seq = cpu_to_le16(ir);
+
+ i = ntfs_iget5(sb, &ref, NULL);
+ if (IS_ERR(i)) {
+next:
+ ntfs_notice(
+ sb,
+ "Invalid reserved record %x",
+ ref.low);
+ continue;
+ }
+ if (is_bad_inode(i)) {
+ iput(i);
+ goto next;
+ }
+
+ ni = ntfs_i(i);
+
+ mrec = ni->mi.mrec;
+
+ if (!is_rec_base(mrec))
+ goto next;
+
+ if (mrec->hard_links)
+ goto next;
+
+ if (!ni_std(ni))
+ goto next;
+
+ if (ni_find_attr(ni, NULL, NULL, ATTR_NAME,
+ NULL, 0, NULL, NULL))
+ goto next;
+
+ __clear_bit(ir - MFT_REC_RESERVED,
+ &sbi->mft.reserved_bitmap);
+ }
+ }
+
+ /* Scan 5 bits for zero. Bit 0 == MFT_REC_RESERVED */
+ zbit = find_next_zero_bit(&sbi->mft.reserved_bitmap,
+ MFT_REC_FREE, MFT_REC_RESERVED);
+ if (zbit >= MFT_REC_FREE) {
+ sbi->mft.next_reserved = MFT_REC_FREE;
+ goto out;
+ }
+
+ zlen = 1;
+ sbi->mft.next_reserved = zbit;
+ } else {
+reserve_mft:
+ zlen = zbit == MFT_REC_FREE ? (MFT_REC_USER - MFT_REC_FREE) : 4;
+ if (zbit + zlen > wnd->nbits)
+ zlen = wnd->nbits - zbit;
+
+ while (zlen > 1 && !wnd_is_free(wnd, zbit, zlen))
+ zlen -= 1;
+
+ /* [zbit, zbit + zlen) will be used for MFT itself. */
+ from = sbi->mft.used;
+ if (from < zbit)
+ from = zbit;
+ to = zbit + zlen;
+ if (from < to) {
+ ntfs_clear_mft_tail(sbi, from, to);
+ sbi->mft.used = to;
+ }
+ }
+
+ if (mft) {
+ *rno = zbit;
+ zbit += 1;
+ zlen -= 1;
+ }
+
+ wnd_zone_set(wnd, zbit, zlen);
+
+found:
+ if (!mft) {
+ /* The request to get record for general purpose. */
+ if (sbi->mft.next_free < MFT_REC_USER)
+ sbi->mft.next_free = MFT_REC_USER;
+
+ for (;;) {
+ if (sbi->mft.next_free >= sbi->mft.bitmap.nbits) {
+ } else if (!wnd_find(wnd, 1, MFT_REC_USER, 0, &fr)) {
+ sbi->mft.next_free = sbi->mft.bitmap.nbits;
+ } else {
+ *rno = fr;
+ sbi->mft.next_free = *rno + 1;
+ break;
+ }
+
+ err = ntfs_extend_mft(sbi);
+ if (err)
+ goto out;
+ }
+ }
+
+ if (ni && !ni_add_subrecord(ni, *rno, mi)) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ /* We have found a record that are not reserved for next MFT. */
+ if (*rno >= MFT_REC_FREE)
+ wnd_set_used(wnd, *rno, 1);
+ else if (*rno >= MFT_REC_RESERVED && sbi->mft.reserved_bitmap_inited)
+ __set_bit(*rno - MFT_REC_RESERVED, &sbi->mft.reserved_bitmap);
+
+out:
+ if (!mft)
+ up_write(&wnd->rw_lock);
+
+ return err;
+}
+
+/*
+ * ntfs_mark_rec_free - Mark record as free.
+ * is_mft - true if we are changing MFT
+ */
+void ntfs_mark_rec_free(struct ntfs_sb_info *sbi, CLST rno, bool is_mft)
+{
+ struct wnd_bitmap *wnd = &sbi->mft.bitmap;
+
+ if (!is_mft)
+ down_write_nested(&wnd->rw_lock, BITMAP_MUTEX_MFT);
+ if (rno >= wnd->nbits)
+ goto out;
+
+ if (rno >= MFT_REC_FREE) {
+ if (!wnd_is_used(wnd, rno, 1))
+ ntfs_set_state(sbi, NTFS_DIRTY_ERROR);
+ else
+ wnd_set_free(wnd, rno, 1);
+ } else if (rno >= MFT_REC_RESERVED && sbi->mft.reserved_bitmap_inited) {
+ __clear_bit(rno - MFT_REC_RESERVED, &sbi->mft.reserved_bitmap);
+ }
+
+ if (rno < wnd_zone_bit(wnd))
+ wnd_zone_set(wnd, rno, 1);
+ else if (rno < sbi->mft.next_free && rno >= MFT_REC_USER)
+ sbi->mft.next_free = rno;
+
+out:
+ if (!is_mft)
+ up_write(&wnd->rw_lock);
+}
+
+/*
+ * ntfs_clear_mft_tail - Format empty records [from, to).
+ *
+ * sbi->mft.bitmap is locked for write.
+ */
+int ntfs_clear_mft_tail(struct ntfs_sb_info *sbi, size_t from, size_t to)
+{
+ int err;
+ u32 rs;
+ u64 vbo;
+ struct runs_tree *run;
+ struct ntfs_inode *ni;
+
+ if (from >= to)
+ return 0;
+
+ rs = sbi->record_size;
+ ni = sbi->mft.ni;
+ run = &ni->file.run;
+
+ down_read(&ni->file.run_lock);
+ vbo = (u64)from * rs;
+ for (; from < to; from++, vbo += rs) {
+ struct ntfs_buffers nb;
+
+ err = ntfs_get_bh(sbi, run, vbo, rs, &nb);
+ if (err)
+ goto out;
+
+ err = ntfs_write_bh(sbi, &sbi->new_rec->rhdr, &nb, 0);
+ nb_put(&nb);
+ if (err)
+ goto out;
+ }
+
+out:
+ sbi->mft.used = from;
+ up_read(&ni->file.run_lock);
+ return err;
+}
+
+/*
+ * ntfs_refresh_zone - Refresh MFT zone.
+ *
+ * sbi->used.bitmap is locked for rw.
+ * sbi->mft.bitmap is locked for write.
+ * sbi->mft.ni->file.run_lock for write.
+ */
+int ntfs_refresh_zone(struct ntfs_sb_info *sbi)
+{
+ CLST lcn, vcn, len;
+ size_t lcn_s, zlen;
+ struct wnd_bitmap *wnd = &sbi->used.bitmap;
+ struct ntfs_inode *ni = sbi->mft.ni;
+
+ /* Do not change anything unless we have non empty MFT zone. */
+ if (wnd_zone_len(wnd))
+ return 0;
+
+ vcn = bytes_to_cluster(sbi,
+ (u64)sbi->mft.bitmap.nbits << sbi->record_bits);
+
+ if (!run_lookup_entry(&ni->file.run, vcn - 1, &lcn, &len, NULL))
+ lcn = SPARSE_LCN;
+
+ /* We should always find Last Lcn for MFT. */
+ if (lcn == SPARSE_LCN)
+ return -EINVAL;
+
+ lcn_s = lcn + 1;
+
+ /* Try to allocate clusters after last MFT run. */
+ zlen = wnd_find(wnd, sbi->zone_max, lcn_s, 0, &lcn_s);
+ wnd_zone_set(wnd, lcn_s, zlen);
+
+ return 0;
+}
+
+/*
+ * ntfs_update_mftmirr - Update $MFTMirr data.
+ */
+void ntfs_update_mftmirr(struct ntfs_sb_info *sbi, int wait)
+{
+ int err;
+ struct super_block *sb = sbi->sb;
+ u32 blocksize;
+ sector_t block1, block2;
+ u32 bytes;
+
+ if (!sb)
+ return;
+
+ blocksize = sb->s_blocksize;
+
+ if (!(sbi->flags & NTFS_FLAGS_MFTMIRR))
+ return;
+
+ err = 0;
+ bytes = sbi->mft.recs_mirr << sbi->record_bits;
+ block1 = sbi->mft.lbo >> sb->s_blocksize_bits;
+ block2 = sbi->mft.lbo2 >> sb->s_blocksize_bits;
+
+ for (; bytes >= blocksize; bytes -= blocksize) {
+ struct buffer_head *bh1, *bh2;
+
+ bh1 = sb_bread(sb, block1++);
+ if (!bh1)
+ return;
+
+ bh2 = sb_getblk(sb, block2++);
+ if (!bh2) {
+ put_bh(bh1);
+ return;
+ }
+
+ if (buffer_locked(bh2))
+ __wait_on_buffer(bh2);
+
+ lock_buffer(bh2);
+ memcpy(bh2->b_data, bh1->b_data, blocksize);
+ set_buffer_uptodate(bh2);
+ mark_buffer_dirty(bh2);
+ unlock_buffer(bh2);
+
+ put_bh(bh1);
+ bh1 = NULL;
+
+ if (wait)
+ err = sync_dirty_buffer(bh2);
+
+ put_bh(bh2);
+ if (err)
+ return;
+ }
+
+ sbi->flags &= ~NTFS_FLAGS_MFTMIRR;
+}
+
+/*
+ * ntfs_bad_inode
+ *
+ * Marks inode as bad and marks fs as 'dirty'
+ */
+void ntfs_bad_inode(struct inode *inode, const char *hint)
+{
+ struct ntfs_sb_info *sbi = inode->i_sb->s_fs_info;
+
+ ntfs_inode_err(inode, "%s", hint);
+ make_bad_inode(inode);
+ ntfs_set_state(sbi, NTFS_DIRTY_ERROR);
+}
+
+/*
+ * ntfs_set_state
+ *
+ * Mount: ntfs_set_state(NTFS_DIRTY_DIRTY)
+ * Umount: ntfs_set_state(NTFS_DIRTY_CLEAR)
+ * NTFS error: ntfs_set_state(NTFS_DIRTY_ERROR)
+ */
+int ntfs_set_state(struct ntfs_sb_info *sbi, enum NTFS_DIRTY_FLAGS dirty)
+{
+ int err;
+ struct ATTRIB *attr;
+ struct VOLUME_INFO *info;
+ struct mft_inode *mi;
+ struct ntfs_inode *ni;
+
+ /*
+ * Do not change state if fs was real_dirty.
+ * Do not change state if fs already dirty(clear).
+ * Do not change any thing if mounted read only.
+ */
+ if (sbi->volume.real_dirty || sb_rdonly(sbi->sb))
+ return 0;
+
+ /* Check cached value. */
+ if ((dirty == NTFS_DIRTY_CLEAR ? 0 : VOLUME_FLAG_DIRTY) ==
+ (sbi->volume.flags & VOLUME_FLAG_DIRTY))
+ return 0;
+
+ ni = sbi->volume.ni;
+ if (!ni)
+ return -EINVAL;
+
+ mutex_lock_nested(&ni->ni_lock, NTFS_INODE_MUTEX_DIRTY);
+
+ attr = ni_find_attr(ni, NULL, NULL, ATTR_VOL_INFO, NULL, 0, NULL, &mi);
+ if (!attr) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ info = resident_data_ex(attr, SIZEOF_ATTRIBUTE_VOLUME_INFO);
+ if (!info) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ switch (dirty) {
+ case NTFS_DIRTY_ERROR:
+ ntfs_notice(sbi->sb, "Mark volume as dirty due to NTFS errors");
+ sbi->volume.real_dirty = true;
+ fallthrough;
+ case NTFS_DIRTY_DIRTY:
+ info->flags |= VOLUME_FLAG_DIRTY;
+ break;
+ case NTFS_DIRTY_CLEAR:
+ info->flags &= ~VOLUME_FLAG_DIRTY;
+ break;
+ }
+ /* Cache current volume flags. */
+ sbi->volume.flags = info->flags;
+ mi->dirty = true;
+ err = 0;
+
+out:
+ ni_unlock(ni);
+ if (err)
+ return err;
+
+ mark_inode_dirty_sync(&ni->vfs_inode);
+ /* verify(!ntfs_update_mftmirr()); */
+
+ /* write mft record on disk. */
+ err = _ni_write_inode(&ni->vfs_inode, 1);
+
+ return err;
+}
+
+/*
+ * security_hash - Calculates a hash of security descriptor.
+ */
+static inline __le32 security_hash(const void *sd, size_t bytes)
+{
+ u32 hash = 0;
+ const __le32 *ptr = sd;
+
+ bytes >>= 2;
+ while (bytes--)
+ hash = ((hash >> 0x1D) | (hash << 3)) + le32_to_cpu(*ptr++);
+ return cpu_to_le32(hash);
+}
+
+int ntfs_sb_read(struct super_block *sb, u64 lbo, size_t bytes, void *buffer)
+{
+ struct block_device *bdev = sb->s_bdev;
+ u32 blocksize = sb->s_blocksize;
+ u64 block = lbo >> sb->s_blocksize_bits;
+ u32 off = lbo & (blocksize - 1);
+ u32 op = blocksize - off;
+
+ for (; bytes; block += 1, off = 0, op = blocksize) {
+ struct buffer_head *bh = __bread(bdev, block, blocksize);
+
+ if (!bh)
+ return -EIO;
+
+ if (op > bytes)
+ op = bytes;
+
+ memcpy(buffer, bh->b_data + off, op);
+
+ put_bh(bh);
+
+ bytes -= op;
+ buffer = Add2Ptr(buffer, op);
+ }
+
+ return 0;
+}
+
+int ntfs_sb_write(struct super_block *sb, u64 lbo, size_t bytes,
+ const void *buf, int wait)
+{
+ u32 blocksize = sb->s_blocksize;
+ struct block_device *bdev = sb->s_bdev;
+ sector_t block = lbo >> sb->s_blocksize_bits;
+ u32 off = lbo & (blocksize - 1);
+ u32 op = blocksize - off;
+ struct buffer_head *bh;
+
+ if (!wait && (sb->s_flags & SB_SYNCHRONOUS))
+ wait = 1;
+
+ for (; bytes; block += 1, off = 0, op = blocksize) {
+ if (op > bytes)
+ op = bytes;
+
+ if (op < blocksize) {
+ bh = __bread(bdev, block, blocksize);
+ if (!bh) {
+ ntfs_err(sb, "failed to read block %llx",
+ (u64)block);
+ return -EIO;
+ }
+ } else {
+ bh = __getblk(bdev, block, blocksize);
+ if (!bh)
+ return -ENOMEM;
+ }
+
+ if (buffer_locked(bh))
+ __wait_on_buffer(bh);
+
+ lock_buffer(bh);
+ if (buf) {
+ memcpy(bh->b_data + off, buf, op);
+ buf = Add2Ptr(buf, op);
+ } else {
+ memset(bh->b_data + off, -1, op);
+ }
+
+ set_buffer_uptodate(bh);
+ mark_buffer_dirty(bh);
+ unlock_buffer(bh);
+
+ if (wait) {
+ int err = sync_dirty_buffer(bh);
+
+ if (err) {
+ ntfs_err(
+ sb,
+ "failed to sync buffer at block %llx, error %d",
+ (u64)block, err);
+ put_bh(bh);
+ return err;
+ }
+ }
+
+ put_bh(bh);
+
+ bytes -= op;
+ }
+ return 0;
+}
+
+int ntfs_sb_write_run(struct ntfs_sb_info *sbi, const struct runs_tree *run,
+ u64 vbo, const void *buf, size_t bytes, int sync)
+{
+ struct super_block *sb = sbi->sb;
+ u8 cluster_bits = sbi->cluster_bits;
+ u32 off = vbo & sbi->cluster_mask;
+ CLST lcn, clen, vcn = vbo >> cluster_bits, vcn_next;
+ u64 lbo, len;
+ size_t idx;
+
+ if (!run_lookup_entry(run, vcn, &lcn, &clen, &idx))
+ return -ENOENT;
+
+ if (lcn == SPARSE_LCN)
+ return -EINVAL;
+
+ lbo = ((u64)lcn << cluster_bits) + off;
+ len = ((u64)clen << cluster_bits) - off;
+
+ for (;;) {
+ u32 op = min_t(u64, len, bytes);
+ int err = ntfs_sb_write(sb, lbo, op, buf, sync);
+
+ if (err)
+ return err;
+
+ bytes -= op;
+ if (!bytes)
+ break;
+
+ vcn_next = vcn + clen;
+ if (!run_get_entry(run, ++idx, &vcn, &lcn, &clen) ||
+ vcn != vcn_next)
+ return -ENOENT;
+
+ if (lcn == SPARSE_LCN)
+ return -EINVAL;
+
+ if (buf)
+ buf = Add2Ptr(buf, op);
+
+ lbo = ((u64)lcn << cluster_bits);
+ len = ((u64)clen << cluster_bits);
+ }
+
+ return 0;
+}
+
+struct buffer_head *ntfs_bread_run(struct ntfs_sb_info *sbi,
+ const struct runs_tree *run, u64 vbo)
+{
+ struct super_block *sb = sbi->sb;
+ u8 cluster_bits = sbi->cluster_bits;
+ CLST lcn;
+ u64 lbo;
+
+ if (!run_lookup_entry(run, vbo >> cluster_bits, &lcn, NULL, NULL))
+ return ERR_PTR(-ENOENT);
+
+ lbo = ((u64)lcn << cluster_bits) + (vbo & sbi->cluster_mask);
+
+ return ntfs_bread(sb, lbo >> sb->s_blocksize_bits);
+}
+
+int ntfs_read_run_nb(struct ntfs_sb_info *sbi, const struct runs_tree *run,
+ u64 vbo, void *buf, u32 bytes, struct ntfs_buffers *nb)
+{
+ int err;
+ struct super_block *sb = sbi->sb;
+ u32 blocksize = sb->s_blocksize;
+ u8 cluster_bits = sbi->cluster_bits;
+ u32 off = vbo & sbi->cluster_mask;
+ u32 nbh = 0;
+ CLST vcn_next, vcn = vbo >> cluster_bits;
+ CLST lcn, clen;
+ u64 lbo, len;
+ size_t idx;
+ struct buffer_head *bh;
+
+ if (!run) {
+ /* First reading of $Volume + $MFTMirr + $LogFile goes here. */
+ if (vbo > MFT_REC_VOL * sbi->record_size) {
+ err = -ENOENT;
+ goto out;
+ }
+
+ /* Use absolute boot's 'MFTCluster' to read record. */
+ lbo = vbo + sbi->mft.lbo;
+ len = sbi->record_size;
+ } else if (!run_lookup_entry(run, vcn, &lcn, &clen, &idx)) {
+ err = -ENOENT;
+ goto out;
+ } else {
+ if (lcn == SPARSE_LCN) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ lbo = ((u64)lcn << cluster_bits) + off;
+ len = ((u64)clen << cluster_bits) - off;
+ }
+
+ off = lbo & (blocksize - 1);
+ if (nb) {
+ nb->off = off;
+ nb->bytes = bytes;
+ }
+
+ for (;;) {
+ u32 len32 = len >= bytes ? bytes : len;
+ sector_t block = lbo >> sb->s_blocksize_bits;
+
+ do {
+ u32 op = blocksize - off;
+
+ if (op > len32)
+ op = len32;
+
+ bh = ntfs_bread(sb, block);
+ if (!bh) {
+ err = -EIO;
+ goto out;
+ }
+
+ if (buf) {
+ memcpy(buf, bh->b_data + off, op);
+ buf = Add2Ptr(buf, op);
+ }
+
+ if (!nb) {
+ put_bh(bh);
+ } else if (nbh >= ARRAY_SIZE(nb->bh)) {
+ err = -EINVAL;
+ goto out;
+ } else {
+ nb->bh[nbh++] = bh;
+ nb->nbufs = nbh;
+ }
+
+ bytes -= op;
+ if (!bytes)
+ return 0;
+ len32 -= op;
+ block += 1;
+ off = 0;
+
+ } while (len32);
+
+ vcn_next = vcn + clen;
+ if (!run_get_entry(run, ++idx, &vcn, &lcn, &clen) ||
+ vcn != vcn_next) {
+ err = -ENOENT;
+ goto out;
+ }
+
+ if (lcn == SPARSE_LCN) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ lbo = ((u64)lcn << cluster_bits);
+ len = ((u64)clen << cluster_bits);
+ }
+
+out:
+ if (!nbh)
+ return err;
+
+ while (nbh) {
+ put_bh(nb->bh[--nbh]);
+ nb->bh[nbh] = NULL;
+ }
+
+ nb->nbufs = 0;
+ return err;
+}
+
+/*
+ * ntfs_read_bh
+ *
+ * Return: < 0 if error, 0 if ok, -E_NTFS_FIXUP if need to update fixups.
+ */
+int ntfs_read_bh(struct ntfs_sb_info *sbi, const struct runs_tree *run, u64 vbo,
+ struct NTFS_RECORD_HEADER *rhdr, u32 bytes,
+ struct ntfs_buffers *nb)
+{
+ int err = ntfs_read_run_nb(sbi, run, vbo, rhdr, bytes, nb);
+
+ if (err)
+ return err;
+ return ntfs_fix_post_read(rhdr, nb->bytes, true);
+}
+
+int ntfs_get_bh(struct ntfs_sb_info *sbi, const struct runs_tree *run, u64 vbo,
+ u32 bytes, struct ntfs_buffers *nb)
+{
+ int err = 0;
+ struct super_block *sb = sbi->sb;
+ u32 blocksize = sb->s_blocksize;
+ u8 cluster_bits = sbi->cluster_bits;
+ CLST vcn_next, vcn = vbo >> cluster_bits;
+ u32 off;
+ u32 nbh = 0;
+ CLST lcn, clen;
+ u64 lbo, len;
+ size_t idx;
+
+ nb->bytes = bytes;
+
+ if (!run_lookup_entry(run, vcn, &lcn, &clen, &idx)) {
+ err = -ENOENT;
+ goto out;
+ }
+
+ off = vbo & sbi->cluster_mask;
+ lbo = ((u64)lcn << cluster_bits) + off;
+ len = ((u64)clen << cluster_bits) - off;
+
+ nb->off = off = lbo & (blocksize - 1);
+
+ for (;;) {
+ u32 len32 = min_t(u64, len, bytes);
+ sector_t block = lbo >> sb->s_blocksize_bits;
+
+ do {
+ u32 op;
+ struct buffer_head *bh;
+
+ if (nbh >= ARRAY_SIZE(nb->bh)) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ op = blocksize - off;
+ if (op > len32)
+ op = len32;
+
+ if (op == blocksize) {
+ bh = sb_getblk(sb, block);
+ if (!bh) {
+ err = -ENOMEM;
+ goto out;
+ }
+ if (buffer_locked(bh))
+ __wait_on_buffer(bh);
+ set_buffer_uptodate(bh);
+ } else {
+ bh = ntfs_bread(sb, block);
+ if (!bh) {
+ err = -EIO;
+ goto out;
+ }
+ }
+
+ nb->bh[nbh++] = bh;
+ bytes -= op;
+ if (!bytes) {
+ nb->nbufs = nbh;
+ return 0;
+ }
+
+ block += 1;
+ len32 -= op;
+ off = 0;
+ } while (len32);
+
+ vcn_next = vcn + clen;
+ if (!run_get_entry(run, ++idx, &vcn, &lcn, &clen) ||
+ vcn != vcn_next) {
+ err = -ENOENT;
+ goto out;
+ }
+
+ lbo = ((u64)lcn << cluster_bits);
+ len = ((u64)clen << cluster_bits);
+ }
+
+out:
+ while (nbh) {
+ put_bh(nb->bh[--nbh]);
+ nb->bh[nbh] = NULL;
+ }
+
+ nb->nbufs = 0;
+
+ return err;
+}
+
+int ntfs_write_bh(struct ntfs_sb_info *sbi, struct NTFS_RECORD_HEADER *rhdr,
+ struct ntfs_buffers *nb, int sync)
+{
+ int err = 0;
+ struct super_block *sb = sbi->sb;
+ u32 block_size = sb->s_blocksize;
+ u32 bytes = nb->bytes;
+ u32 off = nb->off;
+ u16 fo = le16_to_cpu(rhdr->fix_off);
+ u16 fn = le16_to_cpu(rhdr->fix_num);
+ u32 idx;
+ __le16 *fixup;
+ __le16 sample;
+
+ if ((fo & 1) || fo + fn * sizeof(short) > SECTOR_SIZE || !fn-- ||
+ fn * SECTOR_SIZE > bytes) {
+ return -EINVAL;
+ }
+
+ for (idx = 0; bytes && idx < nb->nbufs; idx += 1, off = 0) {
+ u32 op = block_size - off;
+ char *bh_data;
+ struct buffer_head *bh = nb->bh[idx];
+ __le16 *ptr, *end_data;
+
+ if (op > bytes)
+ op = bytes;
+
+ if (buffer_locked(bh))
+ __wait_on_buffer(bh);
+
+ lock_buffer(bh);
+
+ bh_data = bh->b_data + off;
+ end_data = Add2Ptr(bh_data, op);
+ memcpy(bh_data, rhdr, op);
+
+ if (!idx) {
+ u16 t16;
+
+ fixup = Add2Ptr(bh_data, fo);
+ sample = *fixup;
+ t16 = le16_to_cpu(sample);
+ if (t16 >= 0x7FFF) {
+ sample = *fixup = cpu_to_le16(1);
+ } else {
+ sample = cpu_to_le16(t16 + 1);
+ *fixup = sample;
+ }
+
+ *(__le16 *)Add2Ptr(rhdr, fo) = sample;
+ }
+
+ ptr = Add2Ptr(bh_data, SECTOR_SIZE - sizeof(short));
+
+ do {
+ *++fixup = *ptr;
+ *ptr = sample;
+ ptr += SECTOR_SIZE / sizeof(short);
+ } while (ptr < end_data);
+
+ set_buffer_uptodate(bh);
+ mark_buffer_dirty(bh);
+ unlock_buffer(bh);
+
+ if (sync) {
+ int err2 = sync_dirty_buffer(bh);
+
+ if (!err && err2)
+ err = err2;
+ }
+
+ bytes -= op;
+ rhdr = Add2Ptr(rhdr, op);
+ }
+
+ return err;
+}
+
+/*
+ * ntfs_bio_pages - Read/write pages from/to disk.
+ */
+int ntfs_bio_pages(struct ntfs_sb_info *sbi, const struct runs_tree *run,
+ struct page **pages, u32 nr_pages, u64 vbo, u32 bytes,
+ enum req_op op)
+{
+ int err = 0;
+ struct bio *new, *bio = NULL;
+ struct super_block *sb = sbi->sb;
+ struct block_device *bdev = sb->s_bdev;
+ struct page *page;
+ u8 cluster_bits = sbi->cluster_bits;
+ CLST lcn, clen, vcn, vcn_next;
+ u32 add, off, page_idx;
+ u64 lbo, len;
+ size_t run_idx;
+ struct blk_plug plug;
+
+ if (!bytes)
+ return 0;
+
+ blk_start_plug(&plug);
+
+ /* Align vbo and bytes to be 512 bytes aligned. */
+ lbo = (vbo + bytes + 511) & ~511ull;
+ vbo = vbo & ~511ull;
+ bytes = lbo - vbo;
+
+ vcn = vbo >> cluster_bits;
+ if (!run_lookup_entry(run, vcn, &lcn, &clen, &run_idx)) {
+ err = -ENOENT;
+ goto out;
+ }
+ off = vbo & sbi->cluster_mask;
+ page_idx = 0;
+ page = pages[0];
+
+ for (;;) {
+ lbo = ((u64)lcn << cluster_bits) + off;
+ len = ((u64)clen << cluster_bits) - off;
+new_bio:
+ new = bio_alloc(bdev, nr_pages - page_idx, op, GFP_NOFS);
+ if (bio) {
+ bio_chain(bio, new);
+ submit_bio(bio);
+ }
+ bio = new;
+ bio->bi_iter.bi_sector = lbo >> 9;
+
+ while (len) {
+ off = vbo & (PAGE_SIZE - 1);
+ add = off + len > PAGE_SIZE ? (PAGE_SIZE - off) : len;
+
+ if (bio_add_page(bio, page, add, off) < add)
+ goto new_bio;
+
+ if (bytes <= add)
+ goto out;
+ bytes -= add;
+ vbo += add;
+
+ if (add + off == PAGE_SIZE) {
+ page_idx += 1;
+ if (WARN_ON(page_idx >= nr_pages)) {
+ err = -EINVAL;
+ goto out;
+ }
+ page = pages[page_idx];
+ }
+
+ if (len <= add)
+ break;
+ len -= add;
+ lbo += add;
+ }
+
+ vcn_next = vcn + clen;
+ if (!run_get_entry(run, ++run_idx, &vcn, &lcn, &clen) ||
+ vcn != vcn_next) {
+ err = -ENOENT;
+ goto out;
+ }
+ off = 0;
+ }
+out:
+ if (bio) {
+ if (!err)
+ err = submit_bio_wait(bio);
+ bio_put(bio);
+ }
+ blk_finish_plug(&plug);
+
+ return err;
+}
+
+/*
+ * ntfs_bio_fill_1 - Helper for ntfs_loadlog_and_replay().
+ *
+ * Fill on-disk logfile range by (-1)
+ * this means empty logfile.
+ */
+int ntfs_bio_fill_1(struct ntfs_sb_info *sbi, const struct runs_tree *run)
+{
+ int err = 0;
+ struct super_block *sb = sbi->sb;
+ struct block_device *bdev = sb->s_bdev;
+ u8 cluster_bits = sbi->cluster_bits;
+ struct bio *new, *bio = NULL;
+ CLST lcn, clen;
+ u64 lbo, len;
+ size_t run_idx;
+ struct page *fill;
+ void *kaddr;
+ struct blk_plug plug;
+
+ fill = alloc_page(GFP_KERNEL);
+ if (!fill)
+ return -ENOMEM;
+
+ kaddr = kmap_atomic(fill);
+ memset(kaddr, -1, PAGE_SIZE);
+ kunmap_atomic(kaddr);
+ flush_dcache_page(fill);
+ lock_page(fill);
+
+ if (!run_lookup_entry(run, 0, &lcn, &clen, &run_idx)) {
+ err = -ENOENT;
+ goto out;
+ }
+
+ /*
+ * TODO: Try blkdev_issue_write_same.
+ */
+ blk_start_plug(&plug);
+ do {
+ lbo = (u64)lcn << cluster_bits;
+ len = (u64)clen << cluster_bits;
+new_bio:
+ new = bio_alloc(bdev, BIO_MAX_VECS, REQ_OP_WRITE, GFP_NOFS);
+ if (bio) {
+ bio_chain(bio, new);
+ submit_bio(bio);
+ }
+ bio = new;
+ bio->bi_iter.bi_sector = lbo >> 9;
+
+ for (;;) {
+ u32 add = len > PAGE_SIZE ? PAGE_SIZE : len;
+
+ if (bio_add_page(bio, fill, add, 0) < add)
+ goto new_bio;
+
+ lbo += add;
+ if (len <= add)
+ break;
+ len -= add;
+ }
+ } while (run_get_entry(run, ++run_idx, NULL, &lcn, &clen));
+
+ if (!err)
+ err = submit_bio_wait(bio);
+ bio_put(bio);
+
+ blk_finish_plug(&plug);
+out:
+ unlock_page(fill);
+ put_page(fill);
+
+ return err;
+}
+
+int ntfs_vbo_to_lbo(struct ntfs_sb_info *sbi, const struct runs_tree *run,
+ u64 vbo, u64 *lbo, u64 *bytes)
+{
+ u32 off;
+ CLST lcn, len;
+ u8 cluster_bits = sbi->cluster_bits;
+
+ if (!run_lookup_entry(run, vbo >> cluster_bits, &lcn, &len, NULL))
+ return -ENOENT;
+
+ off = vbo & sbi->cluster_mask;
+ *lbo = lcn == SPARSE_LCN ? -1 : (((u64)lcn << cluster_bits) + off);
+ *bytes = ((u64)len << cluster_bits) - off;
+
+ return 0;
+}
+
+struct ntfs_inode *ntfs_new_inode(struct ntfs_sb_info *sbi, CLST rno, bool dir)
+{
+ int err = 0;
+ struct super_block *sb = sbi->sb;
+ struct inode *inode = new_inode(sb);
+ struct ntfs_inode *ni;
+
+ if (!inode)
+ return ERR_PTR(-ENOMEM);
+
+ ni = ntfs_i(inode);
+
+ err = mi_format_new(&ni->mi, sbi, rno, dir ? RECORD_FLAG_DIR : 0,
+ false);
+ if (err)
+ goto out;
+
+ inode->i_ino = rno;
+ if (insert_inode_locked(inode) < 0) {
+ err = -EIO;
+ goto out;
+ }
+
+out:
+ if (err) {
+ iput(inode);
+ ni = ERR_PTR(err);
+ }
+ return ni;
+}
+
+/*
+ * O:BAG:BAD:(A;OICI;FA;;;WD)
+ * Owner S-1-5-32-544 (Administrators)
+ * Group S-1-5-32-544 (Administrators)
+ * ACE: allow S-1-1-0 (Everyone) with FILE_ALL_ACCESS
+ */
+const u8 s_default_security[] __aligned(8) = {
+ 0x01, 0x00, 0x04, 0x80, 0x30, 0x00, 0x00, 0x00, 0x40, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x02, 0x00, 0x1C, 0x00,
+ 0x01, 0x00, 0x00, 0x00, 0x00, 0x03, 0x14, 0x00, 0xFF, 0x01, 0x1F, 0x00,
+ 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00,
+ 0x01, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x05, 0x20, 0x00, 0x00, 0x00,
+ 0x20, 0x02, 0x00, 0x00, 0x01, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x05,
+ 0x20, 0x00, 0x00, 0x00, 0x20, 0x02, 0x00, 0x00,
+};
+
+static_assert(sizeof(s_default_security) == 0x50);
+
+static inline u32 sid_length(const struct SID *sid)
+{
+ return struct_size(sid, SubAuthority, sid->SubAuthorityCount);
+}
+
+/*
+ * is_acl_valid
+ *
+ * Thanks Mark Harmstone for idea.
+ */
+static bool is_acl_valid(const struct ACL *acl, u32 len)
+{
+ const struct ACE_HEADER *ace;
+ u32 i;
+ u16 ace_count, ace_size;
+
+ if (acl->AclRevision != ACL_REVISION &&
+ acl->AclRevision != ACL_REVISION_DS) {
+ /*
+ * This value should be ACL_REVISION, unless the ACL contains an
+ * object-specific ACE, in which case this value must be ACL_REVISION_DS.
+ * All ACEs in an ACL must be at the same revision level.
+ */
+ return false;
+ }
+
+ if (acl->Sbz1)
+ return false;
+
+ if (le16_to_cpu(acl->AclSize) > len)
+ return false;
+
+ if (acl->Sbz2)
+ return false;
+
+ len -= sizeof(struct ACL);
+ ace = (struct ACE_HEADER *)&acl[1];
+ ace_count = le16_to_cpu(acl->AceCount);
+
+ for (i = 0; i < ace_count; i++) {
+ if (len < sizeof(struct ACE_HEADER))
+ return false;
+
+ ace_size = le16_to_cpu(ace->AceSize);
+ if (len < ace_size)
+ return false;
+
+ len -= ace_size;
+ ace = Add2Ptr(ace, ace_size);
+ }
+
+ return true;
+}
+
+bool is_sd_valid(const struct SECURITY_DESCRIPTOR_RELATIVE *sd, u32 len)
+{
+ u32 sd_owner, sd_group, sd_sacl, sd_dacl;
+
+ if (len < sizeof(struct SECURITY_DESCRIPTOR_RELATIVE))
+ return false;
+
+ if (sd->Revision != 1)
+ return false;
+
+ if (sd->Sbz1)
+ return false;
+
+ if (!(sd->Control & SE_SELF_RELATIVE))
+ return false;
+
+ sd_owner = le32_to_cpu(sd->Owner);
+ if (sd_owner) {
+ const struct SID *owner = Add2Ptr(sd, sd_owner);
+
+ if (sd_owner + offsetof(struct SID, SubAuthority) > len)
+ return false;
+
+ if (owner->Revision != 1)
+ return false;
+
+ if (sd_owner + sid_length(owner) > len)
+ return false;
+ }
+
+ sd_group = le32_to_cpu(sd->Group);
+ if (sd_group) {
+ const struct SID *group = Add2Ptr(sd, sd_group);
+
+ if (sd_group + offsetof(struct SID, SubAuthority) > len)
+ return false;
+
+ if (group->Revision != 1)
+ return false;
+
+ if (sd_group + sid_length(group) > len)
+ return false;
+ }
+
+ sd_sacl = le32_to_cpu(sd->Sacl);
+ if (sd_sacl) {
+ const struct ACL *sacl = Add2Ptr(sd, sd_sacl);
+
+ if (sd_sacl + sizeof(struct ACL) > len)
+ return false;
+
+ if (!is_acl_valid(sacl, len - sd_sacl))
+ return false;
+ }
+
+ sd_dacl = le32_to_cpu(sd->Dacl);
+ if (sd_dacl) {
+ const struct ACL *dacl = Add2Ptr(sd, sd_dacl);
+
+ if (sd_dacl + sizeof(struct ACL) > len)
+ return false;
+
+ if (!is_acl_valid(dacl, len - sd_dacl))
+ return false;
+ }
+
+ return true;
+}
+
+/*
+ * ntfs_security_init - Load and parse $Secure.
+ */
+int ntfs_security_init(struct ntfs_sb_info *sbi)
+{
+ int err;
+ struct super_block *sb = sbi->sb;
+ struct inode *inode;
+ struct ntfs_inode *ni;
+ struct MFT_REF ref;
+ struct ATTRIB *attr;
+ struct ATTR_LIST_ENTRY *le;
+ u64 sds_size;
+ size_t off;
+ struct NTFS_DE *ne;
+ struct NTFS_DE_SII *sii_e;
+ struct ntfs_fnd *fnd_sii = NULL;
+ const struct INDEX_ROOT *root_sii;
+ const struct INDEX_ROOT *root_sdh;
+ struct ntfs_index *indx_sdh = &sbi->security.index_sdh;
+ struct ntfs_index *indx_sii = &sbi->security.index_sii;
+
+ ref.low = cpu_to_le32(MFT_REC_SECURE);
+ ref.high = 0;
+ ref.seq = cpu_to_le16(MFT_REC_SECURE);
+
+ inode = ntfs_iget5(sb, &ref, &NAME_SECURE);
+ if (IS_ERR(inode)) {
+ err = PTR_ERR(inode);
+ ntfs_err(sb, "Failed to load $Secure.");
+ inode = NULL;
+ goto out;
+ }
+
+ ni = ntfs_i(inode);
+
+ le = NULL;
+
+ attr = ni_find_attr(ni, NULL, &le, ATTR_ROOT, SDH_NAME,
+ ARRAY_SIZE(SDH_NAME), NULL, NULL);
+ if (!attr) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ root_sdh = resident_data_ex(attr, sizeof(struct INDEX_ROOT));
+ if (root_sdh->type != ATTR_ZERO ||
+ root_sdh->rule != NTFS_COLLATION_TYPE_SECURITY_HASH ||
+ offsetof(struct INDEX_ROOT, ihdr) + root_sdh->ihdr.used > attr->res.data_size) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ err = indx_init(indx_sdh, sbi, attr, INDEX_MUTEX_SDH);
+ if (err)
+ goto out;
+
+ attr = ni_find_attr(ni, attr, &le, ATTR_ROOT, SII_NAME,
+ ARRAY_SIZE(SII_NAME), NULL, NULL);
+ if (!attr) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ root_sii = resident_data_ex(attr, sizeof(struct INDEX_ROOT));
+ if (root_sii->type != ATTR_ZERO ||
+ root_sii->rule != NTFS_COLLATION_TYPE_UINT ||
+ offsetof(struct INDEX_ROOT, ihdr) + root_sii->ihdr.used > attr->res.data_size) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ err = indx_init(indx_sii, sbi, attr, INDEX_MUTEX_SII);
+ if (err)
+ goto out;
+
+ fnd_sii = fnd_get();
+ if (!fnd_sii) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ sds_size = inode->i_size;
+
+ /* Find the last valid Id. */
+ sbi->security.next_id = SECURITY_ID_FIRST;
+ /* Always write new security at the end of bucket. */
+ sbi->security.next_off =
+ ALIGN(sds_size - SecurityDescriptorsBlockSize, 16);
+
+ off = 0;
+ ne = NULL;
+
+ for (;;) {
+ u32 next_id;
+
+ err = indx_find_raw(indx_sii, ni, root_sii, &ne, &off, fnd_sii);
+ if (err || !ne)
+ break;
+
+ sii_e = (struct NTFS_DE_SII *)ne;
+ if (le16_to_cpu(ne->view.data_size) < SIZEOF_SECURITY_HDR)
+ continue;
+
+ next_id = le32_to_cpu(sii_e->sec_id) + 1;
+ if (next_id >= sbi->security.next_id)
+ sbi->security.next_id = next_id;
+ }
+
+ sbi->security.ni = ni;
+ inode = NULL;
+out:
+ iput(inode);
+ fnd_put(fnd_sii);
+
+ return err;
+}
+
+/*
+ * ntfs_get_security_by_id - Read security descriptor by id.
+ */
+int ntfs_get_security_by_id(struct ntfs_sb_info *sbi, __le32 security_id,
+ struct SECURITY_DESCRIPTOR_RELATIVE **sd,
+ size_t *size)
+{
+ int err;
+ int diff;
+ struct ntfs_inode *ni = sbi->security.ni;
+ struct ntfs_index *indx = &sbi->security.index_sii;
+ void *p = NULL;
+ struct NTFS_DE_SII *sii_e;
+ struct ntfs_fnd *fnd_sii;
+ struct SECURITY_HDR d_security;
+ const struct INDEX_ROOT *root_sii;
+ u32 t32;
+
+ *sd = NULL;
+
+ mutex_lock_nested(&ni->ni_lock, NTFS_INODE_MUTEX_SECURITY);
+
+ fnd_sii = fnd_get();
+ if (!fnd_sii) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ root_sii = indx_get_root(indx, ni, NULL, NULL);
+ if (!root_sii) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ /* Try to find this SECURITY descriptor in SII indexes. */
+ err = indx_find(indx, ni, root_sii, &security_id, sizeof(security_id),
+ NULL, &diff, (struct NTFS_DE **)&sii_e, fnd_sii);
+ if (err)
+ goto out;
+
+ if (diff)
+ goto out;
+
+ t32 = le32_to_cpu(sii_e->sec_hdr.size);
+ if (t32 < SIZEOF_SECURITY_HDR) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ if (t32 > SIZEOF_SECURITY_HDR + 0x10000) {
+ /* Looks like too big security. 0x10000 - is arbitrary big number. */
+ err = -EFBIG;
+ goto out;
+ }
+
+ *size = t32 - SIZEOF_SECURITY_HDR;
+
+ p = kmalloc(*size, GFP_NOFS);
+ if (!p) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ err = ntfs_read_run_nb(sbi, &ni->file.run,
+ le64_to_cpu(sii_e->sec_hdr.off), &d_security,
+ sizeof(d_security), NULL);
+ if (err)
+ goto out;
+
+ if (memcmp(&d_security, &sii_e->sec_hdr, SIZEOF_SECURITY_HDR)) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ err = ntfs_read_run_nb(sbi, &ni->file.run,
+ le64_to_cpu(sii_e->sec_hdr.off) +
+ SIZEOF_SECURITY_HDR,
+ p, *size, NULL);
+ if (err)
+ goto out;
+
+ *sd = p;
+ p = NULL;
+
+out:
+ kfree(p);
+ fnd_put(fnd_sii);
+ ni_unlock(ni);
+
+ return err;
+}
+
+/*
+ * ntfs_insert_security - Insert security descriptor into $Secure::SDS.
+ *
+ * SECURITY Descriptor Stream data is organized into chunks of 256K bytes
+ * and it contains a mirror copy of each security descriptor. When writing
+ * to a security descriptor at location X, another copy will be written at
+ * location (X+256K).
+ * When writing a security descriptor that will cross the 256K boundary,
+ * the pointer will be advanced by 256K to skip
+ * over the mirror portion.
+ */
+int ntfs_insert_security(struct ntfs_sb_info *sbi,
+ const struct SECURITY_DESCRIPTOR_RELATIVE *sd,
+ u32 size_sd, __le32 *security_id, bool *inserted)
+{
+ int err, diff;
+ struct ntfs_inode *ni = sbi->security.ni;
+ struct ntfs_index *indx_sdh = &sbi->security.index_sdh;
+ struct ntfs_index *indx_sii = &sbi->security.index_sii;
+ struct NTFS_DE_SDH *e;
+ struct NTFS_DE_SDH sdh_e;
+ struct NTFS_DE_SII sii_e;
+ struct SECURITY_HDR *d_security;
+ u32 new_sec_size = size_sd + SIZEOF_SECURITY_HDR;
+ u32 aligned_sec_size = ALIGN(new_sec_size, 16);
+ struct SECURITY_KEY hash_key;
+ struct ntfs_fnd *fnd_sdh = NULL;
+ const struct INDEX_ROOT *root_sdh;
+ const struct INDEX_ROOT *root_sii;
+ u64 mirr_off, new_sds_size;
+ u32 next, left;
+
+ static_assert((1 << Log2OfSecurityDescriptorsBlockSize) ==
+ SecurityDescriptorsBlockSize);
+
+ hash_key.hash = security_hash(sd, size_sd);
+ hash_key.sec_id = SECURITY_ID_INVALID;
+
+ if (inserted)
+ *inserted = false;
+ *security_id = SECURITY_ID_INVALID;
+
+ /* Allocate a temporal buffer. */
+ d_security = kzalloc(aligned_sec_size, GFP_NOFS);
+ if (!d_security)
+ return -ENOMEM;
+
+ mutex_lock_nested(&ni->ni_lock, NTFS_INODE_MUTEX_SECURITY);
+
+ fnd_sdh = fnd_get();
+ if (!fnd_sdh) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ root_sdh = indx_get_root(indx_sdh, ni, NULL, NULL);
+ if (!root_sdh) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ root_sii = indx_get_root(indx_sii, ni, NULL, NULL);
+ if (!root_sii) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ /*
+ * Check if such security already exists.
+ * Use "SDH" and hash -> to get the offset in "SDS".
+ */
+ err = indx_find(indx_sdh, ni, root_sdh, &hash_key, sizeof(hash_key),
+ &d_security->key.sec_id, &diff, (struct NTFS_DE **)&e,
+ fnd_sdh);
+ if (err)
+ goto out;
+
+ while (e) {
+ if (le32_to_cpu(e->sec_hdr.size) == new_sec_size) {
+ err = ntfs_read_run_nb(sbi, &ni->file.run,
+ le64_to_cpu(e->sec_hdr.off),
+ d_security, new_sec_size, NULL);
+ if (err)
+ goto out;
+
+ if (le32_to_cpu(d_security->size) == new_sec_size &&
+ d_security->key.hash == hash_key.hash &&
+ !memcmp(d_security + 1, sd, size_sd)) {
+ *security_id = d_security->key.sec_id;
+ /* Such security already exists. */
+ err = 0;
+ goto out;
+ }
+ }
+
+ err = indx_find_sort(indx_sdh, ni, root_sdh,
+ (struct NTFS_DE **)&e, fnd_sdh);
+ if (err)
+ goto out;
+
+ if (!e || e->key.hash != hash_key.hash)
+ break;
+ }
+
+ /* Zero unused space. */
+ next = sbi->security.next_off & (SecurityDescriptorsBlockSize - 1);
+ left = SecurityDescriptorsBlockSize - next;
+
+ /* Zero gap until SecurityDescriptorsBlockSize. */
+ if (left < new_sec_size) {
+ /* Zero "left" bytes from sbi->security.next_off. */
+ sbi->security.next_off += SecurityDescriptorsBlockSize + left;
+ }
+
+ /* Zero tail of previous security. */
+ //used = ni->vfs_inode.i_size & (SecurityDescriptorsBlockSize - 1);
+
+ /*
+ * Example:
+ * 0x40438 == ni->vfs_inode.i_size
+ * 0x00440 == sbi->security.next_off
+ * need to zero [0x438-0x440)
+ * if (next > used) {
+ * u32 tozero = next - used;
+ * zero "tozero" bytes from sbi->security.next_off - tozero
+ */
+
+ /* Format new security descriptor. */
+ d_security->key.hash = hash_key.hash;
+ d_security->key.sec_id = cpu_to_le32(sbi->security.next_id);
+ d_security->off = cpu_to_le64(sbi->security.next_off);
+ d_security->size = cpu_to_le32(new_sec_size);
+ memcpy(d_security + 1, sd, size_sd);
+
+ /* Write main SDS bucket. */
+ err = ntfs_sb_write_run(sbi, &ni->file.run, sbi->security.next_off,
+ d_security, aligned_sec_size, 0);
+
+ if (err)
+ goto out;
+
+ mirr_off = sbi->security.next_off + SecurityDescriptorsBlockSize;
+ new_sds_size = mirr_off + aligned_sec_size;
+
+ if (new_sds_size > ni->vfs_inode.i_size) {
+ err = attr_set_size(ni, ATTR_DATA, SDS_NAME,
+ ARRAY_SIZE(SDS_NAME), &ni->file.run,
+ new_sds_size, &new_sds_size, false, NULL);
+ if (err)
+ goto out;
+ }
+
+ /* Write copy SDS bucket. */
+ err = ntfs_sb_write_run(sbi, &ni->file.run, mirr_off, d_security,
+ aligned_sec_size, 0);
+ if (err)
+ goto out;
+
+ /* Fill SII entry. */
+ sii_e.de.view.data_off =
+ cpu_to_le16(offsetof(struct NTFS_DE_SII, sec_hdr));
+ sii_e.de.view.data_size = cpu_to_le16(SIZEOF_SECURITY_HDR);
+ sii_e.de.view.res = 0;
+ sii_e.de.size = cpu_to_le16(SIZEOF_SII_DIRENTRY);
+ sii_e.de.key_size = cpu_to_le16(sizeof(d_security->key.sec_id));
+ sii_e.de.flags = 0;
+ sii_e.de.res = 0;
+ sii_e.sec_id = d_security->key.sec_id;
+ memcpy(&sii_e.sec_hdr, d_security, SIZEOF_SECURITY_HDR);
+
+ err = indx_insert_entry(indx_sii, ni, &sii_e.de, NULL, NULL, 0);
+ if (err)
+ goto out;
+
+ /* Fill SDH entry. */
+ sdh_e.de.view.data_off =
+ cpu_to_le16(offsetof(struct NTFS_DE_SDH, sec_hdr));
+ sdh_e.de.view.data_size = cpu_to_le16(SIZEOF_SECURITY_HDR);
+ sdh_e.de.view.res = 0;
+ sdh_e.de.size = cpu_to_le16(SIZEOF_SDH_DIRENTRY);
+ sdh_e.de.key_size = cpu_to_le16(sizeof(sdh_e.key));
+ sdh_e.de.flags = 0;
+ sdh_e.de.res = 0;
+ sdh_e.key.hash = d_security->key.hash;
+ sdh_e.key.sec_id = d_security->key.sec_id;
+ memcpy(&sdh_e.sec_hdr, d_security, SIZEOF_SECURITY_HDR);
+ sdh_e.magic[0] = cpu_to_le16('I');
+ sdh_e.magic[1] = cpu_to_le16('I');
+
+ fnd_clear(fnd_sdh);
+ err = indx_insert_entry(indx_sdh, ni, &sdh_e.de, (void *)(size_t)1,
+ fnd_sdh, 0);
+ if (err)
+ goto out;
+
+ *security_id = d_security->key.sec_id;
+ if (inserted)
+ *inserted = true;
+
+ /* Update Id and offset for next descriptor. */
+ sbi->security.next_id += 1;
+ sbi->security.next_off += aligned_sec_size;
+
+out:
+ fnd_put(fnd_sdh);
+ mark_inode_dirty(&ni->vfs_inode);
+ ni_unlock(ni);
+ kfree(d_security);
+
+ return err;
+}
+
+/*
+ * ntfs_reparse_init - Load and parse $Extend/$Reparse.
+ */
+int ntfs_reparse_init(struct ntfs_sb_info *sbi)
+{
+ int err;
+ struct ntfs_inode *ni = sbi->reparse.ni;
+ struct ntfs_index *indx = &sbi->reparse.index_r;
+ struct ATTRIB *attr;
+ struct ATTR_LIST_ENTRY *le;
+ const struct INDEX_ROOT *root_r;
+
+ if (!ni)
+ return 0;
+
+ le = NULL;
+ attr = ni_find_attr(ni, NULL, &le, ATTR_ROOT, SR_NAME,
+ ARRAY_SIZE(SR_NAME), NULL, NULL);
+ if (!attr) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ root_r = resident_data(attr);
+ if (root_r->type != ATTR_ZERO ||
+ root_r->rule != NTFS_COLLATION_TYPE_UINTS) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ err = indx_init(indx, sbi, attr, INDEX_MUTEX_SR);
+ if (err)
+ goto out;
+
+out:
+ return err;
+}
+
+/*
+ * ntfs_objid_init - Load and parse $Extend/$ObjId.
+ */
+int ntfs_objid_init(struct ntfs_sb_info *sbi)
+{
+ int err;
+ struct ntfs_inode *ni = sbi->objid.ni;
+ struct ntfs_index *indx = &sbi->objid.index_o;
+ struct ATTRIB *attr;
+ struct ATTR_LIST_ENTRY *le;
+ const struct INDEX_ROOT *root;
+
+ if (!ni)
+ return 0;
+
+ le = NULL;
+ attr = ni_find_attr(ni, NULL, &le, ATTR_ROOT, SO_NAME,
+ ARRAY_SIZE(SO_NAME), NULL, NULL);
+ if (!attr) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ root = resident_data(attr);
+ if (root->type != ATTR_ZERO ||
+ root->rule != NTFS_COLLATION_TYPE_UINTS) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ err = indx_init(indx, sbi, attr, INDEX_MUTEX_SO);
+ if (err)
+ goto out;
+
+out:
+ return err;
+}
+
+int ntfs_objid_remove(struct ntfs_sb_info *sbi, struct GUID *guid)
+{
+ int err;
+ struct ntfs_inode *ni = sbi->objid.ni;
+ struct ntfs_index *indx = &sbi->objid.index_o;
+
+ if (!ni)
+ return -EINVAL;
+
+ mutex_lock_nested(&ni->ni_lock, NTFS_INODE_MUTEX_OBJID);
+
+ err = indx_delete_entry(indx, ni, guid, sizeof(*guid), NULL);
+
+ mark_inode_dirty(&ni->vfs_inode);
+ ni_unlock(ni);
+
+ return err;
+}
+
+int ntfs_insert_reparse(struct ntfs_sb_info *sbi, __le32 rtag,
+ const struct MFT_REF *ref)
+{
+ int err;
+ struct ntfs_inode *ni = sbi->reparse.ni;
+ struct ntfs_index *indx = &sbi->reparse.index_r;
+ struct NTFS_DE_R re;
+
+ if (!ni)
+ return -EINVAL;
+
+ memset(&re, 0, sizeof(re));
+
+ re.de.view.data_off = cpu_to_le16(offsetof(struct NTFS_DE_R, zero));
+ re.de.size = cpu_to_le16(sizeof(struct NTFS_DE_R));
+ re.de.key_size = cpu_to_le16(sizeof(re.key));
+
+ re.key.ReparseTag = rtag;
+ memcpy(&re.key.ref, ref, sizeof(*ref));
+
+ mutex_lock_nested(&ni->ni_lock, NTFS_INODE_MUTEX_REPARSE);
+
+ err = indx_insert_entry(indx, ni, &re.de, NULL, NULL, 0);
+
+ mark_inode_dirty(&ni->vfs_inode);
+ ni_unlock(ni);
+
+ return err;
+}
+
+int ntfs_remove_reparse(struct ntfs_sb_info *sbi, __le32 rtag,
+ const struct MFT_REF *ref)
+{
+ int err, diff;
+ struct ntfs_inode *ni = sbi->reparse.ni;
+ struct ntfs_index *indx = &sbi->reparse.index_r;
+ struct ntfs_fnd *fnd = NULL;
+ struct REPARSE_KEY rkey;
+ struct NTFS_DE_R *re;
+ struct INDEX_ROOT *root_r;
+
+ if (!ni)
+ return -EINVAL;
+
+ rkey.ReparseTag = rtag;
+ rkey.ref = *ref;
+
+ mutex_lock_nested(&ni->ni_lock, NTFS_INODE_MUTEX_REPARSE);
+
+ if (rtag) {
+ err = indx_delete_entry(indx, ni, &rkey, sizeof(rkey), NULL);
+ goto out1;
+ }
+
+ fnd = fnd_get();
+ if (!fnd) {
+ err = -ENOMEM;
+ goto out1;
+ }
+
+ root_r = indx_get_root(indx, ni, NULL, NULL);
+ if (!root_r) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ /* 1 - forces to ignore rkey.ReparseTag when comparing keys. */
+ err = indx_find(indx, ni, root_r, &rkey, sizeof(rkey), (void *)1, &diff,
+ (struct NTFS_DE **)&re, fnd);
+ if (err)
+ goto out;
+
+ if (memcmp(&re->key.ref, ref, sizeof(*ref))) {
+ /* Impossible. Looks like volume corrupt? */
+ goto out;
+ }
+
+ memcpy(&rkey, &re->key, sizeof(rkey));
+
+ fnd_put(fnd);
+ fnd = NULL;
+
+ err = indx_delete_entry(indx, ni, &rkey, sizeof(rkey), NULL);
+ if (err)
+ goto out;
+
+out:
+ fnd_put(fnd);
+
+out1:
+ mark_inode_dirty(&ni->vfs_inode);
+ ni_unlock(ni);
+
+ return err;
+}
+
+static inline void ntfs_unmap_and_discard(struct ntfs_sb_info *sbi, CLST lcn,
+ CLST len)
+{
+ ntfs_unmap_meta(sbi->sb, lcn, len);
+ ntfs_discard(sbi, lcn, len);
+}
+
+void mark_as_free_ex(struct ntfs_sb_info *sbi, CLST lcn, CLST len, bool trim)
+{
+ CLST end, i, zone_len, zlen;
+ struct wnd_bitmap *wnd = &sbi->used.bitmap;
+ bool dirty = false;
+
+ down_write_nested(&wnd->rw_lock, BITMAP_MUTEX_CLUSTERS);
+ if (!wnd_is_used(wnd, lcn, len)) {
+ /* mark volume as dirty out of wnd->rw_lock */
+ dirty = true;
+
+ end = lcn + len;
+ len = 0;
+ for (i = lcn; i < end; i++) {
+ if (wnd_is_used(wnd, i, 1)) {
+ if (!len)
+ lcn = i;
+ len += 1;
+ continue;
+ }
+
+ if (!len)
+ continue;
+
+ if (trim)
+ ntfs_unmap_and_discard(sbi, lcn, len);
+
+ wnd_set_free(wnd, lcn, len);
+ len = 0;
+ }
+
+ if (!len)
+ goto out;
+ }
+
+ if (trim)
+ ntfs_unmap_and_discard(sbi, lcn, len);
+ wnd_set_free(wnd, lcn, len);
+
+ /* append to MFT zone, if possible. */
+ zone_len = wnd_zone_len(wnd);
+ zlen = min(zone_len + len, sbi->zone_max);
+
+ if (zlen == zone_len) {
+ /* MFT zone already has maximum size. */
+ } else if (!zone_len) {
+ /* Create MFT zone only if 'zlen' is large enough. */
+ if (zlen == sbi->zone_max)
+ wnd_zone_set(wnd, lcn, zlen);
+ } else {
+ CLST zone_lcn = wnd_zone_bit(wnd);
+
+ if (lcn + len == zone_lcn) {
+ /* Append into head MFT zone. */
+ wnd_zone_set(wnd, lcn, zlen);
+ } else if (zone_lcn + zone_len == lcn) {
+ /* Append into tail MFT zone. */
+ wnd_zone_set(wnd, zone_lcn, zlen);
+ }
+ }
+
+out:
+ up_write(&wnd->rw_lock);
+ if (dirty)
+ ntfs_set_state(sbi, NTFS_DIRTY_ERROR);
+}
+
+/*
+ * run_deallocate - Deallocate clusters.
+ */
+int run_deallocate(struct ntfs_sb_info *sbi, struct runs_tree *run, bool trim)
+{
+ CLST lcn, len;
+ size_t idx = 0;
+
+ while (run_get_entry(run, idx++, NULL, &lcn, &len)) {
+ if (lcn == SPARSE_LCN)
+ continue;
+
+ mark_as_free_ex(sbi, lcn, len, trim);
+ }
+
+ return 0;
+}
diff --git a/fs/ntfs3/index.c b/fs/ntfs3/index.c
new file mode 100644
index 000000000..b89a33f57
--- /dev/null
+++ b/fs/ntfs3/index.c
@@ -0,0 +1,2668 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ *
+ * Copyright (C) 2019-2021 Paragon Software GmbH, All rights reserved.
+ *
+ */
+
+#include <linux/blkdev.h>
+#include <linux/buffer_head.h>
+#include <linux/fs.h>
+#include <linux/kernel.h>
+
+#include "debug.h"
+#include "ntfs.h"
+#include "ntfs_fs.h"
+
+static const struct INDEX_NAMES {
+ const __le16 *name;
+ u8 name_len;
+} s_index_names[INDEX_MUTEX_TOTAL] = {
+ { I30_NAME, ARRAY_SIZE(I30_NAME) }, { SII_NAME, ARRAY_SIZE(SII_NAME) },
+ { SDH_NAME, ARRAY_SIZE(SDH_NAME) }, { SO_NAME, ARRAY_SIZE(SO_NAME) },
+ { SQ_NAME, ARRAY_SIZE(SQ_NAME) }, { SR_NAME, ARRAY_SIZE(SR_NAME) },
+};
+
+/*
+ * cmp_fnames - Compare two names in index.
+ *
+ * if l1 != 0
+ * Both names are little endian on-disk ATTR_FILE_NAME structs.
+ * else
+ * key1 - cpu_str, key2 - ATTR_FILE_NAME
+ */
+static int cmp_fnames(const void *key1, size_t l1, const void *key2, size_t l2,
+ const void *data)
+{
+ const struct ATTR_FILE_NAME *f2 = key2;
+ const struct ntfs_sb_info *sbi = data;
+ const struct ATTR_FILE_NAME *f1;
+ u16 fsize2;
+ bool both_case;
+
+ if (l2 <= offsetof(struct ATTR_FILE_NAME, name))
+ return -1;
+
+ fsize2 = fname_full_size(f2);
+ if (l2 < fsize2)
+ return -1;
+
+ both_case = f2->type != FILE_NAME_DOS /*&& !sbi->options.nocase*/;
+ if (!l1) {
+ const struct le_str *s2 = (struct le_str *)&f2->name_len;
+
+ /*
+ * If names are equal (case insensitive)
+ * try to compare it case sensitive.
+ */
+ return ntfs_cmp_names_cpu(key1, s2, sbi->upcase, both_case);
+ }
+
+ f1 = key1;
+ return ntfs_cmp_names(f1->name, f1->name_len, f2->name, f2->name_len,
+ sbi->upcase, both_case);
+}
+
+/*
+ * cmp_uint - $SII of $Secure and $Q of Quota
+ */
+static int cmp_uint(const void *key1, size_t l1, const void *key2, size_t l2,
+ const void *data)
+{
+ const u32 *k1 = key1;
+ const u32 *k2 = key2;
+
+ if (l2 < sizeof(u32))
+ return -1;
+
+ if (*k1 < *k2)
+ return -1;
+ if (*k1 > *k2)
+ return 1;
+ return 0;
+}
+
+/*
+ * cmp_sdh - $SDH of $Secure
+ */
+static int cmp_sdh(const void *key1, size_t l1, const void *key2, size_t l2,
+ const void *data)
+{
+ const struct SECURITY_KEY *k1 = key1;
+ const struct SECURITY_KEY *k2 = key2;
+ u32 t1, t2;
+
+ if (l2 < sizeof(struct SECURITY_KEY))
+ return -1;
+
+ t1 = le32_to_cpu(k1->hash);
+ t2 = le32_to_cpu(k2->hash);
+
+ /* First value is a hash value itself. */
+ if (t1 < t2)
+ return -1;
+ if (t1 > t2)
+ return 1;
+
+ /* Second value is security Id. */
+ if (data) {
+ t1 = le32_to_cpu(k1->sec_id);
+ t2 = le32_to_cpu(k2->sec_id);
+ if (t1 < t2)
+ return -1;
+ if (t1 > t2)
+ return 1;
+ }
+
+ return 0;
+}
+
+/*
+ * cmp_uints - $O of ObjId and "$R" for Reparse.
+ */
+static int cmp_uints(const void *key1, size_t l1, const void *key2, size_t l2,
+ const void *data)
+{
+ const __le32 *k1 = key1;
+ const __le32 *k2 = key2;
+ size_t count;
+
+ if ((size_t)data == 1) {
+ /*
+ * ni_delete_all -> ntfs_remove_reparse ->
+ * delete all with this reference.
+ * k1, k2 - pointers to REPARSE_KEY
+ */
+
+ k1 += 1; // Skip REPARSE_KEY.ReparseTag
+ k2 += 1; // Skip REPARSE_KEY.ReparseTag
+ if (l2 <= sizeof(int))
+ return -1;
+ l2 -= sizeof(int);
+ if (l1 <= sizeof(int))
+ return 1;
+ l1 -= sizeof(int);
+ }
+
+ if (l2 < sizeof(int))
+ return -1;
+
+ for (count = min(l1, l2) >> 2; count > 0; --count, ++k1, ++k2) {
+ u32 t1 = le32_to_cpu(*k1);
+ u32 t2 = le32_to_cpu(*k2);
+
+ if (t1 > t2)
+ return 1;
+ if (t1 < t2)
+ return -1;
+ }
+
+ if (l1 > l2)
+ return 1;
+ if (l1 < l2)
+ return -1;
+
+ return 0;
+}
+
+static inline NTFS_CMP_FUNC get_cmp_func(const struct INDEX_ROOT *root)
+{
+ switch (root->type) {
+ case ATTR_NAME:
+ if (root->rule == NTFS_COLLATION_TYPE_FILENAME)
+ return &cmp_fnames;
+ break;
+ case ATTR_ZERO:
+ switch (root->rule) {
+ case NTFS_COLLATION_TYPE_UINT:
+ return &cmp_uint;
+ case NTFS_COLLATION_TYPE_SECURITY_HASH:
+ return &cmp_sdh;
+ case NTFS_COLLATION_TYPE_UINTS:
+ return &cmp_uints;
+ default:
+ break;
+ }
+ break;
+ default:
+ break;
+ }
+
+ return NULL;
+}
+
+struct bmp_buf {
+ struct ATTRIB *b;
+ struct mft_inode *mi;
+ struct buffer_head *bh;
+ ulong *buf;
+ size_t bit;
+ u32 nbits;
+ u64 new_valid;
+};
+
+static int bmp_buf_get(struct ntfs_index *indx, struct ntfs_inode *ni,
+ size_t bit, struct bmp_buf *bbuf)
+{
+ struct ATTRIB *b;
+ size_t data_size, valid_size, vbo, off = bit >> 3;
+ struct ntfs_sb_info *sbi = ni->mi.sbi;
+ CLST vcn = off >> sbi->cluster_bits;
+ struct ATTR_LIST_ENTRY *le = NULL;
+ struct buffer_head *bh;
+ struct super_block *sb;
+ u32 blocksize;
+ const struct INDEX_NAMES *in = &s_index_names[indx->type];
+
+ bbuf->bh = NULL;
+
+ b = ni_find_attr(ni, NULL, &le, ATTR_BITMAP, in->name, in->name_len,
+ &vcn, &bbuf->mi);
+ bbuf->b = b;
+ if (!b)
+ return -EINVAL;
+
+ if (!b->non_res) {
+ data_size = le32_to_cpu(b->res.data_size);
+
+ if (off >= data_size)
+ return -EINVAL;
+
+ bbuf->buf = (ulong *)resident_data(b);
+ bbuf->bit = 0;
+ bbuf->nbits = data_size * 8;
+
+ return 0;
+ }
+
+ data_size = le64_to_cpu(b->nres.data_size);
+ if (WARN_ON(off >= data_size)) {
+ /* Looks like filesystem error. */
+ return -EINVAL;
+ }
+
+ valid_size = le64_to_cpu(b->nres.valid_size);
+
+ bh = ntfs_bread_run(sbi, &indx->bitmap_run, off);
+ if (!bh)
+ return -EIO;
+
+ if (IS_ERR(bh))
+ return PTR_ERR(bh);
+
+ bbuf->bh = bh;
+
+ if (buffer_locked(bh))
+ __wait_on_buffer(bh);
+
+ lock_buffer(bh);
+
+ sb = sbi->sb;
+ blocksize = sb->s_blocksize;
+
+ vbo = off & ~(size_t)sbi->block_mask;
+
+ bbuf->new_valid = vbo + blocksize;
+ if (bbuf->new_valid <= valid_size)
+ bbuf->new_valid = 0;
+ else if (bbuf->new_valid > data_size)
+ bbuf->new_valid = data_size;
+
+ if (vbo >= valid_size) {
+ memset(bh->b_data, 0, blocksize);
+ } else if (vbo + blocksize > valid_size) {
+ u32 voff = valid_size & sbi->block_mask;
+
+ memset(bh->b_data + voff, 0, blocksize - voff);
+ }
+
+ bbuf->buf = (ulong *)bh->b_data;
+ bbuf->bit = 8 * (off & ~(size_t)sbi->block_mask);
+ bbuf->nbits = 8 * blocksize;
+
+ return 0;
+}
+
+static void bmp_buf_put(struct bmp_buf *bbuf, bool dirty)
+{
+ struct buffer_head *bh = bbuf->bh;
+ struct ATTRIB *b = bbuf->b;
+
+ if (!bh) {
+ if (b && !b->non_res && dirty)
+ bbuf->mi->dirty = true;
+ return;
+ }
+
+ if (!dirty)
+ goto out;
+
+ if (bbuf->new_valid) {
+ b->nres.valid_size = cpu_to_le64(bbuf->new_valid);
+ bbuf->mi->dirty = true;
+ }
+
+ set_buffer_uptodate(bh);
+ mark_buffer_dirty(bh);
+
+out:
+ unlock_buffer(bh);
+ put_bh(bh);
+}
+
+/*
+ * indx_mark_used - Mark the bit @bit as used.
+ */
+static int indx_mark_used(struct ntfs_index *indx, struct ntfs_inode *ni,
+ size_t bit)
+{
+ int err;
+ struct bmp_buf bbuf;
+
+ err = bmp_buf_get(indx, ni, bit, &bbuf);
+ if (err)
+ return err;
+
+ __set_bit(bit - bbuf.bit, bbuf.buf);
+
+ bmp_buf_put(&bbuf, true);
+
+ return 0;
+}
+
+/*
+ * indx_mark_free - Mark the bit @bit as free.
+ */
+static int indx_mark_free(struct ntfs_index *indx, struct ntfs_inode *ni,
+ size_t bit)
+{
+ int err;
+ struct bmp_buf bbuf;
+
+ err = bmp_buf_get(indx, ni, bit, &bbuf);
+ if (err)
+ return err;
+
+ __clear_bit(bit - bbuf.bit, bbuf.buf);
+
+ bmp_buf_put(&bbuf, true);
+
+ return 0;
+}
+
+/*
+ * scan_nres_bitmap
+ *
+ * If ntfs_readdir calls this function (indx_used_bit -> scan_nres_bitmap),
+ * inode is shared locked and no ni_lock.
+ * Use rw_semaphore for read/write access to bitmap_run.
+ */
+static int scan_nres_bitmap(struct ntfs_inode *ni, struct ATTRIB *bitmap,
+ struct ntfs_index *indx, size_t from,
+ bool (*fn)(const ulong *buf, u32 bit, u32 bits,
+ size_t *ret),
+ size_t *ret)
+{
+ struct ntfs_sb_info *sbi = ni->mi.sbi;
+ struct super_block *sb = sbi->sb;
+ struct runs_tree *run = &indx->bitmap_run;
+ struct rw_semaphore *lock = &indx->run_lock;
+ u32 nbits = sb->s_blocksize * 8;
+ u32 blocksize = sb->s_blocksize;
+ u64 valid_size = le64_to_cpu(bitmap->nres.valid_size);
+ u64 data_size = le64_to_cpu(bitmap->nres.data_size);
+ sector_t eblock = bytes_to_block(sb, data_size);
+ size_t vbo = from >> 3;
+ sector_t blk = (vbo & sbi->cluster_mask) >> sb->s_blocksize_bits;
+ sector_t vblock = vbo >> sb->s_blocksize_bits;
+ sector_t blen, block;
+ CLST lcn, clen, vcn, vcn_next;
+ size_t idx;
+ struct buffer_head *bh;
+ bool ok;
+
+ *ret = MINUS_ONE_T;
+
+ if (vblock >= eblock)
+ return 0;
+
+ from &= nbits - 1;
+ vcn = vbo >> sbi->cluster_bits;
+
+ down_read(lock);
+ ok = run_lookup_entry(run, vcn, &lcn, &clen, &idx);
+ up_read(lock);
+
+next_run:
+ if (!ok) {
+ int err;
+ const struct INDEX_NAMES *name = &s_index_names[indx->type];
+
+ down_write(lock);
+ err = attr_load_runs_vcn(ni, ATTR_BITMAP, name->name,
+ name->name_len, run, vcn);
+ up_write(lock);
+ if (err)
+ return err;
+ down_read(lock);
+ ok = run_lookup_entry(run, vcn, &lcn, &clen, &idx);
+ up_read(lock);
+ if (!ok)
+ return -EINVAL;
+ }
+
+ blen = (sector_t)clen * sbi->blocks_per_cluster;
+ block = (sector_t)lcn * sbi->blocks_per_cluster;
+
+ for (; blk < blen; blk++, from = 0) {
+ bh = ntfs_bread(sb, block + blk);
+ if (!bh)
+ return -EIO;
+
+ vbo = (u64)vblock << sb->s_blocksize_bits;
+ if (vbo >= valid_size) {
+ memset(bh->b_data, 0, blocksize);
+ } else if (vbo + blocksize > valid_size) {
+ u32 voff = valid_size & sbi->block_mask;
+
+ memset(bh->b_data + voff, 0, blocksize - voff);
+ }
+
+ if (vbo + blocksize > data_size)
+ nbits = 8 * (data_size - vbo);
+
+ ok = nbits > from ? (*fn)((ulong *)bh->b_data, from, nbits, ret)
+ : false;
+ put_bh(bh);
+
+ if (ok) {
+ *ret += 8 * vbo;
+ return 0;
+ }
+
+ if (++vblock >= eblock) {
+ *ret = MINUS_ONE_T;
+ return 0;
+ }
+ }
+ blk = 0;
+ vcn_next = vcn + clen;
+ down_read(lock);
+ ok = run_get_entry(run, ++idx, &vcn, &lcn, &clen) && vcn == vcn_next;
+ if (!ok)
+ vcn = vcn_next;
+ up_read(lock);
+ goto next_run;
+}
+
+static bool scan_for_free(const ulong *buf, u32 bit, u32 bits, size_t *ret)
+{
+ size_t pos = find_next_zero_bit(buf, bits, bit);
+
+ if (pos >= bits)
+ return false;
+ *ret = pos;
+ return true;
+}
+
+/*
+ * indx_find_free - Look for free bit.
+ *
+ * Return: -1 if no free bits.
+ */
+static int indx_find_free(struct ntfs_index *indx, struct ntfs_inode *ni,
+ size_t *bit, struct ATTRIB **bitmap)
+{
+ struct ATTRIB *b;
+ struct ATTR_LIST_ENTRY *le = NULL;
+ const struct INDEX_NAMES *in = &s_index_names[indx->type];
+ int err;
+
+ b = ni_find_attr(ni, NULL, &le, ATTR_BITMAP, in->name, in->name_len,
+ NULL, NULL);
+
+ if (!b)
+ return -ENOENT;
+
+ *bitmap = b;
+ *bit = MINUS_ONE_T;
+
+ if (!b->non_res) {
+ u32 nbits = 8 * le32_to_cpu(b->res.data_size);
+ size_t pos = find_next_zero_bit(resident_data(b), nbits, 0);
+
+ if (pos < nbits)
+ *bit = pos;
+ } else {
+ err = scan_nres_bitmap(ni, b, indx, 0, &scan_for_free, bit);
+
+ if (err)
+ return err;
+ }
+
+ return 0;
+}
+
+static bool scan_for_used(const ulong *buf, u32 bit, u32 bits, size_t *ret)
+{
+ size_t pos = find_next_bit(buf, bits, bit);
+
+ if (pos >= bits)
+ return false;
+ *ret = pos;
+ return true;
+}
+
+/*
+ * indx_used_bit - Look for used bit.
+ *
+ * Return: MINUS_ONE_T if no used bits.
+ */
+int indx_used_bit(struct ntfs_index *indx, struct ntfs_inode *ni, size_t *bit)
+{
+ struct ATTRIB *b;
+ struct ATTR_LIST_ENTRY *le = NULL;
+ size_t from = *bit;
+ const struct INDEX_NAMES *in = &s_index_names[indx->type];
+ int err;
+
+ b = ni_find_attr(ni, NULL, &le, ATTR_BITMAP, in->name, in->name_len,
+ NULL, NULL);
+
+ if (!b)
+ return -ENOENT;
+
+ *bit = MINUS_ONE_T;
+
+ if (!b->non_res) {
+ u32 nbits = le32_to_cpu(b->res.data_size) * 8;
+ size_t pos = find_next_bit(resident_data(b), nbits, from);
+
+ if (pos < nbits)
+ *bit = pos;
+ } else {
+ err = scan_nres_bitmap(ni, b, indx, from, &scan_for_used, bit);
+ if (err)
+ return err;
+ }
+
+ return 0;
+}
+
+/*
+ * hdr_find_split
+ *
+ * Find a point at which the index allocation buffer would like to be split.
+ * NOTE: This function should never return 'END' entry NULL returns on error.
+ */
+static const struct NTFS_DE *hdr_find_split(const struct INDEX_HDR *hdr)
+{
+ size_t o;
+ const struct NTFS_DE *e = hdr_first_de(hdr);
+ u32 used_2 = le32_to_cpu(hdr->used) >> 1;
+ u16 esize;
+
+ if (!e || de_is_last(e))
+ return NULL;
+
+ esize = le16_to_cpu(e->size);
+ for (o = le32_to_cpu(hdr->de_off) + esize; o < used_2; o += esize) {
+ const struct NTFS_DE *p = e;
+
+ e = Add2Ptr(hdr, o);
+
+ /* We must not return END entry. */
+ if (de_is_last(e))
+ return p;
+
+ esize = le16_to_cpu(e->size);
+ }
+
+ return e;
+}
+
+/*
+ * hdr_insert_head - Insert some entries at the beginning of the buffer.
+ *
+ * It is used to insert entries into a newly-created buffer.
+ */
+static const struct NTFS_DE *hdr_insert_head(struct INDEX_HDR *hdr,
+ const void *ins, u32 ins_bytes)
+{
+ u32 to_move;
+ struct NTFS_DE *e = hdr_first_de(hdr);
+ u32 used = le32_to_cpu(hdr->used);
+
+ if (!e)
+ return NULL;
+
+ /* Now we just make room for the inserted entries and jam it in. */
+ to_move = used - le32_to_cpu(hdr->de_off);
+ memmove(Add2Ptr(e, ins_bytes), e, to_move);
+ memcpy(e, ins, ins_bytes);
+ hdr->used = cpu_to_le32(used + ins_bytes);
+
+ return e;
+}
+
+/*
+ * index_hdr_check
+ *
+ * return true if INDEX_HDR is valid
+ */
+static bool index_hdr_check(const struct INDEX_HDR *hdr, u32 bytes)
+{
+ u32 end = le32_to_cpu(hdr->used);
+ u32 tot = le32_to_cpu(hdr->total);
+ u32 off = le32_to_cpu(hdr->de_off);
+
+ if (!IS_ALIGNED(off, 8) || tot > bytes || end > tot ||
+ off + sizeof(struct NTFS_DE) > end) {
+ /* incorrect index buffer. */
+ return false;
+ }
+
+ return true;
+}
+
+/*
+ * index_buf_check
+ *
+ * return true if INDEX_BUFFER seems is valid
+ */
+static bool index_buf_check(const struct INDEX_BUFFER *ib, u32 bytes,
+ const CLST *vbn)
+{
+ const struct NTFS_RECORD_HEADER *rhdr = &ib->rhdr;
+ u16 fo = le16_to_cpu(rhdr->fix_off);
+ u16 fn = le16_to_cpu(rhdr->fix_num);
+
+ if (bytes <= offsetof(struct INDEX_BUFFER, ihdr) ||
+ rhdr->sign != NTFS_INDX_SIGNATURE ||
+ fo < sizeof(struct INDEX_BUFFER)
+ /* Check index buffer vbn. */
+ || (vbn && *vbn != le64_to_cpu(ib->vbn)) || (fo % sizeof(short)) ||
+ fo + fn * sizeof(short) >= bytes ||
+ fn != ((bytes >> SECTOR_SHIFT) + 1)) {
+ /* incorrect index buffer. */
+ return false;
+ }
+
+ return index_hdr_check(&ib->ihdr,
+ bytes - offsetof(struct INDEX_BUFFER, ihdr));
+}
+
+void fnd_clear(struct ntfs_fnd *fnd)
+{
+ int i;
+
+ for (i = fnd->level - 1; i >= 0; i--) {
+ struct indx_node *n = fnd->nodes[i];
+
+ if (!n)
+ continue;
+
+ put_indx_node(n);
+ fnd->nodes[i] = NULL;
+ }
+ fnd->level = 0;
+ fnd->root_de = NULL;
+}
+
+static int fnd_push(struct ntfs_fnd *fnd, struct indx_node *n,
+ struct NTFS_DE *e)
+{
+ int i;
+
+ i = fnd->level;
+ if (i < 0 || i >= ARRAY_SIZE(fnd->nodes))
+ return -EINVAL;
+ fnd->nodes[i] = n;
+ fnd->de[i] = e;
+ fnd->level += 1;
+ return 0;
+}
+
+static struct indx_node *fnd_pop(struct ntfs_fnd *fnd)
+{
+ struct indx_node *n;
+ int i = fnd->level;
+
+ i -= 1;
+ n = fnd->nodes[i];
+ fnd->nodes[i] = NULL;
+ fnd->level = i;
+
+ return n;
+}
+
+static bool fnd_is_empty(struct ntfs_fnd *fnd)
+{
+ if (!fnd->level)
+ return !fnd->root_de;
+
+ return !fnd->de[fnd->level - 1];
+}
+
+/*
+ * hdr_find_e - Locate an entry the index buffer.
+ *
+ * If no matching entry is found, it returns the first entry which is greater
+ * than the desired entry If the search key is greater than all the entries the
+ * buffer, it returns the 'end' entry. This function does a binary search of the
+ * current index buffer, for the first entry that is <= to the search value.
+ *
+ * Return: NULL if error.
+ */
+static struct NTFS_DE *hdr_find_e(const struct ntfs_index *indx,
+ const struct INDEX_HDR *hdr, const void *key,
+ size_t key_len, const void *ctx, int *diff)
+{
+ struct NTFS_DE *e, *found = NULL;
+ NTFS_CMP_FUNC cmp = indx->cmp;
+ int min_idx = 0, mid_idx, max_idx = 0;
+ int diff2;
+ int table_size = 8;
+ u32 e_size, e_key_len;
+ u32 end = le32_to_cpu(hdr->used);
+ u32 off = le32_to_cpu(hdr->de_off);
+ u32 total = le32_to_cpu(hdr->total);
+ u16 offs[128];
+
+ if (unlikely(!cmp))
+ return NULL;
+
+fill_table:
+ if (end > total)
+ return NULL;
+
+ if (off + sizeof(struct NTFS_DE) > end)
+ return NULL;
+
+ e = Add2Ptr(hdr, off);
+ e_size = le16_to_cpu(e->size);
+
+ if (e_size < sizeof(struct NTFS_DE) || off + e_size > end)
+ return NULL;
+
+ if (!de_is_last(e)) {
+ offs[max_idx] = off;
+ off += e_size;
+
+ max_idx++;
+ if (max_idx < table_size)
+ goto fill_table;
+
+ max_idx--;
+ }
+
+binary_search:
+ e_key_len = le16_to_cpu(e->key_size);
+
+ diff2 = (*cmp)(key, key_len, e + 1, e_key_len, ctx);
+ if (diff2 > 0) {
+ if (found) {
+ min_idx = mid_idx + 1;
+ } else {
+ if (de_is_last(e))
+ return NULL;
+
+ max_idx = 0;
+ table_size = min(table_size * 2,
+ (int)ARRAY_SIZE(offs));
+ goto fill_table;
+ }
+ } else if (diff2 < 0) {
+ if (found)
+ max_idx = mid_idx - 1;
+ else
+ max_idx--;
+
+ found = e;
+ } else {
+ *diff = 0;
+ return e;
+ }
+
+ if (min_idx > max_idx) {
+ *diff = -1;
+ return found;
+ }
+
+ mid_idx = (min_idx + max_idx) >> 1;
+ e = Add2Ptr(hdr, offs[mid_idx]);
+
+ goto binary_search;
+}
+
+/*
+ * hdr_insert_de - Insert an index entry into the buffer.
+ *
+ * 'before' should be a pointer previously returned from hdr_find_e.
+ */
+static struct NTFS_DE *hdr_insert_de(const struct ntfs_index *indx,
+ struct INDEX_HDR *hdr,
+ const struct NTFS_DE *de,
+ struct NTFS_DE *before, const void *ctx)
+{
+ int diff;
+ size_t off = PtrOffset(hdr, before);
+ u32 used = le32_to_cpu(hdr->used);
+ u32 total = le32_to_cpu(hdr->total);
+ u16 de_size = le16_to_cpu(de->size);
+
+ /* First, check to see if there's enough room. */
+ if (used + de_size > total)
+ return NULL;
+
+ /* We know there's enough space, so we know we'll succeed. */
+ if (before) {
+ /* Check that before is inside Index. */
+ if (off >= used || off < le32_to_cpu(hdr->de_off) ||
+ off + le16_to_cpu(before->size) > total) {
+ return NULL;
+ }
+ goto ok;
+ }
+ /* No insert point is applied. Get it manually. */
+ before = hdr_find_e(indx, hdr, de + 1, le16_to_cpu(de->key_size), ctx,
+ &diff);
+ if (!before)
+ return NULL;
+ off = PtrOffset(hdr, before);
+
+ok:
+ /* Now we just make room for the entry and jam it in. */
+ memmove(Add2Ptr(before, de_size), before, used - off);
+
+ hdr->used = cpu_to_le32(used + de_size);
+ memcpy(before, de, de_size);
+
+ return before;
+}
+
+/*
+ * hdr_delete_de - Remove an entry from the index buffer.
+ */
+static inline struct NTFS_DE *hdr_delete_de(struct INDEX_HDR *hdr,
+ struct NTFS_DE *re)
+{
+ u32 used = le32_to_cpu(hdr->used);
+ u16 esize = le16_to_cpu(re->size);
+ u32 off = PtrOffset(hdr, re);
+ int bytes = used - (off + esize);
+
+ /* check INDEX_HDR valid before using INDEX_HDR */
+ if (!check_index_header(hdr, le32_to_cpu(hdr->total)))
+ return NULL;
+
+ if (off >= used || esize < sizeof(struct NTFS_DE) ||
+ bytes < sizeof(struct NTFS_DE))
+ return NULL;
+
+ hdr->used = cpu_to_le32(used - esize);
+ memmove(re, Add2Ptr(re, esize), bytes);
+
+ return re;
+}
+
+void indx_clear(struct ntfs_index *indx)
+{
+ run_close(&indx->alloc_run);
+ run_close(&indx->bitmap_run);
+}
+
+int indx_init(struct ntfs_index *indx, struct ntfs_sb_info *sbi,
+ const struct ATTRIB *attr, enum index_mutex_classed type)
+{
+ u32 t32;
+ const struct INDEX_ROOT *root = resident_data(attr);
+
+ t32 = le32_to_cpu(attr->res.data_size);
+ if (t32 <= offsetof(struct INDEX_ROOT, ihdr) ||
+ !index_hdr_check(&root->ihdr,
+ t32 - offsetof(struct INDEX_ROOT, ihdr))) {
+ goto out;
+ }
+
+ /* Check root fields. */
+ if (!root->index_block_clst)
+ goto out;
+
+ indx->type = type;
+ indx->idx2vbn_bits = __ffs(root->index_block_clst);
+
+ t32 = le32_to_cpu(root->index_block_size);
+ indx->index_bits = blksize_bits(t32);
+
+ /* Check index record size. */
+ if (t32 < sbi->cluster_size) {
+ /* Index record is smaller than a cluster, use 512 blocks. */
+ if (t32 != root->index_block_clst * SECTOR_SIZE)
+ goto out;
+
+ /* Check alignment to a cluster. */
+ if ((sbi->cluster_size >> SECTOR_SHIFT) &
+ (root->index_block_clst - 1)) {
+ goto out;
+ }
+
+ indx->vbn2vbo_bits = SECTOR_SHIFT;
+ } else {
+ /* Index record must be a multiple of cluster size. */
+ if (t32 != root->index_block_clst << sbi->cluster_bits)
+ goto out;
+
+ indx->vbn2vbo_bits = sbi->cluster_bits;
+ }
+
+ init_rwsem(&indx->run_lock);
+
+ indx->cmp = get_cmp_func(root);
+ if (!indx->cmp)
+ goto out;
+
+ return 0;
+
+out:
+ ntfs_set_state(sbi, NTFS_DIRTY_DIRTY);
+ return -EINVAL;
+}
+
+static struct indx_node *indx_new(struct ntfs_index *indx,
+ struct ntfs_inode *ni, CLST vbn,
+ const __le64 *sub_vbn)
+{
+ int err;
+ struct NTFS_DE *e;
+ struct indx_node *r;
+ struct INDEX_HDR *hdr;
+ struct INDEX_BUFFER *index;
+ u64 vbo = (u64)vbn << indx->vbn2vbo_bits;
+ u32 bytes = 1u << indx->index_bits;
+ u16 fn;
+ u32 eo;
+
+ r = kzalloc(sizeof(struct indx_node), GFP_NOFS);
+ if (!r)
+ return ERR_PTR(-ENOMEM);
+
+ index = kzalloc(bytes, GFP_NOFS);
+ if (!index) {
+ kfree(r);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ err = ntfs_get_bh(ni->mi.sbi, &indx->alloc_run, vbo, bytes, &r->nb);
+
+ if (err) {
+ kfree(index);
+ kfree(r);
+ return ERR_PTR(err);
+ }
+
+ /* Create header. */
+ index->rhdr.sign = NTFS_INDX_SIGNATURE;
+ index->rhdr.fix_off = cpu_to_le16(sizeof(struct INDEX_BUFFER)); // 0x28
+ fn = (bytes >> SECTOR_SHIFT) + 1; // 9
+ index->rhdr.fix_num = cpu_to_le16(fn);
+ index->vbn = cpu_to_le64(vbn);
+ hdr = &index->ihdr;
+ eo = ALIGN(sizeof(struct INDEX_BUFFER) + fn * sizeof(short), 8);
+ hdr->de_off = cpu_to_le32(eo);
+
+ e = Add2Ptr(hdr, eo);
+
+ if (sub_vbn) {
+ e->flags = NTFS_IE_LAST | NTFS_IE_HAS_SUBNODES;
+ e->size = cpu_to_le16(sizeof(struct NTFS_DE) + sizeof(u64));
+ hdr->used =
+ cpu_to_le32(eo + sizeof(struct NTFS_DE) + sizeof(u64));
+ de_set_vbn_le(e, *sub_vbn);
+ hdr->flags = 1;
+ } else {
+ e->size = cpu_to_le16(sizeof(struct NTFS_DE));
+ hdr->used = cpu_to_le32(eo + sizeof(struct NTFS_DE));
+ e->flags = NTFS_IE_LAST;
+ }
+
+ hdr->total = cpu_to_le32(bytes - offsetof(struct INDEX_BUFFER, ihdr));
+
+ r->index = index;
+ return r;
+}
+
+struct INDEX_ROOT *indx_get_root(struct ntfs_index *indx, struct ntfs_inode *ni,
+ struct ATTRIB **attr, struct mft_inode **mi)
+{
+ struct ATTR_LIST_ENTRY *le = NULL;
+ struct ATTRIB *a;
+ const struct INDEX_NAMES *in = &s_index_names[indx->type];
+
+ a = ni_find_attr(ni, NULL, &le, ATTR_ROOT, in->name, in->name_len, NULL,
+ mi);
+ if (!a)
+ return NULL;
+
+ if (attr)
+ *attr = a;
+
+ return resident_data_ex(a, sizeof(struct INDEX_ROOT));
+}
+
+static int indx_write(struct ntfs_index *indx, struct ntfs_inode *ni,
+ struct indx_node *node, int sync)
+{
+ struct INDEX_BUFFER *ib = node->index;
+
+ return ntfs_write_bh(ni->mi.sbi, &ib->rhdr, &node->nb, sync);
+}
+
+/*
+ * indx_read
+ *
+ * If ntfs_readdir calls this function
+ * inode is shared locked and no ni_lock.
+ * Use rw_semaphore for read/write access to alloc_run.
+ */
+int indx_read(struct ntfs_index *indx, struct ntfs_inode *ni, CLST vbn,
+ struct indx_node **node)
+{
+ int err;
+ struct INDEX_BUFFER *ib;
+ struct runs_tree *run = &indx->alloc_run;
+ struct rw_semaphore *lock = &indx->run_lock;
+ u64 vbo = (u64)vbn << indx->vbn2vbo_bits;
+ u32 bytes = 1u << indx->index_bits;
+ struct indx_node *in = *node;
+ const struct INDEX_NAMES *name;
+
+ if (!in) {
+ in = kzalloc(sizeof(struct indx_node), GFP_NOFS);
+ if (!in)
+ return -ENOMEM;
+ } else {
+ nb_put(&in->nb);
+ }
+
+ ib = in->index;
+ if (!ib) {
+ ib = kmalloc(bytes, GFP_NOFS);
+ if (!ib) {
+ err = -ENOMEM;
+ goto out;
+ }
+ }
+
+ down_read(lock);
+ err = ntfs_read_bh(ni->mi.sbi, run, vbo, &ib->rhdr, bytes, &in->nb);
+ up_read(lock);
+ if (!err)
+ goto ok;
+
+ if (err == -E_NTFS_FIXUP)
+ goto ok;
+
+ if (err != -ENOENT)
+ goto out;
+
+ name = &s_index_names[indx->type];
+ down_write(lock);
+ err = attr_load_runs_range(ni, ATTR_ALLOC, name->name, name->name_len,
+ run, vbo, vbo + bytes);
+ up_write(lock);
+ if (err)
+ goto out;
+
+ down_read(lock);
+ err = ntfs_read_bh(ni->mi.sbi, run, vbo, &ib->rhdr, bytes, &in->nb);
+ up_read(lock);
+ if (err == -E_NTFS_FIXUP)
+ goto ok;
+
+ if (err)
+ goto out;
+
+ok:
+ if (!index_buf_check(ib, bytes, &vbn)) {
+ ntfs_inode_err(&ni->vfs_inode, "directory corrupted");
+ ntfs_set_state(ni->mi.sbi, NTFS_DIRTY_ERROR);
+ err = -EINVAL;
+ goto out;
+ }
+
+ if (err == -E_NTFS_FIXUP) {
+ ntfs_write_bh(ni->mi.sbi, &ib->rhdr, &in->nb, 0);
+ err = 0;
+ }
+
+ /* check for index header length */
+ if (offsetof(struct INDEX_BUFFER, ihdr) + ib->ihdr.used > bytes) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ in->index = ib;
+ *node = in;
+
+out:
+ if (err == -E_NTFS_CORRUPT) {
+ ntfs_inode_err(&ni->vfs_inode, "directory corrupted");
+ ntfs_set_state(ni->mi.sbi, NTFS_DIRTY_ERROR);
+ err = -EINVAL;
+ }
+
+ if (ib != in->index)
+ kfree(ib);
+
+ if (*node != in) {
+ nb_put(&in->nb);
+ kfree(in);
+ }
+
+ return err;
+}
+
+/*
+ * indx_find - Scan NTFS directory for given entry.
+ */
+int indx_find(struct ntfs_index *indx, struct ntfs_inode *ni,
+ const struct INDEX_ROOT *root, const void *key, size_t key_len,
+ const void *ctx, int *diff, struct NTFS_DE **entry,
+ struct ntfs_fnd *fnd)
+{
+ int err;
+ struct NTFS_DE *e;
+ struct indx_node *node;
+
+ if (!root)
+ root = indx_get_root(&ni->dir, ni, NULL, NULL);
+
+ if (!root) {
+ /* Should not happen. */
+ return -EINVAL;
+ }
+
+ /* Check cache. */
+ e = fnd->level ? fnd->de[fnd->level - 1] : fnd->root_de;
+ if (e && !de_is_last(e) &&
+ !(*indx->cmp)(key, key_len, e + 1, le16_to_cpu(e->key_size), ctx)) {
+ *entry = e;
+ *diff = 0;
+ return 0;
+ }
+
+ /* Soft finder reset. */
+ fnd_clear(fnd);
+
+ /* Lookup entry that is <= to the search value. */
+ e = hdr_find_e(indx, &root->ihdr, key, key_len, ctx, diff);
+ if (!e)
+ return -EINVAL;
+
+ fnd->root_de = e;
+
+ for (;;) {
+ node = NULL;
+ if (*diff >= 0 || !de_has_vcn_ex(e))
+ break;
+
+ /* Read next level. */
+ err = indx_read(indx, ni, de_get_vbn(e), &node);
+ if (err)
+ return err;
+
+ /* Lookup entry that is <= to the search value. */
+ e = hdr_find_e(indx, &node->index->ihdr, key, key_len, ctx,
+ diff);
+ if (!e) {
+ put_indx_node(node);
+ return -EINVAL;
+ }
+
+ fnd_push(fnd, node, e);
+ }
+
+ *entry = e;
+ return 0;
+}
+
+int indx_find_sort(struct ntfs_index *indx, struct ntfs_inode *ni,
+ const struct INDEX_ROOT *root, struct NTFS_DE **entry,
+ struct ntfs_fnd *fnd)
+{
+ int err;
+ struct indx_node *n = NULL;
+ struct NTFS_DE *e;
+ size_t iter = 0;
+ int level = fnd->level;
+
+ if (!*entry) {
+ /* Start find. */
+ e = hdr_first_de(&root->ihdr);
+ if (!e)
+ return 0;
+ fnd_clear(fnd);
+ fnd->root_de = e;
+ } else if (!level) {
+ if (de_is_last(fnd->root_de)) {
+ *entry = NULL;
+ return 0;
+ }
+
+ e = hdr_next_de(&root->ihdr, fnd->root_de);
+ if (!e)
+ return -EINVAL;
+ fnd->root_de = e;
+ } else {
+ n = fnd->nodes[level - 1];
+ e = fnd->de[level - 1];
+
+ if (de_is_last(e))
+ goto pop_level;
+
+ e = hdr_next_de(&n->index->ihdr, e);
+ if (!e)
+ return -EINVAL;
+
+ fnd->de[level - 1] = e;
+ }
+
+ /* Just to avoid tree cycle. */
+next_iter:
+ if (iter++ >= 1000)
+ return -EINVAL;
+
+ while (de_has_vcn_ex(e)) {
+ if (le16_to_cpu(e->size) <
+ sizeof(struct NTFS_DE) + sizeof(u64)) {
+ if (n) {
+ fnd_pop(fnd);
+ kfree(n);
+ }
+ return -EINVAL;
+ }
+
+ /* Read next level. */
+ err = indx_read(indx, ni, de_get_vbn(e), &n);
+ if (err)
+ return err;
+
+ /* Try next level. */
+ e = hdr_first_de(&n->index->ihdr);
+ if (!e) {
+ kfree(n);
+ return -EINVAL;
+ }
+
+ fnd_push(fnd, n, e);
+ }
+
+ if (le16_to_cpu(e->size) > sizeof(struct NTFS_DE)) {
+ *entry = e;
+ return 0;
+ }
+
+pop_level:
+ for (;;) {
+ if (!de_is_last(e))
+ goto next_iter;
+
+ /* Pop one level. */
+ if (n) {
+ fnd_pop(fnd);
+ kfree(n);
+ }
+
+ level = fnd->level;
+
+ if (level) {
+ n = fnd->nodes[level - 1];
+ e = fnd->de[level - 1];
+ } else if (fnd->root_de) {
+ n = NULL;
+ e = fnd->root_de;
+ fnd->root_de = NULL;
+ } else {
+ *entry = NULL;
+ return 0;
+ }
+
+ if (le16_to_cpu(e->size) > sizeof(struct NTFS_DE)) {
+ *entry = e;
+ if (!fnd->root_de)
+ fnd->root_de = e;
+ return 0;
+ }
+ }
+}
+
+int indx_find_raw(struct ntfs_index *indx, struct ntfs_inode *ni,
+ const struct INDEX_ROOT *root, struct NTFS_DE **entry,
+ size_t *off, struct ntfs_fnd *fnd)
+{
+ int err;
+ struct indx_node *n = NULL;
+ struct NTFS_DE *e = NULL;
+ struct NTFS_DE *e2;
+ size_t bit;
+ CLST next_used_vbn;
+ CLST next_vbn;
+ u32 record_size = ni->mi.sbi->record_size;
+
+ /* Use non sorted algorithm. */
+ if (!*entry) {
+ /* This is the first call. */
+ e = hdr_first_de(&root->ihdr);
+ if (!e)
+ return 0;
+ fnd_clear(fnd);
+ fnd->root_de = e;
+
+ /* The first call with setup of initial element. */
+ if (*off >= record_size) {
+ next_vbn = (((*off - record_size) >> indx->index_bits))
+ << indx->idx2vbn_bits;
+ /* Jump inside cycle 'for'. */
+ goto next;
+ }
+
+ /* Start enumeration from root. */
+ *off = 0;
+ } else if (!fnd->root_de)
+ return -EINVAL;
+
+ for (;;) {
+ /* Check if current entry can be used. */
+ if (e && le16_to_cpu(e->size) > sizeof(struct NTFS_DE))
+ goto ok;
+
+ if (!fnd->level) {
+ /* Continue to enumerate root. */
+ if (!de_is_last(fnd->root_de)) {
+ e = hdr_next_de(&root->ihdr, fnd->root_de);
+ if (!e)
+ return -EINVAL;
+ fnd->root_de = e;
+ continue;
+ }
+
+ /* Start to enumerate indexes from 0. */
+ next_vbn = 0;
+ } else {
+ /* Continue to enumerate indexes. */
+ e2 = fnd->de[fnd->level - 1];
+
+ n = fnd->nodes[fnd->level - 1];
+
+ if (!de_is_last(e2)) {
+ e = hdr_next_de(&n->index->ihdr, e2);
+ if (!e)
+ return -EINVAL;
+ fnd->de[fnd->level - 1] = e;
+ continue;
+ }
+
+ /* Continue with next index. */
+ next_vbn = le64_to_cpu(n->index->vbn) +
+ root->index_block_clst;
+ }
+
+next:
+ /* Release current index. */
+ if (n) {
+ fnd_pop(fnd);
+ put_indx_node(n);
+ n = NULL;
+ }
+
+ /* Skip all free indexes. */
+ bit = next_vbn >> indx->idx2vbn_bits;
+ err = indx_used_bit(indx, ni, &bit);
+ if (err == -ENOENT || bit == MINUS_ONE_T) {
+ /* No used indexes. */
+ *entry = NULL;
+ return 0;
+ }
+
+ next_used_vbn = bit << indx->idx2vbn_bits;
+
+ /* Read buffer into memory. */
+ err = indx_read(indx, ni, next_used_vbn, &n);
+ if (err)
+ return err;
+
+ e = hdr_first_de(&n->index->ihdr);
+ fnd_push(fnd, n, e);
+ if (!e)
+ return -EINVAL;
+ }
+
+ok:
+ /* Return offset to restore enumerator if necessary. */
+ if (!n) {
+ /* 'e' points in root, */
+ *off = PtrOffset(&root->ihdr, e);
+ } else {
+ /* 'e' points in index, */
+ *off = (le64_to_cpu(n->index->vbn) << indx->vbn2vbo_bits) +
+ record_size + PtrOffset(&n->index->ihdr, e);
+ }
+
+ *entry = e;
+ return 0;
+}
+
+/*
+ * indx_create_allocate - Create "Allocation + Bitmap" attributes.
+ */
+static int indx_create_allocate(struct ntfs_index *indx, struct ntfs_inode *ni,
+ CLST *vbn)
+{
+ int err;
+ struct ntfs_sb_info *sbi = ni->mi.sbi;
+ struct ATTRIB *bitmap;
+ struct ATTRIB *alloc;
+ u32 data_size = 1u << indx->index_bits;
+ u32 alloc_size = ntfs_up_cluster(sbi, data_size);
+ CLST len = alloc_size >> sbi->cluster_bits;
+ const struct INDEX_NAMES *in = &s_index_names[indx->type];
+ CLST alen;
+ struct runs_tree run;
+
+ run_init(&run);
+
+ err = attr_allocate_clusters(sbi, &run, 0, 0, len, NULL, 0, &alen, 0,
+ NULL);
+ if (err)
+ goto out;
+
+ err = ni_insert_nonresident(ni, ATTR_ALLOC, in->name, in->name_len,
+ &run, 0, len, 0, &alloc, NULL, NULL);
+ if (err)
+ goto out1;
+
+ alloc->nres.valid_size = alloc->nres.data_size = cpu_to_le64(data_size);
+
+ err = ni_insert_resident(ni, bitmap_size(1), ATTR_BITMAP, in->name,
+ in->name_len, &bitmap, NULL, NULL);
+ if (err)
+ goto out2;
+
+ if (in->name == I30_NAME) {
+ ni->vfs_inode.i_size = data_size;
+ inode_set_bytes(&ni->vfs_inode, alloc_size);
+ }
+
+ memcpy(&indx->alloc_run, &run, sizeof(run));
+
+ *vbn = 0;
+
+ return 0;
+
+out2:
+ mi_remove_attr(NULL, &ni->mi, alloc);
+
+out1:
+ run_deallocate(sbi, &run, false);
+
+out:
+ return err;
+}
+
+/*
+ * indx_add_allocate - Add clusters to index.
+ */
+static int indx_add_allocate(struct ntfs_index *indx, struct ntfs_inode *ni,
+ CLST *vbn)
+{
+ int err;
+ size_t bit;
+ u64 data_size;
+ u64 bmp_size, bmp_size_v;
+ struct ATTRIB *bmp, *alloc;
+ struct mft_inode *mi;
+ const struct INDEX_NAMES *in = &s_index_names[indx->type];
+
+ err = indx_find_free(indx, ni, &bit, &bmp);
+ if (err)
+ goto out1;
+
+ if (bit != MINUS_ONE_T) {
+ bmp = NULL;
+ } else {
+ if (bmp->non_res) {
+ bmp_size = le64_to_cpu(bmp->nres.data_size);
+ bmp_size_v = le64_to_cpu(bmp->nres.valid_size);
+ } else {
+ bmp_size = bmp_size_v = le32_to_cpu(bmp->res.data_size);
+ }
+
+ bit = bmp_size << 3;
+ }
+
+ data_size = (u64)(bit + 1) << indx->index_bits;
+
+ if (bmp) {
+ /* Increase bitmap. */
+ err = attr_set_size(ni, ATTR_BITMAP, in->name, in->name_len,
+ &indx->bitmap_run, bitmap_size(bit + 1),
+ NULL, true, NULL);
+ if (err)
+ goto out1;
+ }
+
+ alloc = ni_find_attr(ni, NULL, NULL, ATTR_ALLOC, in->name, in->name_len,
+ NULL, &mi);
+ if (!alloc) {
+ err = -EINVAL;
+ if (bmp)
+ goto out2;
+ goto out1;
+ }
+
+ /* Increase allocation. */
+ err = attr_set_size(ni, ATTR_ALLOC, in->name, in->name_len,
+ &indx->alloc_run, data_size, &data_size, true,
+ NULL);
+ if (err) {
+ if (bmp)
+ goto out2;
+ goto out1;
+ }
+
+ *vbn = bit << indx->idx2vbn_bits;
+
+ return 0;
+
+out2:
+ /* Ops. No space? */
+ attr_set_size(ni, ATTR_BITMAP, in->name, in->name_len,
+ &indx->bitmap_run, bmp_size, &bmp_size_v, false, NULL);
+
+out1:
+ return err;
+}
+
+/*
+ * indx_insert_into_root - Attempt to insert an entry into the index root.
+ *
+ * @undo - True if we undoing previous remove.
+ * If necessary, it will twiddle the index b-tree.
+ */
+static int indx_insert_into_root(struct ntfs_index *indx, struct ntfs_inode *ni,
+ const struct NTFS_DE *new_de,
+ struct NTFS_DE *root_de, const void *ctx,
+ struct ntfs_fnd *fnd, bool undo)
+{
+ int err = 0;
+ struct NTFS_DE *e, *e0, *re;
+ struct mft_inode *mi;
+ struct ATTRIB *attr;
+ struct INDEX_HDR *hdr;
+ struct indx_node *n;
+ CLST new_vbn;
+ __le64 *sub_vbn, t_vbn;
+ u16 new_de_size;
+ u32 hdr_used, hdr_total, asize, to_move;
+ u32 root_size, new_root_size;
+ struct ntfs_sb_info *sbi;
+ int ds_root;
+ struct INDEX_ROOT *root, *a_root;
+
+ /* Get the record this root placed in. */
+ root = indx_get_root(indx, ni, &attr, &mi);
+ if (!root)
+ return -EINVAL;
+
+ /*
+ * Try easy case:
+ * hdr_insert_de will succeed if there's
+ * room the root for the new entry.
+ */
+ hdr = &root->ihdr;
+ sbi = ni->mi.sbi;
+ new_de_size = le16_to_cpu(new_de->size);
+ hdr_used = le32_to_cpu(hdr->used);
+ hdr_total = le32_to_cpu(hdr->total);
+ asize = le32_to_cpu(attr->size);
+ root_size = le32_to_cpu(attr->res.data_size);
+
+ ds_root = new_de_size + hdr_used - hdr_total;
+
+ /* If 'undo' is set then reduce requirements. */
+ if ((undo || asize + ds_root < sbi->max_bytes_per_attr) &&
+ mi_resize_attr(mi, attr, ds_root)) {
+ hdr->total = cpu_to_le32(hdr_total + ds_root);
+ e = hdr_insert_de(indx, hdr, new_de, root_de, ctx);
+ WARN_ON(!e);
+ fnd_clear(fnd);
+ fnd->root_de = e;
+
+ return 0;
+ }
+
+ /* Make a copy of root attribute to restore if error. */
+ a_root = kmemdup(attr, asize, GFP_NOFS);
+ if (!a_root)
+ return -ENOMEM;
+
+ /*
+ * Copy all the non-end entries from
+ * the index root to the new buffer.
+ */
+ to_move = 0;
+ e0 = hdr_first_de(hdr);
+
+ /* Calculate the size to copy. */
+ for (e = e0;; e = hdr_next_de(hdr, e)) {
+ if (!e) {
+ err = -EINVAL;
+ goto out_free_root;
+ }
+
+ if (de_is_last(e))
+ break;
+ to_move += le16_to_cpu(e->size);
+ }
+
+ if (!to_move) {
+ re = NULL;
+ } else {
+ re = kmemdup(e0, to_move, GFP_NOFS);
+ if (!re) {
+ err = -ENOMEM;
+ goto out_free_root;
+ }
+ }
+
+ sub_vbn = NULL;
+ if (de_has_vcn(e)) {
+ t_vbn = de_get_vbn_le(e);
+ sub_vbn = &t_vbn;
+ }
+
+ new_root_size = sizeof(struct INDEX_ROOT) + sizeof(struct NTFS_DE) +
+ sizeof(u64);
+ ds_root = new_root_size - root_size;
+
+ if (ds_root > 0 && asize + ds_root > sbi->max_bytes_per_attr) {
+ /* Make root external. */
+ err = -EOPNOTSUPP;
+ goto out_free_re;
+ }
+
+ if (ds_root)
+ mi_resize_attr(mi, attr, ds_root);
+
+ /* Fill first entry (vcn will be set later). */
+ e = (struct NTFS_DE *)(root + 1);
+ memset(e, 0, sizeof(struct NTFS_DE));
+ e->size = cpu_to_le16(sizeof(struct NTFS_DE) + sizeof(u64));
+ e->flags = NTFS_IE_HAS_SUBNODES | NTFS_IE_LAST;
+
+ hdr->flags = 1;
+ hdr->used = hdr->total =
+ cpu_to_le32(new_root_size - offsetof(struct INDEX_ROOT, ihdr));
+
+ fnd->root_de = hdr_first_de(hdr);
+ mi->dirty = true;
+
+ /* Create alloc and bitmap attributes (if not). */
+ err = run_is_empty(&indx->alloc_run)
+ ? indx_create_allocate(indx, ni, &new_vbn)
+ : indx_add_allocate(indx, ni, &new_vbn);
+
+ /* Layout of record may be changed, so rescan root. */
+ root = indx_get_root(indx, ni, &attr, &mi);
+ if (!root) {
+ /* Bug? */
+ ntfs_set_state(sbi, NTFS_DIRTY_ERROR);
+ err = -EINVAL;
+ goto out_free_re;
+ }
+
+ if (err) {
+ /* Restore root. */
+ if (mi_resize_attr(mi, attr, -ds_root)) {
+ memcpy(attr, a_root, asize);
+ } else {
+ /* Bug? */
+ ntfs_set_state(sbi, NTFS_DIRTY_ERROR);
+ }
+ goto out_free_re;
+ }
+
+ e = (struct NTFS_DE *)(root + 1);
+ *(__le64 *)(e + 1) = cpu_to_le64(new_vbn);
+ mi->dirty = true;
+
+ /* Now we can create/format the new buffer and copy the entries into. */
+ n = indx_new(indx, ni, new_vbn, sub_vbn);
+ if (IS_ERR(n)) {
+ err = PTR_ERR(n);
+ goto out_free_re;
+ }
+
+ hdr = &n->index->ihdr;
+ hdr_used = le32_to_cpu(hdr->used);
+ hdr_total = le32_to_cpu(hdr->total);
+
+ /* Copy root entries into new buffer. */
+ hdr_insert_head(hdr, re, to_move);
+
+ /* Update bitmap attribute. */
+ indx_mark_used(indx, ni, new_vbn >> indx->idx2vbn_bits);
+
+ /* Check if we can insert new entry new index buffer. */
+ if (hdr_used + new_de_size > hdr_total) {
+ /*
+ * This occurs if MFT record is the same or bigger than index
+ * buffer. Move all root new index and have no space to add
+ * new entry classic case when MFT record is 1K and index
+ * buffer 4K the problem should not occurs.
+ */
+ kfree(re);
+ indx_write(indx, ni, n, 0);
+
+ put_indx_node(n);
+ fnd_clear(fnd);
+ err = indx_insert_entry(indx, ni, new_de, ctx, fnd, undo);
+ goto out_free_root;
+ }
+
+ /*
+ * Now root is a parent for new index buffer.
+ * Insert NewEntry a new buffer.
+ */
+ e = hdr_insert_de(indx, hdr, new_de, NULL, ctx);
+ if (!e) {
+ err = -EINVAL;
+ goto out_put_n;
+ }
+ fnd_push(fnd, n, e);
+
+ /* Just write updates index into disk. */
+ indx_write(indx, ni, n, 0);
+
+ n = NULL;
+
+out_put_n:
+ put_indx_node(n);
+out_free_re:
+ kfree(re);
+out_free_root:
+ kfree(a_root);
+ return err;
+}
+
+/*
+ * indx_insert_into_buffer
+ *
+ * Attempt to insert an entry into an Index Allocation Buffer.
+ * If necessary, it will split the buffer.
+ */
+static int
+indx_insert_into_buffer(struct ntfs_index *indx, struct ntfs_inode *ni,
+ struct INDEX_ROOT *root, const struct NTFS_DE *new_de,
+ const void *ctx, int level, struct ntfs_fnd *fnd)
+{
+ int err;
+ const struct NTFS_DE *sp;
+ struct NTFS_DE *e, *de_t, *up_e;
+ struct indx_node *n2;
+ struct indx_node *n1 = fnd->nodes[level];
+ struct INDEX_HDR *hdr1 = &n1->index->ihdr;
+ struct INDEX_HDR *hdr2;
+ u32 to_copy, used;
+ CLST new_vbn;
+ __le64 t_vbn, *sub_vbn;
+ u16 sp_size;
+
+ /* Try the most easy case. */
+ e = fnd->level - 1 == level ? fnd->de[level] : NULL;
+ e = hdr_insert_de(indx, hdr1, new_de, e, ctx);
+ fnd->de[level] = e;
+ if (e) {
+ /* Just write updated index into disk. */
+ indx_write(indx, ni, n1, 0);
+ return 0;
+ }
+
+ /*
+ * No space to insert into buffer. Split it.
+ * To split we:
+ * - Save split point ('cause index buffers will be changed)
+ * - Allocate NewBuffer and copy all entries <= sp into new buffer
+ * - Remove all entries (sp including) from TargetBuffer
+ * - Insert NewEntry into left or right buffer (depending on sp <=>
+ * NewEntry)
+ * - Insert sp into parent buffer (or root)
+ * - Make sp a parent for new buffer
+ */
+ sp = hdr_find_split(hdr1);
+ if (!sp)
+ return -EINVAL;
+
+ sp_size = le16_to_cpu(sp->size);
+ up_e = kmalloc(sp_size + sizeof(u64), GFP_NOFS);
+ if (!up_e)
+ return -ENOMEM;
+ memcpy(up_e, sp, sp_size);
+
+ if (!hdr1->flags) {
+ up_e->flags |= NTFS_IE_HAS_SUBNODES;
+ up_e->size = cpu_to_le16(sp_size + sizeof(u64));
+ sub_vbn = NULL;
+ } else {
+ t_vbn = de_get_vbn_le(up_e);
+ sub_vbn = &t_vbn;
+ }
+
+ /* Allocate on disk a new index allocation buffer. */
+ err = indx_add_allocate(indx, ni, &new_vbn);
+ if (err)
+ goto out;
+
+ /* Allocate and format memory a new index buffer. */
+ n2 = indx_new(indx, ni, new_vbn, sub_vbn);
+ if (IS_ERR(n2)) {
+ err = PTR_ERR(n2);
+ goto out;
+ }
+
+ hdr2 = &n2->index->ihdr;
+
+ /* Make sp a parent for new buffer. */
+ de_set_vbn(up_e, new_vbn);
+
+ /* Copy all the entries <= sp into the new buffer. */
+ de_t = hdr_first_de(hdr1);
+ to_copy = PtrOffset(de_t, sp);
+ hdr_insert_head(hdr2, de_t, to_copy);
+
+ /* Remove all entries (sp including) from hdr1. */
+ used = le32_to_cpu(hdr1->used) - to_copy - sp_size;
+ memmove(de_t, Add2Ptr(sp, sp_size), used - le32_to_cpu(hdr1->de_off));
+ hdr1->used = cpu_to_le32(used);
+
+ /*
+ * Insert new entry into left or right buffer
+ * (depending on sp <=> new_de).
+ */
+ hdr_insert_de(indx,
+ (*indx->cmp)(new_de + 1, le16_to_cpu(new_de->key_size),
+ up_e + 1, le16_to_cpu(up_e->key_size),
+ ctx) < 0
+ ? hdr2
+ : hdr1,
+ new_de, NULL, ctx);
+
+ indx_mark_used(indx, ni, new_vbn >> indx->idx2vbn_bits);
+
+ indx_write(indx, ni, n1, 0);
+ indx_write(indx, ni, n2, 0);
+
+ put_indx_node(n2);
+
+ /*
+ * We've finished splitting everybody, so we are ready to
+ * insert the promoted entry into the parent.
+ */
+ if (!level) {
+ /* Insert in root. */
+ err = indx_insert_into_root(indx, ni, up_e, NULL, ctx, fnd, 0);
+ if (err)
+ goto out;
+ } else {
+ /*
+ * The target buffer's parent is another index buffer.
+ * TODO: Remove recursion.
+ */
+ err = indx_insert_into_buffer(indx, ni, root, up_e, ctx,
+ level - 1, fnd);
+ if (err)
+ goto out;
+ }
+
+out:
+ kfree(up_e);
+
+ return err;
+}
+
+/*
+ * indx_insert_entry - Insert new entry into index.
+ *
+ * @undo - True if we undoing previous remove.
+ */
+int indx_insert_entry(struct ntfs_index *indx, struct ntfs_inode *ni,
+ const struct NTFS_DE *new_de, const void *ctx,
+ struct ntfs_fnd *fnd, bool undo)
+{
+ int err;
+ int diff;
+ struct NTFS_DE *e;
+ struct ntfs_fnd *fnd_a = NULL;
+ struct INDEX_ROOT *root;
+
+ if (!fnd) {
+ fnd_a = fnd_get();
+ if (!fnd_a) {
+ err = -ENOMEM;
+ goto out1;
+ }
+ fnd = fnd_a;
+ }
+
+ root = indx_get_root(indx, ni, NULL, NULL);
+ if (!root) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ if (fnd_is_empty(fnd)) {
+ /*
+ * Find the spot the tree where we want to
+ * insert the new entry.
+ */
+ err = indx_find(indx, ni, root, new_de + 1,
+ le16_to_cpu(new_de->key_size), ctx, &diff, &e,
+ fnd);
+ if (err)
+ goto out;
+
+ if (!diff) {
+ err = -EEXIST;
+ goto out;
+ }
+ }
+
+ if (!fnd->level) {
+ /*
+ * The root is also a leaf, so we'll insert the
+ * new entry into it.
+ */
+ err = indx_insert_into_root(indx, ni, new_de, fnd->root_de, ctx,
+ fnd, undo);
+ if (err)
+ goto out;
+ } else {
+ /*
+ * Found a leaf buffer, so we'll insert the new entry into it.
+ */
+ err = indx_insert_into_buffer(indx, ni, root, new_de, ctx,
+ fnd->level - 1, fnd);
+ if (err)
+ goto out;
+ }
+
+out:
+ fnd_put(fnd_a);
+out1:
+ return err;
+}
+
+/*
+ * indx_find_buffer - Locate a buffer from the tree.
+ */
+static struct indx_node *indx_find_buffer(struct ntfs_index *indx,
+ struct ntfs_inode *ni,
+ const struct INDEX_ROOT *root,
+ __le64 vbn, struct indx_node *n)
+{
+ int err;
+ const struct NTFS_DE *e;
+ struct indx_node *r;
+ const struct INDEX_HDR *hdr = n ? &n->index->ihdr : &root->ihdr;
+
+ /* Step 1: Scan one level. */
+ for (e = hdr_first_de(hdr);; e = hdr_next_de(hdr, e)) {
+ if (!e)
+ return ERR_PTR(-EINVAL);
+
+ if (de_has_vcn(e) && vbn == de_get_vbn_le(e))
+ return n;
+
+ if (de_is_last(e))
+ break;
+ }
+
+ /* Step2: Do recursion. */
+ e = Add2Ptr(hdr, le32_to_cpu(hdr->de_off));
+ for (;;) {
+ if (de_has_vcn_ex(e)) {
+ err = indx_read(indx, ni, de_get_vbn(e), &n);
+ if (err)
+ return ERR_PTR(err);
+
+ r = indx_find_buffer(indx, ni, root, vbn, n);
+ if (r)
+ return r;
+ }
+
+ if (de_is_last(e))
+ break;
+
+ e = Add2Ptr(e, le16_to_cpu(e->size));
+ }
+
+ return NULL;
+}
+
+/*
+ * indx_shrink - Deallocate unused tail indexes.
+ */
+static int indx_shrink(struct ntfs_index *indx, struct ntfs_inode *ni,
+ size_t bit)
+{
+ int err = 0;
+ u64 bpb, new_data;
+ size_t nbits;
+ struct ATTRIB *b;
+ struct ATTR_LIST_ENTRY *le = NULL;
+ const struct INDEX_NAMES *in = &s_index_names[indx->type];
+
+ b = ni_find_attr(ni, NULL, &le, ATTR_BITMAP, in->name, in->name_len,
+ NULL, NULL);
+
+ if (!b)
+ return -ENOENT;
+
+ if (!b->non_res) {
+ unsigned long pos;
+ const unsigned long *bm = resident_data(b);
+
+ nbits = (size_t)le32_to_cpu(b->res.data_size) * 8;
+
+ if (bit >= nbits)
+ return 0;
+
+ pos = find_next_bit(bm, nbits, bit);
+ if (pos < nbits)
+ return 0;
+ } else {
+ size_t used = MINUS_ONE_T;
+
+ nbits = le64_to_cpu(b->nres.data_size) * 8;
+
+ if (bit >= nbits)
+ return 0;
+
+ err = scan_nres_bitmap(ni, b, indx, bit, &scan_for_used, &used);
+ if (err)
+ return err;
+
+ if (used != MINUS_ONE_T)
+ return 0;
+ }
+
+ new_data = (u64)bit << indx->index_bits;
+
+ err = attr_set_size(ni, ATTR_ALLOC, in->name, in->name_len,
+ &indx->alloc_run, new_data, &new_data, false, NULL);
+ if (err)
+ return err;
+
+ bpb = bitmap_size(bit);
+ if (bpb * 8 == nbits)
+ return 0;
+
+ err = attr_set_size(ni, ATTR_BITMAP, in->name, in->name_len,
+ &indx->bitmap_run, bpb, &bpb, false, NULL);
+
+ return err;
+}
+
+static int indx_free_children(struct ntfs_index *indx, struct ntfs_inode *ni,
+ const struct NTFS_DE *e, bool trim)
+{
+ int err;
+ struct indx_node *n = NULL;
+ struct INDEX_HDR *hdr;
+ CLST vbn = de_get_vbn(e);
+ size_t i;
+
+ err = indx_read(indx, ni, vbn, &n);
+ if (err)
+ return err;
+
+ hdr = &n->index->ihdr;
+ /* First, recurse into the children, if any. */
+ if (hdr_has_subnode(hdr)) {
+ for (e = hdr_first_de(hdr); e; e = hdr_next_de(hdr, e)) {
+ indx_free_children(indx, ni, e, false);
+ if (de_is_last(e))
+ break;
+ }
+ }
+
+ put_indx_node(n);
+
+ i = vbn >> indx->idx2vbn_bits;
+ /*
+ * We've gotten rid of the children; add this buffer to the free list.
+ */
+ indx_mark_free(indx, ni, i);
+
+ if (!trim)
+ return 0;
+
+ /*
+ * If there are no used indexes after current free index
+ * then we can truncate allocation and bitmap.
+ * Use bitmap to estimate the case.
+ */
+ indx_shrink(indx, ni, i + 1);
+ return 0;
+}
+
+/*
+ * indx_get_entry_to_replace
+ *
+ * Find a replacement entry for a deleted entry.
+ * Always returns a node entry:
+ * NTFS_IE_HAS_SUBNODES is set the flags and the size includes the sub_vcn.
+ */
+static int indx_get_entry_to_replace(struct ntfs_index *indx,
+ struct ntfs_inode *ni,
+ const struct NTFS_DE *de_next,
+ struct NTFS_DE **de_to_replace,
+ struct ntfs_fnd *fnd)
+{
+ int err;
+ int level = -1;
+ CLST vbn;
+ struct NTFS_DE *e, *te, *re;
+ struct indx_node *n;
+ struct INDEX_BUFFER *ib;
+
+ *de_to_replace = NULL;
+
+ /* Find first leaf entry down from de_next. */
+ vbn = de_get_vbn(de_next);
+ for (;;) {
+ n = NULL;
+ err = indx_read(indx, ni, vbn, &n);
+ if (err)
+ goto out;
+
+ e = hdr_first_de(&n->index->ihdr);
+ fnd_push(fnd, n, e);
+
+ if (!de_is_last(e)) {
+ /*
+ * This buffer is non-empty, so its first entry
+ * could be used as the replacement entry.
+ */
+ level = fnd->level - 1;
+ }
+
+ if (!de_has_vcn(e))
+ break;
+
+ /* This buffer is a node. Continue to go down. */
+ vbn = de_get_vbn(e);
+ }
+
+ if (level == -1)
+ goto out;
+
+ n = fnd->nodes[level];
+ te = hdr_first_de(&n->index->ihdr);
+ /* Copy the candidate entry into the replacement entry buffer. */
+ re = kmalloc(le16_to_cpu(te->size) + sizeof(u64), GFP_NOFS);
+ if (!re) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ *de_to_replace = re;
+ memcpy(re, te, le16_to_cpu(te->size));
+
+ if (!de_has_vcn(re)) {
+ /*
+ * The replacement entry we found doesn't have a sub_vcn.
+ * increase its size to hold one.
+ */
+ le16_add_cpu(&re->size, sizeof(u64));
+ re->flags |= NTFS_IE_HAS_SUBNODES;
+ } else {
+ /*
+ * The replacement entry we found was a node entry, which
+ * means that all its child buffers are empty. Return them
+ * to the free pool.
+ */
+ indx_free_children(indx, ni, te, true);
+ }
+
+ /*
+ * Expunge the replacement entry from its former location,
+ * and then write that buffer.
+ */
+ ib = n->index;
+ e = hdr_delete_de(&ib->ihdr, te);
+
+ fnd->de[level] = e;
+ indx_write(indx, ni, n, 0);
+
+ /* Check to see if this action created an empty leaf. */
+ if (ib_is_leaf(ib) && ib_is_empty(ib))
+ return 0;
+
+out:
+ fnd_clear(fnd);
+ return err;
+}
+
+/*
+ * indx_delete_entry - Delete an entry from the index.
+ */
+int indx_delete_entry(struct ntfs_index *indx, struct ntfs_inode *ni,
+ const void *key, u32 key_len, const void *ctx)
+{
+ int err, diff;
+ struct INDEX_ROOT *root;
+ struct INDEX_HDR *hdr;
+ struct ntfs_fnd *fnd, *fnd2;
+ struct INDEX_BUFFER *ib;
+ struct NTFS_DE *e, *re, *next, *prev, *me;
+ struct indx_node *n, *n2d = NULL;
+ __le64 sub_vbn;
+ int level, level2;
+ struct ATTRIB *attr;
+ struct mft_inode *mi;
+ u32 e_size, root_size, new_root_size;
+ size_t trim_bit;
+ const struct INDEX_NAMES *in;
+
+ fnd = fnd_get();
+ if (!fnd) {
+ err = -ENOMEM;
+ goto out2;
+ }
+
+ fnd2 = fnd_get();
+ if (!fnd2) {
+ err = -ENOMEM;
+ goto out1;
+ }
+
+ root = indx_get_root(indx, ni, &attr, &mi);
+ if (!root) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ /* Locate the entry to remove. */
+ err = indx_find(indx, ni, root, key, key_len, ctx, &diff, &e, fnd);
+ if (err)
+ goto out;
+
+ if (!e || diff) {
+ err = -ENOENT;
+ goto out;
+ }
+
+ level = fnd->level;
+
+ if (level) {
+ n = fnd->nodes[level - 1];
+ e = fnd->de[level - 1];
+ ib = n->index;
+ hdr = &ib->ihdr;
+ } else {
+ hdr = &root->ihdr;
+ e = fnd->root_de;
+ n = NULL;
+ }
+
+ e_size = le16_to_cpu(e->size);
+
+ if (!de_has_vcn_ex(e)) {
+ /* The entry to delete is a leaf, so we can just rip it out. */
+ hdr_delete_de(hdr, e);
+
+ if (!level) {
+ hdr->total = hdr->used;
+
+ /* Shrink resident root attribute. */
+ mi_resize_attr(mi, attr, 0 - e_size);
+ goto out;
+ }
+
+ indx_write(indx, ni, n, 0);
+
+ /*
+ * Check to see if removing that entry made
+ * the leaf empty.
+ */
+ if (ib_is_leaf(ib) && ib_is_empty(ib)) {
+ fnd_pop(fnd);
+ fnd_push(fnd2, n, e);
+ }
+ } else {
+ /*
+ * The entry we wish to delete is a node buffer, so we
+ * have to find a replacement for it.
+ */
+ next = de_get_next(e);
+
+ err = indx_get_entry_to_replace(indx, ni, next, &re, fnd2);
+ if (err)
+ goto out;
+
+ if (re) {
+ de_set_vbn_le(re, de_get_vbn_le(e));
+ hdr_delete_de(hdr, e);
+
+ err = level ? indx_insert_into_buffer(indx, ni, root,
+ re, ctx,
+ fnd->level - 1,
+ fnd)
+ : indx_insert_into_root(indx, ni, re, e,
+ ctx, fnd, 0);
+ kfree(re);
+
+ if (err)
+ goto out;
+ } else {
+ /*
+ * There is no replacement for the current entry.
+ * This means that the subtree rooted at its node
+ * is empty, and can be deleted, which turn means
+ * that the node can just inherit the deleted
+ * entry sub_vcn.
+ */
+ indx_free_children(indx, ni, next, true);
+
+ de_set_vbn_le(next, de_get_vbn_le(e));
+ hdr_delete_de(hdr, e);
+ if (level) {
+ indx_write(indx, ni, n, 0);
+ } else {
+ hdr->total = hdr->used;
+
+ /* Shrink resident root attribute. */
+ mi_resize_attr(mi, attr, 0 - e_size);
+ }
+ }
+ }
+
+ /* Delete a branch of tree. */
+ if (!fnd2 || !fnd2->level)
+ goto out;
+
+ /* Reinit root 'cause it can be changed. */
+ root = indx_get_root(indx, ni, &attr, &mi);
+ if (!root) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ n2d = NULL;
+ sub_vbn = fnd2->nodes[0]->index->vbn;
+ level2 = 0;
+ level = fnd->level;
+
+ hdr = level ? &fnd->nodes[level - 1]->index->ihdr : &root->ihdr;
+
+ /* Scan current level. */
+ for (e = hdr_first_de(hdr);; e = hdr_next_de(hdr, e)) {
+ if (!e) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ if (de_has_vcn(e) && sub_vbn == de_get_vbn_le(e))
+ break;
+
+ if (de_is_last(e)) {
+ e = NULL;
+ break;
+ }
+ }
+
+ if (!e) {
+ /* Do slow search from root. */
+ struct indx_node *in;
+
+ fnd_clear(fnd);
+
+ in = indx_find_buffer(indx, ni, root, sub_vbn, NULL);
+ if (IS_ERR(in)) {
+ err = PTR_ERR(in);
+ goto out;
+ }
+
+ if (in)
+ fnd_push(fnd, in, NULL);
+ }
+
+ /* Merge fnd2 -> fnd. */
+ for (level = 0; level < fnd2->level; level++) {
+ fnd_push(fnd, fnd2->nodes[level], fnd2->de[level]);
+ fnd2->nodes[level] = NULL;
+ }
+ fnd2->level = 0;
+
+ hdr = NULL;
+ for (level = fnd->level; level; level--) {
+ struct indx_node *in = fnd->nodes[level - 1];
+
+ ib = in->index;
+ if (ib_is_empty(ib)) {
+ sub_vbn = ib->vbn;
+ } else {
+ hdr = &ib->ihdr;
+ n2d = in;
+ level2 = level;
+ break;
+ }
+ }
+
+ if (!hdr)
+ hdr = &root->ihdr;
+
+ e = hdr_first_de(hdr);
+ if (!e) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ if (hdr != &root->ihdr || !de_is_last(e)) {
+ prev = NULL;
+ while (!de_is_last(e)) {
+ if (de_has_vcn(e) && sub_vbn == de_get_vbn_le(e))
+ break;
+ prev = e;
+ e = hdr_next_de(hdr, e);
+ if (!e) {
+ err = -EINVAL;
+ goto out;
+ }
+ }
+
+ if (sub_vbn != de_get_vbn_le(e)) {
+ /*
+ * Didn't find the parent entry, although this buffer
+ * is the parent trail. Something is corrupt.
+ */
+ err = -EINVAL;
+ goto out;
+ }
+
+ if (de_is_last(e)) {
+ /*
+ * Since we can't remove the end entry, we'll remove
+ * its predecessor instead. This means we have to
+ * transfer the predecessor's sub_vcn to the end entry.
+ * Note: This index block is not empty, so the
+ * predecessor must exist.
+ */
+ if (!prev) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ if (de_has_vcn(prev)) {
+ de_set_vbn_le(e, de_get_vbn_le(prev));
+ } else if (de_has_vcn(e)) {
+ le16_sub_cpu(&e->size, sizeof(u64));
+ e->flags &= ~NTFS_IE_HAS_SUBNODES;
+ le32_sub_cpu(&hdr->used, sizeof(u64));
+ }
+ e = prev;
+ }
+
+ /*
+ * Copy the current entry into a temporary buffer (stripping
+ * off its down-pointer, if any) and delete it from the current
+ * buffer or root, as appropriate.
+ */
+ e_size = le16_to_cpu(e->size);
+ me = kmemdup(e, e_size, GFP_NOFS);
+ if (!me) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ if (de_has_vcn(me)) {
+ me->flags &= ~NTFS_IE_HAS_SUBNODES;
+ le16_sub_cpu(&me->size, sizeof(u64));
+ }
+
+ hdr_delete_de(hdr, e);
+
+ if (hdr == &root->ihdr) {
+ level = 0;
+ hdr->total = hdr->used;
+
+ /* Shrink resident root attribute. */
+ mi_resize_attr(mi, attr, 0 - e_size);
+ } else {
+ indx_write(indx, ni, n2d, 0);
+ level = level2;
+ }
+
+ /* Mark unused buffers as free. */
+ trim_bit = -1;
+ for (; level < fnd->level; level++) {
+ ib = fnd->nodes[level]->index;
+ if (ib_is_empty(ib)) {
+ size_t k = le64_to_cpu(ib->vbn) >>
+ indx->idx2vbn_bits;
+
+ indx_mark_free(indx, ni, k);
+ if (k < trim_bit)
+ trim_bit = k;
+ }
+ }
+
+ fnd_clear(fnd);
+ /*fnd->root_de = NULL;*/
+
+ /*
+ * Re-insert the entry into the tree.
+ * Find the spot the tree where we want to insert the new entry.
+ */
+ err = indx_insert_entry(indx, ni, me, ctx, fnd, 0);
+ kfree(me);
+ if (err)
+ goto out;
+
+ if (trim_bit != -1)
+ indx_shrink(indx, ni, trim_bit);
+ } else {
+ /*
+ * This tree needs to be collapsed down to an empty root.
+ * Recreate the index root as an empty leaf and free all
+ * the bits the index allocation bitmap.
+ */
+ fnd_clear(fnd);
+ fnd_clear(fnd2);
+
+ in = &s_index_names[indx->type];
+
+ err = attr_set_size(ni, ATTR_ALLOC, in->name, in->name_len,
+ &indx->alloc_run, 0, NULL, false, NULL);
+ err = ni_remove_attr(ni, ATTR_ALLOC, in->name, in->name_len,
+ false, NULL);
+ run_close(&indx->alloc_run);
+
+ err = attr_set_size(ni, ATTR_BITMAP, in->name, in->name_len,
+ &indx->bitmap_run, 0, NULL, false, NULL);
+ err = ni_remove_attr(ni, ATTR_BITMAP, in->name, in->name_len,
+ false, NULL);
+ run_close(&indx->bitmap_run);
+
+ root = indx_get_root(indx, ni, &attr, &mi);
+ if (!root) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ root_size = le32_to_cpu(attr->res.data_size);
+ new_root_size =
+ sizeof(struct INDEX_ROOT) + sizeof(struct NTFS_DE);
+
+ if (new_root_size != root_size &&
+ !mi_resize_attr(mi, attr, new_root_size - root_size)) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ /* Fill first entry. */
+ e = (struct NTFS_DE *)(root + 1);
+ e->ref.low = 0;
+ e->ref.high = 0;
+ e->ref.seq = 0;
+ e->size = cpu_to_le16(sizeof(struct NTFS_DE));
+ e->flags = NTFS_IE_LAST; // 0x02
+ e->key_size = 0;
+ e->res = 0;
+
+ hdr = &root->ihdr;
+ hdr->flags = 0;
+ hdr->used = hdr->total = cpu_to_le32(
+ new_root_size - offsetof(struct INDEX_ROOT, ihdr));
+ mi->dirty = true;
+ }
+
+out:
+ fnd_put(fnd2);
+out1:
+ fnd_put(fnd);
+out2:
+ return err;
+}
+
+/*
+ * Update duplicated information in directory entry
+ * 'dup' - info from MFT record
+ */
+int indx_update_dup(struct ntfs_inode *ni, struct ntfs_sb_info *sbi,
+ const struct ATTR_FILE_NAME *fname,
+ const struct NTFS_DUP_INFO *dup, int sync)
+{
+ int err, diff;
+ struct NTFS_DE *e = NULL;
+ struct ATTR_FILE_NAME *e_fname;
+ struct ntfs_fnd *fnd;
+ struct INDEX_ROOT *root;
+ struct mft_inode *mi;
+ struct ntfs_index *indx = &ni->dir;
+
+ fnd = fnd_get();
+ if (!fnd)
+ return -ENOMEM;
+
+ root = indx_get_root(indx, ni, NULL, &mi);
+ if (!root) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ /* Find entry in directory. */
+ err = indx_find(indx, ni, root, fname, fname_full_size(fname), sbi,
+ &diff, &e, fnd);
+ if (err)
+ goto out;
+
+ if (!e) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ if (diff) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ e_fname = (struct ATTR_FILE_NAME *)(e + 1);
+
+ if (!memcmp(&e_fname->dup, dup, sizeof(*dup))) {
+ /*
+ * Nothing to update in index! Try to avoid this call.
+ */
+ goto out;
+ }
+
+ memcpy(&e_fname->dup, dup, sizeof(*dup));
+
+ if (fnd->level) {
+ /* Directory entry in index. */
+ err = indx_write(indx, ni, fnd->nodes[fnd->level - 1], sync);
+ } else {
+ /* Directory entry in directory MFT record. */
+ mi->dirty = true;
+ if (sync)
+ err = mi_write(mi, 1);
+ else
+ mark_inode_dirty(&ni->vfs_inode);
+ }
+
+out:
+ fnd_put(fnd);
+ return err;
+}
diff --git a/fs/ntfs3/inode.c b/fs/ntfs3/inode.c
new file mode 100644
index 000000000..dc937089a
--- /dev/null
+++ b/fs/ntfs3/inode.c
@@ -0,0 +1,1970 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ *
+ * Copyright (C) 2019-2021 Paragon Software GmbH, All rights reserved.
+ *
+ */
+
+#include <linux/buffer_head.h>
+#include <linux/fs.h>
+#include <linux/mpage.h>
+#include <linux/namei.h>
+#include <linux/nls.h>
+#include <linux/uio.h>
+#include <linux/writeback.h>
+
+#include "debug.h"
+#include "ntfs.h"
+#include "ntfs_fs.h"
+
+/*
+ * ntfs_read_mft - Read record and parses MFT.
+ */
+static struct inode *ntfs_read_mft(struct inode *inode,
+ const struct cpu_str *name,
+ const struct MFT_REF *ref)
+{
+ int err = 0;
+ struct ntfs_inode *ni = ntfs_i(inode);
+ struct super_block *sb = inode->i_sb;
+ struct ntfs_sb_info *sbi = sb->s_fs_info;
+ mode_t mode = 0;
+ struct ATTR_STD_INFO5 *std5 = NULL;
+ struct ATTR_LIST_ENTRY *le;
+ struct ATTRIB *attr;
+ bool is_match = false;
+ bool is_root = false;
+ bool is_dir;
+ unsigned long ino = inode->i_ino;
+ u32 rp_fa = 0, asize, t32;
+ u16 roff, rsize, names = 0;
+ const struct ATTR_FILE_NAME *fname = NULL;
+ const struct INDEX_ROOT *root;
+ struct REPARSE_DATA_BUFFER rp; // 0x18 bytes
+ u64 t64;
+ struct MFT_REC *rec;
+ struct runs_tree *run;
+
+ inode->i_op = NULL;
+ /* Setup 'uid' and 'gid' */
+ inode->i_uid = sbi->options->fs_uid;
+ inode->i_gid = sbi->options->fs_gid;
+
+ err = mi_init(&ni->mi, sbi, ino);
+ if (err)
+ goto out;
+
+ if (!sbi->mft.ni && ino == MFT_REC_MFT && !sb->s_root) {
+ t64 = sbi->mft.lbo >> sbi->cluster_bits;
+ t32 = bytes_to_cluster(sbi, MFT_REC_VOL * sbi->record_size);
+ sbi->mft.ni = ni;
+ init_rwsem(&ni->file.run_lock);
+
+ if (!run_add_entry(&ni->file.run, 0, t64, t32, true)) {
+ err = -ENOMEM;
+ goto out;
+ }
+ }
+
+ err = mi_read(&ni->mi, ino == MFT_REC_MFT);
+
+ if (err)
+ goto out;
+
+ rec = ni->mi.mrec;
+
+ if (sbi->flags & NTFS_FLAGS_LOG_REPLAYING) {
+ ;
+ } else if (ref->seq != rec->seq) {
+ err = -EINVAL;
+ ntfs_err(sb, "MFT: r=%lx, expect seq=%x instead of %x!", ino,
+ le16_to_cpu(ref->seq), le16_to_cpu(rec->seq));
+ goto out;
+ } else if (!is_rec_inuse(rec)) {
+ err = -ESTALE;
+ ntfs_err(sb, "Inode r=%x is not in use!", (u32)ino);
+ goto out;
+ }
+
+ if (le32_to_cpu(rec->total) != sbi->record_size) {
+ /* Bad inode? */
+ err = -EINVAL;
+ goto out;
+ }
+
+ if (!is_rec_base(rec)) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ /* Record should contain $I30 root. */
+ is_dir = rec->flags & RECORD_FLAG_DIR;
+
+ /* MFT_REC_MFT is not a dir */
+ if (is_dir && ino == MFT_REC_MFT) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ inode->i_generation = le16_to_cpu(rec->seq);
+
+ /* Enumerate all struct Attributes MFT. */
+ le = NULL;
+ attr = NULL;
+
+ /*
+ * To reduce tab pressure use goto instead of
+ * while( (attr = ni_enum_attr_ex(ni, attr, &le, NULL) ))
+ */
+next_attr:
+ run = NULL;
+ err = -EINVAL;
+ attr = ni_enum_attr_ex(ni, attr, &le, NULL);
+ if (!attr)
+ goto end_enum;
+
+ if (le && le->vcn) {
+ /* This is non primary attribute segment. Ignore if not MFT. */
+ if (ino != MFT_REC_MFT || attr->type != ATTR_DATA)
+ goto next_attr;
+
+ run = &ni->file.run;
+ asize = le32_to_cpu(attr->size);
+ goto attr_unpack_run;
+ }
+
+ roff = attr->non_res ? 0 : le16_to_cpu(attr->res.data_off);
+ rsize = attr->non_res ? 0 : le32_to_cpu(attr->res.data_size);
+ asize = le32_to_cpu(attr->size);
+
+ if (le16_to_cpu(attr->name_off) + attr->name_len > asize)
+ goto out;
+
+ if (attr->non_res) {
+ t64 = le64_to_cpu(attr->nres.alloc_size);
+ if (le64_to_cpu(attr->nres.data_size) > t64 ||
+ le64_to_cpu(attr->nres.valid_size) > t64)
+ goto out;
+ }
+
+ switch (attr->type) {
+ case ATTR_STD:
+ if (attr->non_res ||
+ asize < sizeof(struct ATTR_STD_INFO) + roff ||
+ rsize < sizeof(struct ATTR_STD_INFO))
+ goto out;
+
+ if (std5)
+ goto next_attr;
+
+ std5 = Add2Ptr(attr, roff);
+
+#ifdef STATX_BTIME
+ nt2kernel(std5->cr_time, &ni->i_crtime);
+#endif
+ nt2kernel(std5->a_time, &inode->i_atime);
+ nt2kernel(std5->c_time, &inode->i_ctime);
+ nt2kernel(std5->m_time, &inode->i_mtime);
+
+ ni->std_fa = std5->fa;
+
+ if (asize >= sizeof(struct ATTR_STD_INFO5) + roff &&
+ rsize >= sizeof(struct ATTR_STD_INFO5))
+ ni->std_security_id = std5->security_id;
+ goto next_attr;
+
+ case ATTR_LIST:
+ if (attr->name_len || le || ino == MFT_REC_LOG)
+ goto out;
+
+ err = ntfs_load_attr_list(ni, attr);
+ if (err)
+ goto out;
+
+ le = NULL;
+ attr = NULL;
+ goto next_attr;
+
+ case ATTR_NAME:
+ if (attr->non_res || asize < SIZEOF_ATTRIBUTE_FILENAME + roff ||
+ rsize < SIZEOF_ATTRIBUTE_FILENAME)
+ goto out;
+
+ fname = Add2Ptr(attr, roff);
+ if (fname->type == FILE_NAME_DOS)
+ goto next_attr;
+
+ names += 1;
+ if (name && name->len == fname->name_len &&
+ !ntfs_cmp_names_cpu(name, (struct le_str *)&fname->name_len,
+ NULL, false))
+ is_match = true;
+
+ goto next_attr;
+
+ case ATTR_DATA:
+ if (is_dir) {
+ /* Ignore data attribute in dir record. */
+ goto next_attr;
+ }
+
+ if (ino == MFT_REC_BADCLUST && !attr->non_res)
+ goto next_attr;
+
+ if (attr->name_len &&
+ ((ino != MFT_REC_BADCLUST || !attr->non_res ||
+ attr->name_len != ARRAY_SIZE(BAD_NAME) ||
+ memcmp(attr_name(attr), BAD_NAME, sizeof(BAD_NAME))) &&
+ (ino != MFT_REC_SECURE || !attr->non_res ||
+ attr->name_len != ARRAY_SIZE(SDS_NAME) ||
+ memcmp(attr_name(attr), SDS_NAME, sizeof(SDS_NAME))))) {
+ /* File contains stream attribute. Ignore it. */
+ goto next_attr;
+ }
+
+ if (is_attr_sparsed(attr))
+ ni->std_fa |= FILE_ATTRIBUTE_SPARSE_FILE;
+ else
+ ni->std_fa &= ~FILE_ATTRIBUTE_SPARSE_FILE;
+
+ if (is_attr_compressed(attr))
+ ni->std_fa |= FILE_ATTRIBUTE_COMPRESSED;
+ else
+ ni->std_fa &= ~FILE_ATTRIBUTE_COMPRESSED;
+
+ if (is_attr_encrypted(attr))
+ ni->std_fa |= FILE_ATTRIBUTE_ENCRYPTED;
+ else
+ ni->std_fa &= ~FILE_ATTRIBUTE_ENCRYPTED;
+
+ if (!attr->non_res) {
+ ni->i_valid = inode->i_size = rsize;
+ inode_set_bytes(inode, rsize);
+ }
+
+ mode = S_IFREG | (0777 & sbi->options->fs_fmask_inv);
+
+ if (!attr->non_res) {
+ ni->ni_flags |= NI_FLAG_RESIDENT;
+ goto next_attr;
+ }
+
+ inode_set_bytes(inode, attr_ondisk_size(attr));
+
+ ni->i_valid = le64_to_cpu(attr->nres.valid_size);
+ inode->i_size = le64_to_cpu(attr->nres.data_size);
+ if (!attr->nres.alloc_size)
+ goto next_attr;
+
+ run = ino == MFT_REC_BITMAP ? &sbi->used.bitmap.run
+ : &ni->file.run;
+ break;
+
+ case ATTR_ROOT:
+ if (attr->non_res)
+ goto out;
+
+ root = Add2Ptr(attr, roff);
+
+ if (attr->name_len != ARRAY_SIZE(I30_NAME) ||
+ memcmp(attr_name(attr), I30_NAME, sizeof(I30_NAME)))
+ goto next_attr;
+
+ if (root->type != ATTR_NAME ||
+ root->rule != NTFS_COLLATION_TYPE_FILENAME)
+ goto out;
+
+ if (!is_dir)
+ goto next_attr;
+
+ is_root = true;
+ ni->ni_flags |= NI_FLAG_DIR;
+
+ err = indx_init(&ni->dir, sbi, attr, INDEX_MUTEX_I30);
+ if (err)
+ goto out;
+
+ mode = sb->s_root
+ ? (S_IFDIR | (0777 & sbi->options->fs_dmask_inv))
+ : (S_IFDIR | 0777);
+ goto next_attr;
+
+ case ATTR_ALLOC:
+ if (!is_root || attr->name_len != ARRAY_SIZE(I30_NAME) ||
+ memcmp(attr_name(attr), I30_NAME, sizeof(I30_NAME)))
+ goto next_attr;
+
+ inode->i_size = le64_to_cpu(attr->nres.data_size);
+ ni->i_valid = le64_to_cpu(attr->nres.valid_size);
+ inode_set_bytes(inode, le64_to_cpu(attr->nres.alloc_size));
+
+ run = &ni->dir.alloc_run;
+ break;
+
+ case ATTR_BITMAP:
+ if (ino == MFT_REC_MFT) {
+ if (!attr->non_res)
+ goto out;
+#ifndef CONFIG_NTFS3_64BIT_CLUSTER
+ /* 0x20000000 = 2^32 / 8 */
+ if (le64_to_cpu(attr->nres.alloc_size) >= 0x20000000)
+ goto out;
+#endif
+ run = &sbi->mft.bitmap.run;
+ break;
+ } else if (is_dir && attr->name_len == ARRAY_SIZE(I30_NAME) &&
+ !memcmp(attr_name(attr), I30_NAME,
+ sizeof(I30_NAME)) &&
+ attr->non_res) {
+ run = &ni->dir.bitmap_run;
+ break;
+ }
+ goto next_attr;
+
+ case ATTR_REPARSE:
+ if (attr->name_len)
+ goto next_attr;
+
+ rp_fa = ni_parse_reparse(ni, attr, &rp);
+ switch (rp_fa) {
+ case REPARSE_LINK:
+ /*
+ * Normal symlink.
+ * Assume one unicode symbol == one utf8.
+ */
+ inode->i_size = le16_to_cpu(rp.SymbolicLinkReparseBuffer
+ .PrintNameLength) /
+ sizeof(u16);
+
+ ni->i_valid = inode->i_size;
+
+ /* Clear directory bit. */
+ if (ni->ni_flags & NI_FLAG_DIR) {
+ indx_clear(&ni->dir);
+ memset(&ni->dir, 0, sizeof(ni->dir));
+ ni->ni_flags &= ~NI_FLAG_DIR;
+ } else {
+ run_close(&ni->file.run);
+ }
+ mode = S_IFLNK | 0777;
+ is_dir = false;
+ if (attr->non_res) {
+ run = &ni->file.run;
+ goto attr_unpack_run; // Double break.
+ }
+ break;
+
+ case REPARSE_COMPRESSED:
+ break;
+
+ case REPARSE_DEDUPLICATED:
+ break;
+ }
+ goto next_attr;
+
+ case ATTR_EA_INFO:
+ if (!attr->name_len &&
+ resident_data_ex(attr, sizeof(struct EA_INFO))) {
+ ni->ni_flags |= NI_FLAG_EA;
+ /*
+ * ntfs_get_wsl_perm updates inode->i_uid, inode->i_gid, inode->i_mode
+ */
+ inode->i_mode = mode;
+ ntfs_get_wsl_perm(inode);
+ mode = inode->i_mode;
+ }
+ goto next_attr;
+
+ default:
+ goto next_attr;
+ }
+
+attr_unpack_run:
+ roff = le16_to_cpu(attr->nres.run_off);
+
+ if (roff > asize) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ t64 = le64_to_cpu(attr->nres.svcn);
+
+ err = run_unpack_ex(run, sbi, ino, t64, le64_to_cpu(attr->nres.evcn),
+ t64, Add2Ptr(attr, roff), asize - roff);
+ if (err < 0)
+ goto out;
+ err = 0;
+ goto next_attr;
+
+end_enum:
+
+ if (!std5)
+ goto out;
+
+ if (!is_match && name) {
+ /* Reuse rec as buffer for ascii name. */
+ err = -ENOENT;
+ goto out;
+ }
+
+ if (std5->fa & FILE_ATTRIBUTE_READONLY)
+ mode &= ~0222;
+
+ if (!names) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ if (names != le16_to_cpu(rec->hard_links)) {
+ /* Correct minor error on the fly. Do not mark inode as dirty. */
+ rec->hard_links = cpu_to_le16(names);
+ ni->mi.dirty = true;
+ }
+
+ set_nlink(inode, names);
+
+ if (S_ISDIR(mode)) {
+ ni->std_fa |= FILE_ATTRIBUTE_DIRECTORY;
+
+ /*
+ * Dot and dot-dot should be included in count but was not
+ * included in enumeration.
+ * Usually a hard links to directories are disabled.
+ */
+ inode->i_op = &ntfs_dir_inode_operations;
+ inode->i_fop = &ntfs_dir_operations;
+ ni->i_valid = 0;
+ } else if (S_ISLNK(mode)) {
+ ni->std_fa &= ~FILE_ATTRIBUTE_DIRECTORY;
+ inode->i_op = &ntfs_link_inode_operations;
+ inode->i_fop = NULL;
+ inode_nohighmem(inode);
+ } else if (S_ISREG(mode)) {
+ ni->std_fa &= ~FILE_ATTRIBUTE_DIRECTORY;
+ inode->i_op = &ntfs_file_inode_operations;
+ inode->i_fop = &ntfs_file_operations;
+ inode->i_mapping->a_ops =
+ is_compressed(ni) ? &ntfs_aops_cmpr : &ntfs_aops;
+ if (ino != MFT_REC_MFT)
+ init_rwsem(&ni->file.run_lock);
+ } else if (S_ISCHR(mode) || S_ISBLK(mode) || S_ISFIFO(mode) ||
+ S_ISSOCK(mode)) {
+ inode->i_op = &ntfs_special_inode_operations;
+ init_special_inode(inode, mode, inode->i_rdev);
+ } else if (fname && fname->home.low == cpu_to_le32(MFT_REC_EXTEND) &&
+ fname->home.seq == cpu_to_le16(MFT_REC_EXTEND)) {
+ /* Records in $Extend are not a files or general directories. */
+ inode->i_op = &ntfs_file_inode_operations;
+ } else {
+ err = -EINVAL;
+ goto out;
+ }
+
+ if ((sbi->options->sys_immutable &&
+ (std5->fa & FILE_ATTRIBUTE_SYSTEM)) &&
+ !S_ISFIFO(mode) && !S_ISSOCK(mode) && !S_ISLNK(mode)) {
+ inode->i_flags |= S_IMMUTABLE;
+ } else {
+ inode->i_flags &= ~S_IMMUTABLE;
+ }
+
+ inode->i_mode = mode;
+ if (!(ni->ni_flags & NI_FLAG_EA)) {
+ /* If no xattr then no security (stored in xattr). */
+ inode->i_flags |= S_NOSEC;
+ }
+
+ if (ino == MFT_REC_MFT && !sb->s_root)
+ sbi->mft.ni = NULL;
+
+ unlock_new_inode(inode);
+
+ return inode;
+
+out:
+ if (ino == MFT_REC_MFT && !sb->s_root)
+ sbi->mft.ni = NULL;
+
+ iget_failed(inode);
+ return ERR_PTR(err);
+}
+
+/*
+ * ntfs_test_inode
+ *
+ * Return: 1 if match.
+ */
+static int ntfs_test_inode(struct inode *inode, void *data)
+{
+ struct MFT_REF *ref = data;
+
+ return ino_get(ref) == inode->i_ino;
+}
+
+static int ntfs_set_inode(struct inode *inode, void *data)
+{
+ const struct MFT_REF *ref = data;
+
+ inode->i_ino = ino_get(ref);
+ return 0;
+}
+
+struct inode *ntfs_iget5(struct super_block *sb, const struct MFT_REF *ref,
+ const struct cpu_str *name)
+{
+ struct inode *inode;
+
+ inode = iget5_locked(sb, ino_get(ref), ntfs_test_inode, ntfs_set_inode,
+ (void *)ref);
+ if (unlikely(!inode))
+ return ERR_PTR(-ENOMEM);
+
+ /* If this is a freshly allocated inode, need to read it now. */
+ if (inode->i_state & I_NEW)
+ inode = ntfs_read_mft(inode, name, ref);
+ else if (ref->seq != ntfs_i(inode)->mi.mrec->seq) {
+ /* Inode overlaps? */
+ _ntfs_bad_inode(inode);
+ }
+
+ if (IS_ERR(inode) && name)
+ ntfs_set_state(sb->s_fs_info, NTFS_DIRTY_ERROR);
+
+ return inode;
+}
+
+enum get_block_ctx {
+ GET_BLOCK_GENERAL = 0,
+ GET_BLOCK_WRITE_BEGIN = 1,
+ GET_BLOCK_DIRECT_IO_R = 2,
+ GET_BLOCK_DIRECT_IO_W = 3,
+ GET_BLOCK_BMAP = 4,
+};
+
+static noinline int ntfs_get_block_vbo(struct inode *inode, u64 vbo,
+ struct buffer_head *bh, int create,
+ enum get_block_ctx ctx)
+{
+ struct super_block *sb = inode->i_sb;
+ struct ntfs_sb_info *sbi = sb->s_fs_info;
+ struct ntfs_inode *ni = ntfs_i(inode);
+ struct page *page = bh->b_page;
+ u8 cluster_bits = sbi->cluster_bits;
+ u32 block_size = sb->s_blocksize;
+ u64 bytes, lbo, valid;
+ u32 off;
+ int err;
+ CLST vcn, lcn, len;
+ bool new;
+
+ /* Clear previous state. */
+ clear_buffer_new(bh);
+ clear_buffer_uptodate(bh);
+
+ /* Direct write uses 'create=0'. */
+ if (!create && vbo >= ni->i_valid) {
+ /* Out of valid. */
+ return 0;
+ }
+
+ if (vbo >= inode->i_size) {
+ /* Out of size. */
+ return 0;
+ }
+
+ if (is_resident(ni)) {
+ ni_lock(ni);
+ err = attr_data_read_resident(ni, page);
+ ni_unlock(ni);
+
+ if (!err)
+ set_buffer_uptodate(bh);
+ bh->b_size = block_size;
+ return err;
+ }
+
+ vcn = vbo >> cluster_bits;
+ off = vbo & sbi->cluster_mask;
+ new = false;
+
+ err = attr_data_get_block(ni, vcn, 1, &lcn, &len, create ? &new : NULL);
+ if (err)
+ goto out;
+
+ if (!len)
+ return 0;
+
+ bytes = ((u64)len << cluster_bits) - off;
+
+ if (lcn == SPARSE_LCN) {
+ if (!create) {
+ if (bh->b_size > bytes)
+ bh->b_size = bytes;
+ return 0;
+ }
+ WARN_ON(1);
+ }
+
+ if (new) {
+ set_buffer_new(bh);
+ if ((len << cluster_bits) > block_size)
+ ntfs_sparse_cluster(inode, page, vcn, len);
+ }
+
+ lbo = ((u64)lcn << cluster_bits) + off;
+
+ set_buffer_mapped(bh);
+ bh->b_bdev = sb->s_bdev;
+ bh->b_blocknr = lbo >> sb->s_blocksize_bits;
+
+ valid = ni->i_valid;
+
+ if (ctx == GET_BLOCK_DIRECT_IO_W) {
+ /* ntfs_direct_IO will update ni->i_valid. */
+ if (vbo >= valid)
+ set_buffer_new(bh);
+ } else if (create) {
+ /* Normal write. */
+ if (bytes > bh->b_size)
+ bytes = bh->b_size;
+
+ if (vbo >= valid)
+ set_buffer_new(bh);
+
+ if (vbo + bytes > valid) {
+ ni->i_valid = vbo + bytes;
+ mark_inode_dirty(inode);
+ }
+ } else if (vbo >= valid) {
+ /* Read out of valid data. */
+ /* Should never be here 'cause already checked. */
+ clear_buffer_mapped(bh);
+ } else if (vbo + bytes <= valid) {
+ /* Normal read. */
+ } else if (vbo + block_size <= valid) {
+ /* Normal short read. */
+ bytes = block_size;
+ } else {
+ /*
+ * Read across valid size: vbo < valid && valid < vbo + block_size
+ */
+ bytes = block_size;
+
+ if (page) {
+ u32 voff = valid - vbo;
+
+ bh->b_size = block_size;
+ off = vbo & (PAGE_SIZE - 1);
+ set_bh_page(bh, page, off);
+ err = bh_read(bh, 0);
+ if (err < 0)
+ goto out;
+ zero_user_segment(page, off + voff, off + block_size);
+ }
+ }
+
+ if (bh->b_size > bytes)
+ bh->b_size = bytes;
+
+#ifndef __LP64__
+ if (ctx == GET_BLOCK_DIRECT_IO_W || ctx == GET_BLOCK_DIRECT_IO_R) {
+ static_assert(sizeof(size_t) < sizeof(loff_t));
+ if (bytes > 0x40000000u)
+ bh->b_size = 0x40000000u;
+ }
+#endif
+
+ return 0;
+
+out:
+ return err;
+}
+
+int ntfs_get_block(struct inode *inode, sector_t vbn,
+ struct buffer_head *bh_result, int create)
+{
+ return ntfs_get_block_vbo(inode, (u64)vbn << inode->i_blkbits,
+ bh_result, create, GET_BLOCK_GENERAL);
+}
+
+static int ntfs_get_block_bmap(struct inode *inode, sector_t vsn,
+ struct buffer_head *bh_result, int create)
+{
+ return ntfs_get_block_vbo(inode,
+ (u64)vsn << inode->i_sb->s_blocksize_bits,
+ bh_result, create, GET_BLOCK_BMAP);
+}
+
+static sector_t ntfs_bmap(struct address_space *mapping, sector_t block)
+{
+ return generic_block_bmap(mapping, block, ntfs_get_block_bmap);
+}
+
+static int ntfs_read_folio(struct file *file, struct folio *folio)
+{
+ struct page *page = &folio->page;
+ int err;
+ struct address_space *mapping = page->mapping;
+ struct inode *inode = mapping->host;
+ struct ntfs_inode *ni = ntfs_i(inode);
+
+ if (is_resident(ni)) {
+ ni_lock(ni);
+ err = attr_data_read_resident(ni, page);
+ ni_unlock(ni);
+ if (err != E_NTFS_NONRESIDENT) {
+ unlock_page(page);
+ return err;
+ }
+ }
+
+ if (is_compressed(ni)) {
+ ni_lock(ni);
+ err = ni_readpage_cmpr(ni, page);
+ ni_unlock(ni);
+ return err;
+ }
+
+ /* Normal + sparse files. */
+ return mpage_read_folio(folio, ntfs_get_block);
+}
+
+static void ntfs_readahead(struct readahead_control *rac)
+{
+ struct address_space *mapping = rac->mapping;
+ struct inode *inode = mapping->host;
+ struct ntfs_inode *ni = ntfs_i(inode);
+ u64 valid;
+ loff_t pos;
+
+ if (is_resident(ni)) {
+ /* No readahead for resident. */
+ return;
+ }
+
+ if (is_compressed(ni)) {
+ /* No readahead for compressed. */
+ return;
+ }
+
+ valid = ni->i_valid;
+ pos = readahead_pos(rac);
+
+ if (valid < i_size_read(inode) && pos <= valid &&
+ valid < pos + readahead_length(rac)) {
+ /* Range cross 'valid'. Read it page by page. */
+ return;
+ }
+
+ mpage_readahead(rac, ntfs_get_block);
+}
+
+static int ntfs_get_block_direct_IO_R(struct inode *inode, sector_t iblock,
+ struct buffer_head *bh_result, int create)
+{
+ return ntfs_get_block_vbo(inode, (u64)iblock << inode->i_blkbits,
+ bh_result, create, GET_BLOCK_DIRECT_IO_R);
+}
+
+static int ntfs_get_block_direct_IO_W(struct inode *inode, sector_t iblock,
+ struct buffer_head *bh_result, int create)
+{
+ return ntfs_get_block_vbo(inode, (u64)iblock << inode->i_blkbits,
+ bh_result, create, GET_BLOCK_DIRECT_IO_W);
+}
+
+static ssize_t ntfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
+{
+ struct file *file = iocb->ki_filp;
+ struct address_space *mapping = file->f_mapping;
+ struct inode *inode = mapping->host;
+ struct ntfs_inode *ni = ntfs_i(inode);
+ loff_t vbo = iocb->ki_pos;
+ loff_t end;
+ int wr = iov_iter_rw(iter) & WRITE;
+ size_t iter_count = iov_iter_count(iter);
+ loff_t valid;
+ ssize_t ret;
+
+ if (is_resident(ni)) {
+ /* Switch to buffered write. */
+ ret = 0;
+ goto out;
+ }
+
+ ret = blockdev_direct_IO(iocb, inode, iter,
+ wr ? ntfs_get_block_direct_IO_W
+ : ntfs_get_block_direct_IO_R);
+
+ if (ret > 0)
+ end = vbo + ret;
+ else if (wr && ret == -EIOCBQUEUED)
+ end = vbo + iter_count;
+ else
+ goto out;
+
+ valid = ni->i_valid;
+ if (wr) {
+ if (end > valid && !S_ISBLK(inode->i_mode)) {
+ ni->i_valid = end;
+ mark_inode_dirty(inode);
+ }
+ } else if (vbo < valid && valid < end) {
+ /* Fix page. */
+ iov_iter_revert(iter, end - valid);
+ iov_iter_zero(end - valid, iter);
+ }
+
+out:
+ return ret;
+}
+
+int ntfs_set_size(struct inode *inode, u64 new_size)
+{
+ struct super_block *sb = inode->i_sb;
+ struct ntfs_sb_info *sbi = sb->s_fs_info;
+ struct ntfs_inode *ni = ntfs_i(inode);
+ int err;
+
+ /* Check for maximum file size. */
+ if (is_sparsed(ni) || is_compressed(ni)) {
+ if (new_size > sbi->maxbytes_sparse) {
+ err = -EFBIG;
+ goto out;
+ }
+ } else if (new_size > sbi->maxbytes) {
+ err = -EFBIG;
+ goto out;
+ }
+
+ ni_lock(ni);
+ down_write(&ni->file.run_lock);
+
+ err = attr_set_size(ni, ATTR_DATA, NULL, 0, &ni->file.run, new_size,
+ &ni->i_valid, true, NULL);
+
+ up_write(&ni->file.run_lock);
+ ni_unlock(ni);
+
+ mark_inode_dirty(inode);
+
+out:
+ return err;
+}
+
+static int ntfs_writepage(struct page *page, struct writeback_control *wbc)
+{
+ struct address_space *mapping = page->mapping;
+ struct inode *inode = mapping->host;
+ struct ntfs_inode *ni = ntfs_i(inode);
+ int err;
+
+ if (is_resident(ni)) {
+ ni_lock(ni);
+ err = attr_data_write_resident(ni, page);
+ ni_unlock(ni);
+ if (err != E_NTFS_NONRESIDENT) {
+ unlock_page(page);
+ return err;
+ }
+ }
+
+ return block_write_full_page(page, ntfs_get_block, wbc);
+}
+
+static int ntfs_writepages(struct address_space *mapping,
+ struct writeback_control *wbc)
+{
+ /* Redirect call to 'ntfs_writepage' for resident files. */
+ if (is_resident(ntfs_i(mapping->host)))
+ return generic_writepages(mapping, wbc);
+ return mpage_writepages(mapping, wbc, ntfs_get_block);
+}
+
+static int ntfs_get_block_write_begin(struct inode *inode, sector_t vbn,
+ struct buffer_head *bh_result, int create)
+{
+ return ntfs_get_block_vbo(inode, (u64)vbn << inode->i_blkbits,
+ bh_result, create, GET_BLOCK_WRITE_BEGIN);
+}
+
+int ntfs_write_begin(struct file *file, struct address_space *mapping,
+ loff_t pos, u32 len, struct page **pagep, void **fsdata)
+{
+ int err;
+ struct inode *inode = mapping->host;
+ struct ntfs_inode *ni = ntfs_i(inode);
+
+ *pagep = NULL;
+ if (is_resident(ni)) {
+ struct page *page = grab_cache_page_write_begin(
+ mapping, pos >> PAGE_SHIFT);
+
+ if (!page) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ ni_lock(ni);
+ err = attr_data_read_resident(ni, page);
+ ni_unlock(ni);
+
+ if (!err) {
+ *pagep = page;
+ goto out;
+ }
+ unlock_page(page);
+ put_page(page);
+
+ if (err != E_NTFS_NONRESIDENT)
+ goto out;
+ }
+
+ err = block_write_begin(mapping, pos, len, pagep,
+ ntfs_get_block_write_begin);
+
+out:
+ return err;
+}
+
+/*
+ * ntfs_write_end - Address_space_operations::write_end.
+ */
+int ntfs_write_end(struct file *file, struct address_space *mapping,
+ loff_t pos, u32 len, u32 copied, struct page *page,
+ void *fsdata)
+{
+ struct inode *inode = mapping->host;
+ struct ntfs_inode *ni = ntfs_i(inode);
+ u64 valid = ni->i_valid;
+ bool dirty = false;
+ int err;
+
+ if (is_resident(ni)) {
+ ni_lock(ni);
+ err = attr_data_write_resident(ni, page);
+ ni_unlock(ni);
+ if (!err) {
+ dirty = true;
+ /* Clear any buffers in page. */
+ if (page_has_buffers(page)) {
+ struct buffer_head *head, *bh;
+
+ bh = head = page_buffers(page);
+ do {
+ clear_buffer_dirty(bh);
+ clear_buffer_mapped(bh);
+ set_buffer_uptodate(bh);
+ } while (head != (bh = bh->b_this_page));
+ }
+ SetPageUptodate(page);
+ err = copied;
+ }
+ unlock_page(page);
+ put_page(page);
+ } else {
+ err = generic_write_end(file, mapping, pos, len, copied, page,
+ fsdata);
+ }
+
+ if (err >= 0) {
+ if (!(ni->std_fa & FILE_ATTRIBUTE_ARCHIVE)) {
+ inode->i_ctime = inode->i_mtime = current_time(inode);
+ ni->std_fa |= FILE_ATTRIBUTE_ARCHIVE;
+ dirty = true;
+ }
+
+ if (valid != ni->i_valid) {
+ /* ni->i_valid is changed in ntfs_get_block_vbo. */
+ dirty = true;
+ }
+
+ if (dirty)
+ mark_inode_dirty(inode);
+ }
+
+ return err;
+}
+
+int reset_log_file(struct inode *inode)
+{
+ int err;
+ loff_t pos = 0;
+ u32 log_size = inode->i_size;
+ struct address_space *mapping = inode->i_mapping;
+
+ for (;;) {
+ u32 len;
+ void *kaddr;
+ struct page *page;
+
+ len = pos + PAGE_SIZE > log_size ? (log_size - pos) : PAGE_SIZE;
+
+ err = block_write_begin(mapping, pos, len, &page,
+ ntfs_get_block_write_begin);
+ if (err)
+ goto out;
+
+ kaddr = kmap_atomic(page);
+ memset(kaddr, -1, len);
+ kunmap_atomic(kaddr);
+ flush_dcache_page(page);
+
+ err = block_write_end(NULL, mapping, pos, len, len, page, NULL);
+ if (err < 0)
+ goto out;
+ pos += len;
+
+ if (pos >= log_size)
+ break;
+ balance_dirty_pages_ratelimited(mapping);
+ }
+out:
+ mark_inode_dirty_sync(inode);
+
+ return err;
+}
+
+int ntfs3_write_inode(struct inode *inode, struct writeback_control *wbc)
+{
+ return _ni_write_inode(inode, wbc->sync_mode == WB_SYNC_ALL);
+}
+
+int ntfs_sync_inode(struct inode *inode)
+{
+ return _ni_write_inode(inode, 1);
+}
+
+/*
+ * writeback_inode - Helper function for ntfs_flush_inodes().
+ *
+ * This writes both the inode and the file data blocks, waiting
+ * for in flight data blocks before the start of the call. It
+ * does not wait for any io started during the call.
+ */
+static int writeback_inode(struct inode *inode)
+{
+ int ret = sync_inode_metadata(inode, 0);
+
+ if (!ret)
+ ret = filemap_fdatawrite(inode->i_mapping);
+ return ret;
+}
+
+/*
+ * ntfs_flush_inodes
+ *
+ * Write data and metadata corresponding to i1 and i2. The io is
+ * started but we do not wait for any of it to finish.
+ *
+ * filemap_flush() is used for the block device, so if there is a dirty
+ * page for a block already in flight, we will not wait and start the
+ * io over again.
+ */
+int ntfs_flush_inodes(struct super_block *sb, struct inode *i1,
+ struct inode *i2)
+{
+ int ret = 0;
+
+ if (i1)
+ ret = writeback_inode(i1);
+ if (!ret && i2)
+ ret = writeback_inode(i2);
+ if (!ret)
+ ret = sync_blockdev_nowait(sb->s_bdev);
+ return ret;
+}
+
+int inode_write_data(struct inode *inode, const void *data, size_t bytes)
+{
+ pgoff_t idx;
+
+ /* Write non resident data. */
+ for (idx = 0; bytes; idx++) {
+ size_t op = bytes > PAGE_SIZE ? PAGE_SIZE : bytes;
+ struct page *page = ntfs_map_page(inode->i_mapping, idx);
+
+ if (IS_ERR(page))
+ return PTR_ERR(page);
+
+ lock_page(page);
+ WARN_ON(!PageUptodate(page));
+ ClearPageUptodate(page);
+
+ memcpy(page_address(page), data, op);
+
+ flush_dcache_page(page);
+ SetPageUptodate(page);
+ unlock_page(page);
+
+ ntfs_unmap_page(page);
+
+ bytes -= op;
+ data = Add2Ptr(data, PAGE_SIZE);
+ }
+ return 0;
+}
+
+/*
+ * ntfs_reparse_bytes
+ *
+ * Number of bytes for REPARSE_DATA_BUFFER(IO_REPARSE_TAG_SYMLINK)
+ * for unicode string of @uni_len length.
+ */
+static inline u32 ntfs_reparse_bytes(u32 uni_len)
+{
+ /* Header + unicode string + decorated unicode string. */
+ return sizeof(short) * (2 * uni_len + 4) +
+ offsetof(struct REPARSE_DATA_BUFFER,
+ SymbolicLinkReparseBuffer.PathBuffer);
+}
+
+static struct REPARSE_DATA_BUFFER *
+ntfs_create_reparse_buffer(struct ntfs_sb_info *sbi, const char *symname,
+ u32 size, u16 *nsize)
+{
+ int i, err;
+ struct REPARSE_DATA_BUFFER *rp;
+ __le16 *rp_name;
+ typeof(rp->SymbolicLinkReparseBuffer) *rs;
+
+ rp = kzalloc(ntfs_reparse_bytes(2 * size + 2), GFP_NOFS);
+ if (!rp)
+ return ERR_PTR(-ENOMEM);
+
+ rs = &rp->SymbolicLinkReparseBuffer;
+ rp_name = rs->PathBuffer;
+
+ /* Convert link name to UTF-16. */
+ err = ntfs_nls_to_utf16(sbi, symname, size,
+ (struct cpu_str *)(rp_name - 1), 2 * size,
+ UTF16_LITTLE_ENDIAN);
+ if (err < 0)
+ goto out;
+
+ /* err = the length of unicode name of symlink. */
+ *nsize = ntfs_reparse_bytes(err);
+
+ if (*nsize > sbi->reparse.max_size) {
+ err = -EFBIG;
+ goto out;
+ }
+
+ /* Translate Linux '/' into Windows '\'. */
+ for (i = 0; i < err; i++) {
+ if (rp_name[i] == cpu_to_le16('/'))
+ rp_name[i] = cpu_to_le16('\\');
+ }
+
+ rp->ReparseTag = IO_REPARSE_TAG_SYMLINK;
+ rp->ReparseDataLength =
+ cpu_to_le16(*nsize - offsetof(struct REPARSE_DATA_BUFFER,
+ SymbolicLinkReparseBuffer));
+
+ /* PrintName + SubstituteName. */
+ rs->SubstituteNameOffset = cpu_to_le16(sizeof(short) * err);
+ rs->SubstituteNameLength = cpu_to_le16(sizeof(short) * err + 8);
+ rs->PrintNameLength = rs->SubstituteNameOffset;
+
+ /*
+ * TODO: Use relative path if possible to allow Windows to
+ * parse this path.
+ * 0-absolute path 1- relative path (SYMLINK_FLAG_RELATIVE).
+ */
+ rs->Flags = 0;
+
+ memmove(rp_name + err + 4, rp_name, sizeof(short) * err);
+
+ /* Decorate SubstituteName. */
+ rp_name += err;
+ rp_name[0] = cpu_to_le16('\\');
+ rp_name[1] = cpu_to_le16('?');
+ rp_name[2] = cpu_to_le16('?');
+ rp_name[3] = cpu_to_le16('\\');
+
+ return rp;
+out:
+ kfree(rp);
+ return ERR_PTR(err);
+}
+
+struct inode *ntfs_create_inode(struct user_namespace *mnt_userns,
+ struct inode *dir, struct dentry *dentry,
+ const struct cpu_str *uni, umode_t mode,
+ dev_t dev, const char *symname, u32 size,
+ struct ntfs_fnd *fnd)
+{
+ int err;
+ struct super_block *sb = dir->i_sb;
+ struct ntfs_sb_info *sbi = sb->s_fs_info;
+ const struct qstr *name = &dentry->d_name;
+ CLST ino = 0;
+ struct ntfs_inode *dir_ni = ntfs_i(dir);
+ struct ntfs_inode *ni = NULL;
+ struct inode *inode = NULL;
+ struct ATTRIB *attr;
+ struct ATTR_STD_INFO5 *std5;
+ struct ATTR_FILE_NAME *fname;
+ struct MFT_REC *rec;
+ u32 asize, dsize, sd_size;
+ enum FILE_ATTRIBUTE fa;
+ __le32 security_id = SECURITY_ID_INVALID;
+ CLST vcn;
+ const void *sd;
+ u16 t16, nsize = 0, aid = 0;
+ struct INDEX_ROOT *root, *dir_root;
+ struct NTFS_DE *e, *new_de = NULL;
+ struct REPARSE_DATA_BUFFER *rp = NULL;
+ bool rp_inserted = false;
+
+ ni_lock_dir(dir_ni);
+
+ dir_root = indx_get_root(&dir_ni->dir, dir_ni, NULL, NULL);
+ if (!dir_root) {
+ err = -EINVAL;
+ goto out1;
+ }
+
+ if (S_ISDIR(mode)) {
+ /* Use parent's directory attributes. */
+ fa = dir_ni->std_fa | FILE_ATTRIBUTE_DIRECTORY |
+ FILE_ATTRIBUTE_ARCHIVE;
+ /*
+ * By default child directory inherits parent attributes.
+ * Root directory is hidden + system.
+ * Make an exception for children in root.
+ */
+ if (dir->i_ino == MFT_REC_ROOT)
+ fa &= ~(FILE_ATTRIBUTE_HIDDEN | FILE_ATTRIBUTE_SYSTEM);
+ } else if (S_ISLNK(mode)) {
+ /* It is good idea that link should be the same type (file/dir) as target */
+ fa = FILE_ATTRIBUTE_REPARSE_POINT;
+
+ /*
+ * Linux: there are dir/file/symlink and so on.
+ * NTFS: symlinks are "dir + reparse" or "file + reparse"
+ * It is good idea to create:
+ * dir + reparse if 'symname' points to directory
+ * or
+ * file + reparse if 'symname' points to file
+ * Unfortunately kern_path hangs if symname contains 'dir'.
+ */
+
+ /*
+ * struct path path;
+ *
+ * if (!kern_path(symname, LOOKUP_FOLLOW, &path)){
+ * struct inode *target = d_inode(path.dentry);
+ *
+ * if (S_ISDIR(target->i_mode))
+ * fa |= FILE_ATTRIBUTE_DIRECTORY;
+ * // if ( target->i_sb == sb ){
+ * // use relative path?
+ * // }
+ * path_put(&path);
+ * }
+ */
+ } else if (S_ISREG(mode)) {
+ if (sbi->options->sparse) {
+ /* Sparsed regular file, cause option 'sparse'. */
+ fa = FILE_ATTRIBUTE_SPARSE_FILE |
+ FILE_ATTRIBUTE_ARCHIVE;
+ } else if (dir_ni->std_fa & FILE_ATTRIBUTE_COMPRESSED) {
+ /* Compressed regular file, if parent is compressed. */
+ fa = FILE_ATTRIBUTE_COMPRESSED | FILE_ATTRIBUTE_ARCHIVE;
+ } else {
+ /* Regular file, default attributes. */
+ fa = FILE_ATTRIBUTE_ARCHIVE;
+ }
+ } else {
+ fa = FILE_ATTRIBUTE_ARCHIVE;
+ }
+
+ if (!(mode & 0222))
+ fa |= FILE_ATTRIBUTE_READONLY;
+
+ /* Allocate PATH_MAX bytes. */
+ new_de = __getname();
+ if (!new_de) {
+ err = -ENOMEM;
+ goto out1;
+ }
+
+ /* Mark rw ntfs as dirty. it will be cleared at umount. */
+ ntfs_set_state(sbi, NTFS_DIRTY_DIRTY);
+
+ /* Step 1: allocate and fill new mft record. */
+ err = ntfs_look_free_mft(sbi, &ino, false, NULL, NULL);
+ if (err)
+ goto out2;
+
+ ni = ntfs_new_inode(sbi, ino, fa & FILE_ATTRIBUTE_DIRECTORY);
+ if (IS_ERR(ni)) {
+ err = PTR_ERR(ni);
+ ni = NULL;
+ goto out3;
+ }
+ inode = &ni->vfs_inode;
+ inode_init_owner(mnt_userns, inode, dir, mode);
+ mode = inode->i_mode;
+
+ inode->i_atime = inode->i_mtime = inode->i_ctime = ni->i_crtime =
+ current_time(inode);
+
+ rec = ni->mi.mrec;
+ rec->hard_links = cpu_to_le16(1);
+ attr = Add2Ptr(rec, le16_to_cpu(rec->attr_off));
+
+ /* Get default security id. */
+ sd = s_default_security;
+ sd_size = sizeof(s_default_security);
+
+ if (is_ntfs3(sbi)) {
+ security_id = dir_ni->std_security_id;
+ if (le32_to_cpu(security_id) < SECURITY_ID_FIRST) {
+ security_id = sbi->security.def_security_id;
+
+ if (security_id == SECURITY_ID_INVALID &&
+ !ntfs_insert_security(sbi, sd, sd_size,
+ &security_id, NULL))
+ sbi->security.def_security_id = security_id;
+ }
+ }
+
+ /* Insert standard info. */
+ std5 = Add2Ptr(attr, SIZEOF_RESIDENT);
+
+ if (security_id == SECURITY_ID_INVALID) {
+ dsize = sizeof(struct ATTR_STD_INFO);
+ } else {
+ dsize = sizeof(struct ATTR_STD_INFO5);
+ std5->security_id = security_id;
+ ni->std_security_id = security_id;
+ }
+ asize = SIZEOF_RESIDENT + dsize;
+
+ attr->type = ATTR_STD;
+ attr->size = cpu_to_le32(asize);
+ attr->id = cpu_to_le16(aid++);
+ attr->res.data_off = SIZEOF_RESIDENT_LE;
+ attr->res.data_size = cpu_to_le32(dsize);
+
+ std5->cr_time = std5->m_time = std5->c_time = std5->a_time =
+ kernel2nt(&inode->i_atime);
+
+ ni->std_fa = fa;
+ std5->fa = fa;
+
+ attr = Add2Ptr(attr, asize);
+
+ /* Insert file name. */
+ err = fill_name_de(sbi, new_de, name, uni);
+ if (err)
+ goto out4;
+
+ mi_get_ref(&ni->mi, &new_de->ref);
+
+ fname = (struct ATTR_FILE_NAME *)(new_de + 1);
+ mi_get_ref(&dir_ni->mi, &fname->home);
+ fname->dup.cr_time = fname->dup.m_time = fname->dup.c_time =
+ fname->dup.a_time = std5->cr_time;
+ fname->dup.alloc_size = fname->dup.data_size = 0;
+ fname->dup.fa = std5->fa;
+ fname->dup.ea_size = fname->dup.reparse = 0;
+
+ dsize = le16_to_cpu(new_de->key_size);
+ asize = ALIGN(SIZEOF_RESIDENT + dsize, 8);
+
+ attr->type = ATTR_NAME;
+ attr->size = cpu_to_le32(asize);
+ attr->res.data_off = SIZEOF_RESIDENT_LE;
+ attr->res.flags = RESIDENT_FLAG_INDEXED;
+ attr->id = cpu_to_le16(aid++);
+ attr->res.data_size = cpu_to_le32(dsize);
+ memcpy(Add2Ptr(attr, SIZEOF_RESIDENT), fname, dsize);
+
+ attr = Add2Ptr(attr, asize);
+
+ if (security_id == SECURITY_ID_INVALID) {
+ /* Insert security attribute. */
+ asize = SIZEOF_RESIDENT + ALIGN(sd_size, 8);
+
+ attr->type = ATTR_SECURE;
+ attr->size = cpu_to_le32(asize);
+ attr->id = cpu_to_le16(aid++);
+ attr->res.data_off = SIZEOF_RESIDENT_LE;
+ attr->res.data_size = cpu_to_le32(sd_size);
+ memcpy(Add2Ptr(attr, SIZEOF_RESIDENT), sd, sd_size);
+
+ attr = Add2Ptr(attr, asize);
+ }
+
+ attr->id = cpu_to_le16(aid++);
+ if (fa & FILE_ATTRIBUTE_DIRECTORY) {
+ /*
+ * Regular directory or symlink to directory.
+ * Create root attribute.
+ */
+ dsize = sizeof(struct INDEX_ROOT) + sizeof(struct NTFS_DE);
+ asize = sizeof(I30_NAME) + SIZEOF_RESIDENT + dsize;
+
+ attr->type = ATTR_ROOT;
+ attr->size = cpu_to_le32(asize);
+
+ attr->name_len = ARRAY_SIZE(I30_NAME);
+ attr->name_off = SIZEOF_RESIDENT_LE;
+ attr->res.data_off =
+ cpu_to_le16(sizeof(I30_NAME) + SIZEOF_RESIDENT);
+ attr->res.data_size = cpu_to_le32(dsize);
+ memcpy(Add2Ptr(attr, SIZEOF_RESIDENT), I30_NAME,
+ sizeof(I30_NAME));
+
+ root = Add2Ptr(attr, sizeof(I30_NAME) + SIZEOF_RESIDENT);
+ memcpy(root, dir_root, offsetof(struct INDEX_ROOT, ihdr));
+ root->ihdr.de_off =
+ cpu_to_le32(sizeof(struct INDEX_HDR)); // 0x10
+ root->ihdr.used = cpu_to_le32(sizeof(struct INDEX_HDR) +
+ sizeof(struct NTFS_DE));
+ root->ihdr.total = root->ihdr.used;
+
+ e = Add2Ptr(root, sizeof(struct INDEX_ROOT));
+ e->size = cpu_to_le16(sizeof(struct NTFS_DE));
+ e->flags = NTFS_IE_LAST;
+ } else if (S_ISLNK(mode)) {
+ /*
+ * Symlink to file.
+ * Create empty resident data attribute.
+ */
+ asize = SIZEOF_RESIDENT;
+
+ /* Insert empty ATTR_DATA */
+ attr->type = ATTR_DATA;
+ attr->size = cpu_to_le32(SIZEOF_RESIDENT);
+ attr->name_off = SIZEOF_RESIDENT_LE;
+ attr->res.data_off = SIZEOF_RESIDENT_LE;
+ } else if (S_ISREG(mode)) {
+ /*
+ * Regular file. Create empty non resident data attribute.
+ */
+ attr->type = ATTR_DATA;
+ attr->non_res = 1;
+ attr->nres.evcn = cpu_to_le64(-1ll);
+ if (fa & FILE_ATTRIBUTE_SPARSE_FILE) {
+ attr->size = cpu_to_le32(SIZEOF_NONRESIDENT_EX + 8);
+ attr->name_off = SIZEOF_NONRESIDENT_EX_LE;
+ attr->flags = ATTR_FLAG_SPARSED;
+ asize = SIZEOF_NONRESIDENT_EX + 8;
+ } else if (fa & FILE_ATTRIBUTE_COMPRESSED) {
+ attr->size = cpu_to_le32(SIZEOF_NONRESIDENT_EX + 8);
+ attr->name_off = SIZEOF_NONRESIDENT_EX_LE;
+ attr->flags = ATTR_FLAG_COMPRESSED;
+ attr->nres.c_unit = COMPRESSION_UNIT;
+ asize = SIZEOF_NONRESIDENT_EX + 8;
+ } else {
+ attr->size = cpu_to_le32(SIZEOF_NONRESIDENT + 8);
+ attr->name_off = SIZEOF_NONRESIDENT_LE;
+ asize = SIZEOF_NONRESIDENT + 8;
+ }
+ attr->nres.run_off = attr->name_off;
+ } else {
+ /*
+ * Node. Create empty resident data attribute.
+ */
+ attr->type = ATTR_DATA;
+ attr->size = cpu_to_le32(SIZEOF_RESIDENT);
+ attr->name_off = SIZEOF_RESIDENT_LE;
+ if (fa & FILE_ATTRIBUTE_SPARSE_FILE)
+ attr->flags = ATTR_FLAG_SPARSED;
+ else if (fa & FILE_ATTRIBUTE_COMPRESSED)
+ attr->flags = ATTR_FLAG_COMPRESSED;
+ attr->res.data_off = SIZEOF_RESIDENT_LE;
+ asize = SIZEOF_RESIDENT;
+ ni->ni_flags |= NI_FLAG_RESIDENT;
+ }
+
+ if (S_ISDIR(mode)) {
+ ni->ni_flags |= NI_FLAG_DIR;
+ err = indx_init(&ni->dir, sbi, attr, INDEX_MUTEX_I30);
+ if (err)
+ goto out4;
+ } else if (S_ISLNK(mode)) {
+ rp = ntfs_create_reparse_buffer(sbi, symname, size, &nsize);
+
+ if (IS_ERR(rp)) {
+ err = PTR_ERR(rp);
+ rp = NULL;
+ goto out4;
+ }
+
+ /*
+ * Insert ATTR_REPARSE.
+ */
+ attr = Add2Ptr(attr, asize);
+ attr->type = ATTR_REPARSE;
+ attr->id = cpu_to_le16(aid++);
+
+ /* Resident or non resident? */
+ asize = ALIGN(SIZEOF_RESIDENT + nsize, 8);
+ t16 = PtrOffset(rec, attr);
+
+ /*
+ * Below function 'ntfs_save_wsl_perm' requires 0x78 bytes.
+ * It is good idea to keep extened attributes resident.
+ */
+ if (asize + t16 + 0x78 + 8 > sbi->record_size) {
+ CLST alen;
+ CLST clst = bytes_to_cluster(sbi, nsize);
+
+ /* Bytes per runs. */
+ t16 = sbi->record_size - t16 - SIZEOF_NONRESIDENT;
+
+ attr->non_res = 1;
+ attr->nres.evcn = cpu_to_le64(clst - 1);
+ attr->name_off = SIZEOF_NONRESIDENT_LE;
+ attr->nres.run_off = attr->name_off;
+ attr->nres.data_size = cpu_to_le64(nsize);
+ attr->nres.valid_size = attr->nres.data_size;
+ attr->nres.alloc_size =
+ cpu_to_le64(ntfs_up_cluster(sbi, nsize));
+
+ err = attr_allocate_clusters(sbi, &ni->file.run, 0, 0,
+ clst, NULL, 0, &alen, 0,
+ NULL);
+ if (err)
+ goto out5;
+
+ err = run_pack(&ni->file.run, 0, clst,
+ Add2Ptr(attr, SIZEOF_NONRESIDENT), t16,
+ &vcn);
+ if (err < 0)
+ goto out5;
+
+ if (vcn != clst) {
+ err = -EINVAL;
+ goto out5;
+ }
+
+ asize = SIZEOF_NONRESIDENT + ALIGN(err, 8);
+ } else {
+ attr->res.data_off = SIZEOF_RESIDENT_LE;
+ attr->res.data_size = cpu_to_le32(nsize);
+ memcpy(Add2Ptr(attr, SIZEOF_RESIDENT), rp, nsize);
+ nsize = 0;
+ }
+ /* Size of symlink equals the length of input string. */
+ inode->i_size = size;
+
+ attr->size = cpu_to_le32(asize);
+
+ err = ntfs_insert_reparse(sbi, IO_REPARSE_TAG_SYMLINK,
+ &new_de->ref);
+ if (err)
+ goto out5;
+
+ rp_inserted = true;
+ }
+
+ attr = Add2Ptr(attr, asize);
+ attr->type = ATTR_END;
+
+ rec->used = cpu_to_le32(PtrOffset(rec, attr) + 8);
+ rec->next_attr_id = cpu_to_le16(aid);
+
+ /* Step 2: Add new name in index. */
+ err = indx_insert_entry(&dir_ni->dir, dir_ni, new_de, sbi, fnd, 0);
+ if (err)
+ goto out6;
+
+ /* Unlock parent directory before ntfs_init_acl. */
+ ni_unlock(dir_ni);
+
+ inode->i_generation = le16_to_cpu(rec->seq);
+
+ dir->i_mtime = dir->i_ctime = inode->i_atime;
+
+ if (S_ISDIR(mode)) {
+ inode->i_op = &ntfs_dir_inode_operations;
+ inode->i_fop = &ntfs_dir_operations;
+ } else if (S_ISLNK(mode)) {
+ inode->i_op = &ntfs_link_inode_operations;
+ inode->i_fop = NULL;
+ inode->i_mapping->a_ops = &ntfs_aops;
+ inode->i_size = size;
+ inode_nohighmem(inode);
+ } else if (S_ISREG(mode)) {
+ inode->i_op = &ntfs_file_inode_operations;
+ inode->i_fop = &ntfs_file_operations;
+ inode->i_mapping->a_ops =
+ is_compressed(ni) ? &ntfs_aops_cmpr : &ntfs_aops;
+ init_rwsem(&ni->file.run_lock);
+ } else {
+ inode->i_op = &ntfs_special_inode_operations;
+ init_special_inode(inode, mode, dev);
+ }
+
+#ifdef CONFIG_NTFS3_FS_POSIX_ACL
+ if (!S_ISLNK(mode) && (sb->s_flags & SB_POSIXACL)) {
+ err = ntfs_init_acl(mnt_userns, inode, dir);
+ if (err)
+ goto out7;
+ } else
+#endif
+ {
+ inode->i_flags |= S_NOSEC;
+ }
+
+ /* Write non resident data. */
+ if (nsize) {
+ err = ntfs_sb_write_run(sbi, &ni->file.run, 0, rp, nsize, 0);
+ if (err)
+ goto out7;
+ }
+
+ /*
+ * Call 'd_instantiate' after inode->i_op is set
+ * but before finish_open.
+ */
+ d_instantiate(dentry, inode);
+
+ ntfs_save_wsl_perm(inode);
+ mark_inode_dirty(dir);
+ mark_inode_dirty(inode);
+
+ /* Normal exit. */
+ goto out2;
+
+out7:
+
+ /* Undo 'indx_insert_entry'. */
+ ni_lock_dir(dir_ni);
+ indx_delete_entry(&dir_ni->dir, dir_ni, new_de + 1,
+ le16_to_cpu(new_de->key_size), sbi);
+ /* ni_unlock(dir_ni); will be called later. */
+out6:
+ if (rp_inserted)
+ ntfs_remove_reparse(sbi, IO_REPARSE_TAG_SYMLINK, &new_de->ref);
+
+out5:
+ if (!S_ISDIR(mode))
+ run_deallocate(sbi, &ni->file.run, false);
+
+out4:
+ clear_rec_inuse(rec);
+ clear_nlink(inode);
+ ni->mi.dirty = false;
+ discard_new_inode(inode);
+out3:
+ ntfs_mark_rec_free(sbi, ino, false);
+
+out2:
+ __putname(new_de);
+ kfree(rp);
+
+out1:
+ if (err) {
+ ni_unlock(dir_ni);
+ return ERR_PTR(err);
+ }
+
+ unlock_new_inode(inode);
+
+ return inode;
+}
+
+int ntfs_link_inode(struct inode *inode, struct dentry *dentry)
+{
+ int err;
+ struct ntfs_inode *ni = ntfs_i(inode);
+ struct ntfs_sb_info *sbi = inode->i_sb->s_fs_info;
+ struct NTFS_DE *de;
+
+ /* Allocate PATH_MAX bytes. */
+ de = __getname();
+ if (!de)
+ return -ENOMEM;
+
+ /* Mark rw ntfs as dirty. It will be cleared at umount. */
+ ntfs_set_state(sbi, NTFS_DIRTY_DIRTY);
+
+ /* Construct 'de'. */
+ err = fill_name_de(sbi, de, &dentry->d_name, NULL);
+ if (err)
+ goto out;
+
+ err = ni_add_name(ntfs_i(d_inode(dentry->d_parent)), ni, de);
+out:
+ __putname(de);
+ return err;
+}
+
+/*
+ * ntfs_unlink_inode
+ *
+ * inode_operations::unlink
+ * inode_operations::rmdir
+ */
+int ntfs_unlink_inode(struct inode *dir, const struct dentry *dentry)
+{
+ int err;
+ struct ntfs_sb_info *sbi = dir->i_sb->s_fs_info;
+ struct inode *inode = d_inode(dentry);
+ struct ntfs_inode *ni = ntfs_i(inode);
+ struct ntfs_inode *dir_ni = ntfs_i(dir);
+ struct NTFS_DE *de, *de2 = NULL;
+ int undo_remove;
+
+ if (ntfs_is_meta_file(sbi, ni->mi.rno))
+ return -EINVAL;
+
+ /* Allocate PATH_MAX bytes. */
+ de = __getname();
+ if (!de)
+ return -ENOMEM;
+
+ ni_lock(ni);
+
+ if (S_ISDIR(inode->i_mode) && !dir_is_empty(inode)) {
+ err = -ENOTEMPTY;
+ goto out;
+ }
+
+ err = fill_name_de(sbi, de, &dentry->d_name, NULL);
+ if (err < 0)
+ goto out;
+
+ undo_remove = 0;
+ err = ni_remove_name(dir_ni, ni, de, &de2, &undo_remove);
+
+ if (!err) {
+ drop_nlink(inode);
+ dir->i_mtime = dir->i_ctime = current_time(dir);
+ mark_inode_dirty(dir);
+ inode->i_ctime = dir->i_ctime;
+ if (inode->i_nlink)
+ mark_inode_dirty(inode);
+ } else if (!ni_remove_name_undo(dir_ni, ni, de, de2, undo_remove)) {
+ _ntfs_bad_inode(inode);
+ } else {
+ if (ni_is_dirty(dir))
+ mark_inode_dirty(dir);
+ if (ni_is_dirty(inode))
+ mark_inode_dirty(inode);
+ }
+
+out:
+ ni_unlock(ni);
+ __putname(de);
+ return err;
+}
+
+void ntfs_evict_inode(struct inode *inode)
+{
+ truncate_inode_pages_final(&inode->i_data);
+
+ if (inode->i_nlink)
+ _ni_write_inode(inode, inode_needs_sync(inode));
+
+ invalidate_inode_buffers(inode);
+ clear_inode(inode);
+
+ ni_clear(ntfs_i(inode));
+}
+
+static noinline int ntfs_readlink_hlp(struct inode *inode, char *buffer,
+ int buflen)
+{
+ int i, err = -EINVAL;
+ struct ntfs_inode *ni = ntfs_i(inode);
+ struct super_block *sb = inode->i_sb;
+ struct ntfs_sb_info *sbi = sb->s_fs_info;
+ u64 size;
+ u16 ulen = 0;
+ void *to_free = NULL;
+ struct REPARSE_DATA_BUFFER *rp;
+ const __le16 *uname;
+ struct ATTRIB *attr;
+
+ /* Reparse data present. Try to parse it. */
+ static_assert(!offsetof(struct REPARSE_DATA_BUFFER, ReparseTag));
+ static_assert(sizeof(u32) == sizeof(rp->ReparseTag));
+
+ *buffer = 0;
+
+ attr = ni_find_attr(ni, NULL, NULL, ATTR_REPARSE, NULL, 0, NULL, NULL);
+ if (!attr)
+ goto out;
+
+ if (!attr->non_res) {
+ rp = resident_data_ex(attr, sizeof(struct REPARSE_DATA_BUFFER));
+ if (!rp)
+ goto out;
+ size = le32_to_cpu(attr->res.data_size);
+ } else {
+ size = le64_to_cpu(attr->nres.data_size);
+ rp = NULL;
+ }
+
+ if (size > sbi->reparse.max_size || size <= sizeof(u32))
+ goto out;
+
+ if (!rp) {
+ rp = kmalloc(size, GFP_NOFS);
+ if (!rp) {
+ err = -ENOMEM;
+ goto out;
+ }
+ to_free = rp;
+ /* Read into temporal buffer. */
+ err = ntfs_read_run_nb(sbi, &ni->file.run, 0, rp, size, NULL);
+ if (err)
+ goto out;
+ }
+
+ /* Microsoft Tag. */
+ switch (rp->ReparseTag) {
+ case IO_REPARSE_TAG_MOUNT_POINT:
+ /* Mount points and junctions. */
+ /* Can we use 'Rp->MountPointReparseBuffer.PrintNameLength'? */
+ if (size <= offsetof(struct REPARSE_DATA_BUFFER,
+ MountPointReparseBuffer.PathBuffer))
+ goto out;
+ uname = Add2Ptr(rp,
+ offsetof(struct REPARSE_DATA_BUFFER,
+ MountPointReparseBuffer.PathBuffer) +
+ le16_to_cpu(rp->MountPointReparseBuffer
+ .PrintNameOffset));
+ ulen = le16_to_cpu(rp->MountPointReparseBuffer.PrintNameLength);
+ break;
+
+ case IO_REPARSE_TAG_SYMLINK:
+ /* FolderSymbolicLink */
+ /* Can we use 'Rp->SymbolicLinkReparseBuffer.PrintNameLength'? */
+ if (size <= offsetof(struct REPARSE_DATA_BUFFER,
+ SymbolicLinkReparseBuffer.PathBuffer))
+ goto out;
+ uname = Add2Ptr(
+ rp, offsetof(struct REPARSE_DATA_BUFFER,
+ SymbolicLinkReparseBuffer.PathBuffer) +
+ le16_to_cpu(rp->SymbolicLinkReparseBuffer
+ .PrintNameOffset));
+ ulen = le16_to_cpu(
+ rp->SymbolicLinkReparseBuffer.PrintNameLength);
+ break;
+
+ case IO_REPARSE_TAG_CLOUD:
+ case IO_REPARSE_TAG_CLOUD_1:
+ case IO_REPARSE_TAG_CLOUD_2:
+ case IO_REPARSE_TAG_CLOUD_3:
+ case IO_REPARSE_TAG_CLOUD_4:
+ case IO_REPARSE_TAG_CLOUD_5:
+ case IO_REPARSE_TAG_CLOUD_6:
+ case IO_REPARSE_TAG_CLOUD_7:
+ case IO_REPARSE_TAG_CLOUD_8:
+ case IO_REPARSE_TAG_CLOUD_9:
+ case IO_REPARSE_TAG_CLOUD_A:
+ case IO_REPARSE_TAG_CLOUD_B:
+ case IO_REPARSE_TAG_CLOUD_C:
+ case IO_REPARSE_TAG_CLOUD_D:
+ case IO_REPARSE_TAG_CLOUD_E:
+ case IO_REPARSE_TAG_CLOUD_F:
+ err = sizeof("OneDrive") - 1;
+ if (err > buflen)
+ err = buflen;
+ memcpy(buffer, "OneDrive", err);
+ goto out;
+
+ default:
+ if (IsReparseTagMicrosoft(rp->ReparseTag)) {
+ /* Unknown Microsoft Tag. */
+ goto out;
+ }
+ if (!IsReparseTagNameSurrogate(rp->ReparseTag) ||
+ size <= sizeof(struct REPARSE_POINT)) {
+ goto out;
+ }
+
+ /* Users tag. */
+ uname = Add2Ptr(rp, sizeof(struct REPARSE_POINT));
+ ulen = le16_to_cpu(rp->ReparseDataLength) -
+ sizeof(struct REPARSE_POINT);
+ }
+
+ /* Convert nlen from bytes to UNICODE chars. */
+ ulen >>= 1;
+
+ /* Check that name is available. */
+ if (!ulen || uname + ulen > (__le16 *)Add2Ptr(rp, size))
+ goto out;
+
+ /* If name is already zero terminated then truncate it now. */
+ if (!uname[ulen - 1])
+ ulen -= 1;
+
+ err = ntfs_utf16_to_nls(sbi, uname, ulen, buffer, buflen);
+
+ if (err < 0)
+ goto out;
+
+ /* Translate Windows '\' into Linux '/'. */
+ for (i = 0; i < err; i++) {
+ if (buffer[i] == '\\')
+ buffer[i] = '/';
+ }
+
+ /* Always set last zero. */
+ buffer[err] = 0;
+out:
+ kfree(to_free);
+ return err;
+}
+
+static const char *ntfs_get_link(struct dentry *de, struct inode *inode,
+ struct delayed_call *done)
+{
+ int err;
+ char *ret;
+
+ if (!de)
+ return ERR_PTR(-ECHILD);
+
+ ret = kmalloc(PAGE_SIZE, GFP_NOFS);
+ if (!ret)
+ return ERR_PTR(-ENOMEM);
+
+ err = ntfs_readlink_hlp(inode, ret, PAGE_SIZE);
+ if (err < 0) {
+ kfree(ret);
+ return ERR_PTR(err);
+ }
+
+ set_delayed_call(done, kfree_link, ret);
+
+ return ret;
+}
+
+// clang-format off
+const struct inode_operations ntfs_link_inode_operations = {
+ .get_link = ntfs_get_link,
+ .setattr = ntfs3_setattr,
+ .listxattr = ntfs_listxattr,
+ .permission = ntfs_permission,
+};
+
+const struct address_space_operations ntfs_aops = {
+ .read_folio = ntfs_read_folio,
+ .readahead = ntfs_readahead,
+ .writepage = ntfs_writepage,
+ .writepages = ntfs_writepages,
+ .write_begin = ntfs_write_begin,
+ .write_end = ntfs_write_end,
+ .direct_IO = ntfs_direct_IO,
+ .bmap = ntfs_bmap,
+ .dirty_folio = block_dirty_folio,
+ .invalidate_folio = block_invalidate_folio,
+};
+
+const struct address_space_operations ntfs_aops_cmpr = {
+ .read_folio = ntfs_read_folio,
+ .readahead = ntfs_readahead,
+};
+// clang-format on
diff --git a/fs/ntfs3/lib/decompress_common.c b/fs/ntfs3/lib/decompress_common.c
new file mode 100644
index 000000000..e96652240
--- /dev/null
+++ b/fs/ntfs3/lib/decompress_common.c
@@ -0,0 +1,319 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * decompress_common.c - Code shared by the XPRESS and LZX decompressors
+ *
+ * Copyright (C) 2015 Eric Biggers
+ */
+
+#include "decompress_common.h"
+
+/*
+ * make_huffman_decode_table() -
+ *
+ * Build a decoding table for a canonical prefix code, or "Huffman code".
+ *
+ * This is an internal function, not part of the library API!
+ *
+ * This takes as input the length of the codeword for each symbol in the
+ * alphabet and produces as output a table that can be used for fast
+ * decoding of prefix-encoded symbols using read_huffsym().
+ *
+ * Strictly speaking, a canonical prefix code might not be a Huffman
+ * code. But this algorithm will work either way; and in fact, since
+ * Huffman codes are defined in terms of symbol frequencies, there is no
+ * way for the decompressor to know whether the code is a true Huffman
+ * code or not until all symbols have been decoded.
+ *
+ * Because the prefix code is assumed to be "canonical", it can be
+ * reconstructed directly from the codeword lengths. A prefix code is
+ * canonical if and only if a longer codeword never lexicographically
+ * precedes a shorter codeword, and the lexicographic ordering of
+ * codewords of the same length is the same as the lexicographic ordering
+ * of the corresponding symbols. Consequently, we can sort the symbols
+ * primarily by codeword length and secondarily by symbol value, then
+ * reconstruct the prefix code by generating codewords lexicographically
+ * in that order.
+ *
+ * This function does not, however, generate the prefix code explicitly.
+ * Instead, it directly builds a table for decoding symbols using the
+ * code. The basic idea is this: given the next 'max_codeword_len' bits
+ * in the input, we can look up the decoded symbol by indexing a table
+ * containing 2**max_codeword_len entries. A codeword with length
+ * 'max_codeword_len' will have exactly one entry in this table, whereas
+ * a codeword shorter than 'max_codeword_len' will have multiple entries
+ * in this table. Precisely, a codeword of length n will be represented
+ * by 2**(max_codeword_len - n) entries in this table. The 0-based index
+ * of each such entry will contain the corresponding codeword as a prefix
+ * when zero-padded on the left to 'max_codeword_len' binary digits.
+ *
+ * That's the basic idea, but we implement two optimizations regarding
+ * the format of the decode table itself:
+ *
+ * - For many compression formats, the maximum codeword length is too
+ * long for it to be efficient to build the full decoding table
+ * whenever a new prefix code is used. Instead, we can build the table
+ * using only 2**table_bits entries, where 'table_bits' is some number
+ * less than or equal to 'max_codeword_len'. Then, only codewords of
+ * length 'table_bits' and shorter can be directly looked up. For
+ * longer codewords, the direct lookup instead produces the root of a
+ * binary tree. Using this tree, the decoder can do traditional
+ * bit-by-bit decoding of the remainder of the codeword. Child nodes
+ * are allocated in extra entries at the end of the table; leaf nodes
+ * contain symbols. Note that the long-codeword case is, in general,
+ * not performance critical, since in Huffman codes the most frequently
+ * used symbols are assigned the shortest codeword lengths.
+ *
+ * - When we decode a symbol using a direct lookup of the table, we still
+ * need to know its length so that the bitstream can be advanced by the
+ * appropriate number of bits. The simple solution is to simply retain
+ * the 'lens' array and use the decoded symbol as an index into it.
+ * However, this requires two separate array accesses in the fast path.
+ * The optimization is to store the length directly in the decode
+ * table. We use the bottom 11 bits for the symbol and the top 5 bits
+ * for the length. In addition, to combine this optimization with the
+ * previous one, we introduce a special case where the top 2 bits of
+ * the length are both set if the entry is actually the root of a
+ * binary tree.
+ *
+ * @decode_table:
+ * The array in which to create the decoding table. This must have
+ * a length of at least ((2**table_bits) + 2 * num_syms) entries.
+ *
+ * @num_syms:
+ * The number of symbols in the alphabet; also, the length of the
+ * 'lens' array. Must be less than or equal to 2048.
+ *
+ * @table_bits:
+ * The order of the decode table size, as explained above. Must be
+ * less than or equal to 13.
+ *
+ * @lens:
+ * An array of length @num_syms, indexable by symbol, that gives the
+ * length of the codeword, in bits, for that symbol. The length can
+ * be 0, which means that the symbol does not have a codeword
+ * assigned.
+ *
+ * @max_codeword_len:
+ * The longest codeword length allowed in the compression format.
+ * All entries in 'lens' must be less than or equal to this value.
+ * This must be less than or equal to 23.
+ *
+ * @working_space
+ * A temporary array of length '2 * (max_codeword_len + 1) +
+ * num_syms'.
+ *
+ * Returns 0 on success, or -1 if the lengths do not form a valid prefix
+ * code.
+ */
+int make_huffman_decode_table(u16 decode_table[], const u32 num_syms,
+ const u32 table_bits, const u8 lens[],
+ const u32 max_codeword_len,
+ u16 working_space[])
+{
+ const u32 table_num_entries = 1 << table_bits;
+ u16 * const len_counts = &working_space[0];
+ u16 * const offsets = &working_space[1 * (max_codeword_len + 1)];
+ u16 * const sorted_syms = &working_space[2 * (max_codeword_len + 1)];
+ int left;
+ void *decode_table_ptr;
+ u32 sym_idx;
+ u32 codeword_len;
+ u32 stores_per_loop;
+ u32 decode_table_pos;
+ u32 len;
+ u32 sym;
+
+ /* Count how many symbols have each possible codeword length.
+ * Note that a length of 0 indicates the corresponding symbol is not
+ * used in the code and therefore does not have a codeword.
+ */
+ for (len = 0; len <= max_codeword_len; len++)
+ len_counts[len] = 0;
+ for (sym = 0; sym < num_syms; sym++)
+ len_counts[lens[sym]]++;
+
+ /* We can assume all lengths are <= max_codeword_len, but we
+ * cannot assume they form a valid prefix code. A codeword of
+ * length n should require a proportion of the codespace equaling
+ * (1/2)^n. The code is valid if and only if the codespace is
+ * exactly filled by the lengths, by this measure.
+ */
+ left = 1;
+ for (len = 1; len <= max_codeword_len; len++) {
+ left <<= 1;
+ left -= len_counts[len];
+ if (left < 0) {
+ /* The lengths overflow the codespace; that is, the code
+ * is over-subscribed.
+ */
+ return -1;
+ }
+ }
+
+ if (left) {
+ /* The lengths do not fill the codespace; that is, they form an
+ * incomplete set.
+ */
+ if (left == (1 << max_codeword_len)) {
+ /* The code is completely empty. This is arguably
+ * invalid, but in fact it is valid in LZX and XPRESS,
+ * so we must allow it. By definition, no symbols can
+ * be decoded with an empty code. Consequently, we
+ * technically don't even need to fill in the decode
+ * table. However, to avoid accessing uninitialized
+ * memory if the algorithm nevertheless attempts to
+ * decode symbols using such a code, we zero out the
+ * decode table.
+ */
+ memset(decode_table, 0,
+ table_num_entries * sizeof(decode_table[0]));
+ return 0;
+ }
+ return -1;
+ }
+
+ /* Sort the symbols primarily by length and secondarily by symbol order.
+ */
+
+ /* Initialize 'offsets' so that offsets[len] for 1 <= len <=
+ * max_codeword_len is the number of codewords shorter than 'len' bits.
+ */
+ offsets[1] = 0;
+ for (len = 1; len < max_codeword_len; len++)
+ offsets[len + 1] = offsets[len] + len_counts[len];
+
+ /* Use the 'offsets' array to sort the symbols. Note that we do not
+ * include symbols that are not used in the code. Consequently, fewer
+ * than 'num_syms' entries in 'sorted_syms' may be filled.
+ */
+ for (sym = 0; sym < num_syms; sym++)
+ if (lens[sym])
+ sorted_syms[offsets[lens[sym]]++] = sym;
+
+ /* Fill entries for codewords with length <= table_bits
+ * --- that is, those short enough for a direct mapping.
+ *
+ * The table will start with entries for the shortest codeword(s), which
+ * have the most entries. From there, the number of entries per
+ * codeword will decrease.
+ */
+ decode_table_ptr = decode_table;
+ sym_idx = 0;
+ codeword_len = 1;
+ stores_per_loop = (1 << (table_bits - codeword_len));
+ for (; stores_per_loop != 0; codeword_len++, stores_per_loop >>= 1) {
+ u32 end_sym_idx = sym_idx + len_counts[codeword_len];
+
+ for (; sym_idx < end_sym_idx; sym_idx++) {
+ u16 entry;
+ u16 *p;
+ u32 n;
+
+ entry = ((u32)codeword_len << 11) | sorted_syms[sym_idx];
+ p = (u16 *)decode_table_ptr;
+ n = stores_per_loop;
+
+ do {
+ *p++ = entry;
+ } while (--n);
+
+ decode_table_ptr = p;
+ }
+ }
+
+ /* If we've filled in the entire table, we are done. Otherwise,
+ * there are codewords longer than table_bits for which we must
+ * generate binary trees.
+ */
+ decode_table_pos = (u16 *)decode_table_ptr - decode_table;
+ if (decode_table_pos != table_num_entries) {
+ u32 j;
+ u32 next_free_tree_slot;
+ u32 cur_codeword;
+
+ /* First, zero out the remaining entries. This is
+ * necessary so that these entries appear as
+ * "unallocated" in the next part. Each of these entries
+ * will eventually be filled with the representation of
+ * the root node of a binary tree.
+ */
+ j = decode_table_pos;
+ do {
+ decode_table[j] = 0;
+ } while (++j != table_num_entries);
+
+ /* We allocate child nodes starting at the end of the
+ * direct lookup table. Note that there should be
+ * 2*num_syms extra entries for this purpose, although
+ * fewer than this may actually be needed.
+ */
+ next_free_tree_slot = table_num_entries;
+
+ /* Iterate through each codeword with length greater than
+ * 'table_bits', primarily in order of codeword length
+ * and secondarily in order of symbol.
+ */
+ for (cur_codeword = decode_table_pos << 1;
+ codeword_len <= max_codeword_len;
+ codeword_len++, cur_codeword <<= 1) {
+ u32 end_sym_idx = sym_idx + len_counts[codeword_len];
+
+ for (; sym_idx < end_sym_idx; sym_idx++, cur_codeword++) {
+ /* 'sorted_sym' is the symbol represented by the
+ * codeword.
+ */
+ u32 sorted_sym = sorted_syms[sym_idx];
+ u32 extra_bits = codeword_len - table_bits;
+ u32 node_idx = cur_codeword >> extra_bits;
+
+ /* Go through each bit of the current codeword
+ * beyond the prefix of length @table_bits and
+ * walk the appropriate binary tree, allocating
+ * any slots that have not yet been allocated.
+ *
+ * Note that the 'pointer' entry to the binary
+ * tree, which is stored in the direct lookup
+ * portion of the table, is represented
+ * identically to other internal (non-leaf)
+ * nodes of the binary tree; it can be thought
+ * of as simply the root of the tree. The
+ * representation of these internal nodes is
+ * simply the index of the left child combined
+ * with the special bits 0xC000 to distinguish
+ * the entry from direct mapping and leaf node
+ * entries.
+ */
+ do {
+ /* At least one bit remains in the
+ * codeword, but the current node is an
+ * unallocated leaf. Change it to an
+ * internal node.
+ */
+ if (decode_table[node_idx] == 0) {
+ decode_table[node_idx] =
+ next_free_tree_slot | 0xC000;
+ decode_table[next_free_tree_slot++] = 0;
+ decode_table[next_free_tree_slot++] = 0;
+ }
+
+ /* Go to the left child if the next bit
+ * in the codeword is 0; otherwise go to
+ * the right child.
+ */
+ node_idx = decode_table[node_idx] & 0x3FFF;
+ --extra_bits;
+ node_idx += (cur_codeword >> extra_bits) & 1;
+ } while (extra_bits != 0);
+
+ /* We've traversed the tree using the entire
+ * codeword, and we're now at the entry where
+ * the actual symbol will be stored. This is
+ * distinguished from internal nodes by not
+ * having its high two bits set.
+ */
+ decode_table[node_idx] = sorted_sym;
+ }
+ }
+ }
+ return 0;
+}
diff --git a/fs/ntfs3/lib/decompress_common.h b/fs/ntfs3/lib/decompress_common.h
new file mode 100644
index 000000000..dd7ced000
--- /dev/null
+++ b/fs/ntfs3/lib/decompress_common.h
@@ -0,0 +1,343 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * decompress_common.h - Code shared by the XPRESS and LZX decompressors
+ *
+ * Copyright (C) 2015 Eric Biggers
+ */
+
+#ifndef _LINUX_NTFS3_LIB_DECOMPRESS_COMMON_H
+#define _LINUX_NTFS3_LIB_DECOMPRESS_COMMON_H
+
+#include <linux/string.h>
+#include <linux/compiler.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <asm/unaligned.h>
+
+
+/* "Force inline" macro (not required, but helpful for performance) */
+#define forceinline __always_inline
+
+/* Enable whole-word match copying on selected architectures */
+#if defined(__i386__) || defined(__x86_64__) || defined(__ARM_FEATURE_UNALIGNED)
+# define FAST_UNALIGNED_ACCESS
+#endif
+
+/* Size of a machine word */
+#define WORDBYTES (sizeof(size_t))
+
+static forceinline void
+copy_unaligned_word(const void *src, void *dst)
+{
+ put_unaligned(get_unaligned((const size_t *)src), (size_t *)dst);
+}
+
+
+/* Generate a "word" with platform-dependent size whose bytes all contain the
+ * value 'b'.
+ */
+static forceinline size_t repeat_byte(u8 b)
+{
+ size_t v;
+
+ v = b;
+ v |= v << 8;
+ v |= v << 16;
+ v |= v << ((WORDBYTES == 8) ? 32 : 0);
+ return v;
+}
+
+/* Structure that encapsulates a block of in-memory data being interpreted as a
+ * stream of bits, optionally with interwoven literal bytes. Bits are assumed
+ * to be stored in little endian 16-bit coding units, with the bits ordered high
+ * to low.
+ */
+struct input_bitstream {
+
+ /* Bits that have been read from the input buffer. The bits are
+ * left-justified; the next bit is always bit 31.
+ */
+ u32 bitbuf;
+
+ /* Number of bits currently held in @bitbuf. */
+ u32 bitsleft;
+
+ /* Pointer to the next byte to be retrieved from the input buffer. */
+ const u8 *next;
+
+ /* Pointer to just past the end of the input buffer. */
+ const u8 *end;
+};
+
+/* Initialize a bitstream to read from the specified input buffer. */
+static forceinline void init_input_bitstream(struct input_bitstream *is,
+ const void *buffer, u32 size)
+{
+ is->bitbuf = 0;
+ is->bitsleft = 0;
+ is->next = buffer;
+ is->end = is->next + size;
+}
+
+/* Ensure the bit buffer variable for the bitstream contains at least @num_bits
+ * bits. Following this, bitstream_peek_bits() and/or bitstream_remove_bits()
+ * may be called on the bitstream to peek or remove up to @num_bits bits. Note
+ * that @num_bits must be <= 16.
+ */
+static forceinline void bitstream_ensure_bits(struct input_bitstream *is,
+ u32 num_bits)
+{
+ if (is->bitsleft < num_bits) {
+ if (is->end - is->next >= 2) {
+ is->bitbuf |= (u32)get_unaligned_le16(is->next)
+ << (16 - is->bitsleft);
+ is->next += 2;
+ }
+ is->bitsleft += 16;
+ }
+}
+
+/* Return the next @num_bits bits from the bitstream, without removing them.
+ * There must be at least @num_bits remaining in the buffer variable, from a
+ * previous call to bitstream_ensure_bits().
+ */
+static forceinline u32
+bitstream_peek_bits(const struct input_bitstream *is, const u32 num_bits)
+{
+ return (is->bitbuf >> 1) >> (sizeof(is->bitbuf) * 8 - num_bits - 1);
+}
+
+/* Remove @num_bits from the bitstream. There must be at least @num_bits
+ * remaining in the buffer variable, from a previous call to
+ * bitstream_ensure_bits().
+ */
+static forceinline void
+bitstream_remove_bits(struct input_bitstream *is, u32 num_bits)
+{
+ is->bitbuf <<= num_bits;
+ is->bitsleft -= num_bits;
+}
+
+/* Remove and return @num_bits bits from the bitstream. There must be at least
+ * @num_bits remaining in the buffer variable, from a previous call to
+ * bitstream_ensure_bits().
+ */
+static forceinline u32
+bitstream_pop_bits(struct input_bitstream *is, u32 num_bits)
+{
+ u32 bits = bitstream_peek_bits(is, num_bits);
+
+ bitstream_remove_bits(is, num_bits);
+ return bits;
+}
+
+/* Read and return the next @num_bits bits from the bitstream. */
+static forceinline u32
+bitstream_read_bits(struct input_bitstream *is, u32 num_bits)
+{
+ bitstream_ensure_bits(is, num_bits);
+ return bitstream_pop_bits(is, num_bits);
+}
+
+/* Read and return the next literal byte embedded in the bitstream. */
+static forceinline u8
+bitstream_read_byte(struct input_bitstream *is)
+{
+ if (unlikely(is->end == is->next))
+ return 0;
+ return *is->next++;
+}
+
+/* Read and return the next 16-bit integer embedded in the bitstream. */
+static forceinline u16
+bitstream_read_u16(struct input_bitstream *is)
+{
+ u16 v;
+
+ if (unlikely(is->end - is->next < 2))
+ return 0;
+ v = get_unaligned_le16(is->next);
+ is->next += 2;
+ return v;
+}
+
+/* Read and return the next 32-bit integer embedded in the bitstream. */
+static forceinline u32
+bitstream_read_u32(struct input_bitstream *is)
+{
+ u32 v;
+
+ if (unlikely(is->end - is->next < 4))
+ return 0;
+ v = get_unaligned_le32(is->next);
+ is->next += 4;
+ return v;
+}
+
+/* Read into @dst_buffer an array of literal bytes embedded in the bitstream.
+ * Return either a pointer to the byte past the last written, or NULL if the
+ * read overflows the input buffer.
+ */
+static forceinline void *bitstream_read_bytes(struct input_bitstream *is,
+ void *dst_buffer, size_t count)
+{
+ if ((size_t)(is->end - is->next) < count)
+ return NULL;
+ memcpy(dst_buffer, is->next, count);
+ is->next += count;
+ return (u8 *)dst_buffer + count;
+}
+
+/* Align the input bitstream on a coding-unit boundary. */
+static forceinline void bitstream_align(struct input_bitstream *is)
+{
+ is->bitsleft = 0;
+ is->bitbuf = 0;
+}
+
+extern int make_huffman_decode_table(u16 decode_table[], const u32 num_syms,
+ const u32 num_bits, const u8 lens[],
+ const u32 max_codeword_len,
+ u16 working_space[]);
+
+
+/* Reads and returns the next Huffman-encoded symbol from a bitstream. If the
+ * input data is exhausted, the Huffman symbol is decoded as if the missing bits
+ * are all zeroes.
+ */
+static forceinline u32 read_huffsym(struct input_bitstream *istream,
+ const u16 decode_table[],
+ u32 table_bits,
+ u32 max_codeword_len)
+{
+ u32 entry;
+ u32 key_bits;
+
+ bitstream_ensure_bits(istream, max_codeword_len);
+
+ /* Index the decode table by the next table_bits bits of the input. */
+ key_bits = bitstream_peek_bits(istream, table_bits);
+ entry = decode_table[key_bits];
+ if (entry < 0xC000) {
+ /* Fast case: The decode table directly provided the
+ * symbol and codeword length. The low 11 bits are the
+ * symbol, and the high 5 bits are the codeword length.
+ */
+ bitstream_remove_bits(istream, entry >> 11);
+ return entry & 0x7FF;
+ }
+ /* Slow case: The codeword for the symbol is longer than
+ * table_bits, so the symbol does not have an entry
+ * directly in the first (1 << table_bits) entries of the
+ * decode table. Traverse the appropriate binary tree
+ * bit-by-bit to decode the symbol.
+ */
+ bitstream_remove_bits(istream, table_bits);
+ do {
+ key_bits = (entry & 0x3FFF) + bitstream_pop_bits(istream, 1);
+ } while ((entry = decode_table[key_bits]) >= 0xC000);
+ return entry;
+}
+
+/*
+ * Copy an LZ77 match at (dst - offset) to dst.
+ *
+ * The length and offset must be already validated --- that is, (dst - offset)
+ * can't underrun the output buffer, and (dst + length) can't overrun the output
+ * buffer. Also, the length cannot be 0.
+ *
+ * @bufend points to the byte past the end of the output buffer. This function
+ * won't write any data beyond this position.
+ *
+ * Returns dst + length.
+ */
+static forceinline u8 *lz_copy(u8 *dst, u32 length, u32 offset, const u8 *bufend,
+ u32 min_length)
+{
+ const u8 *src = dst - offset;
+
+ /*
+ * Try to copy one machine word at a time. On i386 and x86_64 this is
+ * faster than copying one byte at a time, unless the data is
+ * near-random and all the matches have very short lengths. Note that
+ * since this requires unaligned memory accesses, it won't necessarily
+ * be faster on every architecture.
+ *
+ * Also note that we might copy more than the length of the match. For
+ * example, if a word is 8 bytes and the match is of length 5, then
+ * we'll simply copy 8 bytes. This is okay as long as we don't write
+ * beyond the end of the output buffer, hence the check for (bufend -
+ * end >= WORDBYTES - 1).
+ */
+#ifdef FAST_UNALIGNED_ACCESS
+ u8 * const end = dst + length;
+
+ if (bufend - end >= (ptrdiff_t)(WORDBYTES - 1)) {
+
+ if (offset >= WORDBYTES) {
+ /* The source and destination words don't overlap. */
+
+ /* To improve branch prediction, one iteration of this
+ * loop is unrolled. Most matches are short and will
+ * fail the first check. But if that check passes, then
+ * it becomes increasing likely that the match is long
+ * and we'll need to continue copying.
+ */
+
+ copy_unaligned_word(src, dst);
+ src += WORDBYTES;
+ dst += WORDBYTES;
+
+ if (dst < end) {
+ do {
+ copy_unaligned_word(src, dst);
+ src += WORDBYTES;
+ dst += WORDBYTES;
+ } while (dst < end);
+ }
+ return end;
+ } else if (offset == 1) {
+
+ /* Offset 1 matches are equivalent to run-length
+ * encoding of the previous byte. This case is common
+ * if the data contains many repeated bytes.
+ */
+ size_t v = repeat_byte(*(dst - 1));
+
+ do {
+ put_unaligned(v, (size_t *)dst);
+ src += WORDBYTES;
+ dst += WORDBYTES;
+ } while (dst < end);
+ return end;
+ }
+ /*
+ * We don't bother with special cases for other 'offset <
+ * WORDBYTES', which are usually rarer than 'offset == 1'. Extra
+ * checks will just slow things down. Actually, it's possible
+ * to handle all the 'offset < WORDBYTES' cases using the same
+ * code, but it still becomes more complicated doesn't seem any
+ * faster overall; it definitely slows down the more common
+ * 'offset == 1' case.
+ */
+ }
+#endif /* FAST_UNALIGNED_ACCESS */
+
+ /* Fall back to a bytewise copy. */
+
+ if (min_length >= 2) {
+ *dst++ = *src++;
+ length--;
+ }
+ if (min_length >= 3) {
+ *dst++ = *src++;
+ length--;
+ }
+ do {
+ *dst++ = *src++;
+ } while (--length);
+
+ return dst;
+}
+
+#endif /* _LINUX_NTFS3_LIB_DECOMPRESS_COMMON_H */
diff --git a/fs/ntfs3/lib/lib.h b/fs/ntfs3/lib/lib.h
new file mode 100644
index 000000000..90309a5ae
--- /dev/null
+++ b/fs/ntfs3/lib/lib.h
@@ -0,0 +1,32 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Adapted for linux kernel by Alexander Mamaev:
+ * - remove implementations of get_unaligned_
+ * - assume GCC is always defined
+ * - ISO C90
+ * - linux kernel code style
+ */
+
+#ifndef _LINUX_NTFS3_LIB_LIB_H
+#define _LINUX_NTFS3_LIB_LIB_H
+
+#include <linux/types.h>
+
+/* globals from xpress_decompress.c */
+struct xpress_decompressor *xpress_allocate_decompressor(void);
+void xpress_free_decompressor(struct xpress_decompressor *d);
+int xpress_decompress(struct xpress_decompressor *__restrict d,
+ const void *__restrict compressed_data,
+ size_t compressed_size,
+ void *__restrict uncompressed_data,
+ size_t uncompressed_size);
+
+/* globals from lzx_decompress.c */
+struct lzx_decompressor *lzx_allocate_decompressor(void);
+void lzx_free_decompressor(struct lzx_decompressor *d);
+int lzx_decompress(struct lzx_decompressor *__restrict d,
+ const void *__restrict compressed_data,
+ size_t compressed_size, void *__restrict uncompressed_data,
+ size_t uncompressed_size);
+
+#endif /* _LINUX_NTFS3_LIB_LIB_H */
diff --git a/fs/ntfs3/lib/lzx_decompress.c b/fs/ntfs3/lib/lzx_decompress.c
new file mode 100644
index 000000000..6b16f0707
--- /dev/null
+++ b/fs/ntfs3/lib/lzx_decompress.c
@@ -0,0 +1,670 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * lzx_decompress.c - A decompressor for the LZX compression format, which can
+ * be used in "System Compressed" files. This is based on the code from wimlib.
+ * This code only supports a window size (dictionary size) of 32768 bytes, since
+ * this is the only size used in System Compression.
+ *
+ * Copyright (C) 2015 Eric Biggers
+ */
+
+#include "decompress_common.h"
+#include "lib.h"
+
+/* Number of literal byte values */
+#define LZX_NUM_CHARS 256
+
+/* The smallest and largest allowed match lengths */
+#define LZX_MIN_MATCH_LEN 2
+#define LZX_MAX_MATCH_LEN 257
+
+/* Number of distinct match lengths that can be represented */
+#define LZX_NUM_LENS (LZX_MAX_MATCH_LEN - LZX_MIN_MATCH_LEN + 1)
+
+/* Number of match lengths for which no length symbol is required */
+#define LZX_NUM_PRIMARY_LENS 7
+#define LZX_NUM_LEN_HEADERS (LZX_NUM_PRIMARY_LENS + 1)
+
+/* Valid values of the 3-bit block type field */
+#define LZX_BLOCKTYPE_VERBATIM 1
+#define LZX_BLOCKTYPE_ALIGNED 2
+#define LZX_BLOCKTYPE_UNCOMPRESSED 3
+
+/* Number of offset slots for a window size of 32768 */
+#define LZX_NUM_OFFSET_SLOTS 30
+
+/* Number of symbols in the main code for a window size of 32768 */
+#define LZX_MAINCODE_NUM_SYMBOLS \
+ (LZX_NUM_CHARS + (LZX_NUM_OFFSET_SLOTS * LZX_NUM_LEN_HEADERS))
+
+/* Number of symbols in the length code */
+#define LZX_LENCODE_NUM_SYMBOLS (LZX_NUM_LENS - LZX_NUM_PRIMARY_LENS)
+
+/* Number of symbols in the precode */
+#define LZX_PRECODE_NUM_SYMBOLS 20
+
+/* Number of bits in which each precode codeword length is represented */
+#define LZX_PRECODE_ELEMENT_SIZE 4
+
+/* Number of low-order bits of each match offset that are entropy-encoded in
+ * aligned offset blocks
+ */
+#define LZX_NUM_ALIGNED_OFFSET_BITS 3
+
+/* Number of symbols in the aligned offset code */
+#define LZX_ALIGNEDCODE_NUM_SYMBOLS (1 << LZX_NUM_ALIGNED_OFFSET_BITS)
+
+/* Mask for the match offset bits that are entropy-encoded in aligned offset
+ * blocks
+ */
+#define LZX_ALIGNED_OFFSET_BITMASK ((1 << LZX_NUM_ALIGNED_OFFSET_BITS) - 1)
+
+/* Number of bits in which each aligned offset codeword length is represented */
+#define LZX_ALIGNEDCODE_ELEMENT_SIZE 3
+
+/* Maximum lengths (in bits) of the codewords in each Huffman code */
+#define LZX_MAX_MAIN_CODEWORD_LEN 16
+#define LZX_MAX_LEN_CODEWORD_LEN 16
+#define LZX_MAX_PRE_CODEWORD_LEN ((1 << LZX_PRECODE_ELEMENT_SIZE) - 1)
+#define LZX_MAX_ALIGNED_CODEWORD_LEN ((1 << LZX_ALIGNEDCODE_ELEMENT_SIZE) - 1)
+
+/* The default "filesize" value used in pre/post-processing. In the LZX format
+ * used in cabinet files this value must be given to the decompressor, whereas
+ * in the LZX format used in WIM files and system-compressed files this value is
+ * fixed at 12000000.
+ */
+#define LZX_DEFAULT_FILESIZE 12000000
+
+/* Assumed block size when the encoded block size begins with a 0 bit. */
+#define LZX_DEFAULT_BLOCK_SIZE 32768
+
+/* Number of offsets in the recent (or "repeat") offsets queue. */
+#define LZX_NUM_RECENT_OFFSETS 3
+
+/* These values are chosen for fast decompression. */
+#define LZX_MAINCODE_TABLEBITS 11
+#define LZX_LENCODE_TABLEBITS 10
+#define LZX_PRECODE_TABLEBITS 6
+#define LZX_ALIGNEDCODE_TABLEBITS 7
+
+#define LZX_READ_LENS_MAX_OVERRUN 50
+
+/* Mapping: offset slot => first match offset that uses that offset slot.
+ */
+static const u32 lzx_offset_slot_base[LZX_NUM_OFFSET_SLOTS + 1] = {
+ 0, 1, 2, 3, 4, /* 0 --- 4 */
+ 6, 8, 12, 16, 24, /* 5 --- 9 */
+ 32, 48, 64, 96, 128, /* 10 --- 14 */
+ 192, 256, 384, 512, 768, /* 15 --- 19 */
+ 1024, 1536, 2048, 3072, 4096, /* 20 --- 24 */
+ 6144, 8192, 12288, 16384, 24576, /* 25 --- 29 */
+ 32768, /* extra */
+};
+
+/* Mapping: offset slot => how many extra bits must be read and added to the
+ * corresponding offset slot base to decode the match offset.
+ */
+static const u8 lzx_extra_offset_bits[LZX_NUM_OFFSET_SLOTS] = {
+ 0, 0, 0, 0, 1,
+ 1, 2, 2, 3, 3,
+ 4, 4, 5, 5, 6,
+ 6, 7, 7, 8, 8,
+ 9, 9, 10, 10, 11,
+ 11, 12, 12, 13, 13,
+};
+
+/* Reusable heap-allocated memory for LZX decompression */
+struct lzx_decompressor {
+
+ /* Huffman decoding tables, and arrays that map symbols to codeword
+ * lengths
+ */
+
+ u16 maincode_decode_table[(1 << LZX_MAINCODE_TABLEBITS) +
+ (LZX_MAINCODE_NUM_SYMBOLS * 2)];
+ u8 maincode_lens[LZX_MAINCODE_NUM_SYMBOLS + LZX_READ_LENS_MAX_OVERRUN];
+
+
+ u16 lencode_decode_table[(1 << LZX_LENCODE_TABLEBITS) +
+ (LZX_LENCODE_NUM_SYMBOLS * 2)];
+ u8 lencode_lens[LZX_LENCODE_NUM_SYMBOLS + LZX_READ_LENS_MAX_OVERRUN];
+
+
+ u16 alignedcode_decode_table[(1 << LZX_ALIGNEDCODE_TABLEBITS) +
+ (LZX_ALIGNEDCODE_NUM_SYMBOLS * 2)];
+ u8 alignedcode_lens[LZX_ALIGNEDCODE_NUM_SYMBOLS];
+
+ u16 precode_decode_table[(1 << LZX_PRECODE_TABLEBITS) +
+ (LZX_PRECODE_NUM_SYMBOLS * 2)];
+ u8 precode_lens[LZX_PRECODE_NUM_SYMBOLS];
+
+ /* Temporary space for make_huffman_decode_table() */
+ u16 working_space[2 * (1 + LZX_MAX_MAIN_CODEWORD_LEN) +
+ LZX_MAINCODE_NUM_SYMBOLS];
+};
+
+static void undo_e8_translation(void *target, s32 input_pos)
+{
+ s32 abs_offset, rel_offset;
+
+ abs_offset = get_unaligned_le32(target);
+ if (abs_offset >= 0) {
+ if (abs_offset < LZX_DEFAULT_FILESIZE) {
+ /* "good translation" */
+ rel_offset = abs_offset - input_pos;
+ put_unaligned_le32(rel_offset, target);
+ }
+ } else {
+ if (abs_offset >= -input_pos) {
+ /* "compensating translation" */
+ rel_offset = abs_offset + LZX_DEFAULT_FILESIZE;
+ put_unaligned_le32(rel_offset, target);
+ }
+ }
+}
+
+/*
+ * Undo the 'E8' preprocessing used in LZX. Before compression, the
+ * uncompressed data was preprocessed by changing the targets of suspected x86
+ * CALL instructions from relative offsets to absolute offsets. After
+ * match/literal decoding, the decompressor must undo the translation.
+ */
+static void lzx_postprocess(u8 *data, u32 size)
+{
+ /*
+ * A worthwhile optimization is to push the end-of-buffer check into the
+ * relatively rare E8 case. This is possible if we replace the last six
+ * bytes of data with E8 bytes; then we are guaranteed to hit an E8 byte
+ * before reaching end-of-buffer. In addition, this scheme guarantees
+ * that no translation can begin following an E8 byte in the last 10
+ * bytes because a 4-byte offset containing E8 as its high byte is a
+ * large negative number that is not valid for translation. That is
+ * exactly what we need.
+ */
+ u8 *tail;
+ u8 saved_bytes[6];
+ u8 *p;
+
+ if (size <= 10)
+ return;
+
+ tail = &data[size - 6];
+ memcpy(saved_bytes, tail, 6);
+ memset(tail, 0xE8, 6);
+ p = data;
+ for (;;) {
+ while (*p != 0xE8)
+ p++;
+ if (p >= tail)
+ break;
+ undo_e8_translation(p + 1, p - data);
+ p += 5;
+ }
+ memcpy(tail, saved_bytes, 6);
+}
+
+/* Read a Huffman-encoded symbol using the precode. */
+static forceinline u32 read_presym(const struct lzx_decompressor *d,
+ struct input_bitstream *is)
+{
+ return read_huffsym(is, d->precode_decode_table,
+ LZX_PRECODE_TABLEBITS, LZX_MAX_PRE_CODEWORD_LEN);
+}
+
+/* Read a Huffman-encoded symbol using the main code. */
+static forceinline u32 read_mainsym(const struct lzx_decompressor *d,
+ struct input_bitstream *is)
+{
+ return read_huffsym(is, d->maincode_decode_table,
+ LZX_MAINCODE_TABLEBITS, LZX_MAX_MAIN_CODEWORD_LEN);
+}
+
+/* Read a Huffman-encoded symbol using the length code. */
+static forceinline u32 read_lensym(const struct lzx_decompressor *d,
+ struct input_bitstream *is)
+{
+ return read_huffsym(is, d->lencode_decode_table,
+ LZX_LENCODE_TABLEBITS, LZX_MAX_LEN_CODEWORD_LEN);
+}
+
+/* Read a Huffman-encoded symbol using the aligned offset code. */
+static forceinline u32 read_alignedsym(const struct lzx_decompressor *d,
+ struct input_bitstream *is)
+{
+ return read_huffsym(is, d->alignedcode_decode_table,
+ LZX_ALIGNEDCODE_TABLEBITS,
+ LZX_MAX_ALIGNED_CODEWORD_LEN);
+}
+
+/*
+ * Read the precode from the compressed input bitstream, then use it to decode
+ * @num_lens codeword length values.
+ *
+ * @is: The input bitstream.
+ *
+ * @lens: An array that contains the length values from the previous time
+ * the codeword lengths for this Huffman code were read, or all 0's
+ * if this is the first time. This array must have at least
+ * (@num_lens + LZX_READ_LENS_MAX_OVERRUN) entries.
+ *
+ * @num_lens: Number of length values to decode.
+ *
+ * Returns 0 on success, or -1 if the data was invalid.
+ */
+static int lzx_read_codeword_lens(struct lzx_decompressor *d,
+ struct input_bitstream *is,
+ u8 *lens, u32 num_lens)
+{
+ u8 *len_ptr = lens;
+ u8 *lens_end = lens + num_lens;
+ int i;
+
+ /* Read the lengths of the precode codewords. These are given
+ * explicitly.
+ */
+ for (i = 0; i < LZX_PRECODE_NUM_SYMBOLS; i++) {
+ d->precode_lens[i] =
+ bitstream_read_bits(is, LZX_PRECODE_ELEMENT_SIZE);
+ }
+
+ /* Make the decoding table for the precode. */
+ if (make_huffman_decode_table(d->precode_decode_table,
+ LZX_PRECODE_NUM_SYMBOLS,
+ LZX_PRECODE_TABLEBITS,
+ d->precode_lens,
+ LZX_MAX_PRE_CODEWORD_LEN,
+ d->working_space))
+ return -1;
+
+ /* Decode the codeword lengths. */
+ do {
+ u32 presym;
+ u8 len;
+
+ /* Read the next precode symbol. */
+ presym = read_presym(d, is);
+ if (presym < 17) {
+ /* Difference from old length */
+ len = *len_ptr - presym;
+ if ((s8)len < 0)
+ len += 17;
+ *len_ptr++ = len;
+ } else {
+ /* Special RLE values */
+
+ u32 run_len;
+
+ if (presym == 17) {
+ /* Run of 0's */
+ run_len = 4 + bitstream_read_bits(is, 4);
+ len = 0;
+ } else if (presym == 18) {
+ /* Longer run of 0's */
+ run_len = 20 + bitstream_read_bits(is, 5);
+ len = 0;
+ } else {
+ /* Run of identical lengths */
+ run_len = 4 + bitstream_read_bits(is, 1);
+ presym = read_presym(d, is);
+ if (presym > 17)
+ return -1;
+ len = *len_ptr - presym;
+ if ((s8)len < 0)
+ len += 17;
+ }
+
+ do {
+ *len_ptr++ = len;
+ } while (--run_len);
+ /* Worst case overrun is when presym == 18,
+ * run_len == 20 + 31, and only 1 length was remaining.
+ * So LZX_READ_LENS_MAX_OVERRUN == 50.
+ *
+ * Overrun while reading the first half of maincode_lens
+ * can corrupt the previous values in the second half.
+ * This doesn't really matter because the resulting
+ * lengths will still be in range, and data that
+ * generates overruns is invalid anyway.
+ */
+ }
+ } while (len_ptr < lens_end);
+
+ return 0;
+}
+
+/*
+ * Read the header of an LZX block and save the block type and (uncompressed)
+ * size in *block_type_ret and *block_size_ret, respectively.
+ *
+ * If the block is compressed, also update the Huffman decode @tables with the
+ * new Huffman codes. If the block is uncompressed, also update the match
+ * offset @queue with the new match offsets.
+ *
+ * Return 0 on success, or -1 if the data was invalid.
+ */
+static int lzx_read_block_header(struct lzx_decompressor *d,
+ struct input_bitstream *is,
+ int *block_type_ret,
+ u32 *block_size_ret,
+ u32 recent_offsets[])
+{
+ int block_type;
+ u32 block_size;
+ int i;
+
+ bitstream_ensure_bits(is, 4);
+
+ /* The first three bits tell us what kind of block it is, and should be
+ * one of the LZX_BLOCKTYPE_* values.
+ */
+ block_type = bitstream_pop_bits(is, 3);
+
+ /* Read the block size. */
+ if (bitstream_pop_bits(is, 1)) {
+ block_size = LZX_DEFAULT_BLOCK_SIZE;
+ } else {
+ block_size = 0;
+ block_size |= bitstream_read_bits(is, 8);
+ block_size <<= 8;
+ block_size |= bitstream_read_bits(is, 8);
+ }
+
+ switch (block_type) {
+
+ case LZX_BLOCKTYPE_ALIGNED:
+
+ /* Read the aligned offset code and prepare its decode table.
+ */
+
+ for (i = 0; i < LZX_ALIGNEDCODE_NUM_SYMBOLS; i++) {
+ d->alignedcode_lens[i] =
+ bitstream_read_bits(is,
+ LZX_ALIGNEDCODE_ELEMENT_SIZE);
+ }
+
+ if (make_huffman_decode_table(d->alignedcode_decode_table,
+ LZX_ALIGNEDCODE_NUM_SYMBOLS,
+ LZX_ALIGNEDCODE_TABLEBITS,
+ d->alignedcode_lens,
+ LZX_MAX_ALIGNED_CODEWORD_LEN,
+ d->working_space))
+ return -1;
+
+ /* Fall though, since the rest of the header for aligned offset
+ * blocks is the same as that for verbatim blocks.
+ */
+ fallthrough;
+
+ case LZX_BLOCKTYPE_VERBATIM:
+
+ /* Read the main code and prepare its decode table.
+ *
+ * Note that the codeword lengths in the main code are encoded
+ * in two parts: one part for literal symbols, and one part for
+ * match symbols.
+ */
+
+ if (lzx_read_codeword_lens(d, is, d->maincode_lens,
+ LZX_NUM_CHARS))
+ return -1;
+
+ if (lzx_read_codeword_lens(d, is,
+ d->maincode_lens + LZX_NUM_CHARS,
+ LZX_MAINCODE_NUM_SYMBOLS - LZX_NUM_CHARS))
+ return -1;
+
+ if (make_huffman_decode_table(d->maincode_decode_table,
+ LZX_MAINCODE_NUM_SYMBOLS,
+ LZX_MAINCODE_TABLEBITS,
+ d->maincode_lens,
+ LZX_MAX_MAIN_CODEWORD_LEN,
+ d->working_space))
+ return -1;
+
+ /* Read the length code and prepare its decode table. */
+
+ if (lzx_read_codeword_lens(d, is, d->lencode_lens,
+ LZX_LENCODE_NUM_SYMBOLS))
+ return -1;
+
+ if (make_huffman_decode_table(d->lencode_decode_table,
+ LZX_LENCODE_NUM_SYMBOLS,
+ LZX_LENCODE_TABLEBITS,
+ d->lencode_lens,
+ LZX_MAX_LEN_CODEWORD_LEN,
+ d->working_space))
+ return -1;
+
+ break;
+
+ case LZX_BLOCKTYPE_UNCOMPRESSED:
+
+ /* Before reading the three recent offsets from the uncompressed
+ * block header, the stream must be aligned on a 16-bit
+ * boundary. But if the stream is *already* aligned, then the
+ * next 16 bits must be discarded.
+ */
+ bitstream_ensure_bits(is, 1);
+ bitstream_align(is);
+
+ recent_offsets[0] = bitstream_read_u32(is);
+ recent_offsets[1] = bitstream_read_u32(is);
+ recent_offsets[2] = bitstream_read_u32(is);
+
+ /* Offsets of 0 are invalid. */
+ if (recent_offsets[0] == 0 || recent_offsets[1] == 0 ||
+ recent_offsets[2] == 0)
+ return -1;
+ break;
+
+ default:
+ /* Unrecognized block type. */
+ return -1;
+ }
+
+ *block_type_ret = block_type;
+ *block_size_ret = block_size;
+ return 0;
+}
+
+/* Decompress a block of LZX-compressed data. */
+static int lzx_decompress_block(const struct lzx_decompressor *d,
+ struct input_bitstream *is,
+ int block_type, u32 block_size,
+ u8 * const out_begin, u8 *out_next,
+ u32 recent_offsets[])
+{
+ u8 * const block_end = out_next + block_size;
+ u32 ones_if_aligned = 0U - (block_type == LZX_BLOCKTYPE_ALIGNED);
+
+ do {
+ u32 mainsym;
+ u32 match_len;
+ u32 match_offset;
+ u32 offset_slot;
+ u32 num_extra_bits;
+
+ mainsym = read_mainsym(d, is);
+ if (mainsym < LZX_NUM_CHARS) {
+ /* Literal */
+ *out_next++ = mainsym;
+ continue;
+ }
+
+ /* Match */
+
+ /* Decode the length header and offset slot. */
+ mainsym -= LZX_NUM_CHARS;
+ match_len = mainsym % LZX_NUM_LEN_HEADERS;
+ offset_slot = mainsym / LZX_NUM_LEN_HEADERS;
+
+ /* If needed, read a length symbol to decode the full length. */
+ if (match_len == LZX_NUM_PRIMARY_LENS)
+ match_len += read_lensym(d, is);
+ match_len += LZX_MIN_MATCH_LEN;
+
+ if (offset_slot < LZX_NUM_RECENT_OFFSETS) {
+ /* Repeat offset */
+
+ /* Note: This isn't a real LRU queue, since using the R2
+ * offset doesn't bump the R1 offset down to R2. This
+ * quirk allows all 3 recent offsets to be handled by
+ * the same code. (For R0, the swap is a no-op.)
+ */
+ match_offset = recent_offsets[offset_slot];
+ recent_offsets[offset_slot] = recent_offsets[0];
+ recent_offsets[0] = match_offset;
+ } else {
+ /* Explicit offset */
+
+ /* Look up the number of extra bits that need to be read
+ * to decode offsets with this offset slot.
+ */
+ num_extra_bits = lzx_extra_offset_bits[offset_slot];
+
+ /* Start with the offset slot base value. */
+ match_offset = lzx_offset_slot_base[offset_slot];
+
+ /* In aligned offset blocks, the low-order 3 bits of
+ * each offset are encoded using the aligned offset
+ * code. Otherwise, all the extra bits are literal.
+ */
+
+ if ((num_extra_bits & ones_if_aligned) >= LZX_NUM_ALIGNED_OFFSET_BITS) {
+ match_offset +=
+ bitstream_read_bits(is, num_extra_bits -
+ LZX_NUM_ALIGNED_OFFSET_BITS)
+ << LZX_NUM_ALIGNED_OFFSET_BITS;
+ match_offset += read_alignedsym(d, is);
+ } else {
+ match_offset += bitstream_read_bits(is, num_extra_bits);
+ }
+
+ /* Adjust the offset. */
+ match_offset -= (LZX_NUM_RECENT_OFFSETS - 1);
+
+ /* Update the recent offsets. */
+ recent_offsets[2] = recent_offsets[1];
+ recent_offsets[1] = recent_offsets[0];
+ recent_offsets[0] = match_offset;
+ }
+
+ /* Validate the match, then copy it to the current position. */
+
+ if (match_len > (size_t)(block_end - out_next))
+ return -1;
+
+ if (match_offset > (size_t)(out_next - out_begin))
+ return -1;
+
+ out_next = lz_copy(out_next, match_len, match_offset,
+ block_end, LZX_MIN_MATCH_LEN);
+
+ } while (out_next != block_end);
+
+ return 0;
+}
+
+/*
+ * lzx_allocate_decompressor - Allocate an LZX decompressor
+ *
+ * Return the pointer to the decompressor on success, or return NULL and set
+ * errno on failure.
+ */
+struct lzx_decompressor *lzx_allocate_decompressor(void)
+{
+ return kmalloc(sizeof(struct lzx_decompressor), GFP_NOFS);
+}
+
+/*
+ * lzx_decompress - Decompress a buffer of LZX-compressed data
+ *
+ * @decompressor: A decompressor allocated with lzx_allocate_decompressor()
+ * @compressed_data: The buffer of data to decompress
+ * @compressed_size: Number of bytes of compressed data
+ * @uncompressed_data: The buffer in which to store the decompressed data
+ * @uncompressed_size: The number of bytes the data decompresses into
+ *
+ * Return 0 on success, or return -1 and set errno on failure.
+ */
+int lzx_decompress(struct lzx_decompressor *decompressor,
+ const void *compressed_data, size_t compressed_size,
+ void *uncompressed_data, size_t uncompressed_size)
+{
+ struct lzx_decompressor *d = decompressor;
+ u8 * const out_begin = uncompressed_data;
+ u8 *out_next = out_begin;
+ u8 * const out_end = out_begin + uncompressed_size;
+ struct input_bitstream is;
+ u32 recent_offsets[LZX_NUM_RECENT_OFFSETS] = {1, 1, 1};
+ int e8_status = 0;
+
+ init_input_bitstream(&is, compressed_data, compressed_size);
+
+ /* Codeword lengths begin as all 0's for delta encoding purposes. */
+ memset(d->maincode_lens, 0, LZX_MAINCODE_NUM_SYMBOLS);
+ memset(d->lencode_lens, 0, LZX_LENCODE_NUM_SYMBOLS);
+
+ /* Decompress blocks until we have all the uncompressed data. */
+
+ while (out_next != out_end) {
+ int block_type;
+ u32 block_size;
+
+ if (lzx_read_block_header(d, &is, &block_type, &block_size,
+ recent_offsets))
+ goto invalid;
+
+ if (block_size < 1 || block_size > (size_t)(out_end - out_next))
+ goto invalid;
+
+ if (block_type != LZX_BLOCKTYPE_UNCOMPRESSED) {
+
+ /* Compressed block */
+
+ if (lzx_decompress_block(d,
+ &is,
+ block_type,
+ block_size,
+ out_begin,
+ out_next,
+ recent_offsets))
+ goto invalid;
+
+ e8_status |= d->maincode_lens[0xe8];
+ out_next += block_size;
+ } else {
+ /* Uncompressed block */
+
+ out_next = bitstream_read_bytes(&is, out_next,
+ block_size);
+ if (!out_next)
+ goto invalid;
+
+ if (block_size & 1)
+ bitstream_read_byte(&is);
+
+ e8_status = 1;
+ }
+ }
+
+ /* Postprocess the data unless it cannot possibly contain 0xe8 bytes. */
+ if (e8_status)
+ lzx_postprocess(uncompressed_data, uncompressed_size);
+
+ return 0;
+
+invalid:
+ return -1;
+}
+
+/*
+ * lzx_free_decompressor - Free an LZX decompressor
+ *
+ * @decompressor: A decompressor that was allocated with
+ * lzx_allocate_decompressor(), or NULL.
+ */
+void lzx_free_decompressor(struct lzx_decompressor *decompressor)
+{
+ kfree(decompressor);
+}
diff --git a/fs/ntfs3/lib/xpress_decompress.c b/fs/ntfs3/lib/xpress_decompress.c
new file mode 100644
index 000000000..769c6d3dd
--- /dev/null
+++ b/fs/ntfs3/lib/xpress_decompress.c
@@ -0,0 +1,142 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * xpress_decompress.c - A decompressor for the XPRESS compression format
+ * (Huffman variant), which can be used in "System Compressed" files. This is
+ * based on the code from wimlib.
+ *
+ * Copyright (C) 2015 Eric Biggers
+ */
+
+#include "decompress_common.h"
+#include "lib.h"
+
+#define XPRESS_NUM_SYMBOLS 512
+#define XPRESS_MAX_CODEWORD_LEN 15
+#define XPRESS_MIN_MATCH_LEN 3
+
+/* This value is chosen for fast decompression. */
+#define XPRESS_TABLEBITS 12
+
+/* Reusable heap-allocated memory for XPRESS decompression */
+struct xpress_decompressor {
+
+ /* The Huffman decoding table */
+ u16 decode_table[(1 << XPRESS_TABLEBITS) + 2 * XPRESS_NUM_SYMBOLS];
+
+ /* An array that maps symbols to codeword lengths */
+ u8 lens[XPRESS_NUM_SYMBOLS];
+
+ /* Temporary space for make_huffman_decode_table() */
+ u16 working_space[2 * (1 + XPRESS_MAX_CODEWORD_LEN) +
+ XPRESS_NUM_SYMBOLS];
+};
+
+/*
+ * xpress_allocate_decompressor - Allocate an XPRESS decompressor
+ *
+ * Return the pointer to the decompressor on success, or return NULL and set
+ * errno on failure.
+ */
+struct xpress_decompressor *xpress_allocate_decompressor(void)
+{
+ return kmalloc(sizeof(struct xpress_decompressor), GFP_NOFS);
+}
+
+/*
+ * xpress_decompress - Decompress a buffer of XPRESS-compressed data
+ *
+ * @decompressor: A decompressor that was allocated with
+ * xpress_allocate_decompressor()
+ * @compressed_data: The buffer of data to decompress
+ * @compressed_size: Number of bytes of compressed data
+ * @uncompressed_data: The buffer in which to store the decompressed data
+ * @uncompressed_size: The number of bytes the data decompresses into
+ *
+ * Return 0 on success, or return -1 and set errno on failure.
+ */
+int xpress_decompress(struct xpress_decompressor *decompressor,
+ const void *compressed_data, size_t compressed_size,
+ void *uncompressed_data, size_t uncompressed_size)
+{
+ struct xpress_decompressor *d = decompressor;
+ const u8 * const in_begin = compressed_data;
+ u8 * const out_begin = uncompressed_data;
+ u8 *out_next = out_begin;
+ u8 * const out_end = out_begin + uncompressed_size;
+ struct input_bitstream is;
+ u32 i;
+
+ /* Read the Huffman codeword lengths. */
+ if (compressed_size < XPRESS_NUM_SYMBOLS / 2)
+ goto invalid;
+ for (i = 0; i < XPRESS_NUM_SYMBOLS / 2; i++) {
+ d->lens[i*2 + 0] = in_begin[i] & 0xF;
+ d->lens[i*2 + 1] = in_begin[i] >> 4;
+ }
+
+ /* Build a decoding table for the Huffman code. */
+ if (make_huffman_decode_table(d->decode_table, XPRESS_NUM_SYMBOLS,
+ XPRESS_TABLEBITS, d->lens,
+ XPRESS_MAX_CODEWORD_LEN,
+ d->working_space))
+ goto invalid;
+
+ /* Decode the matches and literals. */
+
+ init_input_bitstream(&is, in_begin + XPRESS_NUM_SYMBOLS / 2,
+ compressed_size - XPRESS_NUM_SYMBOLS / 2);
+
+ while (out_next != out_end) {
+ u32 sym;
+ u32 log2_offset;
+ u32 length;
+ u32 offset;
+
+ sym = read_huffsym(&is, d->decode_table,
+ XPRESS_TABLEBITS, XPRESS_MAX_CODEWORD_LEN);
+ if (sym < 256) {
+ /* Literal */
+ *out_next++ = sym;
+ } else {
+ /* Match */
+ length = sym & 0xf;
+ log2_offset = (sym >> 4) & 0xf;
+
+ bitstream_ensure_bits(&is, 16);
+
+ offset = ((u32)1 << log2_offset) |
+ bitstream_pop_bits(&is, log2_offset);
+
+ if (length == 0xf) {
+ length += bitstream_read_byte(&is);
+ if (length == 0xf + 0xff)
+ length = bitstream_read_u16(&is);
+ }
+ length += XPRESS_MIN_MATCH_LEN;
+
+ if (offset > (size_t)(out_next - out_begin))
+ goto invalid;
+
+ if (length > (size_t)(out_end - out_next))
+ goto invalid;
+
+ out_next = lz_copy(out_next, length, offset, out_end,
+ XPRESS_MIN_MATCH_LEN);
+ }
+ }
+ return 0;
+
+invalid:
+ return -1;
+}
+
+/*
+ * xpress_free_decompressor - Free an XPRESS decompressor
+ *
+ * @decompressor: A decompressor that was allocated with
+ * xpress_allocate_decompressor(), or NULL.
+ */
+void xpress_free_decompressor(struct xpress_decompressor *decompressor)
+{
+ kfree(decompressor);
+}
diff --git a/fs/ntfs3/lznt.c b/fs/ntfs3/lznt.c
new file mode 100644
index 000000000..28f654561
--- /dev/null
+++ b/fs/ntfs3/lznt.c
@@ -0,0 +1,453 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ *
+ * Copyright (C) 2019-2021 Paragon Software GmbH, All rights reserved.
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/stddef.h>
+#include <linux/string.h>
+#include <linux/types.h>
+
+#include "debug.h"
+#include "ntfs_fs.h"
+
+// clang-format off
+/* Src buffer is zero. */
+#define LZNT_ERROR_ALL_ZEROS 1
+#define LZNT_CHUNK_SIZE 0x1000
+// clang-format on
+
+struct lznt_hash {
+ const u8 *p1;
+ const u8 *p2;
+};
+
+struct lznt {
+ const u8 *unc;
+ const u8 *unc_end;
+ const u8 *best_match;
+ size_t max_len;
+ bool std;
+
+ struct lznt_hash hash[LZNT_CHUNK_SIZE];
+};
+
+static inline size_t get_match_len(const u8 *ptr, const u8 *end, const u8 *prev,
+ size_t max_len)
+{
+ size_t len = 0;
+
+ while (ptr + len < end && ptr[len] == prev[len] && ++len < max_len)
+ ;
+ return len;
+}
+
+static size_t longest_match_std(const u8 *src, struct lznt *ctx)
+{
+ size_t hash_index;
+ size_t len1 = 0, len2 = 0;
+ const u8 **hash;
+
+ hash_index =
+ ((40543U * ((((src[0] << 4) ^ src[1]) << 4) ^ src[2])) >> 4) &
+ (LZNT_CHUNK_SIZE - 1);
+
+ hash = &(ctx->hash[hash_index].p1);
+
+ if (hash[0] >= ctx->unc && hash[0] < src && hash[0][0] == src[0] &&
+ hash[0][1] == src[1] && hash[0][2] == src[2]) {
+ len1 = 3;
+ if (ctx->max_len > 3)
+ len1 += get_match_len(src + 3, ctx->unc_end,
+ hash[0] + 3, ctx->max_len - 3);
+ }
+
+ if (hash[1] >= ctx->unc && hash[1] < src && hash[1][0] == src[0] &&
+ hash[1][1] == src[1] && hash[1][2] == src[2]) {
+ len2 = 3;
+ if (ctx->max_len > 3)
+ len2 += get_match_len(src + 3, ctx->unc_end,
+ hash[1] + 3, ctx->max_len - 3);
+ }
+
+ /* Compare two matches and select the best one. */
+ if (len1 < len2) {
+ ctx->best_match = hash[1];
+ len1 = len2;
+ } else {
+ ctx->best_match = hash[0];
+ }
+
+ hash[1] = hash[0];
+ hash[0] = src;
+ return len1;
+}
+
+static size_t longest_match_best(const u8 *src, struct lznt *ctx)
+{
+ size_t max_len;
+ const u8 *ptr;
+
+ if (ctx->unc >= src || !ctx->max_len)
+ return 0;
+
+ max_len = 0;
+ for (ptr = ctx->unc; ptr < src; ++ptr) {
+ size_t len =
+ get_match_len(src, ctx->unc_end, ptr, ctx->max_len);
+ if (len >= max_len) {
+ max_len = len;
+ ctx->best_match = ptr;
+ }
+ }
+
+ return max_len >= 3 ? max_len : 0;
+}
+
+static const size_t s_max_len[] = {
+ 0x1002, 0x802, 0x402, 0x202, 0x102, 0x82, 0x42, 0x22, 0x12,
+};
+
+static const size_t s_max_off[] = {
+ 0x10, 0x20, 0x40, 0x80, 0x100, 0x200, 0x400, 0x800, 0x1000,
+};
+
+static inline u16 make_pair(size_t offset, size_t len, size_t index)
+{
+ return ((offset - 1) << (12 - index)) |
+ ((len - 3) & (((1 << (12 - index)) - 1)));
+}
+
+static inline size_t parse_pair(u16 pair, size_t *offset, size_t index)
+{
+ *offset = 1 + (pair >> (12 - index));
+ return 3 + (pair & ((1 << (12 - index)) - 1));
+}
+
+/*
+ * compress_chunk
+ *
+ * Return:
+ * * 0 - Ok, @cmpr contains @cmpr_chunk_size bytes of compressed data.
+ * * 1 - Input buffer is full zero.
+ * * -2 - The compressed buffer is too small to hold the compressed data.
+ */
+static inline int compress_chunk(size_t (*match)(const u8 *, struct lznt *),
+ const u8 *unc, const u8 *unc_end, u8 *cmpr,
+ u8 *cmpr_end, size_t *cmpr_chunk_size,
+ struct lznt *ctx)
+{
+ size_t cnt = 0;
+ size_t idx = 0;
+ const u8 *up = unc;
+ u8 *cp = cmpr + 3;
+ u8 *cp2 = cmpr + 2;
+ u8 not_zero = 0;
+ /* Control byte of 8-bit values: ( 0 - means byte as is, 1 - short pair ). */
+ u8 ohdr = 0;
+ u8 *last;
+ u16 t16;
+
+ if (unc + LZNT_CHUNK_SIZE < unc_end)
+ unc_end = unc + LZNT_CHUNK_SIZE;
+
+ last = min(cmpr + LZNT_CHUNK_SIZE + sizeof(short), cmpr_end);
+
+ ctx->unc = unc;
+ ctx->unc_end = unc_end;
+ ctx->max_len = s_max_len[0];
+
+ while (up < unc_end) {
+ size_t max_len;
+
+ while (unc + s_max_off[idx] < up)
+ ctx->max_len = s_max_len[++idx];
+
+ /* Find match. */
+ max_len = up + 3 <= unc_end ? (*match)(up, ctx) : 0;
+
+ if (!max_len) {
+ if (cp >= last)
+ goto NotCompressed;
+ not_zero |= *cp++ = *up++;
+ } else if (cp + 1 >= last) {
+ goto NotCompressed;
+ } else {
+ t16 = make_pair(up - ctx->best_match, max_len, idx);
+ *cp++ = t16;
+ *cp++ = t16 >> 8;
+
+ ohdr |= 1 << cnt;
+ up += max_len;
+ }
+
+ cnt = (cnt + 1) & 7;
+ if (!cnt) {
+ *cp2 = ohdr;
+ ohdr = 0;
+ cp2 = cp;
+ cp += 1;
+ }
+ }
+
+ if (cp2 < last)
+ *cp2 = ohdr;
+ else
+ cp -= 1;
+
+ *cmpr_chunk_size = cp - cmpr;
+
+ t16 = (*cmpr_chunk_size - 3) | 0xB000;
+ cmpr[0] = t16;
+ cmpr[1] = t16 >> 8;
+
+ return not_zero ? 0 : LZNT_ERROR_ALL_ZEROS;
+
+NotCompressed:
+
+ if ((cmpr + LZNT_CHUNK_SIZE + sizeof(short)) > last)
+ return -2;
+
+ /*
+ * Copy non cmpr data.
+ * 0x3FFF == ((LZNT_CHUNK_SIZE + 2 - 3) | 0x3000)
+ */
+ cmpr[0] = 0xff;
+ cmpr[1] = 0x3f;
+
+ memcpy(cmpr + sizeof(short), unc, LZNT_CHUNK_SIZE);
+ *cmpr_chunk_size = LZNT_CHUNK_SIZE + sizeof(short);
+
+ return 0;
+}
+
+static inline ssize_t decompress_chunk(u8 *unc, u8 *unc_end, const u8 *cmpr,
+ const u8 *cmpr_end)
+{
+ u8 *up = unc;
+ u8 ch = *cmpr++;
+ size_t bit = 0;
+ size_t index = 0;
+ u16 pair;
+ size_t offset, length;
+
+ /* Do decompression until pointers are inside range. */
+ while (up < unc_end && cmpr < cmpr_end) {
+ /* Correct index */
+ while (unc + s_max_off[index] < up)
+ index += 1;
+
+ /* Check the current flag for zero. */
+ if (!(ch & (1 << bit))) {
+ /* Just copy byte. */
+ *up++ = *cmpr++;
+ goto next;
+ }
+
+ /* Check for boundary. */
+ if (cmpr + 1 >= cmpr_end)
+ return -EINVAL;
+
+ /* Read a short from little endian stream. */
+ pair = cmpr[1];
+ pair <<= 8;
+ pair |= cmpr[0];
+
+ cmpr += 2;
+
+ /* Translate packed information into offset and length. */
+ length = parse_pair(pair, &offset, index);
+
+ /* Check offset for boundary. */
+ if (unc + offset > up)
+ return -EINVAL;
+
+ /* Truncate the length if necessary. */
+ if (up + length >= unc_end)
+ length = unc_end - up;
+
+ /* Now we copy bytes. This is the heart of LZ algorithm. */
+ for (; length > 0; length--, up++)
+ *up = *(up - offset);
+
+next:
+ /* Advance flag bit value. */
+ bit = (bit + 1) & 7;
+
+ if (!bit) {
+ if (cmpr >= cmpr_end)
+ break;
+
+ ch = *cmpr++;
+ }
+ }
+
+ /* Return the size of uncompressed data. */
+ return up - unc;
+}
+
+/*
+ * get_lznt_ctx
+ * @level: 0 - Standard compression.
+ * !0 - Best compression, requires a lot of cpu.
+ */
+struct lznt *get_lznt_ctx(int level)
+{
+ struct lznt *r = kzalloc(level ? offsetof(struct lznt, hash)
+ : sizeof(struct lznt),
+ GFP_NOFS);
+
+ if (r)
+ r->std = !level;
+ return r;
+}
+
+/*
+ * compress_lznt - Compresses @unc into @cmpr
+ *
+ * Return:
+ * * +x - Ok, @cmpr contains 'final_compressed_size' bytes of compressed data.
+ * * 0 - Input buffer is full zero.
+ */
+size_t compress_lznt(const void *unc, size_t unc_size, void *cmpr,
+ size_t cmpr_size, struct lznt *ctx)
+{
+ int err;
+ size_t (*match)(const u8 *src, struct lznt *ctx);
+ u8 *p = cmpr;
+ u8 *end = p + cmpr_size;
+ const u8 *unc_chunk = unc;
+ const u8 *unc_end = unc_chunk + unc_size;
+ bool is_zero = true;
+
+ if (ctx->std) {
+ match = &longest_match_std;
+ memset(ctx->hash, 0, sizeof(ctx->hash));
+ } else {
+ match = &longest_match_best;
+ }
+
+ /* Compression cycle. */
+ for (; unc_chunk < unc_end; unc_chunk += LZNT_CHUNK_SIZE) {
+ cmpr_size = 0;
+ err = compress_chunk(match, unc_chunk, unc_end, p, end,
+ &cmpr_size, ctx);
+ if (err < 0)
+ return unc_size;
+
+ if (is_zero && err != LZNT_ERROR_ALL_ZEROS)
+ is_zero = false;
+
+ p += cmpr_size;
+ }
+
+ if (p <= end - 2)
+ p[0] = p[1] = 0;
+
+ return is_zero ? 0 : PtrOffset(cmpr, p);
+}
+
+/*
+ * decompress_lznt - Decompress @cmpr into @unc.
+ */
+ssize_t decompress_lznt(const void *cmpr, size_t cmpr_size, void *unc,
+ size_t unc_size)
+{
+ const u8 *cmpr_chunk = cmpr;
+ const u8 *cmpr_end = cmpr_chunk + cmpr_size;
+ u8 *unc_chunk = unc;
+ u8 *unc_end = unc_chunk + unc_size;
+ u16 chunk_hdr;
+
+ if (cmpr_size < sizeof(short))
+ return -EINVAL;
+
+ /* Read chunk header. */
+ chunk_hdr = cmpr_chunk[1];
+ chunk_hdr <<= 8;
+ chunk_hdr |= cmpr_chunk[0];
+
+ /* Loop through decompressing chunks. */
+ for (;;) {
+ size_t chunk_size_saved;
+ size_t unc_use;
+ size_t cmpr_use = 3 + (chunk_hdr & (LZNT_CHUNK_SIZE - 1));
+
+ /* Check that the chunk actually fits the supplied buffer. */
+ if (cmpr_chunk + cmpr_use > cmpr_end)
+ return -EINVAL;
+
+ /* First make sure the chunk contains compressed data. */
+ if (chunk_hdr & 0x8000) {
+ /* Decompress a chunk and return if we get an error. */
+ ssize_t err =
+ decompress_chunk(unc_chunk, unc_end,
+ cmpr_chunk + sizeof(chunk_hdr),
+ cmpr_chunk + cmpr_use);
+ if (err < 0)
+ return err;
+ unc_use = err;
+ } else {
+ /* This chunk does not contain compressed data. */
+ unc_use = unc_chunk + LZNT_CHUNK_SIZE > unc_end
+ ? unc_end - unc_chunk
+ : LZNT_CHUNK_SIZE;
+
+ if (cmpr_chunk + sizeof(chunk_hdr) + unc_use >
+ cmpr_end) {
+ return -EINVAL;
+ }
+
+ memcpy(unc_chunk, cmpr_chunk + sizeof(chunk_hdr),
+ unc_use);
+ }
+
+ /* Advance pointers. */
+ cmpr_chunk += cmpr_use;
+ unc_chunk += unc_use;
+
+ /* Check for the end of unc buffer. */
+ if (unc_chunk >= unc_end)
+ break;
+
+ /* Proceed the next chunk. */
+ if (cmpr_chunk > cmpr_end - 2)
+ break;
+
+ chunk_size_saved = LZNT_CHUNK_SIZE;
+
+ /* Read chunk header. */
+ chunk_hdr = cmpr_chunk[1];
+ chunk_hdr <<= 8;
+ chunk_hdr |= cmpr_chunk[0];
+
+ if (!chunk_hdr)
+ break;
+
+ /* Check the size of unc buffer. */
+ if (unc_use < chunk_size_saved) {
+ size_t t1 = chunk_size_saved - unc_use;
+ u8 *t2 = unc_chunk + t1;
+
+ /* 'Zero' memory. */
+ if (t2 >= unc_end)
+ break;
+
+ memset(unc_chunk, 0, t1);
+ unc_chunk = t2;
+ }
+ }
+
+ /* Check compression boundary. */
+ if (cmpr_chunk > cmpr_end)
+ return -EINVAL;
+
+ /*
+ * The unc size is just a difference between current
+ * pointer and original one.
+ */
+ return PtrOffset(unc, unc_chunk);
+}
diff --git a/fs/ntfs3/namei.c b/fs/ntfs3/namei.c
new file mode 100644
index 000000000..a9549e730
--- /dev/null
+++ b/fs/ntfs3/namei.c
@@ -0,0 +1,395 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ *
+ * Copyright (C) 2019-2021 Paragon Software GmbH, All rights reserved.
+ *
+ */
+
+#include <linux/fs.h>
+#include <linux/nls.h>
+
+#include "debug.h"
+#include "ntfs.h"
+#include "ntfs_fs.h"
+
+/*
+ * fill_name_de - Format NTFS_DE in @buf.
+ */
+int fill_name_de(struct ntfs_sb_info *sbi, void *buf, const struct qstr *name,
+ const struct cpu_str *uni)
+{
+ int err;
+ struct NTFS_DE *e = buf;
+ u16 data_size;
+ struct ATTR_FILE_NAME *fname = (struct ATTR_FILE_NAME *)(e + 1);
+
+#ifndef CONFIG_NTFS3_64BIT_CLUSTER
+ e->ref.high = fname->home.high = 0;
+#endif
+ if (uni) {
+#ifdef __BIG_ENDIAN
+ int ulen = uni->len;
+ __le16 *uname = fname->name;
+ const u16 *name_cpu = uni->name;
+
+ while (ulen--)
+ *uname++ = cpu_to_le16(*name_cpu++);
+#else
+ memcpy(fname->name, uni->name, uni->len * sizeof(u16));
+#endif
+ fname->name_len = uni->len;
+
+ } else {
+ /* Convert input string to unicode. */
+ err = ntfs_nls_to_utf16(sbi, name->name, name->len,
+ (struct cpu_str *)&fname->name_len,
+ NTFS_NAME_LEN, UTF16_LITTLE_ENDIAN);
+ if (err < 0)
+ return err;
+ }
+
+ fname->type = FILE_NAME_POSIX;
+ data_size = fname_full_size(fname);
+
+ e->size = cpu_to_le16(ALIGN(data_size, 8) + sizeof(struct NTFS_DE));
+ e->key_size = cpu_to_le16(data_size);
+ e->flags = 0;
+ e->res = 0;
+
+ return 0;
+}
+
+/*
+ * ntfs_lookup - inode_operations::lookup
+ */
+static struct dentry *ntfs_lookup(struct inode *dir, struct dentry *dentry,
+ u32 flags)
+{
+ struct ntfs_inode *ni = ntfs_i(dir);
+ struct cpu_str *uni = __getname();
+ struct inode *inode;
+ int err;
+
+ if (!uni)
+ inode = ERR_PTR(-ENOMEM);
+ else {
+ err = ntfs_nls_to_utf16(ni->mi.sbi, dentry->d_name.name,
+ dentry->d_name.len, uni, NTFS_NAME_LEN,
+ UTF16_HOST_ENDIAN);
+ if (err < 0)
+ inode = ERR_PTR(err);
+ else {
+ ni_lock(ni);
+ inode = dir_search_u(dir, uni, NULL);
+ ni_unlock(ni);
+ }
+ __putname(uni);
+ }
+
+ /*
+ * Check for a null pointer
+ * If the MFT record of ntfs inode is not a base record, inode->i_op can be NULL.
+ * This causes null pointer dereference in d_splice_alias().
+ */
+ if (!IS_ERR_OR_NULL(inode) && !inode->i_op) {
+ iput(inode);
+ inode = ERR_PTR(-EINVAL);
+ }
+
+ return d_splice_alias(inode, dentry);
+}
+
+/*
+ * ntfs_create - inode_operations::create
+ */
+static int ntfs_create(struct user_namespace *mnt_userns, struct inode *dir,
+ struct dentry *dentry, umode_t mode, bool excl)
+{
+ struct inode *inode;
+
+ inode = ntfs_create_inode(mnt_userns, dir, dentry, NULL, S_IFREG | mode,
+ 0, NULL, 0, NULL);
+
+ return IS_ERR(inode) ? PTR_ERR(inode) : 0;
+}
+
+/*
+ * ntfs_mknod
+ *
+ * inode_operations::mknod
+ */
+static int ntfs_mknod(struct user_namespace *mnt_userns, struct inode *dir,
+ struct dentry *dentry, umode_t mode, dev_t rdev)
+{
+ struct inode *inode;
+
+ inode = ntfs_create_inode(mnt_userns, dir, dentry, NULL, mode, rdev,
+ NULL, 0, NULL);
+
+ return IS_ERR(inode) ? PTR_ERR(inode) : 0;
+}
+
+/*
+ * ntfs_link - inode_operations::link
+ */
+static int ntfs_link(struct dentry *ode, struct inode *dir, struct dentry *de)
+{
+ int err;
+ struct inode *inode = d_inode(ode);
+ struct ntfs_inode *ni = ntfs_i(inode);
+
+ if (S_ISDIR(inode->i_mode))
+ return -EPERM;
+
+ if (inode->i_nlink >= NTFS_LINK_MAX)
+ return -EMLINK;
+
+ ni_lock_dir(ntfs_i(dir));
+ if (inode != dir)
+ ni_lock(ni);
+
+ inc_nlink(inode);
+ ihold(inode);
+
+ err = ntfs_link_inode(inode, de);
+
+ if (!err) {
+ dir->i_ctime = dir->i_mtime = inode->i_ctime =
+ current_time(dir);
+ mark_inode_dirty(inode);
+ mark_inode_dirty(dir);
+ d_instantiate(de, inode);
+ } else {
+ drop_nlink(inode);
+ iput(inode);
+ }
+
+ if (inode != dir)
+ ni_unlock(ni);
+ ni_unlock(ntfs_i(dir));
+
+ return err;
+}
+
+/*
+ * ntfs_unlink - inode_operations::unlink
+ */
+static int ntfs_unlink(struct inode *dir, struct dentry *dentry)
+{
+ struct ntfs_inode *ni = ntfs_i(dir);
+ int err;
+
+ ni_lock_dir(ni);
+
+ err = ntfs_unlink_inode(dir, dentry);
+
+ ni_unlock(ni);
+
+ return err;
+}
+
+/*
+ * ntfs_symlink - inode_operations::symlink
+ */
+static int ntfs_symlink(struct user_namespace *mnt_userns, struct inode *dir,
+ struct dentry *dentry, const char *symname)
+{
+ u32 size = strlen(symname);
+ struct inode *inode;
+
+ inode = ntfs_create_inode(mnt_userns, dir, dentry, NULL, S_IFLNK | 0777,
+ 0, symname, size, NULL);
+
+ return IS_ERR(inode) ? PTR_ERR(inode) : 0;
+}
+
+/*
+ * ntfs_mkdir- inode_operations::mkdir
+ */
+static int ntfs_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
+ struct dentry *dentry, umode_t mode)
+{
+ struct inode *inode;
+
+ inode = ntfs_create_inode(mnt_userns, dir, dentry, NULL, S_IFDIR | mode,
+ 0, NULL, 0, NULL);
+
+ return IS_ERR(inode) ? PTR_ERR(inode) : 0;
+}
+
+/*
+ * ntfs_rmdir - inode_operations::rmdir
+ */
+static int ntfs_rmdir(struct inode *dir, struct dentry *dentry)
+{
+ struct ntfs_inode *ni = ntfs_i(dir);
+ int err;
+
+ ni_lock_dir(ni);
+
+ err = ntfs_unlink_inode(dir, dentry);
+
+ ni_unlock(ni);
+
+ return err;
+}
+
+/*
+ * ntfs_rename - inode_operations::rename
+ */
+static int ntfs_rename(struct user_namespace *mnt_userns, struct inode *dir,
+ struct dentry *dentry, struct inode *new_dir,
+ struct dentry *new_dentry, u32 flags)
+{
+ int err;
+ struct super_block *sb = dir->i_sb;
+ struct ntfs_sb_info *sbi = sb->s_fs_info;
+ struct ntfs_inode *dir_ni = ntfs_i(dir);
+ struct ntfs_inode *new_dir_ni = ntfs_i(new_dir);
+ struct inode *inode = d_inode(dentry);
+ struct ntfs_inode *ni = ntfs_i(inode);
+ struct inode *new_inode = d_inode(new_dentry);
+ struct NTFS_DE *de, *new_de;
+ bool is_same, is_bad;
+ /*
+ * de - memory of PATH_MAX bytes:
+ * [0-1024) - original name (dentry->d_name)
+ * [1024-2048) - paired to original name, usually DOS variant of dentry->d_name
+ * [2048-3072) - new name (new_dentry->d_name)
+ */
+ static_assert(SIZEOF_ATTRIBUTE_FILENAME_MAX + SIZEOF_RESIDENT < 1024);
+ static_assert(SIZEOF_ATTRIBUTE_FILENAME_MAX + sizeof(struct NTFS_DE) <
+ 1024);
+ static_assert(PATH_MAX >= 4 * 1024);
+
+ if (flags & ~RENAME_NOREPLACE)
+ return -EINVAL;
+
+ is_same = dentry->d_name.len == new_dentry->d_name.len &&
+ !memcmp(dentry->d_name.name, new_dentry->d_name.name,
+ dentry->d_name.len);
+
+ if (is_same && dir == new_dir) {
+ /* Nothing to do. */
+ return 0;
+ }
+
+ if (ntfs_is_meta_file(sbi, inode->i_ino)) {
+ /* Should we print an error? */
+ return -EINVAL;
+ }
+
+ if (new_inode) {
+ /* Target name exists. Unlink it. */
+ dget(new_dentry);
+ ni_lock_dir(new_dir_ni);
+ err = ntfs_unlink_inode(new_dir, new_dentry);
+ ni_unlock(new_dir_ni);
+ dput(new_dentry);
+ if (err)
+ return err;
+ }
+
+ /* Allocate PATH_MAX bytes. */
+ de = __getname();
+ if (!de)
+ return -ENOMEM;
+
+ /* Translate dentry->d_name into unicode form. */
+ err = fill_name_de(sbi, de, &dentry->d_name, NULL);
+ if (err < 0)
+ goto out;
+
+ if (is_same) {
+ /* Reuse 'de'. */
+ new_de = de;
+ } else {
+ /* Translate new_dentry->d_name into unicode form. */
+ new_de = Add2Ptr(de, 2048);
+ err = fill_name_de(sbi, new_de, &new_dentry->d_name, NULL);
+ if (err < 0)
+ goto out;
+ }
+
+ ni_lock_dir(dir_ni);
+ ni_lock(ni);
+
+ is_bad = false;
+ err = ni_rename(dir_ni, new_dir_ni, ni, de, new_de, &is_bad);
+ if (is_bad) {
+ /* Restore after failed rename failed too. */
+ _ntfs_bad_inode(inode);
+ } else if (!err) {
+ inode->i_ctime = dir->i_ctime = dir->i_mtime =
+ current_time(dir);
+ mark_inode_dirty(inode);
+ mark_inode_dirty(dir);
+ if (dir != new_dir) {
+ new_dir->i_mtime = new_dir->i_ctime = dir->i_ctime;
+ mark_inode_dirty(new_dir);
+ }
+
+ if (IS_DIRSYNC(dir))
+ ntfs_sync_inode(dir);
+
+ if (IS_DIRSYNC(new_dir))
+ ntfs_sync_inode(inode);
+ }
+
+ ni_unlock(ni);
+ ni_unlock(dir_ni);
+out:
+ __putname(de);
+ return err;
+}
+
+struct dentry *ntfs3_get_parent(struct dentry *child)
+{
+ struct inode *inode = d_inode(child);
+ struct ntfs_inode *ni = ntfs_i(inode);
+
+ struct ATTR_LIST_ENTRY *le = NULL;
+ struct ATTRIB *attr = NULL;
+ struct ATTR_FILE_NAME *fname;
+
+ while ((attr = ni_find_attr(ni, attr, &le, ATTR_NAME, NULL, 0, NULL,
+ NULL))) {
+ fname = resident_data_ex(attr, SIZEOF_ATTRIBUTE_FILENAME);
+ if (!fname)
+ continue;
+
+ return d_obtain_alias(
+ ntfs_iget5(inode->i_sb, &fname->home, NULL));
+ }
+
+ return ERR_PTR(-ENOENT);
+}
+
+// clang-format off
+const struct inode_operations ntfs_dir_inode_operations = {
+ .lookup = ntfs_lookup,
+ .create = ntfs_create,
+ .link = ntfs_link,
+ .unlink = ntfs_unlink,
+ .symlink = ntfs_symlink,
+ .mkdir = ntfs_mkdir,
+ .rmdir = ntfs_rmdir,
+ .mknod = ntfs_mknod,
+ .rename = ntfs_rename,
+ .permission = ntfs_permission,
+ .get_acl = ntfs_get_acl,
+ .set_acl = ntfs_set_acl,
+ .setattr = ntfs3_setattr,
+ .getattr = ntfs_getattr,
+ .listxattr = ntfs_listxattr,
+ .fiemap = ntfs_fiemap,
+};
+
+const struct inode_operations ntfs_special_inode_operations = {
+ .setattr = ntfs3_setattr,
+ .getattr = ntfs_getattr,
+ .listxattr = ntfs_listxattr,
+ .get_acl = ntfs_get_acl,
+ .set_acl = ntfs_set_acl,
+};
+// clang-format on
diff --git a/fs/ntfs3/ntfs.h b/fs/ntfs3/ntfs.h
new file mode 100644
index 000000000..0f38d5581
--- /dev/null
+++ b/fs/ntfs3/ntfs.h
@@ -0,0 +1,1221 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ *
+ * Copyright (C) 2019-2021 Paragon Software GmbH, All rights reserved.
+ *
+ * on-disk ntfs structs
+ */
+
+// clang-format off
+#ifndef _LINUX_NTFS3_NTFS_H
+#define _LINUX_NTFS3_NTFS_H
+
+#include <linux/blkdev.h>
+#include <linux/build_bug.h>
+#include <linux/kernel.h>
+#include <linux/stddef.h>
+#include <linux/string.h>
+#include <linux/types.h>
+
+#include "debug.h"
+
+/* TODO: Check 4K MFT record and 512 bytes cluster. */
+
+/* Check each run for marked clusters. */
+#define NTFS3_CHECK_FREE_CLST
+
+#define NTFS_NAME_LEN 255
+
+/*
+ * ntfs.sys used 500 maximum links on-disk struct allows up to 0xffff.
+ * xfstest generic/041 creates 3003 hardlinks.
+ */
+#define NTFS_LINK_MAX 4000
+
+/*
+ * Activate to use 64 bit clusters instead of 32 bits in ntfs.sys.
+ * Logical and virtual cluster number if needed, may be
+ * redefined to use 64 bit value.
+ */
+//#define CONFIG_NTFS3_64BIT_CLUSTER
+
+#define NTFS_LZNT_MAX_CLUSTER 4096
+#define NTFS_LZNT_CUNIT 4
+#define NTFS_LZNT_CLUSTERS (1u<<NTFS_LZNT_CUNIT)
+
+struct GUID {
+ __le32 Data1;
+ __le16 Data2;
+ __le16 Data3;
+ u8 Data4[8];
+};
+
+/*
+ * This struct repeats layout of ATTR_FILE_NAME
+ * at offset 0x40.
+ * It used to store global constants NAME_MFT/NAME_MIRROR...
+ * most constant names are shorter than 10.
+ */
+struct cpu_str {
+ u8 len;
+ u8 unused;
+ u16 name[10];
+};
+
+struct le_str {
+ u8 len;
+ u8 unused;
+ __le16 name[];
+};
+
+static_assert(SECTOR_SHIFT == 9);
+
+#ifdef CONFIG_NTFS3_64BIT_CLUSTER
+typedef u64 CLST;
+static_assert(sizeof(size_t) == 8);
+#else
+typedef u32 CLST;
+#endif
+
+#define SPARSE_LCN64 ((u64)-1)
+#define SPARSE_LCN ((CLST)-1)
+#define RESIDENT_LCN ((CLST)-2)
+#define COMPRESSED_LCN ((CLST)-3)
+
+#define COMPRESSION_UNIT 4
+#define COMPRESS_MAX_CLUSTER 0x1000
+#define MFT_INCREASE_CHUNK 1024
+
+enum RECORD_NUM {
+ MFT_REC_MFT = 0,
+ MFT_REC_MIRR = 1,
+ MFT_REC_LOG = 2,
+ MFT_REC_VOL = 3,
+ MFT_REC_ATTR = 4,
+ MFT_REC_ROOT = 5,
+ MFT_REC_BITMAP = 6,
+ MFT_REC_BOOT = 7,
+ MFT_REC_BADCLUST = 8,
+ //MFT_REC_QUOTA = 9,
+ MFT_REC_SECURE = 9, // NTFS 3.0
+ MFT_REC_UPCASE = 10,
+ MFT_REC_EXTEND = 11, // NTFS 3.0
+ MFT_REC_RESERVED = 11,
+ MFT_REC_FREE = 16,
+ MFT_REC_USER = 24,
+};
+
+enum ATTR_TYPE {
+ ATTR_ZERO = cpu_to_le32(0x00),
+ ATTR_STD = cpu_to_le32(0x10),
+ ATTR_LIST = cpu_to_le32(0x20),
+ ATTR_NAME = cpu_to_le32(0x30),
+ // ATTR_VOLUME_VERSION on Nt4
+ ATTR_ID = cpu_to_le32(0x40),
+ ATTR_SECURE = cpu_to_le32(0x50),
+ ATTR_LABEL = cpu_to_le32(0x60),
+ ATTR_VOL_INFO = cpu_to_le32(0x70),
+ ATTR_DATA = cpu_to_le32(0x80),
+ ATTR_ROOT = cpu_to_le32(0x90),
+ ATTR_ALLOC = cpu_to_le32(0xA0),
+ ATTR_BITMAP = cpu_to_le32(0xB0),
+ // ATTR_SYMLINK on Nt4
+ ATTR_REPARSE = cpu_to_le32(0xC0),
+ ATTR_EA_INFO = cpu_to_le32(0xD0),
+ ATTR_EA = cpu_to_le32(0xE0),
+ ATTR_PROPERTYSET = cpu_to_le32(0xF0),
+ ATTR_LOGGED_UTILITY_STREAM = cpu_to_le32(0x100),
+ ATTR_END = cpu_to_le32(0xFFFFFFFF)
+};
+
+static_assert(sizeof(enum ATTR_TYPE) == 4);
+
+enum FILE_ATTRIBUTE {
+ FILE_ATTRIBUTE_READONLY = cpu_to_le32(0x00000001),
+ FILE_ATTRIBUTE_HIDDEN = cpu_to_le32(0x00000002),
+ FILE_ATTRIBUTE_SYSTEM = cpu_to_le32(0x00000004),
+ FILE_ATTRIBUTE_ARCHIVE = cpu_to_le32(0x00000020),
+ FILE_ATTRIBUTE_DEVICE = cpu_to_le32(0x00000040),
+ FILE_ATTRIBUTE_TEMPORARY = cpu_to_le32(0x00000100),
+ FILE_ATTRIBUTE_SPARSE_FILE = cpu_to_le32(0x00000200),
+ FILE_ATTRIBUTE_REPARSE_POINT = cpu_to_le32(0x00000400),
+ FILE_ATTRIBUTE_COMPRESSED = cpu_to_le32(0x00000800),
+ FILE_ATTRIBUTE_OFFLINE = cpu_to_le32(0x00001000),
+ FILE_ATTRIBUTE_NOT_CONTENT_INDEXED = cpu_to_le32(0x00002000),
+ FILE_ATTRIBUTE_ENCRYPTED = cpu_to_le32(0x00004000),
+ FILE_ATTRIBUTE_VALID_FLAGS = cpu_to_le32(0x00007fb7),
+ FILE_ATTRIBUTE_DIRECTORY = cpu_to_le32(0x10000000),
+};
+
+static_assert(sizeof(enum FILE_ATTRIBUTE) == 4);
+
+extern const struct cpu_str NAME_MFT;
+extern const struct cpu_str NAME_MIRROR;
+extern const struct cpu_str NAME_LOGFILE;
+extern const struct cpu_str NAME_VOLUME;
+extern const struct cpu_str NAME_ATTRDEF;
+extern const struct cpu_str NAME_ROOT;
+extern const struct cpu_str NAME_BITMAP;
+extern const struct cpu_str NAME_BOOT;
+extern const struct cpu_str NAME_BADCLUS;
+extern const struct cpu_str NAME_QUOTA;
+extern const struct cpu_str NAME_SECURE;
+extern const struct cpu_str NAME_UPCASE;
+extern const struct cpu_str NAME_EXTEND;
+extern const struct cpu_str NAME_OBJID;
+extern const struct cpu_str NAME_REPARSE;
+extern const struct cpu_str NAME_USNJRNL;
+
+extern const __le16 I30_NAME[4];
+extern const __le16 SII_NAME[4];
+extern const __le16 SDH_NAME[4];
+extern const __le16 SO_NAME[2];
+extern const __le16 SQ_NAME[2];
+extern const __le16 SR_NAME[2];
+
+extern const __le16 BAD_NAME[4];
+extern const __le16 SDS_NAME[4];
+extern const __le16 WOF_NAME[17]; /* WofCompressedData */
+
+/* MFT record number structure. */
+struct MFT_REF {
+ __le32 low; // The low part of the number.
+ __le16 high; // The high part of the number.
+ __le16 seq; // The sequence number of MFT record.
+};
+
+static_assert(sizeof(__le64) == sizeof(struct MFT_REF));
+
+static inline CLST ino_get(const struct MFT_REF *ref)
+{
+#ifdef CONFIG_NTFS3_64BIT_CLUSTER
+ return le32_to_cpu(ref->low) | ((u64)le16_to_cpu(ref->high) << 32);
+#else
+ return le32_to_cpu(ref->low);
+#endif
+}
+
+struct NTFS_BOOT {
+ u8 jump_code[3]; // 0x00: Jump to boot code.
+ u8 system_id[8]; // 0x03: System ID, equals "NTFS "
+
+ // NOTE: This member is not aligned(!)
+ // bytes_per_sector[0] must be 0.
+ // bytes_per_sector[1] must be multiplied by 256.
+ u8 bytes_per_sector[2]; // 0x0B: Bytes per sector.
+
+ u8 sectors_per_clusters;// 0x0D: Sectors per cluster.
+ u8 unused1[7];
+ u8 media_type; // 0x15: Media type (0xF8 - harddisk)
+ u8 unused2[2];
+ __le16 sct_per_track; // 0x18: number of sectors per track.
+ __le16 heads; // 0x1A: number of heads per cylinder.
+ __le32 hidden_sectors; // 0x1C: number of 'hidden' sectors.
+ u8 unused3[4];
+ u8 bios_drive_num; // 0x24: BIOS drive number =0x80.
+ u8 unused4;
+ u8 signature_ex; // 0x26: Extended BOOT signature =0x80.
+ u8 unused5;
+ __le64 sectors_per_volume;// 0x28: Size of volume in sectors.
+ __le64 mft_clst; // 0x30: First cluster of $MFT
+ __le64 mft2_clst; // 0x38: First cluster of $MFTMirr
+ s8 record_size; // 0x40: Size of MFT record in clusters(sectors).
+ u8 unused6[3];
+ s8 index_size; // 0x44: Size of INDX record in clusters(sectors).
+ u8 unused7[3];
+ __le64 serial_num; // 0x48: Volume serial number
+ __le32 check_sum; // 0x50: Simple additive checksum of all
+ // of the u32's which precede the 'check_sum'.
+
+ u8 boot_code[0x200 - 0x50 - 2 - 4]; // 0x54:
+ u8 boot_magic[2]; // 0x1FE: Boot signature =0x55 + 0xAA
+};
+
+static_assert(sizeof(struct NTFS_BOOT) == 0x200);
+
+enum NTFS_SIGNATURE {
+ NTFS_FILE_SIGNATURE = cpu_to_le32(0x454C4946), // 'FILE'
+ NTFS_INDX_SIGNATURE = cpu_to_le32(0x58444E49), // 'INDX'
+ NTFS_CHKD_SIGNATURE = cpu_to_le32(0x444B4843), // 'CHKD'
+ NTFS_RSTR_SIGNATURE = cpu_to_le32(0x52545352), // 'RSTR'
+ NTFS_RCRD_SIGNATURE = cpu_to_le32(0x44524352), // 'RCRD'
+ NTFS_BAAD_SIGNATURE = cpu_to_le32(0x44414142), // 'BAAD'
+ NTFS_HOLE_SIGNATURE = cpu_to_le32(0x454C4F48), // 'HOLE'
+ NTFS_FFFF_SIGNATURE = cpu_to_le32(0xffffffff),
+};
+
+static_assert(sizeof(enum NTFS_SIGNATURE) == 4);
+
+/* MFT Record header structure. */
+struct NTFS_RECORD_HEADER {
+ /* Record magic number, equals 'FILE'/'INDX'/'RSTR'/'RCRD'. */
+ enum NTFS_SIGNATURE sign; // 0x00:
+ __le16 fix_off; // 0x04:
+ __le16 fix_num; // 0x06:
+ __le64 lsn; // 0x08: Log file sequence number,
+};
+
+static_assert(sizeof(struct NTFS_RECORD_HEADER) == 0x10);
+
+static inline int is_baad(const struct NTFS_RECORD_HEADER *hdr)
+{
+ return hdr->sign == NTFS_BAAD_SIGNATURE;
+}
+
+/* Possible bits in struct MFT_REC.flags. */
+enum RECORD_FLAG {
+ RECORD_FLAG_IN_USE = cpu_to_le16(0x0001),
+ RECORD_FLAG_DIR = cpu_to_le16(0x0002),
+ RECORD_FLAG_SYSTEM = cpu_to_le16(0x0004),
+ RECORD_FLAG_UNKNOWN = cpu_to_le16(0x0008),
+};
+
+/* MFT Record structure. */
+struct MFT_REC {
+ struct NTFS_RECORD_HEADER rhdr; // 'FILE'
+
+ __le16 seq; // 0x10: Sequence number for this record.
+ __le16 hard_links; // 0x12: The number of hard links to record.
+ __le16 attr_off; // 0x14: Offset to attributes.
+ __le16 flags; // 0x16: See RECORD_FLAG.
+ __le32 used; // 0x18: The size of used part.
+ __le32 total; // 0x1C: Total record size.
+
+ struct MFT_REF parent_ref; // 0x20: Parent MFT record.
+ __le16 next_attr_id; // 0x28: The next attribute Id.
+
+ __le16 res; // 0x2A: High part of MFT record?
+ __le32 mft_record; // 0x2C: Current MFT record number.
+ __le16 fixups[]; // 0x30:
+};
+
+#define MFTRECORD_FIXUP_OFFSET_1 offsetof(struct MFT_REC, res)
+#define MFTRECORD_FIXUP_OFFSET_3 offsetof(struct MFT_REC, fixups)
+
+static_assert(MFTRECORD_FIXUP_OFFSET_1 == 0x2A);
+static_assert(MFTRECORD_FIXUP_OFFSET_3 == 0x30);
+
+static inline bool is_rec_base(const struct MFT_REC *rec)
+{
+ const struct MFT_REF *r = &rec->parent_ref;
+
+ return !r->low && !r->high && !r->seq;
+}
+
+static inline bool is_mft_rec5(const struct MFT_REC *rec)
+{
+ return le16_to_cpu(rec->rhdr.fix_off) >=
+ offsetof(struct MFT_REC, fixups);
+}
+
+static inline bool is_rec_inuse(const struct MFT_REC *rec)
+{
+ return rec->flags & RECORD_FLAG_IN_USE;
+}
+
+static inline bool clear_rec_inuse(struct MFT_REC *rec)
+{
+ return rec->flags &= ~RECORD_FLAG_IN_USE;
+}
+
+/* Possible values of ATTR_RESIDENT.flags */
+#define RESIDENT_FLAG_INDEXED 0x01
+
+struct ATTR_RESIDENT {
+ __le32 data_size; // 0x10: The size of data.
+ __le16 data_off; // 0x14: Offset to data.
+ u8 flags; // 0x16: Resident flags ( 1 - indexed ).
+ u8 res; // 0x17:
+}; // sizeof() = 0x18
+
+struct ATTR_NONRESIDENT {
+ __le64 svcn; // 0x10: Starting VCN of this segment.
+ __le64 evcn; // 0x18: End VCN of this segment.
+ __le16 run_off; // 0x20: Offset to packed runs.
+ // Unit of Compression size for this stream, expressed
+ // as a log of the cluster size.
+ //
+ // 0 means file is not compressed
+ // 1, 2, 3, and 4 are potentially legal values if the
+ // stream is compressed, however the implementation
+ // may only choose to use 4, or possibly 3. Note
+ // that 4 means cluster size time 16. If convenient
+ // the implementation may wish to accept a
+ // reasonable range of legal values here (1-5?),
+ // even if the implementation only generates
+ // a smaller set of values itself.
+ u8 c_unit; // 0x22:
+ u8 res1[5]; // 0x23:
+ __le64 alloc_size; // 0x28: The allocated size of attribute in bytes.
+ // (multiple of cluster size)
+ __le64 data_size; // 0x30: The size of attribute in bytes <= alloc_size.
+ __le64 valid_size; // 0x38: The size of valid part in bytes <= data_size.
+ __le64 total_size; // 0x40: The sum of the allocated clusters for a file.
+ // (present only for the first segment (0 == vcn)
+ // of compressed attribute)
+
+}; // sizeof()=0x40 or 0x48 (if compressed)
+
+/* Possible values of ATTRIB.flags: */
+#define ATTR_FLAG_COMPRESSED cpu_to_le16(0x0001)
+#define ATTR_FLAG_COMPRESSED_MASK cpu_to_le16(0x00FF)
+#define ATTR_FLAG_ENCRYPTED cpu_to_le16(0x4000)
+#define ATTR_FLAG_SPARSED cpu_to_le16(0x8000)
+
+struct ATTRIB {
+ enum ATTR_TYPE type; // 0x00: The type of this attribute.
+ __le32 size; // 0x04: The size of this attribute.
+ u8 non_res; // 0x08: Is this attribute non-resident?
+ u8 name_len; // 0x09: This attribute name length.
+ __le16 name_off; // 0x0A: Offset to the attribute name.
+ __le16 flags; // 0x0C: See ATTR_FLAG_XXX.
+ __le16 id; // 0x0E: Unique id (per record).
+
+ union {
+ struct ATTR_RESIDENT res; // 0x10
+ struct ATTR_NONRESIDENT nres; // 0x10
+ };
+};
+
+/* Define attribute sizes. */
+#define SIZEOF_RESIDENT 0x18
+#define SIZEOF_NONRESIDENT_EX 0x48
+#define SIZEOF_NONRESIDENT 0x40
+
+#define SIZEOF_RESIDENT_LE cpu_to_le16(0x18)
+#define SIZEOF_NONRESIDENT_EX_LE cpu_to_le16(0x48)
+#define SIZEOF_NONRESIDENT_LE cpu_to_le16(0x40)
+
+static inline u64 attr_ondisk_size(const struct ATTRIB *attr)
+{
+ return attr->non_res ? ((attr->flags &
+ (ATTR_FLAG_COMPRESSED | ATTR_FLAG_SPARSED)) ?
+ le64_to_cpu(attr->nres.total_size) :
+ le64_to_cpu(attr->nres.alloc_size))
+ : ALIGN(le32_to_cpu(attr->res.data_size), 8);
+}
+
+static inline u64 attr_size(const struct ATTRIB *attr)
+{
+ return attr->non_res ? le64_to_cpu(attr->nres.data_size) :
+ le32_to_cpu(attr->res.data_size);
+}
+
+static inline bool is_attr_encrypted(const struct ATTRIB *attr)
+{
+ return attr->flags & ATTR_FLAG_ENCRYPTED;
+}
+
+static inline bool is_attr_sparsed(const struct ATTRIB *attr)
+{
+ return attr->flags & ATTR_FLAG_SPARSED;
+}
+
+static inline bool is_attr_compressed(const struct ATTRIB *attr)
+{
+ return attr->flags & ATTR_FLAG_COMPRESSED;
+}
+
+static inline bool is_attr_ext(const struct ATTRIB *attr)
+{
+ return attr->flags & (ATTR_FLAG_SPARSED | ATTR_FLAG_COMPRESSED);
+}
+
+static inline bool is_attr_indexed(const struct ATTRIB *attr)
+{
+ return !attr->non_res && (attr->res.flags & RESIDENT_FLAG_INDEXED);
+}
+
+static inline __le16 const *attr_name(const struct ATTRIB *attr)
+{
+ return Add2Ptr(attr, le16_to_cpu(attr->name_off));
+}
+
+static inline u64 attr_svcn(const struct ATTRIB *attr)
+{
+ return attr->non_res ? le64_to_cpu(attr->nres.svcn) : 0;
+}
+
+static_assert(sizeof(struct ATTRIB) == 0x48);
+static_assert(sizeof(((struct ATTRIB *)NULL)->res) == 0x08);
+static_assert(sizeof(((struct ATTRIB *)NULL)->nres) == 0x38);
+
+static inline void *resident_data_ex(const struct ATTRIB *attr, u32 datasize)
+{
+ u32 asize, rsize;
+ u16 off;
+
+ if (attr->non_res)
+ return NULL;
+
+ asize = le32_to_cpu(attr->size);
+ off = le16_to_cpu(attr->res.data_off);
+
+ if (asize < datasize + off)
+ return NULL;
+
+ rsize = le32_to_cpu(attr->res.data_size);
+ if (rsize < datasize)
+ return NULL;
+
+ return Add2Ptr(attr, off);
+}
+
+static inline void *resident_data(const struct ATTRIB *attr)
+{
+ return Add2Ptr(attr, le16_to_cpu(attr->res.data_off));
+}
+
+static inline void *attr_run(const struct ATTRIB *attr)
+{
+ return Add2Ptr(attr, le16_to_cpu(attr->nres.run_off));
+}
+
+/* Standard information attribute (0x10). */
+struct ATTR_STD_INFO {
+ __le64 cr_time; // 0x00: File creation file.
+ __le64 m_time; // 0x08: File modification time.
+ __le64 c_time; // 0x10: Last time any attribute was modified.
+ __le64 a_time; // 0x18: File last access time.
+ enum FILE_ATTRIBUTE fa; // 0x20: Standard DOS attributes & more.
+ __le32 max_ver_num; // 0x24: Maximum Number of Versions.
+ __le32 ver_num; // 0x28: Version Number.
+ __le32 class_id; // 0x2C: Class Id from bidirectional Class Id index.
+};
+
+static_assert(sizeof(struct ATTR_STD_INFO) == 0x30);
+
+#define SECURITY_ID_INVALID 0x00000000
+#define SECURITY_ID_FIRST 0x00000100
+
+struct ATTR_STD_INFO5 {
+ __le64 cr_time; // 0x00: File creation file.
+ __le64 m_time; // 0x08: File modification time.
+ __le64 c_time; // 0x10: Last time any attribute was modified.
+ __le64 a_time; // 0x18: File last access time.
+ enum FILE_ATTRIBUTE fa; // 0x20: Standard DOS attributes & more.
+ __le32 max_ver_num; // 0x24: Maximum Number of Versions.
+ __le32 ver_num; // 0x28: Version Number.
+ __le32 class_id; // 0x2C: Class Id from bidirectional Class Id index.
+
+ __le32 owner_id; // 0x30: Owner Id of the user owning the file.
+ __le32 security_id; // 0x34: The Security Id is a key in the $SII Index and $SDS.
+ __le64 quota_charge; // 0x38:
+ __le64 usn; // 0x40: Last Update Sequence Number of the file. This is a direct
+ // index into the file $UsnJrnl. If zero, the USN Journal is
+ // disabled.
+};
+
+static_assert(sizeof(struct ATTR_STD_INFO5) == 0x48);
+
+/* Attribute list entry structure (0x20) */
+struct ATTR_LIST_ENTRY {
+ enum ATTR_TYPE type; // 0x00: The type of attribute.
+ __le16 size; // 0x04: The size of this record.
+ u8 name_len; // 0x06: The length of attribute name.
+ u8 name_off; // 0x07: The offset to attribute name.
+ __le64 vcn; // 0x08: Starting VCN of this attribute.
+ struct MFT_REF ref; // 0x10: MFT record number with attribute.
+ __le16 id; // 0x18: struct ATTRIB ID.
+ __le16 name[3]; // 0x1A: Just to align. To get real name can use bNameOffset.
+
+}; // sizeof(0x20)
+
+static_assert(sizeof(struct ATTR_LIST_ENTRY) == 0x20);
+
+static inline u32 le_size(u8 name_len)
+{
+ return ALIGN(offsetof(struct ATTR_LIST_ENTRY, name) +
+ name_len * sizeof(short), 8);
+}
+
+/* Returns 0 if 'attr' has the same type and name. */
+static inline int le_cmp(const struct ATTR_LIST_ENTRY *le,
+ const struct ATTRIB *attr)
+{
+ return le->type != attr->type || le->name_len != attr->name_len ||
+ (!le->name_len &&
+ memcmp(Add2Ptr(le, le->name_off),
+ Add2Ptr(attr, le16_to_cpu(attr->name_off)),
+ le->name_len * sizeof(short)));
+}
+
+static inline __le16 const *le_name(const struct ATTR_LIST_ENTRY *le)
+{
+ return Add2Ptr(le, le->name_off);
+}
+
+/* File name types (the field type in struct ATTR_FILE_NAME). */
+#define FILE_NAME_POSIX 0
+#define FILE_NAME_UNICODE 1
+#define FILE_NAME_DOS 2
+#define FILE_NAME_UNICODE_AND_DOS (FILE_NAME_DOS | FILE_NAME_UNICODE)
+
+/* Filename attribute structure (0x30). */
+struct NTFS_DUP_INFO {
+ __le64 cr_time; // 0x00: File creation file.
+ __le64 m_time; // 0x08: File modification time.
+ __le64 c_time; // 0x10: Last time any attribute was modified.
+ __le64 a_time; // 0x18: File last access time.
+ __le64 alloc_size; // 0x20: Data attribute allocated size, multiple of cluster size.
+ __le64 data_size; // 0x28: Data attribute size <= Dataalloc_size.
+ enum FILE_ATTRIBUTE fa; // 0x30: Standard DOS attributes & more.
+ __le16 ea_size; // 0x34: Packed EAs.
+ __le16 reparse; // 0x36: Used by Reparse.
+
+}; // 0x38
+
+struct ATTR_FILE_NAME {
+ struct MFT_REF home; // 0x00: MFT record for directory.
+ struct NTFS_DUP_INFO dup;// 0x08:
+ u8 name_len; // 0x40: File name length in words.
+ u8 type; // 0x41: File name type.
+ __le16 name[]; // 0x42: File name.
+};
+
+static_assert(sizeof(((struct ATTR_FILE_NAME *)NULL)->dup) == 0x38);
+static_assert(offsetof(struct ATTR_FILE_NAME, name) == 0x42);
+#define SIZEOF_ATTRIBUTE_FILENAME 0x44
+#define SIZEOF_ATTRIBUTE_FILENAME_MAX (0x42 + 255 * 2)
+
+static inline struct ATTRIB *attr_from_name(struct ATTR_FILE_NAME *fname)
+{
+ return (struct ATTRIB *)((char *)fname - SIZEOF_RESIDENT);
+}
+
+static inline u16 fname_full_size(const struct ATTR_FILE_NAME *fname)
+{
+ /* Don't return struct_size(fname, name, fname->name_len); */
+ return offsetof(struct ATTR_FILE_NAME, name) +
+ fname->name_len * sizeof(short);
+}
+
+static inline u8 paired_name(u8 type)
+{
+ if (type == FILE_NAME_UNICODE)
+ return FILE_NAME_DOS;
+ if (type == FILE_NAME_DOS)
+ return FILE_NAME_UNICODE;
+ return FILE_NAME_POSIX;
+}
+
+/* Index entry defines ( the field flags in NtfsDirEntry ). */
+#define NTFS_IE_HAS_SUBNODES cpu_to_le16(1)
+#define NTFS_IE_LAST cpu_to_le16(2)
+
+/* Directory entry structure. */
+struct NTFS_DE {
+ union {
+ struct MFT_REF ref; // 0x00: MFT record number with this file.
+ struct {
+ __le16 data_off; // 0x00:
+ __le16 data_size; // 0x02:
+ __le32 res; // 0x04: Must be 0.
+ } view;
+ };
+ __le16 size; // 0x08: The size of this entry.
+ __le16 key_size; // 0x0A: The size of File name length in bytes + 0x42.
+ __le16 flags; // 0x0C: Entry flags: NTFS_IE_XXX.
+ __le16 res; // 0x0E:
+
+ // Here any indexed attribute can be placed.
+ // One of them is:
+ // struct ATTR_FILE_NAME AttrFileName;
+ //
+
+ // The last 8 bytes of this structure contains
+ // the VBN of subnode.
+ // !!! Note !!!
+ // This field is presented only if (flags & NTFS_IE_HAS_SUBNODES)
+ // __le64 vbn;
+};
+
+static_assert(sizeof(struct NTFS_DE) == 0x10);
+
+static inline void de_set_vbn_le(struct NTFS_DE *e, __le64 vcn)
+{
+ __le64 *v = Add2Ptr(e, le16_to_cpu(e->size) - sizeof(__le64));
+
+ *v = vcn;
+}
+
+static inline void de_set_vbn(struct NTFS_DE *e, CLST vcn)
+{
+ __le64 *v = Add2Ptr(e, le16_to_cpu(e->size) - sizeof(__le64));
+
+ *v = cpu_to_le64(vcn);
+}
+
+static inline __le64 de_get_vbn_le(const struct NTFS_DE *e)
+{
+ return *(__le64 *)Add2Ptr(e, le16_to_cpu(e->size) - sizeof(__le64));
+}
+
+static inline CLST de_get_vbn(const struct NTFS_DE *e)
+{
+ __le64 *v = Add2Ptr(e, le16_to_cpu(e->size) - sizeof(__le64));
+
+ return le64_to_cpu(*v);
+}
+
+static inline struct NTFS_DE *de_get_next(const struct NTFS_DE *e)
+{
+ return Add2Ptr(e, le16_to_cpu(e->size));
+}
+
+static inline struct ATTR_FILE_NAME *de_get_fname(const struct NTFS_DE *e)
+{
+ return le16_to_cpu(e->key_size) >= SIZEOF_ATTRIBUTE_FILENAME ?
+ Add2Ptr(e, sizeof(struct NTFS_DE)) :
+ NULL;
+}
+
+static inline bool de_is_last(const struct NTFS_DE *e)
+{
+ return e->flags & NTFS_IE_LAST;
+}
+
+static inline bool de_has_vcn(const struct NTFS_DE *e)
+{
+ return e->flags & NTFS_IE_HAS_SUBNODES;
+}
+
+static inline bool de_has_vcn_ex(const struct NTFS_DE *e)
+{
+ return (e->flags & NTFS_IE_HAS_SUBNODES) &&
+ (u64)(-1) != *((u64 *)Add2Ptr(e, le16_to_cpu(e->size) -
+ sizeof(__le64)));
+}
+
+#define MAX_BYTES_PER_NAME_ENTRY \
+ ALIGN(sizeof(struct NTFS_DE) + \
+ offsetof(struct ATTR_FILE_NAME, name) + \
+ NTFS_NAME_LEN * sizeof(short), 8)
+
+struct INDEX_HDR {
+ __le32 de_off; // 0x00: The offset from the start of this structure
+ // to the first NTFS_DE.
+ __le32 used; // 0x04: The size of this structure plus all
+ // entries (quad-word aligned).
+ __le32 total; // 0x08: The allocated size of for this structure plus all entries.
+ u8 flags; // 0x0C: 0x00 = Small directory, 0x01 = Large directory.
+ u8 res[3];
+
+ //
+ // de_off + used <= total
+ //
+};
+
+static_assert(sizeof(struct INDEX_HDR) == 0x10);
+
+static inline struct NTFS_DE *hdr_first_de(const struct INDEX_HDR *hdr)
+{
+ u32 de_off = le32_to_cpu(hdr->de_off);
+ u32 used = le32_to_cpu(hdr->used);
+ struct NTFS_DE *e = Add2Ptr(hdr, de_off);
+ u16 esize;
+
+ if (de_off >= used || de_off >= le32_to_cpu(hdr->total))
+ return NULL;
+
+ esize = le16_to_cpu(e->size);
+ if (esize < sizeof(struct NTFS_DE) || de_off + esize > used)
+ return NULL;
+
+ return e;
+}
+
+static inline struct NTFS_DE *hdr_next_de(const struct INDEX_HDR *hdr,
+ const struct NTFS_DE *e)
+{
+ size_t off = PtrOffset(hdr, e);
+ u32 used = le32_to_cpu(hdr->used);
+ u16 esize;
+
+ if (off >= used)
+ return NULL;
+
+ esize = le16_to_cpu(e->size);
+
+ if (esize < sizeof(struct NTFS_DE) ||
+ off + esize + sizeof(struct NTFS_DE) > used)
+ return NULL;
+
+ return Add2Ptr(e, esize);
+}
+
+static inline bool hdr_has_subnode(const struct INDEX_HDR *hdr)
+{
+ return hdr->flags & 1;
+}
+
+struct INDEX_BUFFER {
+ struct NTFS_RECORD_HEADER rhdr; // 'INDX'
+ __le64 vbn; // 0x10: vcn if index >= cluster or vsn id index < cluster
+ struct INDEX_HDR ihdr; // 0x18:
+};
+
+static_assert(sizeof(struct INDEX_BUFFER) == 0x28);
+
+static inline bool ib_is_empty(const struct INDEX_BUFFER *ib)
+{
+ const struct NTFS_DE *first = hdr_first_de(&ib->ihdr);
+
+ return !first || de_is_last(first);
+}
+
+static inline bool ib_is_leaf(const struct INDEX_BUFFER *ib)
+{
+ return !(ib->ihdr.flags & 1);
+}
+
+/* Index root structure ( 0x90 ). */
+enum COLLATION_RULE {
+ NTFS_COLLATION_TYPE_BINARY = cpu_to_le32(0),
+ // $I30
+ NTFS_COLLATION_TYPE_FILENAME = cpu_to_le32(0x01),
+ // $SII of $Secure and $Q of Quota
+ NTFS_COLLATION_TYPE_UINT = cpu_to_le32(0x10),
+ // $O of Quota
+ NTFS_COLLATION_TYPE_SID = cpu_to_le32(0x11),
+ // $SDH of $Secure
+ NTFS_COLLATION_TYPE_SECURITY_HASH = cpu_to_le32(0x12),
+ // $O of ObjId and "$R" for Reparse
+ NTFS_COLLATION_TYPE_UINTS = cpu_to_le32(0x13)
+};
+
+static_assert(sizeof(enum COLLATION_RULE) == 4);
+
+//
+struct INDEX_ROOT {
+ enum ATTR_TYPE type; // 0x00: The type of attribute to index on.
+ enum COLLATION_RULE rule; // 0x04: The rule.
+ __le32 index_block_size;// 0x08: The size of index record.
+ u8 index_block_clst; // 0x0C: The number of clusters or sectors per index.
+ u8 res[3];
+ struct INDEX_HDR ihdr; // 0x10:
+};
+
+static_assert(sizeof(struct INDEX_ROOT) == 0x20);
+static_assert(offsetof(struct INDEX_ROOT, ihdr) == 0x10);
+
+#define VOLUME_FLAG_DIRTY cpu_to_le16(0x0001)
+#define VOLUME_FLAG_RESIZE_LOG_FILE cpu_to_le16(0x0002)
+
+struct VOLUME_INFO {
+ __le64 res1; // 0x00
+ u8 major_ver; // 0x08: NTFS major version number (before .)
+ u8 minor_ver; // 0x09: NTFS minor version number (after .)
+ __le16 flags; // 0x0A: Volume flags, see VOLUME_FLAG_XXX
+
+}; // sizeof=0xC
+
+#define SIZEOF_ATTRIBUTE_VOLUME_INFO 0xc
+
+#define NTFS_LABEL_MAX_LENGTH (0x100 / sizeof(short))
+#define NTFS_ATTR_INDEXABLE cpu_to_le32(0x00000002)
+#define NTFS_ATTR_DUPALLOWED cpu_to_le32(0x00000004)
+#define NTFS_ATTR_MUST_BE_INDEXED cpu_to_le32(0x00000010)
+#define NTFS_ATTR_MUST_BE_NAMED cpu_to_le32(0x00000020)
+#define NTFS_ATTR_MUST_BE_RESIDENT cpu_to_le32(0x00000040)
+#define NTFS_ATTR_LOG_ALWAYS cpu_to_le32(0x00000080)
+
+/* $AttrDef file entry. */
+struct ATTR_DEF_ENTRY {
+ __le16 name[0x40]; // 0x00: Attr name.
+ enum ATTR_TYPE type; // 0x80: struct ATTRIB type.
+ __le32 res; // 0x84:
+ enum COLLATION_RULE rule; // 0x88:
+ __le32 flags; // 0x8C: NTFS_ATTR_XXX (see above).
+ __le64 min_sz; // 0x90: Minimum attribute data size.
+ __le64 max_sz; // 0x98: Maximum attribute data size.
+};
+
+static_assert(sizeof(struct ATTR_DEF_ENTRY) == 0xa0);
+
+/* Object ID (0x40) */
+struct OBJECT_ID {
+ struct GUID ObjId; // 0x00: Unique Id assigned to file.
+ struct GUID BirthVolumeId; // 0x10: Birth Volume Id is the Object Id of the Volume on.
+ // which the Object Id was allocated. It never changes.
+ struct GUID BirthObjectId; // 0x20: Birth Object Id is the first Object Id that was
+ // ever assigned to this MFT Record. I.e. If the Object Id
+ // is changed for some reason, this field will reflect the
+ // original value of the Object Id.
+ struct GUID DomainId; // 0x30: Domain Id is currently unused but it is intended to be
+ // used in a network environment where the local machine is
+ // part of a Windows 2000 Domain. This may be used in a Windows
+ // 2000 Advanced Server managed domain.
+};
+
+static_assert(sizeof(struct OBJECT_ID) == 0x40);
+
+/* O Directory entry structure ( rule = 0x13 ) */
+struct NTFS_DE_O {
+ struct NTFS_DE de;
+ struct GUID ObjId; // 0x10: Unique Id assigned to file.
+ struct MFT_REF ref; // 0x20: MFT record number with this file.
+ struct GUID BirthVolumeId; // 0x28: Birth Volume Id is the Object Id of the Volume on
+ // which the Object Id was allocated. It never changes.
+ struct GUID BirthObjectId; // 0x38: Birth Object Id is the first Object Id that was
+ // ever assigned to this MFT Record. I.e. If the Object Id
+ // is changed for some reason, this field will reflect the
+ // original value of the Object Id.
+ // This field is valid if data_size == 0x48.
+ struct GUID BirthDomainId; // 0x48: Domain Id is currently unused but it is intended
+ // to be used in a network environment where the local
+ // machine is part of a Windows 2000 Domain. This may be
+ // used in a Windows 2000 Advanced Server managed domain.
+};
+
+static_assert(sizeof(struct NTFS_DE_O) == 0x58);
+
+#define NTFS_OBJECT_ENTRY_DATA_SIZE1 \
+ 0x38 // struct NTFS_DE_O.BirthDomainId is not used
+#define NTFS_OBJECT_ENTRY_DATA_SIZE2 \
+ 0x48 // struct NTFS_DE_O.BirthDomainId is used
+
+/* Q Directory entry structure ( rule = 0x11 ) */
+struct NTFS_DE_Q {
+ struct NTFS_DE de;
+ __le32 owner_id; // 0x10: Unique Id assigned to file
+ __le32 Version; // 0x14: 0x02
+ __le32 flags2; // 0x18: Quota flags, see above
+ __le64 BytesUsed; // 0x1C:
+ __le64 ChangeTime; // 0x24:
+ __le64 WarningLimit; // 0x28:
+ __le64 HardLimit; // 0x34:
+ __le64 ExceededTime; // 0x3C:
+
+ // SID is placed here
+}; // sizeof() = 0x44
+
+#define SIZEOF_NTFS_DE_Q 0x44
+
+#define SecurityDescriptorsBlockSize 0x40000 // 256K
+#define SecurityDescriptorMaxSize 0x20000 // 128K
+#define Log2OfSecurityDescriptorsBlockSize 18
+
+struct SECURITY_KEY {
+ __le32 hash; // Hash value for descriptor
+ __le32 sec_id; // Security Id (guaranteed unique)
+};
+
+/* Security descriptors (the content of $Secure::SDS data stream) */
+struct SECURITY_HDR {
+ struct SECURITY_KEY key; // 0x00: Security Key.
+ __le64 off; // 0x08: Offset of this entry in the file.
+ __le32 size; // 0x10: Size of this entry, 8 byte aligned.
+ /*
+ * Security descriptor itself is placed here.
+ * Total size is 16 byte aligned.
+ */
+} __packed;
+
+#define SIZEOF_SECURITY_HDR 0x14
+
+/* SII Directory entry structure */
+struct NTFS_DE_SII {
+ struct NTFS_DE de;
+ __le32 sec_id; // 0x10: Key: sizeof(security_id) = wKeySize
+ struct SECURITY_HDR sec_hdr; // 0x14:
+} __packed;
+
+#define SIZEOF_SII_DIRENTRY 0x28
+
+/* SDH Directory entry structure */
+struct NTFS_DE_SDH {
+ struct NTFS_DE de;
+ struct SECURITY_KEY key; // 0x10: Key
+ struct SECURITY_HDR sec_hdr; // 0x18: Data
+ __le16 magic[2]; // 0x2C: 0x00490049 "I I"
+};
+
+#define SIZEOF_SDH_DIRENTRY 0x30
+
+struct REPARSE_KEY {
+ __le32 ReparseTag; // 0x00: Reparse Tag
+ struct MFT_REF ref; // 0x04: MFT record number with this file
+}; // sizeof() = 0x0C
+
+static_assert(offsetof(struct REPARSE_KEY, ref) == 0x04);
+#define SIZEOF_REPARSE_KEY 0x0C
+
+/* Reparse Directory entry structure */
+struct NTFS_DE_R {
+ struct NTFS_DE de;
+ struct REPARSE_KEY key; // 0x10: Reparse Key.
+ u32 zero; // 0x1c:
+}; // sizeof() = 0x20
+
+static_assert(sizeof(struct NTFS_DE_R) == 0x20);
+
+/* CompressReparseBuffer.WofVersion */
+#define WOF_CURRENT_VERSION cpu_to_le32(1)
+/* CompressReparseBuffer.WofProvider */
+#define WOF_PROVIDER_WIM cpu_to_le32(1)
+/* CompressReparseBuffer.WofProvider */
+#define WOF_PROVIDER_SYSTEM cpu_to_le32(2)
+/* CompressReparseBuffer.ProviderVer */
+#define WOF_PROVIDER_CURRENT_VERSION cpu_to_le32(1)
+
+#define WOF_COMPRESSION_XPRESS4K cpu_to_le32(0) // 4k
+#define WOF_COMPRESSION_LZX32K cpu_to_le32(1) // 32k
+#define WOF_COMPRESSION_XPRESS8K cpu_to_le32(2) // 8k
+#define WOF_COMPRESSION_XPRESS16K cpu_to_le32(3) // 16k
+
+/*
+ * ATTR_REPARSE (0xC0)
+ *
+ * The reparse struct GUID structure is used by all 3rd party layered drivers to
+ * store data in a reparse point. For non-Microsoft tags, The struct GUID field
+ * cannot be GUID_NULL.
+ * The constraints on reparse tags are defined below.
+ * Microsoft tags can also be used with this format of the reparse point buffer.
+ */
+struct REPARSE_POINT {
+ __le32 ReparseTag; // 0x00:
+ __le16 ReparseDataLength;// 0x04:
+ __le16 Reserved;
+
+ struct GUID Guid; // 0x08:
+
+ //
+ // Here GenericReparseBuffer is placed
+ //
+};
+
+static_assert(sizeof(struct REPARSE_POINT) == 0x18);
+
+/* Maximum allowed size of the reparse data. */
+#define MAXIMUM_REPARSE_DATA_BUFFER_SIZE (16 * 1024)
+
+/*
+ * The value of the following constant needs to satisfy the following
+ * conditions:
+ * (1) Be at least as large as the largest of the reserved tags.
+ * (2) Be strictly smaller than all the tags in use.
+ */
+#define IO_REPARSE_TAG_RESERVED_RANGE 1
+
+/*
+ * The reparse tags are a ULONG. The 32 bits are laid out as follows:
+ *
+ * 3 3 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1
+ * 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0
+ * +-+-+-+-+-----------------------+-------------------------------+
+ * |M|R|N|R| Reserved bits | Reparse Tag Value |
+ * +-+-+-+-+-----------------------+-------------------------------+
+ *
+ * M is the Microsoft bit. When set to 1, it denotes a tag owned by Microsoft.
+ * All ISVs must use a tag with a 0 in this position.
+ * Note: If a Microsoft tag is used by non-Microsoft software, the
+ * behavior is not defined.
+ *
+ * R is reserved. Must be zero for non-Microsoft tags.
+ *
+ * N is name surrogate. When set to 1, the file represents another named
+ * entity in the system.
+ *
+ * The M and N bits are OR-able.
+ * The following macros check for the M and N bit values:
+ */
+
+/*
+ * Macro to determine whether a reparse point tag corresponds to a tag
+ * owned by Microsoft.
+ */
+#define IsReparseTagMicrosoft(_tag) (((_tag)&IO_REPARSE_TAG_MICROSOFT))
+
+/* Macro to determine whether a reparse point tag is a name surrogate. */
+#define IsReparseTagNameSurrogate(_tag) (((_tag)&IO_REPARSE_TAG_NAME_SURROGATE))
+
+/*
+ * The following constant represents the bits that are valid to use in
+ * reparse tags.
+ */
+#define IO_REPARSE_TAG_VALID_VALUES 0xF000FFFF
+
+/*
+ * Macro to determine whether a reparse tag is a valid tag.
+ */
+#define IsReparseTagValid(_tag) \
+ (!((_tag) & ~IO_REPARSE_TAG_VALID_VALUES) && \
+ ((_tag) > IO_REPARSE_TAG_RESERVED_RANGE))
+
+/* Microsoft tags for reparse points. */
+
+enum IO_REPARSE_TAG {
+ IO_REPARSE_TAG_SYMBOLIC_LINK = cpu_to_le32(0),
+ IO_REPARSE_TAG_NAME_SURROGATE = cpu_to_le32(0x20000000),
+ IO_REPARSE_TAG_MICROSOFT = cpu_to_le32(0x80000000),
+ IO_REPARSE_TAG_MOUNT_POINT = cpu_to_le32(0xA0000003),
+ IO_REPARSE_TAG_SYMLINK = cpu_to_le32(0xA000000C),
+ IO_REPARSE_TAG_HSM = cpu_to_le32(0xC0000004),
+ IO_REPARSE_TAG_SIS = cpu_to_le32(0x80000007),
+ IO_REPARSE_TAG_DEDUP = cpu_to_le32(0x80000013),
+ IO_REPARSE_TAG_COMPRESS = cpu_to_le32(0x80000017),
+
+ /*
+ * The reparse tag 0x80000008 is reserved for Microsoft internal use.
+ * May be published in the future.
+ */
+
+ /* Microsoft reparse tag reserved for DFS */
+ IO_REPARSE_TAG_DFS = cpu_to_le32(0x8000000A),
+
+ /* Microsoft reparse tag reserved for the file system filter manager. */
+ IO_REPARSE_TAG_FILTER_MANAGER = cpu_to_le32(0x8000000B),
+
+ /* Non-Microsoft tags for reparse points */
+
+ /* Tag allocated to CONGRUENT, May 2000. Used by IFSTEST. */
+ IO_REPARSE_TAG_IFSTEST_CONGRUENT = cpu_to_le32(0x00000009),
+
+ /* Tag allocated to ARKIVIO. */
+ IO_REPARSE_TAG_ARKIVIO = cpu_to_le32(0x0000000C),
+
+ /* Tag allocated to SOLUTIONSOFT. */
+ IO_REPARSE_TAG_SOLUTIONSOFT = cpu_to_le32(0x2000000D),
+
+ /* Tag allocated to COMMVAULT. */
+ IO_REPARSE_TAG_COMMVAULT = cpu_to_le32(0x0000000E),
+
+ /* OneDrive?? */
+ IO_REPARSE_TAG_CLOUD = cpu_to_le32(0x9000001A),
+ IO_REPARSE_TAG_CLOUD_1 = cpu_to_le32(0x9000101A),
+ IO_REPARSE_TAG_CLOUD_2 = cpu_to_le32(0x9000201A),
+ IO_REPARSE_TAG_CLOUD_3 = cpu_to_le32(0x9000301A),
+ IO_REPARSE_TAG_CLOUD_4 = cpu_to_le32(0x9000401A),
+ IO_REPARSE_TAG_CLOUD_5 = cpu_to_le32(0x9000501A),
+ IO_REPARSE_TAG_CLOUD_6 = cpu_to_le32(0x9000601A),
+ IO_REPARSE_TAG_CLOUD_7 = cpu_to_le32(0x9000701A),
+ IO_REPARSE_TAG_CLOUD_8 = cpu_to_le32(0x9000801A),
+ IO_REPARSE_TAG_CLOUD_9 = cpu_to_le32(0x9000901A),
+ IO_REPARSE_TAG_CLOUD_A = cpu_to_le32(0x9000A01A),
+ IO_REPARSE_TAG_CLOUD_B = cpu_to_le32(0x9000B01A),
+ IO_REPARSE_TAG_CLOUD_C = cpu_to_le32(0x9000C01A),
+ IO_REPARSE_TAG_CLOUD_D = cpu_to_le32(0x9000D01A),
+ IO_REPARSE_TAG_CLOUD_E = cpu_to_le32(0x9000E01A),
+ IO_REPARSE_TAG_CLOUD_F = cpu_to_le32(0x9000F01A),
+
+};
+
+#define SYMLINK_FLAG_RELATIVE 1
+
+/* Microsoft reparse buffer. (see DDK for details) */
+struct REPARSE_DATA_BUFFER {
+ __le32 ReparseTag; // 0x00:
+ __le16 ReparseDataLength; // 0x04:
+ __le16 Reserved;
+
+ union {
+ /* If ReparseTag == 0xA0000003 (IO_REPARSE_TAG_MOUNT_POINT) */
+ struct {
+ __le16 SubstituteNameOffset; // 0x08
+ __le16 SubstituteNameLength; // 0x0A
+ __le16 PrintNameOffset; // 0x0C
+ __le16 PrintNameLength; // 0x0E
+ __le16 PathBuffer[]; // 0x10
+ } MountPointReparseBuffer;
+
+ /*
+ * If ReparseTag == 0xA000000C (IO_REPARSE_TAG_SYMLINK)
+ * https://msdn.microsoft.com/en-us/library/cc232006.aspx
+ */
+ struct {
+ __le16 SubstituteNameOffset; // 0x08
+ __le16 SubstituteNameLength; // 0x0A
+ __le16 PrintNameOffset; // 0x0C
+ __le16 PrintNameLength; // 0x0E
+ // 0-absolute path 1- relative path, SYMLINK_FLAG_RELATIVE
+ __le32 Flags; // 0x10
+ __le16 PathBuffer[]; // 0x14
+ } SymbolicLinkReparseBuffer;
+
+ /* If ReparseTag == 0x80000017U */
+ struct {
+ __le32 WofVersion; // 0x08 == 1
+ /*
+ * 1 - WIM backing provider ("WIMBoot"),
+ * 2 - System compressed file provider
+ */
+ __le32 WofProvider; // 0x0C:
+ __le32 ProviderVer; // 0x10: == 1 WOF_FILE_PROVIDER_CURRENT_VERSION == 1
+ __le32 CompressionFormat; // 0x14: 0, 1, 2, 3. See WOF_COMPRESSION_XXX
+ } CompressReparseBuffer;
+
+ struct {
+ u8 DataBuffer[1]; // 0x08:
+ } GenericReparseBuffer;
+ };
+};
+
+/* ATTR_EA_INFO (0xD0) */
+
+#define FILE_NEED_EA 0x80 // See ntifs.h
+/*
+ *FILE_NEED_EA, indicates that the file to which the EA belongs cannot be
+ * interpreted without understanding the associated extended attributes.
+ */
+struct EA_INFO {
+ __le16 size_pack; // 0x00: Size of buffer to hold in packed form.
+ __le16 count; // 0x02: Count of EA's with FILE_NEED_EA bit set.
+ __le32 size; // 0x04: Size of buffer to hold in unpacked form.
+};
+
+static_assert(sizeof(struct EA_INFO) == 8);
+
+/* ATTR_EA (0xE0) */
+struct EA_FULL {
+ __le32 size; // 0x00: (not in packed)
+ u8 flags; // 0x04:
+ u8 name_len; // 0x05:
+ __le16 elength; // 0x06:
+ u8 name[]; // 0x08:
+};
+
+static_assert(offsetof(struct EA_FULL, name) == 8);
+
+#define ACL_REVISION 2
+#define ACL_REVISION_DS 4
+
+#define SE_SELF_RELATIVE cpu_to_le16(0x8000)
+
+struct SECURITY_DESCRIPTOR_RELATIVE {
+ u8 Revision;
+ u8 Sbz1;
+ __le16 Control;
+ __le32 Owner;
+ __le32 Group;
+ __le32 Sacl;
+ __le32 Dacl;
+};
+static_assert(sizeof(struct SECURITY_DESCRIPTOR_RELATIVE) == 0x14);
+
+struct ACE_HEADER {
+ u8 AceType;
+ u8 AceFlags;
+ __le16 AceSize;
+};
+static_assert(sizeof(struct ACE_HEADER) == 4);
+
+struct ACL {
+ u8 AclRevision;
+ u8 Sbz1;
+ __le16 AclSize;
+ __le16 AceCount;
+ __le16 Sbz2;
+};
+static_assert(sizeof(struct ACL) == 8);
+
+struct SID {
+ u8 Revision;
+ u8 SubAuthorityCount;
+ u8 IdentifierAuthority[6];
+ __le32 SubAuthority[];
+};
+static_assert(offsetof(struct SID, SubAuthority) == 8);
+
+#endif /* _LINUX_NTFS3_NTFS_H */
+// clang-format on
diff --git a/fs/ntfs3/ntfs_fs.h b/fs/ntfs3/ntfs_fs.h
new file mode 100644
index 000000000..8c9abaf13
--- /dev/null
+++ b/fs/ntfs3/ntfs_fs.h
@@ -0,0 +1,1142 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ *
+ * Copyright (C) 2019-2021 Paragon Software GmbH, All rights reserved.
+ *
+ */
+
+// clang-format off
+#ifndef _LINUX_NTFS3_NTFS_FS_H
+#define _LINUX_NTFS3_NTFS_FS_H
+
+#include <linux/blkdev.h>
+#include <linux/buffer_head.h>
+#include <linux/fs.h>
+#include <linux/highmem.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/mutex.h>
+#include <linux/page-flags.h>
+#include <linux/pagemap.h>
+#include <linux/rbtree.h>
+#include <linux/rwsem.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/time64.h>
+#include <linux/types.h>
+#include <linux/uidgid.h>
+#include <asm/div64.h>
+#include <asm/page.h>
+
+#include "debug.h"
+#include "ntfs.h"
+
+struct dentry;
+struct fiemap_extent_info;
+struct user_namespace;
+struct page;
+struct writeback_control;
+enum utf16_endian;
+
+
+#define MINUS_ONE_T ((size_t)(-1))
+/* Biggest MFT / smallest cluster */
+#define MAXIMUM_BYTES_PER_MFT 4096
+#define NTFS_BLOCKS_PER_MFT_RECORD (MAXIMUM_BYTES_PER_MFT / 512)
+
+#define MAXIMUM_BYTES_PER_INDEX 4096
+#define NTFS_BLOCKS_PER_INODE (MAXIMUM_BYTES_PER_INDEX / 512)
+
+/* NTFS specific error code when fixup failed. */
+#define E_NTFS_FIXUP 555
+/* NTFS specific error code about resident->nonresident. */
+#define E_NTFS_NONRESIDENT 556
+/* NTFS specific error code about punch hole. */
+#define E_NTFS_NOTALIGNED 557
+/* NTFS specific error code when on-disk struct is corrupted. */
+#define E_NTFS_CORRUPT 558
+
+
+/* sbi->flags */
+#define NTFS_FLAGS_NODISCARD 0x00000001
+/* Set when LogFile is replaying. */
+#define NTFS_FLAGS_LOG_REPLAYING 0x00000008
+/* Set when we changed first MFT's which copy must be updated in $MftMirr. */
+#define NTFS_FLAGS_MFTMIRR 0x00001000
+#define NTFS_FLAGS_NEED_REPLAY 0x04000000
+
+
+/* ni->ni_flags */
+/*
+ * Data attribute is external compressed (LZX/Xpress)
+ * 1 - WOF_COMPRESSION_XPRESS4K
+ * 2 - WOF_COMPRESSION_XPRESS8K
+ * 3 - WOF_COMPRESSION_XPRESS16K
+ * 4 - WOF_COMPRESSION_LZX32K
+ */
+#define NI_FLAG_COMPRESSED_MASK 0x0000000f
+/* Data attribute is deduplicated. */
+#define NI_FLAG_DEDUPLICATED 0x00000010
+#define NI_FLAG_EA 0x00000020
+#define NI_FLAG_DIR 0x00000040
+#define NI_FLAG_RESIDENT 0x00000080
+#define NI_FLAG_UPDATE_PARENT 0x00000100
+// clang-format on
+
+struct ntfs_mount_options {
+ char *nls_name;
+ struct nls_table *nls;
+
+ kuid_t fs_uid;
+ kgid_t fs_gid;
+ u16 fs_fmask_inv;
+ u16 fs_dmask_inv;
+
+ unsigned fmask : 1; /* fmask was set. */
+ unsigned dmask : 1; /*dmask was set. */
+ unsigned sys_immutable : 1; /* Immutable system files. */
+ unsigned discard : 1; /* Issue discard requests on deletions. */
+ unsigned sparse : 1; /* Create sparse files. */
+ unsigned showmeta : 1; /* Show meta files. */
+ unsigned nohidden : 1; /* Do not show hidden files. */
+ unsigned force : 1; /* RW mount dirty volume. */
+ unsigned noacsrules : 1; /* Exclude acs rules. */
+ unsigned prealloc : 1; /* Preallocate space when file is growing. */
+};
+
+/* Special value to unpack and deallocate. */
+#define RUN_DEALLOCATE ((struct runs_tree *)(size_t)1)
+
+/* TODO: Use rb tree instead of array. */
+struct runs_tree {
+ struct ntfs_run *runs;
+ size_t count; /* Currently used size a ntfs_run storage. */
+ size_t allocated; /* Currently allocated ntfs_run storage size. */
+};
+
+struct ntfs_buffers {
+ /* Biggest MFT / smallest cluster = 4096 / 512 = 8 */
+ /* Biggest index / smallest cluster = 4096 / 512 = 8 */
+ struct buffer_head *bh[PAGE_SIZE >> SECTOR_SHIFT];
+ u32 bytes;
+ u32 nbufs;
+ u32 off;
+};
+
+enum ALLOCATE_OPT {
+ ALLOCATE_DEF = 0, // Allocate all clusters.
+ ALLOCATE_MFT = 1, // Allocate for MFT.
+};
+
+enum bitmap_mutex_classes {
+ BITMAP_MUTEX_CLUSTERS = 0,
+ BITMAP_MUTEX_MFT = 1,
+};
+
+struct wnd_bitmap {
+ struct super_block *sb;
+ struct rw_semaphore rw_lock;
+
+ struct runs_tree run;
+ size_t nbits;
+
+ size_t total_zeroes; // Total number of free bits.
+ u16 *free_bits; // Free bits in each window.
+ size_t nwnd;
+ u32 bits_last; // Bits in last window.
+
+ struct rb_root start_tree; // Extents, sorted by 'start'.
+ struct rb_root count_tree; // Extents, sorted by 'count + start'.
+ size_t count; // Extents count.
+
+ /*
+ * -1 Tree is activated but not updated (too many fragments).
+ * 0 - Tree is not activated.
+ * 1 - Tree is activated and updated.
+ */
+ int uptodated;
+ size_t extent_min; // Minimal extent used while building.
+ size_t extent_max; // Upper estimate of biggest free block.
+
+ /* Zone [bit, end) */
+ size_t zone_bit;
+ size_t zone_end;
+
+ bool set_tail; // Not necessary in driver.
+ bool inited;
+};
+
+typedef int (*NTFS_CMP_FUNC)(const void *key1, size_t len1, const void *key2,
+ size_t len2, const void *param);
+
+enum index_mutex_classed {
+ INDEX_MUTEX_I30 = 0,
+ INDEX_MUTEX_SII = 1,
+ INDEX_MUTEX_SDH = 2,
+ INDEX_MUTEX_SO = 3,
+ INDEX_MUTEX_SQ = 4,
+ INDEX_MUTEX_SR = 5,
+ INDEX_MUTEX_TOTAL
+};
+
+/* ntfs_index - Allocation unit inside directory. */
+struct ntfs_index {
+ struct runs_tree bitmap_run;
+ struct runs_tree alloc_run;
+ /* read/write access to 'bitmap_run'/'alloc_run' while ntfs_readdir */
+ struct rw_semaphore run_lock;
+
+ /*TODO: Remove 'cmp'. */
+ NTFS_CMP_FUNC cmp;
+
+ u8 index_bits; // log2(root->index_block_size)
+ u8 idx2vbn_bits; // log2(root->index_block_clst)
+ u8 vbn2vbo_bits; // index_block_size < cluster? 9 : cluster_bits
+ u8 type; // index_mutex_classed
+};
+
+/* Minimum MFT zone. */
+#define NTFS_MIN_MFT_ZONE 100
+
+/* Ntfs file system in-core superblock data. */
+struct ntfs_sb_info {
+ struct super_block *sb;
+
+ u32 discard_granularity;
+ u64 discard_granularity_mask_inv; // ~(discard_granularity_mask_inv-1)
+
+ u32 cluster_size; // bytes per cluster
+ u32 cluster_mask; // == cluster_size - 1
+ u64 cluster_mask_inv; // ~(cluster_size - 1)
+ u32 block_mask; // sb->s_blocksize - 1
+ u32 blocks_per_cluster; // cluster_size / sb->s_blocksize
+
+ u32 record_size;
+ u32 index_size;
+
+ u8 cluster_bits;
+ u8 record_bits;
+
+ u64 maxbytes; // Maximum size for normal files.
+ u64 maxbytes_sparse; // Maximum size for sparse file.
+
+ u32 flags; // See NTFS_FLAGS_XXX.
+
+ CLST zone_max; // Maximum MFT zone length in clusters
+ CLST bad_clusters; // The count of marked bad clusters.
+
+ u16 max_bytes_per_attr; // Maximum attribute size in record.
+ u16 attr_size_tr; // Attribute size threshold (320 bytes).
+
+ /* Records in $Extend. */
+ CLST objid_no;
+ CLST quota_no;
+ CLST reparse_no;
+ CLST usn_jrnl_no;
+
+ struct ATTR_DEF_ENTRY *def_table; // Attribute definition table.
+ u32 def_entries;
+ u32 ea_max_size;
+
+ struct MFT_REC *new_rec;
+
+ u16 *upcase;
+
+ struct {
+ u64 lbo, lbo2;
+ struct ntfs_inode *ni;
+ struct wnd_bitmap bitmap; // $MFT::Bitmap
+ /*
+ * MFT records [11-24) used to expand MFT itself.
+ * They always marked as used in $MFT::Bitmap
+ * 'reserved_bitmap' contains real bitmap of these records.
+ */
+ ulong reserved_bitmap; // Bitmap of used records [11 - 24)
+ size_t next_free; // The next record to allocate from
+ size_t used; // MFT valid size in records.
+ u32 recs_mirr; // Number of records in MFTMirr
+ u8 next_reserved;
+ u8 reserved_bitmap_inited;
+ } mft;
+
+ struct {
+ struct wnd_bitmap bitmap; // $Bitmap::Data
+ CLST next_free_lcn;
+ } used;
+
+ struct {
+ u64 size; // In bytes.
+ u64 blocks; // In blocks.
+ u64 ser_num;
+ struct ntfs_inode *ni;
+ __le16 flags; // Cached current VOLUME_INFO::flags, VOLUME_FLAG_DIRTY.
+ u8 major_ver;
+ u8 minor_ver;
+ char label[65];
+ bool real_dirty; // Real fs state.
+ } volume;
+
+ struct {
+ struct ntfs_index index_sii;
+ struct ntfs_index index_sdh;
+ struct ntfs_inode *ni;
+ u32 next_id;
+ u64 next_off;
+
+ __le32 def_security_id;
+ } security;
+
+ struct {
+ struct ntfs_index index_r;
+ struct ntfs_inode *ni;
+ u64 max_size; // 16K
+ } reparse;
+
+ struct {
+ struct ntfs_index index_o;
+ struct ntfs_inode *ni;
+ } objid;
+
+ struct {
+ struct mutex mtx_lznt;
+ struct lznt *lznt;
+#ifdef CONFIG_NTFS3_LZX_XPRESS
+ struct mutex mtx_xpress;
+ struct xpress_decompressor *xpress;
+ struct mutex mtx_lzx;
+ struct lzx_decompressor *lzx;
+#endif
+ } compress;
+
+ struct ntfs_mount_options *options;
+ struct ratelimit_state msg_ratelimit;
+};
+
+/* One MFT record(usually 1024 bytes), consists of attributes. */
+struct mft_inode {
+ struct rb_node node;
+ struct ntfs_sb_info *sbi;
+
+ struct MFT_REC *mrec;
+ struct ntfs_buffers nb;
+
+ CLST rno;
+ bool dirty;
+};
+
+/* Nested class for ntfs_inode::ni_lock. */
+enum ntfs_inode_mutex_lock_class {
+ NTFS_INODE_MUTEX_DIRTY,
+ NTFS_INODE_MUTEX_SECURITY,
+ NTFS_INODE_MUTEX_OBJID,
+ NTFS_INODE_MUTEX_REPARSE,
+ NTFS_INODE_MUTEX_NORMAL,
+ NTFS_INODE_MUTEX_PARENT,
+};
+
+/*
+ * sturct ntfs_inode
+ *
+ * Ntfs inode - extends linux inode. consists of one or more MFT inodes.
+ */
+struct ntfs_inode {
+ struct mft_inode mi; // base record
+
+ /*
+ * Valid size: [0 - i_valid) - these range in file contains valid data.
+ * Range [i_valid - inode->i_size) - contains 0.
+ * Usually i_valid <= inode->i_size.
+ */
+ u64 i_valid;
+ struct timespec64 i_crtime;
+
+ struct mutex ni_lock;
+
+ /* File attributes from std. */
+ enum FILE_ATTRIBUTE std_fa;
+ __le32 std_security_id;
+
+ /*
+ * Tree of mft_inode.
+ * Not empty when primary MFT record (usually 1024 bytes) can't save all attributes
+ * e.g. file becomes too fragmented or contains a lot of names.
+ */
+ struct rb_root mi_tree;
+
+ /*
+ * This member is used in ntfs_readdir to ensure that all subrecords are loaded
+ */
+ u8 mi_loaded;
+
+ union {
+ struct ntfs_index dir;
+ struct {
+ struct rw_semaphore run_lock;
+ struct runs_tree run;
+#ifdef CONFIG_NTFS3_LZX_XPRESS
+ struct page *offs_page;
+#endif
+ } file;
+ };
+
+ struct {
+ struct runs_tree run;
+ struct ATTR_LIST_ENTRY *le; // 1K aligned memory.
+ size_t size;
+ bool dirty;
+ } attr_list;
+
+ size_t ni_flags; // NI_FLAG_XXX
+
+ struct inode vfs_inode;
+};
+
+struct indx_node {
+ struct ntfs_buffers nb;
+ struct INDEX_BUFFER *index;
+};
+
+struct ntfs_fnd {
+ int level;
+ struct indx_node *nodes[20];
+ struct NTFS_DE *de[20];
+ struct NTFS_DE *root_de;
+};
+
+enum REPARSE_SIGN {
+ REPARSE_NONE = 0,
+ REPARSE_COMPRESSED = 1,
+ REPARSE_DEDUPLICATED = 2,
+ REPARSE_LINK = 3
+};
+
+/* Functions from attrib.c */
+int attr_allocate_clusters(struct ntfs_sb_info *sbi, struct runs_tree *run,
+ CLST vcn, CLST lcn, CLST len, CLST *pre_alloc,
+ enum ALLOCATE_OPT opt, CLST *alen, const size_t fr,
+ CLST *new_lcn);
+int attr_make_nonresident(struct ntfs_inode *ni, struct ATTRIB *attr,
+ struct ATTR_LIST_ENTRY *le, struct mft_inode *mi,
+ u64 new_size, struct runs_tree *run,
+ struct ATTRIB **ins_attr, struct page *page);
+int attr_set_size(struct ntfs_inode *ni, enum ATTR_TYPE type,
+ const __le16 *name, u8 name_len, struct runs_tree *run,
+ u64 new_size, const u64 *new_valid, bool keep_prealloc,
+ struct ATTRIB **ret);
+int attr_data_get_block(struct ntfs_inode *ni, CLST vcn, CLST clen, CLST *lcn,
+ CLST *len, bool *new);
+int attr_data_read_resident(struct ntfs_inode *ni, struct page *page);
+int attr_data_write_resident(struct ntfs_inode *ni, struct page *page);
+int attr_load_runs_vcn(struct ntfs_inode *ni, enum ATTR_TYPE type,
+ const __le16 *name, u8 name_len, struct runs_tree *run,
+ CLST vcn);
+int attr_load_runs_range(struct ntfs_inode *ni, enum ATTR_TYPE type,
+ const __le16 *name, u8 name_len, struct runs_tree *run,
+ u64 from, u64 to);
+int attr_wof_frame_info(struct ntfs_inode *ni, struct ATTRIB *attr,
+ struct runs_tree *run, u64 frame, u64 frames,
+ u8 frame_bits, u32 *ondisk_size, u64 *vbo_data);
+int attr_is_frame_compressed(struct ntfs_inode *ni, struct ATTRIB *attr,
+ CLST frame, CLST *clst_data);
+int attr_allocate_frame(struct ntfs_inode *ni, CLST frame, size_t compr_size,
+ u64 new_valid);
+int attr_collapse_range(struct ntfs_inode *ni, u64 vbo, u64 bytes);
+int attr_insert_range(struct ntfs_inode *ni, u64 vbo, u64 bytes);
+int attr_punch_hole(struct ntfs_inode *ni, u64 vbo, u64 bytes, u32 *frame_size);
+
+/* Functions from attrlist.c */
+void al_destroy(struct ntfs_inode *ni);
+bool al_verify(struct ntfs_inode *ni);
+int ntfs_load_attr_list(struct ntfs_inode *ni, struct ATTRIB *attr);
+struct ATTR_LIST_ENTRY *al_enumerate(struct ntfs_inode *ni,
+ struct ATTR_LIST_ENTRY *le);
+struct ATTR_LIST_ENTRY *al_find_le(struct ntfs_inode *ni,
+ struct ATTR_LIST_ENTRY *le,
+ const struct ATTRIB *attr);
+struct ATTR_LIST_ENTRY *al_find_ex(struct ntfs_inode *ni,
+ struct ATTR_LIST_ENTRY *le,
+ enum ATTR_TYPE type, const __le16 *name,
+ u8 name_len, const CLST *vcn);
+int al_add_le(struct ntfs_inode *ni, enum ATTR_TYPE type, const __le16 *name,
+ u8 name_len, CLST svcn, __le16 id, const struct MFT_REF *ref,
+ struct ATTR_LIST_ENTRY **new_le);
+bool al_remove_le(struct ntfs_inode *ni, struct ATTR_LIST_ENTRY *le);
+bool al_delete_le(struct ntfs_inode *ni, enum ATTR_TYPE type, CLST vcn,
+ const __le16 *name, size_t name_len,
+ const struct MFT_REF *ref);
+int al_update(struct ntfs_inode *ni, int sync);
+static inline size_t al_aligned(size_t size)
+{
+ return (size + 1023) & ~(size_t)1023;
+}
+
+/* Globals from bitfunc.c */
+bool are_bits_clear(const ulong *map, size_t bit, size_t nbits);
+bool are_bits_set(const ulong *map, size_t bit, size_t nbits);
+size_t get_set_bits_ex(const ulong *map, size_t bit, size_t nbits);
+
+/* Globals from dir.c */
+int ntfs_utf16_to_nls(struct ntfs_sb_info *sbi, const __le16 *name, u32 len,
+ u8 *buf, int buf_len);
+int ntfs_nls_to_utf16(struct ntfs_sb_info *sbi, const u8 *name, u32 name_len,
+ struct cpu_str *uni, u32 max_ulen,
+ enum utf16_endian endian);
+struct inode *dir_search_u(struct inode *dir, const struct cpu_str *uni,
+ struct ntfs_fnd *fnd);
+bool dir_is_empty(struct inode *dir);
+extern const struct file_operations ntfs_dir_operations;
+
+/* Globals from file.c */
+int ntfs_getattr(struct user_namespace *mnt_userns, const struct path *path,
+ struct kstat *stat, u32 request_mask, u32 flags);
+void ntfs_sparse_cluster(struct inode *inode, struct page *page0, CLST vcn,
+ CLST len);
+int ntfs3_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
+ struct iattr *attr);
+int ntfs_file_open(struct inode *inode, struct file *file);
+int ntfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
+ __u64 start, __u64 len);
+extern const struct inode_operations ntfs_special_inode_operations;
+extern const struct inode_operations ntfs_file_inode_operations;
+extern const struct file_operations ntfs_file_operations;
+
+/* Globals from frecord.c */
+void ni_remove_mi(struct ntfs_inode *ni, struct mft_inode *mi);
+struct ATTR_STD_INFO *ni_std(struct ntfs_inode *ni);
+struct ATTR_STD_INFO5 *ni_std5(struct ntfs_inode *ni);
+void ni_clear(struct ntfs_inode *ni);
+int ni_load_mi_ex(struct ntfs_inode *ni, CLST rno, struct mft_inode **mi);
+int ni_load_mi(struct ntfs_inode *ni, const struct ATTR_LIST_ENTRY *le,
+ struct mft_inode **mi);
+struct ATTRIB *ni_find_attr(struct ntfs_inode *ni, struct ATTRIB *attr,
+ struct ATTR_LIST_ENTRY **entry_o,
+ enum ATTR_TYPE type, const __le16 *name,
+ u8 name_len, const CLST *vcn,
+ struct mft_inode **mi);
+struct ATTRIB *ni_enum_attr_ex(struct ntfs_inode *ni, struct ATTRIB *attr,
+ struct ATTR_LIST_ENTRY **le,
+ struct mft_inode **mi);
+struct ATTRIB *ni_load_attr(struct ntfs_inode *ni, enum ATTR_TYPE type,
+ const __le16 *name, u8 name_len, CLST vcn,
+ struct mft_inode **pmi);
+int ni_load_all_mi(struct ntfs_inode *ni);
+bool ni_add_subrecord(struct ntfs_inode *ni, CLST rno, struct mft_inode **mi);
+int ni_remove_attr(struct ntfs_inode *ni, enum ATTR_TYPE type,
+ const __le16 *name, size_t name_len, bool base_only,
+ const __le16 *id);
+int ni_create_attr_list(struct ntfs_inode *ni);
+int ni_expand_list(struct ntfs_inode *ni);
+int ni_insert_nonresident(struct ntfs_inode *ni, enum ATTR_TYPE type,
+ const __le16 *name, u8 name_len,
+ const struct runs_tree *run, CLST svcn, CLST len,
+ __le16 flags, struct ATTRIB **new_attr,
+ struct mft_inode **mi, struct ATTR_LIST_ENTRY **le);
+int ni_insert_resident(struct ntfs_inode *ni, u32 data_size,
+ enum ATTR_TYPE type, const __le16 *name, u8 name_len,
+ struct ATTRIB **new_attr, struct mft_inode **mi,
+ struct ATTR_LIST_ENTRY **le);
+void ni_remove_attr_le(struct ntfs_inode *ni, struct ATTRIB *attr,
+ struct mft_inode *mi, struct ATTR_LIST_ENTRY *le);
+int ni_delete_all(struct ntfs_inode *ni);
+struct ATTR_FILE_NAME *ni_fname_name(struct ntfs_inode *ni,
+ const struct cpu_str *uni,
+ const struct MFT_REF *home,
+ struct mft_inode **mi,
+ struct ATTR_LIST_ENTRY **entry);
+struct ATTR_FILE_NAME *ni_fname_type(struct ntfs_inode *ni, u8 name_type,
+ struct mft_inode **mi,
+ struct ATTR_LIST_ENTRY **entry);
+int ni_new_attr_flags(struct ntfs_inode *ni, enum FILE_ATTRIBUTE new_fa);
+enum REPARSE_SIGN ni_parse_reparse(struct ntfs_inode *ni, struct ATTRIB *attr,
+ struct REPARSE_DATA_BUFFER *buffer);
+int ni_write_inode(struct inode *inode, int sync, const char *hint);
+#define _ni_write_inode(i, w) ni_write_inode(i, w, __func__)
+int ni_fiemap(struct ntfs_inode *ni, struct fiemap_extent_info *fieinfo,
+ __u64 vbo, __u64 len);
+int ni_readpage_cmpr(struct ntfs_inode *ni, struct page *page);
+int ni_decompress_file(struct ntfs_inode *ni);
+int ni_read_frame(struct ntfs_inode *ni, u64 frame_vbo, struct page **pages,
+ u32 pages_per_frame);
+int ni_write_frame(struct ntfs_inode *ni, struct page **pages,
+ u32 pages_per_frame);
+int ni_remove_name(struct ntfs_inode *dir_ni, struct ntfs_inode *ni,
+ struct NTFS_DE *de, struct NTFS_DE **de2, int *undo_step);
+
+bool ni_remove_name_undo(struct ntfs_inode *dir_ni, struct ntfs_inode *ni,
+ struct NTFS_DE *de, struct NTFS_DE *de2,
+ int undo_step);
+
+int ni_add_name(struct ntfs_inode *dir_ni, struct ntfs_inode *ni,
+ struct NTFS_DE *de);
+
+int ni_rename(struct ntfs_inode *dir_ni, struct ntfs_inode *new_dir_ni,
+ struct ntfs_inode *ni, struct NTFS_DE *de, struct NTFS_DE *new_de,
+ bool *is_bad);
+
+bool ni_is_dirty(struct inode *inode);
+
+/* Globals from fslog.c */
+bool check_index_header(const struct INDEX_HDR *hdr, size_t bytes);
+int log_replay(struct ntfs_inode *ni, bool *initialized);
+
+/* Globals from fsntfs.c */
+bool ntfs_fix_pre_write(struct NTFS_RECORD_HEADER *rhdr, size_t bytes);
+int ntfs_fix_post_read(struct NTFS_RECORD_HEADER *rhdr, size_t bytes,
+ bool simple);
+int ntfs_extend_init(struct ntfs_sb_info *sbi);
+int ntfs_loadlog_and_replay(struct ntfs_inode *ni, struct ntfs_sb_info *sbi);
+const struct ATTR_DEF_ENTRY *ntfs_query_def(struct ntfs_sb_info *sbi,
+ enum ATTR_TYPE Type);
+int ntfs_look_for_free_space(struct ntfs_sb_info *sbi, CLST lcn, CLST len,
+ CLST *new_lcn, CLST *new_len,
+ enum ALLOCATE_OPT opt);
+int ntfs_look_free_mft(struct ntfs_sb_info *sbi, CLST *rno, bool mft,
+ struct ntfs_inode *ni, struct mft_inode **mi);
+void ntfs_mark_rec_free(struct ntfs_sb_info *sbi, CLST rno, bool is_mft);
+int ntfs_clear_mft_tail(struct ntfs_sb_info *sbi, size_t from, size_t to);
+int ntfs_refresh_zone(struct ntfs_sb_info *sbi);
+void ntfs_update_mftmirr(struct ntfs_sb_info *sbi, int wait);
+void ntfs_bad_inode(struct inode *inode, const char *hint);
+#define _ntfs_bad_inode(i) ntfs_bad_inode(i, __func__)
+enum NTFS_DIRTY_FLAGS {
+ NTFS_DIRTY_CLEAR = 0,
+ NTFS_DIRTY_DIRTY = 1,
+ NTFS_DIRTY_ERROR = 2,
+};
+int ntfs_set_state(struct ntfs_sb_info *sbi, enum NTFS_DIRTY_FLAGS dirty);
+int ntfs_sb_read(struct super_block *sb, u64 lbo, size_t bytes, void *buffer);
+int ntfs_sb_write(struct super_block *sb, u64 lbo, size_t bytes,
+ const void *buffer, int wait);
+int ntfs_sb_write_run(struct ntfs_sb_info *sbi, const struct runs_tree *run,
+ u64 vbo, const void *buf, size_t bytes, int sync);
+struct buffer_head *ntfs_bread_run(struct ntfs_sb_info *sbi,
+ const struct runs_tree *run, u64 vbo);
+int ntfs_read_run_nb(struct ntfs_sb_info *sbi, const struct runs_tree *run,
+ u64 vbo, void *buf, u32 bytes, struct ntfs_buffers *nb);
+int ntfs_read_bh(struct ntfs_sb_info *sbi, const struct runs_tree *run, u64 vbo,
+ struct NTFS_RECORD_HEADER *rhdr, u32 bytes,
+ struct ntfs_buffers *nb);
+int ntfs_get_bh(struct ntfs_sb_info *sbi, const struct runs_tree *run, u64 vbo,
+ u32 bytes, struct ntfs_buffers *nb);
+int ntfs_write_bh(struct ntfs_sb_info *sbi, struct NTFS_RECORD_HEADER *rhdr,
+ struct ntfs_buffers *nb, int sync);
+int ntfs_bio_pages(struct ntfs_sb_info *sbi, const struct runs_tree *run,
+ struct page **pages, u32 nr_pages, u64 vbo, u32 bytes,
+ enum req_op op);
+int ntfs_bio_fill_1(struct ntfs_sb_info *sbi, const struct runs_tree *run);
+int ntfs_vbo_to_lbo(struct ntfs_sb_info *sbi, const struct runs_tree *run,
+ u64 vbo, u64 *lbo, u64 *bytes);
+struct ntfs_inode *ntfs_new_inode(struct ntfs_sb_info *sbi, CLST nRec,
+ bool dir);
+extern const u8 s_default_security[0x50];
+bool is_sd_valid(const struct SECURITY_DESCRIPTOR_RELATIVE *sd, u32 len);
+int ntfs_security_init(struct ntfs_sb_info *sbi);
+int ntfs_get_security_by_id(struct ntfs_sb_info *sbi, __le32 security_id,
+ struct SECURITY_DESCRIPTOR_RELATIVE **sd,
+ size_t *size);
+int ntfs_insert_security(struct ntfs_sb_info *sbi,
+ const struct SECURITY_DESCRIPTOR_RELATIVE *sd,
+ u32 size, __le32 *security_id, bool *inserted);
+int ntfs_reparse_init(struct ntfs_sb_info *sbi);
+int ntfs_objid_init(struct ntfs_sb_info *sbi);
+int ntfs_objid_remove(struct ntfs_sb_info *sbi, struct GUID *guid);
+int ntfs_insert_reparse(struct ntfs_sb_info *sbi, __le32 rtag,
+ const struct MFT_REF *ref);
+int ntfs_remove_reparse(struct ntfs_sb_info *sbi, __le32 rtag,
+ const struct MFT_REF *ref);
+void mark_as_free_ex(struct ntfs_sb_info *sbi, CLST lcn, CLST len, bool trim);
+int run_deallocate(struct ntfs_sb_info *sbi, struct runs_tree *run, bool trim);
+
+/* Globals from index.c */
+int indx_used_bit(struct ntfs_index *indx, struct ntfs_inode *ni, size_t *bit);
+void fnd_clear(struct ntfs_fnd *fnd);
+static inline struct ntfs_fnd *fnd_get(void)
+{
+ return kzalloc(sizeof(struct ntfs_fnd), GFP_NOFS);
+}
+static inline void fnd_put(struct ntfs_fnd *fnd)
+{
+ if (fnd) {
+ fnd_clear(fnd);
+ kfree(fnd);
+ }
+}
+void indx_clear(struct ntfs_index *idx);
+int indx_init(struct ntfs_index *indx, struct ntfs_sb_info *sbi,
+ const struct ATTRIB *attr, enum index_mutex_classed type);
+struct INDEX_ROOT *indx_get_root(struct ntfs_index *indx, struct ntfs_inode *ni,
+ struct ATTRIB **attr, struct mft_inode **mi);
+int indx_read(struct ntfs_index *idx, struct ntfs_inode *ni, CLST vbn,
+ struct indx_node **node);
+int indx_find(struct ntfs_index *indx, struct ntfs_inode *dir,
+ const struct INDEX_ROOT *root, const void *Key, size_t KeyLen,
+ const void *param, int *diff, struct NTFS_DE **entry,
+ struct ntfs_fnd *fnd);
+int indx_find_sort(struct ntfs_index *indx, struct ntfs_inode *ni,
+ const struct INDEX_ROOT *root, struct NTFS_DE **entry,
+ struct ntfs_fnd *fnd);
+int indx_find_raw(struct ntfs_index *indx, struct ntfs_inode *ni,
+ const struct INDEX_ROOT *root, struct NTFS_DE **entry,
+ size_t *off, struct ntfs_fnd *fnd);
+int indx_insert_entry(struct ntfs_index *indx, struct ntfs_inode *ni,
+ const struct NTFS_DE *new_de, const void *param,
+ struct ntfs_fnd *fnd, bool undo);
+int indx_delete_entry(struct ntfs_index *indx, struct ntfs_inode *ni,
+ const void *key, u32 key_len, const void *param);
+int indx_update_dup(struct ntfs_inode *ni, struct ntfs_sb_info *sbi,
+ const struct ATTR_FILE_NAME *fname,
+ const struct NTFS_DUP_INFO *dup, int sync);
+
+/* Globals from inode.c */
+struct inode *ntfs_iget5(struct super_block *sb, const struct MFT_REF *ref,
+ const struct cpu_str *name);
+int ntfs_set_size(struct inode *inode, u64 new_size);
+int reset_log_file(struct inode *inode);
+int ntfs_get_block(struct inode *inode, sector_t vbn,
+ struct buffer_head *bh_result, int create);
+int ntfs_write_begin(struct file *file, struct address_space *mapping,
+ loff_t pos, u32 len, struct page **pagep, void **fsdata);
+int ntfs_write_end(struct file *file, struct address_space *mapping,
+ loff_t pos, u32 len, u32 copied, struct page *page,
+ void *fsdata);
+int ntfs3_write_inode(struct inode *inode, struct writeback_control *wbc);
+int ntfs_sync_inode(struct inode *inode);
+int ntfs_flush_inodes(struct super_block *sb, struct inode *i1,
+ struct inode *i2);
+int inode_write_data(struct inode *inode, const void *data, size_t bytes);
+struct inode *ntfs_create_inode(struct user_namespace *mnt_userns,
+ struct inode *dir, struct dentry *dentry,
+ const struct cpu_str *uni, umode_t mode,
+ dev_t dev, const char *symname, u32 size,
+ struct ntfs_fnd *fnd);
+int ntfs_link_inode(struct inode *inode, struct dentry *dentry);
+int ntfs_unlink_inode(struct inode *dir, const struct dentry *dentry);
+void ntfs_evict_inode(struct inode *inode);
+extern const struct inode_operations ntfs_link_inode_operations;
+extern const struct address_space_operations ntfs_aops;
+extern const struct address_space_operations ntfs_aops_cmpr;
+
+/* Globals from name_i.c */
+int fill_name_de(struct ntfs_sb_info *sbi, void *buf, const struct qstr *name,
+ const struct cpu_str *uni);
+struct dentry *ntfs3_get_parent(struct dentry *child);
+
+extern const struct inode_operations ntfs_dir_inode_operations;
+extern const struct inode_operations ntfs_special_inode_operations;
+
+/* Globals from record.c */
+int mi_get(struct ntfs_sb_info *sbi, CLST rno, struct mft_inode **mi);
+void mi_put(struct mft_inode *mi);
+int mi_init(struct mft_inode *mi, struct ntfs_sb_info *sbi, CLST rno);
+int mi_read(struct mft_inode *mi, bool is_mft);
+struct ATTRIB *mi_enum_attr(struct mft_inode *mi, struct ATTRIB *attr);
+// TODO: id?
+struct ATTRIB *mi_find_attr(struct mft_inode *mi, struct ATTRIB *attr,
+ enum ATTR_TYPE type, const __le16 *name,
+ size_t name_len, const __le16 *id);
+static inline struct ATTRIB *rec_find_attr_le(struct mft_inode *rec,
+ struct ATTR_LIST_ENTRY *le)
+{
+ return mi_find_attr(rec, NULL, le->type, le_name(le), le->name_len,
+ &le->id);
+}
+int mi_write(struct mft_inode *mi, int wait);
+int mi_format_new(struct mft_inode *mi, struct ntfs_sb_info *sbi, CLST rno,
+ __le16 flags, bool is_mft);
+struct ATTRIB *mi_insert_attr(struct mft_inode *mi, enum ATTR_TYPE type,
+ const __le16 *name, u8 name_len, u32 asize,
+ u16 name_off);
+
+bool mi_remove_attr(struct ntfs_inode *ni, struct mft_inode *mi,
+ struct ATTRIB *attr);
+bool mi_resize_attr(struct mft_inode *mi, struct ATTRIB *attr, int bytes);
+int mi_pack_runs(struct mft_inode *mi, struct ATTRIB *attr,
+ struct runs_tree *run, CLST len);
+static inline bool mi_is_ref(const struct mft_inode *mi,
+ const struct MFT_REF *ref)
+{
+ if (le32_to_cpu(ref->low) != mi->rno)
+ return false;
+ if (ref->seq != mi->mrec->seq)
+ return false;
+
+#ifdef CONFIG_NTFS3_64BIT_CLUSTER
+ return le16_to_cpu(ref->high) == (mi->rno >> 32);
+#else
+ return !ref->high;
+#endif
+}
+
+static inline void mi_get_ref(const struct mft_inode *mi, struct MFT_REF *ref)
+{
+ ref->low = cpu_to_le32(mi->rno);
+#ifdef CONFIG_NTFS3_64BIT_CLUSTER
+ ref->high = cpu_to_le16(mi->rno >> 32);
+#else
+ ref->high = 0;
+#endif
+ ref->seq = mi->mrec->seq;
+}
+
+/* Globals from run.c */
+bool run_lookup_entry(const struct runs_tree *run, CLST vcn, CLST *lcn,
+ CLST *len, size_t *index);
+void run_truncate(struct runs_tree *run, CLST vcn);
+void run_truncate_head(struct runs_tree *run, CLST vcn);
+void run_truncate_around(struct runs_tree *run, CLST vcn);
+bool run_add_entry(struct runs_tree *run, CLST vcn, CLST lcn, CLST len,
+ bool is_mft);
+bool run_collapse_range(struct runs_tree *run, CLST vcn, CLST len);
+bool run_insert_range(struct runs_tree *run, CLST vcn, CLST len);
+bool run_get_entry(const struct runs_tree *run, size_t index, CLST *vcn,
+ CLST *lcn, CLST *len);
+bool run_is_mapped_full(const struct runs_tree *run, CLST svcn, CLST evcn);
+
+int run_pack(const struct runs_tree *run, CLST svcn, CLST len, u8 *run_buf,
+ u32 run_buf_size, CLST *packed_vcns);
+int run_unpack(struct runs_tree *run, struct ntfs_sb_info *sbi, CLST ino,
+ CLST svcn, CLST evcn, CLST vcn, const u8 *run_buf,
+ int run_buf_size);
+
+#ifdef NTFS3_CHECK_FREE_CLST
+int run_unpack_ex(struct runs_tree *run, struct ntfs_sb_info *sbi, CLST ino,
+ CLST svcn, CLST evcn, CLST vcn, const u8 *run_buf,
+ int run_buf_size);
+#else
+#define run_unpack_ex run_unpack
+#endif
+int run_get_highest_vcn(CLST vcn, const u8 *run_buf, u64 *highest_vcn);
+int run_clone(const struct runs_tree *run, struct runs_tree *new_run);
+
+/* Globals from super.c */
+void *ntfs_set_shared(void *ptr, u32 bytes);
+void *ntfs_put_shared(void *ptr);
+void ntfs_unmap_meta(struct super_block *sb, CLST lcn, CLST len);
+int ntfs_discard(struct ntfs_sb_info *sbi, CLST Lcn, CLST Len);
+
+/* Globals from bitmap.c*/
+int __init ntfs3_init_bitmap(void);
+void ntfs3_exit_bitmap(void);
+void wnd_close(struct wnd_bitmap *wnd);
+static inline size_t wnd_zeroes(const struct wnd_bitmap *wnd)
+{
+ return wnd->total_zeroes;
+}
+int wnd_init(struct wnd_bitmap *wnd, struct super_block *sb, size_t nbits);
+int wnd_set_free(struct wnd_bitmap *wnd, size_t bit, size_t bits);
+int wnd_set_used(struct wnd_bitmap *wnd, size_t bit, size_t bits);
+bool wnd_is_free(struct wnd_bitmap *wnd, size_t bit, size_t bits);
+bool wnd_is_used(struct wnd_bitmap *wnd, size_t bit, size_t bits);
+
+/* Possible values for 'flags' 'wnd_find'. */
+#define BITMAP_FIND_MARK_AS_USED 0x01
+#define BITMAP_FIND_FULL 0x02
+size_t wnd_find(struct wnd_bitmap *wnd, size_t to_alloc, size_t hint,
+ size_t flags, size_t *allocated);
+int wnd_extend(struct wnd_bitmap *wnd, size_t new_bits);
+void wnd_zone_set(struct wnd_bitmap *wnd, size_t Lcn, size_t Len);
+int ntfs_trim_fs(struct ntfs_sb_info *sbi, struct fstrim_range *range);
+
+/* Globals from upcase.c */
+int ntfs_cmp_names(const __le16 *s1, size_t l1, const __le16 *s2, size_t l2,
+ const u16 *upcase, bool bothcase);
+int ntfs_cmp_names_cpu(const struct cpu_str *uni1, const struct le_str *uni2,
+ const u16 *upcase, bool bothcase);
+
+/* globals from xattr.c */
+#ifdef CONFIG_NTFS3_FS_POSIX_ACL
+struct posix_acl *ntfs_get_acl(struct inode *inode, int type, bool rcu);
+int ntfs_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
+ struct posix_acl *acl, int type);
+int ntfs_init_acl(struct user_namespace *mnt_userns, struct inode *inode,
+ struct inode *dir);
+#else
+#define ntfs_get_acl NULL
+#define ntfs_set_acl NULL
+#endif
+
+int ntfs_acl_chmod(struct user_namespace *mnt_userns, struct inode *inode);
+int ntfs_permission(struct user_namespace *mnt_userns, struct inode *inode,
+ int mask);
+ssize_t ntfs_listxattr(struct dentry *dentry, char *buffer, size_t size);
+extern const struct xattr_handler *ntfs_xattr_handlers[];
+
+int ntfs_save_wsl_perm(struct inode *inode);
+void ntfs_get_wsl_perm(struct inode *inode);
+
+/* globals from lznt.c */
+struct lznt *get_lznt_ctx(int level);
+size_t compress_lznt(const void *uncompressed, size_t uncompressed_size,
+ void *compressed, size_t compressed_size,
+ struct lznt *ctx);
+ssize_t decompress_lznt(const void *compressed, size_t compressed_size,
+ void *uncompressed, size_t uncompressed_size);
+
+static inline bool is_ntfs3(struct ntfs_sb_info *sbi)
+{
+ return sbi->volume.major_ver >= 3;
+}
+
+/* (sb->s_flags & SB_ACTIVE) */
+static inline bool is_mounted(struct ntfs_sb_info *sbi)
+{
+ return !!sbi->sb->s_root;
+}
+
+static inline bool ntfs_is_meta_file(struct ntfs_sb_info *sbi, CLST rno)
+{
+ return rno < MFT_REC_FREE || rno == sbi->objid_no ||
+ rno == sbi->quota_no || rno == sbi->reparse_no ||
+ rno == sbi->usn_jrnl_no;
+}
+
+static inline void ntfs_unmap_page(struct page *page)
+{
+ kunmap(page);
+ put_page(page);
+}
+
+static inline struct page *ntfs_map_page(struct address_space *mapping,
+ unsigned long index)
+{
+ struct page *page = read_mapping_page(mapping, index, NULL);
+
+ if (!IS_ERR(page))
+ kmap(page);
+ return page;
+}
+
+static inline size_t wnd_zone_bit(const struct wnd_bitmap *wnd)
+{
+ return wnd->zone_bit;
+}
+
+static inline size_t wnd_zone_len(const struct wnd_bitmap *wnd)
+{
+ return wnd->zone_end - wnd->zone_bit;
+}
+
+static inline void run_init(struct runs_tree *run)
+{
+ run->runs = NULL;
+ run->count = 0;
+ run->allocated = 0;
+}
+
+static inline struct runs_tree *run_alloc(void)
+{
+ return kzalloc(sizeof(struct runs_tree), GFP_NOFS);
+}
+
+static inline void run_close(struct runs_tree *run)
+{
+ kvfree(run->runs);
+ memset(run, 0, sizeof(*run));
+}
+
+static inline void run_free(struct runs_tree *run)
+{
+ if (run) {
+ kvfree(run->runs);
+ kfree(run);
+ }
+}
+
+static inline bool run_is_empty(struct runs_tree *run)
+{
+ return !run->count;
+}
+
+/* NTFS uses quad aligned bitmaps. */
+static inline size_t bitmap_size(size_t bits)
+{
+ return ALIGN((bits + 7) >> 3, 8);
+}
+
+#define _100ns2seconds 10000000
+#define SecondsToStartOf1970 0x00000002B6109100
+
+#define NTFS_TIME_GRAN 100
+
+/*
+ * kernel2nt - Converts in-memory kernel timestamp into nt time.
+ */
+static inline __le64 kernel2nt(const struct timespec64 *ts)
+{
+ // 10^7 units of 100 nanoseconds one second
+ return cpu_to_le64(_100ns2seconds *
+ (ts->tv_sec + SecondsToStartOf1970) +
+ ts->tv_nsec / NTFS_TIME_GRAN);
+}
+
+/*
+ * nt2kernel - Converts on-disk nt time into kernel timestamp.
+ */
+static inline void nt2kernel(const __le64 tm, struct timespec64 *ts)
+{
+ u64 t = le64_to_cpu(tm) - _100ns2seconds * SecondsToStartOf1970;
+
+ // WARNING: do_div changes its first argument(!)
+ ts->tv_nsec = do_div(t, _100ns2seconds) * 100;
+ ts->tv_sec = t;
+}
+
+static inline struct ntfs_sb_info *ntfs_sb(struct super_block *sb)
+{
+ return sb->s_fs_info;
+}
+
+/*
+ * ntfs_up_cluster - Align up on cluster boundary.
+ */
+static inline u64 ntfs_up_cluster(const struct ntfs_sb_info *sbi, u64 size)
+{
+ return (size + sbi->cluster_mask) & sbi->cluster_mask_inv;
+}
+
+/*
+ * ntfs_up_block - Align up on cluster boundary.
+ */
+static inline u64 ntfs_up_block(const struct super_block *sb, u64 size)
+{
+ return (size + sb->s_blocksize - 1) & ~(u64)(sb->s_blocksize - 1);
+}
+
+static inline CLST bytes_to_cluster(const struct ntfs_sb_info *sbi, u64 size)
+{
+ return (size + sbi->cluster_mask) >> sbi->cluster_bits;
+}
+
+static inline u64 bytes_to_block(const struct super_block *sb, u64 size)
+{
+ return (size + sb->s_blocksize - 1) >> sb->s_blocksize_bits;
+}
+
+static inline struct buffer_head *ntfs_bread(struct super_block *sb,
+ sector_t block)
+{
+ struct buffer_head *bh = sb_bread(sb, block);
+
+ if (bh)
+ return bh;
+
+ ntfs_err(sb, "failed to read volume at offset 0x%llx",
+ (u64)block << sb->s_blocksize_bits);
+ return NULL;
+}
+
+static inline struct ntfs_inode *ntfs_i(struct inode *inode)
+{
+ return container_of(inode, struct ntfs_inode, vfs_inode);
+}
+
+static inline bool is_compressed(const struct ntfs_inode *ni)
+{
+ return (ni->std_fa & FILE_ATTRIBUTE_COMPRESSED) ||
+ (ni->ni_flags & NI_FLAG_COMPRESSED_MASK);
+}
+
+static inline int ni_ext_compress_bits(const struct ntfs_inode *ni)
+{
+ return 0xb + (ni->ni_flags & NI_FLAG_COMPRESSED_MASK);
+}
+
+/* Bits - 0xc, 0xd, 0xe, 0xf, 0x10 */
+static inline void ni_set_ext_compress_bits(struct ntfs_inode *ni, u8 bits)
+{
+ ni->ni_flags |= (bits - 0xb) & NI_FLAG_COMPRESSED_MASK;
+}
+
+static inline bool is_dedup(const struct ntfs_inode *ni)
+{
+ return ni->ni_flags & NI_FLAG_DEDUPLICATED;
+}
+
+static inline bool is_encrypted(const struct ntfs_inode *ni)
+{
+ return ni->std_fa & FILE_ATTRIBUTE_ENCRYPTED;
+}
+
+static inline bool is_sparsed(const struct ntfs_inode *ni)
+{
+ return ni->std_fa & FILE_ATTRIBUTE_SPARSE_FILE;
+}
+
+static inline int is_resident(struct ntfs_inode *ni)
+{
+ return ni->ni_flags & NI_FLAG_RESIDENT;
+}
+
+static inline void le16_sub_cpu(__le16 *var, u16 val)
+{
+ *var = cpu_to_le16(le16_to_cpu(*var) - val);
+}
+
+static inline void le32_sub_cpu(__le32 *var, u32 val)
+{
+ *var = cpu_to_le32(le32_to_cpu(*var) - val);
+}
+
+static inline void nb_put(struct ntfs_buffers *nb)
+{
+ u32 i, nbufs = nb->nbufs;
+
+ if (!nbufs)
+ return;
+
+ for (i = 0; i < nbufs; i++)
+ put_bh(nb->bh[i]);
+ nb->nbufs = 0;
+}
+
+static inline void put_indx_node(struct indx_node *in)
+{
+ if (!in)
+ return;
+
+ kfree(in->index);
+ nb_put(&in->nb);
+ kfree(in);
+}
+
+static inline void mi_clear(struct mft_inode *mi)
+{
+ nb_put(&mi->nb);
+ kfree(mi->mrec);
+ mi->mrec = NULL;
+}
+
+static inline void ni_lock(struct ntfs_inode *ni)
+{
+ mutex_lock_nested(&ni->ni_lock, NTFS_INODE_MUTEX_NORMAL);
+}
+
+static inline void ni_lock_dir(struct ntfs_inode *ni)
+{
+ mutex_lock_nested(&ni->ni_lock, NTFS_INODE_MUTEX_PARENT);
+}
+
+static inline void ni_unlock(struct ntfs_inode *ni)
+{
+ mutex_unlock(&ni->ni_lock);
+}
+
+static inline int ni_trylock(struct ntfs_inode *ni)
+{
+ return mutex_trylock(&ni->ni_lock);
+}
+
+static inline int attr_load_runs_attr(struct ntfs_inode *ni,
+ struct ATTRIB *attr,
+ struct runs_tree *run, CLST vcn)
+{
+ return attr_load_runs_vcn(ni, attr->type, attr_name(attr),
+ attr->name_len, run, vcn);
+}
+
+static inline void le64_sub_cpu(__le64 *var, u64 val)
+{
+ *var = cpu_to_le64(le64_to_cpu(*var) - val);
+}
+
+#endif /* _LINUX_NTFS3_NTFS_FS_H */
diff --git a/fs/ntfs3/record.c b/fs/ntfs3/record.c
new file mode 100644
index 000000000..ba336c728
--- /dev/null
+++ b/fs/ntfs3/record.c
@@ -0,0 +1,595 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ *
+ * Copyright (C) 2019-2021 Paragon Software GmbH, All rights reserved.
+ *
+ */
+
+#include <linux/fs.h>
+
+#include "debug.h"
+#include "ntfs.h"
+#include "ntfs_fs.h"
+
+static inline int compare_attr(const struct ATTRIB *left, enum ATTR_TYPE type,
+ const __le16 *name, u8 name_len,
+ const u16 *upcase)
+{
+ /* First, compare the type codes. */
+ int diff = le32_to_cpu(left->type) - le32_to_cpu(type);
+
+ if (diff)
+ return diff;
+
+ /* They have the same type code, so we have to compare the names. */
+ return ntfs_cmp_names(attr_name(left), left->name_len, name, name_len,
+ upcase, true);
+}
+
+/*
+ * mi_new_attt_id
+ *
+ * Return: Unused attribute id that is less than mrec->next_attr_id.
+ */
+static __le16 mi_new_attt_id(struct mft_inode *mi)
+{
+ u16 free_id, max_id, t16;
+ struct MFT_REC *rec = mi->mrec;
+ struct ATTRIB *attr;
+ __le16 id;
+
+ id = rec->next_attr_id;
+ free_id = le16_to_cpu(id);
+ if (free_id < 0x7FFF) {
+ rec->next_attr_id = cpu_to_le16(free_id + 1);
+ return id;
+ }
+
+ /* One record can store up to 1024/24 ~= 42 attributes. */
+ free_id = 0;
+ max_id = 0;
+
+ attr = NULL;
+
+ for (;;) {
+ attr = mi_enum_attr(mi, attr);
+ if (!attr) {
+ rec->next_attr_id = cpu_to_le16(max_id + 1);
+ mi->dirty = true;
+ return cpu_to_le16(free_id);
+ }
+
+ t16 = le16_to_cpu(attr->id);
+ if (t16 == free_id) {
+ free_id += 1;
+ attr = NULL;
+ } else if (max_id < t16)
+ max_id = t16;
+ }
+}
+
+int mi_get(struct ntfs_sb_info *sbi, CLST rno, struct mft_inode **mi)
+{
+ int err;
+ struct mft_inode *m = kzalloc(sizeof(struct mft_inode), GFP_NOFS);
+
+ if (!m)
+ return -ENOMEM;
+
+ err = mi_init(m, sbi, rno);
+ if (err) {
+ kfree(m);
+ return err;
+ }
+
+ err = mi_read(m, false);
+ if (err) {
+ mi_put(m);
+ return err;
+ }
+
+ *mi = m;
+ return 0;
+}
+
+void mi_put(struct mft_inode *mi)
+{
+ mi_clear(mi);
+ kfree(mi);
+}
+
+int mi_init(struct mft_inode *mi, struct ntfs_sb_info *sbi, CLST rno)
+{
+ mi->sbi = sbi;
+ mi->rno = rno;
+ mi->mrec = kmalloc(sbi->record_size, GFP_NOFS);
+ if (!mi->mrec)
+ return -ENOMEM;
+
+ return 0;
+}
+
+/*
+ * mi_read - Read MFT data.
+ */
+int mi_read(struct mft_inode *mi, bool is_mft)
+{
+ int err;
+ struct MFT_REC *rec = mi->mrec;
+ struct ntfs_sb_info *sbi = mi->sbi;
+ u32 bpr = sbi->record_size;
+ u64 vbo = (u64)mi->rno << sbi->record_bits;
+ struct ntfs_inode *mft_ni = sbi->mft.ni;
+ struct runs_tree *run = mft_ni ? &mft_ni->file.run : NULL;
+ struct rw_semaphore *rw_lock = NULL;
+
+ if (is_mounted(sbi)) {
+ if (!is_mft && mft_ni) {
+ rw_lock = &mft_ni->file.run_lock;
+ down_read(rw_lock);
+ }
+ }
+
+ err = ntfs_read_bh(sbi, run, vbo, &rec->rhdr, bpr, &mi->nb);
+ if (rw_lock)
+ up_read(rw_lock);
+ if (!err)
+ goto ok;
+
+ if (err == -E_NTFS_FIXUP) {
+ mi->dirty = true;
+ goto ok;
+ }
+
+ if (err != -ENOENT)
+ goto out;
+
+ if (rw_lock) {
+ ni_lock(mft_ni);
+ down_write(rw_lock);
+ }
+ err = attr_load_runs_vcn(mft_ni, ATTR_DATA, NULL, 0, run,
+ vbo >> sbi->cluster_bits);
+ if (rw_lock) {
+ up_write(rw_lock);
+ ni_unlock(mft_ni);
+ }
+ if (err)
+ goto out;
+
+ if (rw_lock)
+ down_read(rw_lock);
+ err = ntfs_read_bh(sbi, run, vbo, &rec->rhdr, bpr, &mi->nb);
+ if (rw_lock)
+ up_read(rw_lock);
+
+ if (err == -E_NTFS_FIXUP) {
+ mi->dirty = true;
+ goto ok;
+ }
+ if (err)
+ goto out;
+
+ok:
+ /* Check field 'total' only here. */
+ if (le32_to_cpu(rec->total) != bpr) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ return 0;
+
+out:
+ if (err == -E_NTFS_CORRUPT) {
+ ntfs_err(sbi->sb, "mft corrupted");
+ ntfs_set_state(sbi, NTFS_DIRTY_ERROR);
+ err = -EINVAL;
+ }
+
+ return err;
+}
+
+struct ATTRIB *mi_enum_attr(struct mft_inode *mi, struct ATTRIB *attr)
+{
+ const struct MFT_REC *rec = mi->mrec;
+ u32 used = le32_to_cpu(rec->used);
+ u32 t32, off, asize;
+ u16 t16;
+
+ if (!attr) {
+ u32 total = le32_to_cpu(rec->total);
+
+ off = le16_to_cpu(rec->attr_off);
+
+ if (used > total)
+ return NULL;
+
+ if (off >= used || off < MFTRECORD_FIXUP_OFFSET_1 ||
+ !IS_ALIGNED(off, 4)) {
+ return NULL;
+ }
+
+ /* Skip non-resident records. */
+ if (!is_rec_inuse(rec))
+ return NULL;
+
+ attr = Add2Ptr(rec, off);
+ } else {
+ /* Check if input attr inside record. */
+ off = PtrOffset(rec, attr);
+ if (off >= used)
+ return NULL;
+
+ asize = le32_to_cpu(attr->size);
+ if (asize < SIZEOF_RESIDENT) {
+ /* Impossible 'cause we should not return such attribute. */
+ return NULL;
+ }
+
+ if (off + asize < off) {
+ /* overflow check */
+ return NULL;
+ }
+
+ attr = Add2Ptr(attr, asize);
+ off += asize;
+ }
+
+ asize = le32_to_cpu(attr->size);
+
+ /* Can we use the first field (attr->type). */
+ if (off + 8 > used) {
+ static_assert(ALIGN(sizeof(enum ATTR_TYPE), 8) == 8);
+ return NULL;
+ }
+
+ if (attr->type == ATTR_END) {
+ /* End of enumeration. */
+ return NULL;
+ }
+
+ /* 0x100 is last known attribute for now. */
+ t32 = le32_to_cpu(attr->type);
+ if ((t32 & 0xf) || (t32 > 0x100))
+ return NULL;
+
+ /* Check boundary. */
+ if (off + asize > used)
+ return NULL;
+
+ /* Check size of attribute. */
+ if (!attr->non_res) {
+ if (asize < SIZEOF_RESIDENT)
+ return NULL;
+
+ t16 = le16_to_cpu(attr->res.data_off);
+
+ if (t16 > asize)
+ return NULL;
+
+ t32 = le32_to_cpu(attr->res.data_size);
+ if (t16 + t32 > asize)
+ return NULL;
+
+ if (attr->name_len &&
+ le16_to_cpu(attr->name_off) + sizeof(short) * attr->name_len > t16) {
+ return NULL;
+ }
+
+ return attr;
+ }
+
+ /* Check some nonresident fields. */
+ if (attr->name_len &&
+ le16_to_cpu(attr->name_off) + sizeof(short) * attr->name_len >
+ le16_to_cpu(attr->nres.run_off)) {
+ return NULL;
+ }
+
+ if (attr->nres.svcn || !is_attr_ext(attr)) {
+ if (asize + 8 < SIZEOF_NONRESIDENT)
+ return NULL;
+
+ if (attr->nres.c_unit)
+ return NULL;
+ } else if (asize + 8 < SIZEOF_NONRESIDENT_EX)
+ return NULL;
+
+ return attr;
+}
+
+/*
+ * mi_find_attr - Find the attribute by type and name and id.
+ */
+struct ATTRIB *mi_find_attr(struct mft_inode *mi, struct ATTRIB *attr,
+ enum ATTR_TYPE type, const __le16 *name,
+ size_t name_len, const __le16 *id)
+{
+ u32 type_in = le32_to_cpu(type);
+ u32 atype;
+
+next_attr:
+ attr = mi_enum_attr(mi, attr);
+ if (!attr)
+ return NULL;
+
+ atype = le32_to_cpu(attr->type);
+ if (atype > type_in)
+ return NULL;
+
+ if (atype < type_in)
+ goto next_attr;
+
+ if (attr->name_len != name_len)
+ goto next_attr;
+
+ if (name_len && memcmp(attr_name(attr), name, name_len * sizeof(short)))
+ goto next_attr;
+
+ if (id && *id != attr->id)
+ goto next_attr;
+
+ return attr;
+}
+
+int mi_write(struct mft_inode *mi, int wait)
+{
+ struct MFT_REC *rec;
+ int err;
+ struct ntfs_sb_info *sbi;
+
+ if (!mi->dirty)
+ return 0;
+
+ sbi = mi->sbi;
+ rec = mi->mrec;
+
+ err = ntfs_write_bh(sbi, &rec->rhdr, &mi->nb, wait);
+ if (err)
+ return err;
+
+ if (mi->rno < sbi->mft.recs_mirr)
+ sbi->flags |= NTFS_FLAGS_MFTMIRR;
+
+ mi->dirty = false;
+
+ return 0;
+}
+
+int mi_format_new(struct mft_inode *mi, struct ntfs_sb_info *sbi, CLST rno,
+ __le16 flags, bool is_mft)
+{
+ int err;
+ u16 seq = 1;
+ struct MFT_REC *rec;
+ u64 vbo = (u64)rno << sbi->record_bits;
+
+ err = mi_init(mi, sbi, rno);
+ if (err)
+ return err;
+
+ rec = mi->mrec;
+
+ if (rno == MFT_REC_MFT) {
+ ;
+ } else if (rno < MFT_REC_FREE) {
+ seq = rno;
+ } else if (rno >= sbi->mft.used) {
+ ;
+ } else if (mi_read(mi, is_mft)) {
+ ;
+ } else if (rec->rhdr.sign == NTFS_FILE_SIGNATURE) {
+ /* Record is reused. Update its sequence number. */
+ seq = le16_to_cpu(rec->seq) + 1;
+ if (!seq)
+ seq = 1;
+ }
+
+ memcpy(rec, sbi->new_rec, sbi->record_size);
+
+ rec->seq = cpu_to_le16(seq);
+ rec->flags = RECORD_FLAG_IN_USE | flags;
+
+ mi->dirty = true;
+
+ if (!mi->nb.nbufs) {
+ struct ntfs_inode *ni = sbi->mft.ni;
+ bool lock = false;
+
+ if (is_mounted(sbi) && !is_mft) {
+ down_read(&ni->file.run_lock);
+ lock = true;
+ }
+
+ err = ntfs_get_bh(sbi, &ni->file.run, vbo, sbi->record_size,
+ &mi->nb);
+ if (lock)
+ up_read(&ni->file.run_lock);
+ }
+
+ return err;
+}
+
+/*
+ * mi_insert_attr - Reserve space for new attribute.
+ *
+ * Return: Not full constructed attribute or NULL if not possible to create.
+ */
+struct ATTRIB *mi_insert_attr(struct mft_inode *mi, enum ATTR_TYPE type,
+ const __le16 *name, u8 name_len, u32 asize,
+ u16 name_off)
+{
+ size_t tail;
+ struct ATTRIB *attr;
+ __le16 id;
+ struct MFT_REC *rec = mi->mrec;
+ struct ntfs_sb_info *sbi = mi->sbi;
+ u32 used = le32_to_cpu(rec->used);
+ const u16 *upcase = sbi->upcase;
+ int diff;
+
+ /* Can we insert mi attribute? */
+ if (used + asize > mi->sbi->record_size)
+ return NULL;
+
+ /*
+ * Scan through the list of attributes to find the point
+ * at which we should insert it.
+ */
+ attr = NULL;
+ while ((attr = mi_enum_attr(mi, attr))) {
+ diff = compare_attr(attr, type, name, name_len, upcase);
+
+ if (diff < 0)
+ continue;
+
+ if (!diff && !is_attr_indexed(attr))
+ return NULL;
+ break;
+ }
+
+ if (!attr) {
+ tail = 8; /* Not used, just to suppress warning. */
+ attr = Add2Ptr(rec, used - 8);
+ } else {
+ tail = used - PtrOffset(rec, attr);
+ }
+
+ id = mi_new_attt_id(mi);
+
+ memmove(Add2Ptr(attr, asize), attr, tail);
+ memset(attr, 0, asize);
+
+ attr->type = type;
+ attr->size = cpu_to_le32(asize);
+ attr->name_len = name_len;
+ attr->name_off = cpu_to_le16(name_off);
+ attr->id = id;
+
+ memmove(Add2Ptr(attr, name_off), name, name_len * sizeof(short));
+ rec->used = cpu_to_le32(used + asize);
+
+ mi->dirty = true;
+
+ return attr;
+}
+
+/*
+ * mi_remove_attr - Remove the attribute from record.
+ *
+ * NOTE: The source attr will point to next attribute.
+ */
+bool mi_remove_attr(struct ntfs_inode *ni, struct mft_inode *mi,
+ struct ATTRIB *attr)
+{
+ struct MFT_REC *rec = mi->mrec;
+ u32 aoff = PtrOffset(rec, attr);
+ u32 used = le32_to_cpu(rec->used);
+ u32 asize = le32_to_cpu(attr->size);
+
+ if (aoff + asize > used)
+ return false;
+
+ if (ni && is_attr_indexed(attr)) {
+ le16_add_cpu(&ni->mi.mrec->hard_links, -1);
+ ni->mi.dirty = true;
+ }
+
+ used -= asize;
+ memmove(attr, Add2Ptr(attr, asize), used - aoff);
+ rec->used = cpu_to_le32(used);
+ mi->dirty = true;
+
+ return true;
+}
+
+/* bytes = "new attribute size" - "old attribute size" */
+bool mi_resize_attr(struct mft_inode *mi, struct ATTRIB *attr, int bytes)
+{
+ struct MFT_REC *rec = mi->mrec;
+ u32 aoff = PtrOffset(rec, attr);
+ u32 total, used = le32_to_cpu(rec->used);
+ u32 nsize, asize = le32_to_cpu(attr->size);
+ u32 rsize = le32_to_cpu(attr->res.data_size);
+ int tail = (int)(used - aoff - asize);
+ int dsize;
+ char *next;
+
+ if (tail < 0 || aoff >= used)
+ return false;
+
+ if (!bytes)
+ return true;
+
+ total = le32_to_cpu(rec->total);
+ next = Add2Ptr(attr, asize);
+
+ if (bytes > 0) {
+ dsize = ALIGN(bytes, 8);
+ if (used + dsize > total)
+ return false;
+ nsize = asize + dsize;
+ /* Move tail */
+ memmove(next + dsize, next, tail);
+ memset(next, 0, dsize);
+ used += dsize;
+ rsize += dsize;
+ } else {
+ dsize = ALIGN(-bytes, 8);
+ if (dsize > asize)
+ return false;
+ nsize = asize - dsize;
+ memmove(next - dsize, next, tail);
+ used -= dsize;
+ rsize -= dsize;
+ }
+
+ rec->used = cpu_to_le32(used);
+ attr->size = cpu_to_le32(nsize);
+ if (!attr->non_res)
+ attr->res.data_size = cpu_to_le32(rsize);
+ mi->dirty = true;
+
+ return true;
+}
+
+int mi_pack_runs(struct mft_inode *mi, struct ATTRIB *attr,
+ struct runs_tree *run, CLST len)
+{
+ int err = 0;
+ struct ntfs_sb_info *sbi = mi->sbi;
+ u32 new_run_size;
+ CLST plen;
+ struct MFT_REC *rec = mi->mrec;
+ CLST svcn = le64_to_cpu(attr->nres.svcn);
+ u32 used = le32_to_cpu(rec->used);
+ u32 aoff = PtrOffset(rec, attr);
+ u32 asize = le32_to_cpu(attr->size);
+ char *next = Add2Ptr(attr, asize);
+ u16 run_off = le16_to_cpu(attr->nres.run_off);
+ u32 run_size = asize - run_off;
+ u32 tail = used - aoff - asize;
+ u32 dsize = sbi->record_size - used;
+
+ /* Make a maximum gap in current record. */
+ memmove(next + dsize, next, tail);
+
+ /* Pack as much as possible. */
+ err = run_pack(run, svcn, len, Add2Ptr(attr, run_off), run_size + dsize,
+ &plen);
+ if (err < 0) {
+ memmove(next, next + dsize, tail);
+ return err;
+ }
+
+ new_run_size = ALIGN(err, 8);
+
+ memmove(next + new_run_size - run_size, next + dsize, tail);
+
+ attr->size = cpu_to_le32(asize + new_run_size - run_size);
+ attr->nres.evcn = cpu_to_le64(svcn + plen - 1);
+ rec->used = cpu_to_le32(used + new_run_size - run_size);
+ mi->dirty = true;
+
+ return 0;
+}
diff --git a/fs/ntfs3/run.c b/fs/ntfs3/run.c
new file mode 100644
index 000000000..12d8682f3
--- /dev/null
+++ b/fs/ntfs3/run.c
@@ -0,0 +1,1186 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ *
+ * Copyright (C) 2019-2021 Paragon Software GmbH, All rights reserved.
+ *
+ * TODO: try to use extents tree (instead of array)
+ */
+
+#include <linux/blkdev.h>
+#include <linux/fs.h>
+#include <linux/log2.h>
+
+#include "debug.h"
+#include "ntfs.h"
+#include "ntfs_fs.h"
+
+/* runs_tree is a continues memory. Try to avoid big size. */
+#define NTFS3_RUN_MAX_BYTES 0x10000
+
+struct ntfs_run {
+ CLST vcn; /* Virtual cluster number. */
+ CLST len; /* Length in clusters. */
+ CLST lcn; /* Logical cluster number. */
+};
+
+/*
+ * run_lookup - Lookup the index of a MCB entry that is first <= vcn.
+ *
+ * Case of success it will return non-zero value and set
+ * @index parameter to index of entry been found.
+ * Case of entry missing from list 'index' will be set to
+ * point to insertion position for the entry question.
+ */
+static bool run_lookup(const struct runs_tree *run, CLST vcn, size_t *index)
+{
+ size_t min_idx, max_idx, mid_idx;
+ struct ntfs_run *r;
+
+ if (!run->count) {
+ *index = 0;
+ return false;
+ }
+
+ min_idx = 0;
+ max_idx = run->count - 1;
+
+ /* Check boundary cases specially, 'cause they cover the often requests. */
+ r = run->runs;
+ if (vcn < r->vcn) {
+ *index = 0;
+ return false;
+ }
+
+ if (vcn < r->vcn + r->len) {
+ *index = 0;
+ return true;
+ }
+
+ r += max_idx;
+ if (vcn >= r->vcn + r->len) {
+ *index = run->count;
+ return false;
+ }
+
+ if (vcn >= r->vcn) {
+ *index = max_idx;
+ return true;
+ }
+
+ do {
+ mid_idx = min_idx + ((max_idx - min_idx) >> 1);
+ r = run->runs + mid_idx;
+
+ if (vcn < r->vcn) {
+ max_idx = mid_idx - 1;
+ if (!mid_idx)
+ break;
+ } else if (vcn >= r->vcn + r->len) {
+ min_idx = mid_idx + 1;
+ } else {
+ *index = mid_idx;
+ return true;
+ }
+ } while (min_idx <= max_idx);
+
+ *index = max_idx + 1;
+ return false;
+}
+
+/*
+ * run_consolidate - Consolidate runs starting from a given one.
+ */
+static void run_consolidate(struct runs_tree *run, size_t index)
+{
+ size_t i;
+ struct ntfs_run *r = run->runs + index;
+
+ while (index + 1 < run->count) {
+ /*
+ * I should merge current run with next
+ * if start of the next run lies inside one being tested.
+ */
+ struct ntfs_run *n = r + 1;
+ CLST end = r->vcn + r->len;
+ CLST dl;
+
+ /* Stop if runs are not aligned one to another. */
+ if (n->vcn > end)
+ break;
+
+ dl = end - n->vcn;
+
+ /*
+ * If range at index overlaps with next one
+ * then I will either adjust it's start position
+ * or (if completely matches) dust remove one from the list.
+ */
+ if (dl > 0) {
+ if (n->len <= dl)
+ goto remove_next_range;
+
+ n->len -= dl;
+ n->vcn += dl;
+ if (n->lcn != SPARSE_LCN)
+ n->lcn += dl;
+ dl = 0;
+ }
+
+ /*
+ * Stop if sparse mode does not match
+ * both current and next runs.
+ */
+ if ((n->lcn == SPARSE_LCN) != (r->lcn == SPARSE_LCN)) {
+ index += 1;
+ r = n;
+ continue;
+ }
+
+ /*
+ * Check if volume block
+ * of a next run lcn does not match
+ * last volume block of the current run.
+ */
+ if (n->lcn != SPARSE_LCN && n->lcn != r->lcn + r->len)
+ break;
+
+ /*
+ * Next and current are siblings.
+ * Eat/join.
+ */
+ r->len += n->len - dl;
+
+remove_next_range:
+ i = run->count - (index + 1);
+ if (i > 1)
+ memmove(n, n + 1, sizeof(*n) * (i - 1));
+
+ run->count -= 1;
+ }
+}
+
+/*
+ * run_is_mapped_full
+ *
+ * Return: True if range [svcn - evcn] is mapped.
+ */
+bool run_is_mapped_full(const struct runs_tree *run, CLST svcn, CLST evcn)
+{
+ size_t i;
+ const struct ntfs_run *r, *end;
+ CLST next_vcn;
+
+ if (!run_lookup(run, svcn, &i))
+ return false;
+
+ end = run->runs + run->count;
+ r = run->runs + i;
+
+ for (;;) {
+ next_vcn = r->vcn + r->len;
+ if (next_vcn > evcn)
+ return true;
+
+ if (++r >= end)
+ return false;
+
+ if (r->vcn != next_vcn)
+ return false;
+ }
+}
+
+bool run_lookup_entry(const struct runs_tree *run, CLST vcn, CLST *lcn,
+ CLST *len, size_t *index)
+{
+ size_t idx;
+ CLST gap;
+ struct ntfs_run *r;
+
+ /* Fail immediately if nrun was not touched yet. */
+ if (!run->runs)
+ return false;
+
+ if (!run_lookup(run, vcn, &idx))
+ return false;
+
+ r = run->runs + idx;
+
+ if (vcn >= r->vcn + r->len)
+ return false;
+
+ gap = vcn - r->vcn;
+ if (r->len <= gap)
+ return false;
+
+ *lcn = r->lcn == SPARSE_LCN ? SPARSE_LCN : (r->lcn + gap);
+
+ if (len)
+ *len = r->len - gap;
+ if (index)
+ *index = idx;
+
+ return true;
+}
+
+/*
+ * run_truncate_head - Decommit the range before vcn.
+ */
+void run_truncate_head(struct runs_tree *run, CLST vcn)
+{
+ size_t index;
+ struct ntfs_run *r;
+
+ if (run_lookup(run, vcn, &index)) {
+ r = run->runs + index;
+
+ if (vcn > r->vcn) {
+ CLST dlen = vcn - r->vcn;
+
+ r->vcn = vcn;
+ r->len -= dlen;
+ if (r->lcn != SPARSE_LCN)
+ r->lcn += dlen;
+ }
+
+ if (!index)
+ return;
+ }
+ r = run->runs;
+ memmove(r, r + index, sizeof(*r) * (run->count - index));
+
+ run->count -= index;
+
+ if (!run->count) {
+ kvfree(run->runs);
+ run->runs = NULL;
+ run->allocated = 0;
+ }
+}
+
+/*
+ * run_truncate - Decommit the range after vcn.
+ */
+void run_truncate(struct runs_tree *run, CLST vcn)
+{
+ size_t index;
+
+ /*
+ * If I hit the range then
+ * I have to truncate one.
+ * If range to be truncated is becoming empty
+ * then it will entirely be removed.
+ */
+ if (run_lookup(run, vcn, &index)) {
+ struct ntfs_run *r = run->runs + index;
+
+ r->len = vcn - r->vcn;
+
+ if (r->len > 0)
+ index += 1;
+ }
+
+ /*
+ * At this point 'index' is set to position that
+ * should be thrown away (including index itself)
+ * Simple one - just set the limit.
+ */
+ run->count = index;
+
+ /* Do not reallocate array 'runs'. Only free if possible. */
+ if (!index) {
+ kvfree(run->runs);
+ run->runs = NULL;
+ run->allocated = 0;
+ }
+}
+
+/*
+ * run_truncate_around - Trim head and tail if necessary.
+ */
+void run_truncate_around(struct runs_tree *run, CLST vcn)
+{
+ run_truncate_head(run, vcn);
+
+ if (run->count >= NTFS3_RUN_MAX_BYTES / sizeof(struct ntfs_run) / 2)
+ run_truncate(run, (run->runs + (run->count >> 1))->vcn);
+}
+
+/*
+ * run_add_entry
+ *
+ * Sets location to known state.
+ * Run to be added may overlap with existing location.
+ *
+ * Return: false if of memory.
+ */
+bool run_add_entry(struct runs_tree *run, CLST vcn, CLST lcn, CLST len,
+ bool is_mft)
+{
+ size_t used, index;
+ struct ntfs_run *r;
+ bool inrange;
+ CLST tail_vcn = 0, tail_len = 0, tail_lcn = 0;
+ bool should_add_tail = false;
+
+ /*
+ * Lookup the insertion point.
+ *
+ * Execute bsearch for the entry containing
+ * start position question.
+ */
+ inrange = run_lookup(run, vcn, &index);
+
+ /*
+ * Shortcut here would be case of
+ * range not been found but one been added
+ * continues previous run.
+ * This case I can directly make use of
+ * existing range as my start point.
+ */
+ if (!inrange && index > 0) {
+ struct ntfs_run *t = run->runs + index - 1;
+
+ if (t->vcn + t->len == vcn &&
+ (t->lcn == SPARSE_LCN) == (lcn == SPARSE_LCN) &&
+ (lcn == SPARSE_LCN || lcn == t->lcn + t->len)) {
+ inrange = true;
+ index -= 1;
+ }
+ }
+
+ /*
+ * At this point 'index' either points to the range
+ * containing start position or to the insertion position
+ * for a new range.
+ * So first let's check if range I'm probing is here already.
+ */
+ if (!inrange) {
+requires_new_range:
+ /*
+ * Range was not found.
+ * Insert at position 'index'
+ */
+ used = run->count * sizeof(struct ntfs_run);
+
+ /*
+ * Check allocated space.
+ * If one is not enough to get one more entry
+ * then it will be reallocated.
+ */
+ if (run->allocated < used + sizeof(struct ntfs_run)) {
+ size_t bytes;
+ struct ntfs_run *new_ptr;
+
+ /* Use power of 2 for 'bytes'. */
+ if (!used) {
+ bytes = 64;
+ } else if (used <= 16 * PAGE_SIZE) {
+ if (is_power_of_2(run->allocated))
+ bytes = run->allocated << 1;
+ else
+ bytes = (size_t)1
+ << (2 + blksize_bits(used));
+ } else {
+ bytes = run->allocated + (16 * PAGE_SIZE);
+ }
+
+ WARN_ON(!is_mft && bytes > NTFS3_RUN_MAX_BYTES);
+
+ new_ptr = kvmalloc(bytes, GFP_KERNEL);
+
+ if (!new_ptr)
+ return false;
+
+ r = new_ptr + index;
+ memcpy(new_ptr, run->runs,
+ index * sizeof(struct ntfs_run));
+ memcpy(r + 1, run->runs + index,
+ sizeof(struct ntfs_run) * (run->count - index));
+
+ kvfree(run->runs);
+ run->runs = new_ptr;
+ run->allocated = bytes;
+
+ } else {
+ size_t i = run->count - index;
+
+ r = run->runs + index;
+
+ /* memmove appears to be a bottle neck here... */
+ if (i > 0)
+ memmove(r + 1, r, sizeof(struct ntfs_run) * i);
+ }
+
+ r->vcn = vcn;
+ r->lcn = lcn;
+ r->len = len;
+ run->count += 1;
+ } else {
+ r = run->runs + index;
+
+ /*
+ * If one of ranges was not allocated then we
+ * have to split location we just matched and
+ * insert current one.
+ * A common case this requires tail to be reinserted
+ * a recursive call.
+ */
+ if (((lcn == SPARSE_LCN) != (r->lcn == SPARSE_LCN)) ||
+ (lcn != SPARSE_LCN && lcn != r->lcn + (vcn - r->vcn))) {
+ CLST to_eat = vcn - r->vcn;
+ CLST Tovcn = to_eat + len;
+
+ should_add_tail = Tovcn < r->len;
+
+ if (should_add_tail) {
+ tail_lcn = r->lcn == SPARSE_LCN
+ ? SPARSE_LCN
+ : (r->lcn + Tovcn);
+ tail_vcn = r->vcn + Tovcn;
+ tail_len = r->len - Tovcn;
+ }
+
+ if (to_eat > 0) {
+ r->len = to_eat;
+ inrange = false;
+ index += 1;
+ goto requires_new_range;
+ }
+
+ /* lcn should match one were going to add. */
+ r->lcn = lcn;
+ }
+
+ /*
+ * If existing range fits then were done.
+ * Otherwise extend found one and fall back to range jocode.
+ */
+ if (r->vcn + r->len < vcn + len)
+ r->len += len - ((r->vcn + r->len) - vcn);
+ }
+
+ /*
+ * And normalize it starting from insertion point.
+ * It's possible that no insertion needed case if
+ * start point lies within the range of an entry
+ * that 'index' points to.
+ */
+ if (inrange && index > 0)
+ index -= 1;
+ run_consolidate(run, index);
+ run_consolidate(run, index + 1);
+
+ /*
+ * A special case.
+ * We have to add extra range a tail.
+ */
+ if (should_add_tail &&
+ !run_add_entry(run, tail_vcn, tail_lcn, tail_len, is_mft))
+ return false;
+
+ return true;
+}
+
+/* run_collapse_range
+ *
+ * Helper for attr_collapse_range(),
+ * which is helper for fallocate(collapse_range).
+ */
+bool run_collapse_range(struct runs_tree *run, CLST vcn, CLST len)
+{
+ size_t index, eat;
+ struct ntfs_run *r, *e, *eat_start, *eat_end;
+ CLST end;
+
+ if (WARN_ON(!run_lookup(run, vcn, &index)))
+ return true; /* Should never be here. */
+
+ e = run->runs + run->count;
+ r = run->runs + index;
+ end = vcn + len;
+
+ if (vcn > r->vcn) {
+ if (r->vcn + r->len <= end) {
+ /* Collapse tail of run .*/
+ r->len = vcn - r->vcn;
+ } else if (r->lcn == SPARSE_LCN) {
+ /* Collapse a middle part of sparsed run. */
+ r->len -= len;
+ } else {
+ /* Collapse a middle part of normal run, split. */
+ if (!run_add_entry(run, vcn, SPARSE_LCN, len, false))
+ return false;
+ return run_collapse_range(run, vcn, len);
+ }
+
+ r += 1;
+ }
+
+ eat_start = r;
+ eat_end = r;
+
+ for (; r < e; r++) {
+ CLST d;
+
+ if (r->vcn >= end) {
+ r->vcn -= len;
+ continue;
+ }
+
+ if (r->vcn + r->len <= end) {
+ /* Eat this run. */
+ eat_end = r + 1;
+ continue;
+ }
+
+ d = end - r->vcn;
+ if (r->lcn != SPARSE_LCN)
+ r->lcn += d;
+ r->len -= d;
+ r->vcn -= len - d;
+ }
+
+ eat = eat_end - eat_start;
+ memmove(eat_start, eat_end, (e - eat_end) * sizeof(*r));
+ run->count -= eat;
+
+ return true;
+}
+
+/* run_insert_range
+ *
+ * Helper for attr_insert_range(),
+ * which is helper for fallocate(insert_range).
+ */
+bool run_insert_range(struct runs_tree *run, CLST vcn, CLST len)
+{
+ size_t index;
+ struct ntfs_run *r, *e;
+
+ if (WARN_ON(!run_lookup(run, vcn, &index)))
+ return false; /* Should never be here. */
+
+ e = run->runs + run->count;
+ r = run->runs + index;
+
+ if (vcn > r->vcn)
+ r += 1;
+
+ for (; r < e; r++)
+ r->vcn += len;
+
+ r = run->runs + index;
+
+ if (vcn > r->vcn) {
+ /* split fragment. */
+ CLST len1 = vcn - r->vcn;
+ CLST len2 = r->len - len1;
+ CLST lcn2 = r->lcn == SPARSE_LCN ? SPARSE_LCN : (r->lcn + len1);
+
+ r->len = len1;
+
+ if (!run_add_entry(run, vcn + len, lcn2, len2, false))
+ return false;
+ }
+
+ if (!run_add_entry(run, vcn, SPARSE_LCN, len, false))
+ return false;
+
+ return true;
+}
+
+/*
+ * run_get_entry - Return index-th mapped region.
+ */
+bool run_get_entry(const struct runs_tree *run, size_t index, CLST *vcn,
+ CLST *lcn, CLST *len)
+{
+ const struct ntfs_run *r;
+
+ if (index >= run->count)
+ return false;
+
+ r = run->runs + index;
+
+ if (!r->len)
+ return false;
+
+ if (vcn)
+ *vcn = r->vcn;
+ if (lcn)
+ *lcn = r->lcn;
+ if (len)
+ *len = r->len;
+ return true;
+}
+
+/*
+ * run_packed_size - Calculate the size of packed int64.
+ */
+#ifdef __BIG_ENDIAN
+static inline int run_packed_size(const s64 n)
+{
+ const u8 *p = (const u8 *)&n + sizeof(n) - 1;
+
+ if (n >= 0) {
+ if (p[-7] || p[-6] || p[-5] || p[-4])
+ p -= 4;
+ if (p[-3] || p[-2])
+ p -= 2;
+ if (p[-1])
+ p -= 1;
+ if (p[0] & 0x80)
+ p -= 1;
+ } else {
+ if (p[-7] != 0xff || p[-6] != 0xff || p[-5] != 0xff ||
+ p[-4] != 0xff)
+ p -= 4;
+ if (p[-3] != 0xff || p[-2] != 0xff)
+ p -= 2;
+ if (p[-1] != 0xff)
+ p -= 1;
+ if (!(p[0] & 0x80))
+ p -= 1;
+ }
+ return (const u8 *)&n + sizeof(n) - p;
+}
+
+/* Full trusted function. It does not check 'size' for errors. */
+static inline void run_pack_s64(u8 *run_buf, u8 size, s64 v)
+{
+ const u8 *p = (u8 *)&v;
+
+ switch (size) {
+ case 8:
+ run_buf[7] = p[0];
+ fallthrough;
+ case 7:
+ run_buf[6] = p[1];
+ fallthrough;
+ case 6:
+ run_buf[5] = p[2];
+ fallthrough;
+ case 5:
+ run_buf[4] = p[3];
+ fallthrough;
+ case 4:
+ run_buf[3] = p[4];
+ fallthrough;
+ case 3:
+ run_buf[2] = p[5];
+ fallthrough;
+ case 2:
+ run_buf[1] = p[6];
+ fallthrough;
+ case 1:
+ run_buf[0] = p[7];
+ }
+}
+
+/* Full trusted function. It does not check 'size' for errors. */
+static inline s64 run_unpack_s64(const u8 *run_buf, u8 size, s64 v)
+{
+ u8 *p = (u8 *)&v;
+
+ switch (size) {
+ case 8:
+ p[0] = run_buf[7];
+ fallthrough;
+ case 7:
+ p[1] = run_buf[6];
+ fallthrough;
+ case 6:
+ p[2] = run_buf[5];
+ fallthrough;
+ case 5:
+ p[3] = run_buf[4];
+ fallthrough;
+ case 4:
+ p[4] = run_buf[3];
+ fallthrough;
+ case 3:
+ p[5] = run_buf[2];
+ fallthrough;
+ case 2:
+ p[6] = run_buf[1];
+ fallthrough;
+ case 1:
+ p[7] = run_buf[0];
+ }
+ return v;
+}
+
+#else
+
+static inline int run_packed_size(const s64 n)
+{
+ const u8 *p = (const u8 *)&n;
+
+ if (n >= 0) {
+ if (p[7] || p[6] || p[5] || p[4])
+ p += 4;
+ if (p[3] || p[2])
+ p += 2;
+ if (p[1])
+ p += 1;
+ if (p[0] & 0x80)
+ p += 1;
+ } else {
+ if (p[7] != 0xff || p[6] != 0xff || p[5] != 0xff ||
+ p[4] != 0xff)
+ p += 4;
+ if (p[3] != 0xff || p[2] != 0xff)
+ p += 2;
+ if (p[1] != 0xff)
+ p += 1;
+ if (!(p[0] & 0x80))
+ p += 1;
+ }
+
+ return 1 + p - (const u8 *)&n;
+}
+
+/* Full trusted function. It does not check 'size' for errors. */
+static inline void run_pack_s64(u8 *run_buf, u8 size, s64 v)
+{
+ const u8 *p = (u8 *)&v;
+
+ /* memcpy( run_buf, &v, size); Is it faster? */
+ switch (size) {
+ case 8:
+ run_buf[7] = p[7];
+ fallthrough;
+ case 7:
+ run_buf[6] = p[6];
+ fallthrough;
+ case 6:
+ run_buf[5] = p[5];
+ fallthrough;
+ case 5:
+ run_buf[4] = p[4];
+ fallthrough;
+ case 4:
+ run_buf[3] = p[3];
+ fallthrough;
+ case 3:
+ run_buf[2] = p[2];
+ fallthrough;
+ case 2:
+ run_buf[1] = p[1];
+ fallthrough;
+ case 1:
+ run_buf[0] = p[0];
+ }
+}
+
+/* full trusted function. It does not check 'size' for errors */
+static inline s64 run_unpack_s64(const u8 *run_buf, u8 size, s64 v)
+{
+ u8 *p = (u8 *)&v;
+
+ /* memcpy( &v, run_buf, size); Is it faster? */
+ switch (size) {
+ case 8:
+ p[7] = run_buf[7];
+ fallthrough;
+ case 7:
+ p[6] = run_buf[6];
+ fallthrough;
+ case 6:
+ p[5] = run_buf[5];
+ fallthrough;
+ case 5:
+ p[4] = run_buf[4];
+ fallthrough;
+ case 4:
+ p[3] = run_buf[3];
+ fallthrough;
+ case 3:
+ p[2] = run_buf[2];
+ fallthrough;
+ case 2:
+ p[1] = run_buf[1];
+ fallthrough;
+ case 1:
+ p[0] = run_buf[0];
+ }
+ return v;
+}
+#endif
+
+/*
+ * run_pack - Pack runs into buffer.
+ *
+ * packed_vcns - How much runs we have packed.
+ * packed_size - How much bytes we have used run_buf.
+ */
+int run_pack(const struct runs_tree *run, CLST svcn, CLST len, u8 *run_buf,
+ u32 run_buf_size, CLST *packed_vcns)
+{
+ CLST next_vcn, vcn, lcn;
+ CLST prev_lcn = 0;
+ CLST evcn1 = svcn + len;
+ const struct ntfs_run *r, *r_end;
+ int packed_size = 0;
+ size_t i;
+ s64 dlcn;
+ int offset_size, size_size, tmp;
+
+ *packed_vcns = 0;
+
+ if (!len)
+ goto out;
+
+ /* Check all required entries [svcn, encv1) available. */
+ if (!run_lookup(run, svcn, &i))
+ return -ENOENT;
+
+ r_end = run->runs + run->count;
+ r = run->runs + i;
+
+ for (next_vcn = r->vcn + r->len; next_vcn < evcn1;
+ next_vcn = r->vcn + r->len) {
+ if (++r >= r_end || r->vcn != next_vcn)
+ return -ENOENT;
+ }
+
+ /* Repeat cycle above and pack runs. Assume no errors. */
+ r = run->runs + i;
+ len = svcn - r->vcn;
+ vcn = svcn;
+ lcn = r->lcn == SPARSE_LCN ? SPARSE_LCN : (r->lcn + len);
+ len = r->len - len;
+
+ for (;;) {
+ next_vcn = vcn + len;
+ if (next_vcn > evcn1)
+ len = evcn1 - vcn;
+
+ /* How much bytes required to pack len. */
+ size_size = run_packed_size(len);
+
+ /* offset_size - How much bytes is packed dlcn. */
+ if (lcn == SPARSE_LCN) {
+ offset_size = 0;
+ dlcn = 0;
+ } else {
+ /* NOTE: lcn can be less than prev_lcn! */
+ dlcn = (s64)lcn - prev_lcn;
+ offset_size = run_packed_size(dlcn);
+ prev_lcn = lcn;
+ }
+
+ tmp = run_buf_size - packed_size - 2 - offset_size;
+ if (tmp <= 0)
+ goto out;
+
+ /* Can we store this entire run. */
+ if (tmp < size_size)
+ goto out;
+
+ if (run_buf) {
+ /* Pack run header. */
+ run_buf[0] = ((u8)(size_size | (offset_size << 4)));
+ run_buf += 1;
+
+ /* Pack the length of run. */
+ run_pack_s64(run_buf, size_size, len);
+
+ run_buf += size_size;
+ /* Pack the offset from previous LCN. */
+ run_pack_s64(run_buf, offset_size, dlcn);
+ run_buf += offset_size;
+ }
+
+ packed_size += 1 + offset_size + size_size;
+ *packed_vcns += len;
+
+ if (packed_size + 1 >= run_buf_size || next_vcn >= evcn1)
+ goto out;
+
+ r += 1;
+ vcn = r->vcn;
+ lcn = r->lcn;
+ len = r->len;
+ }
+
+out:
+ /* Store last zero. */
+ if (run_buf)
+ run_buf[0] = 0;
+
+ return packed_size + 1;
+}
+
+/*
+ * run_unpack - Unpack packed runs from @run_buf.
+ *
+ * Return: Error if negative, or real used bytes.
+ */
+int run_unpack(struct runs_tree *run, struct ntfs_sb_info *sbi, CLST ino,
+ CLST svcn, CLST evcn, CLST vcn, const u8 *run_buf,
+ int run_buf_size)
+{
+ u64 prev_lcn, vcn64, lcn, next_vcn;
+ const u8 *run_last, *run_0;
+ bool is_mft = ino == MFT_REC_MFT;
+
+ if (run_buf_size < 0)
+ return -EINVAL;
+
+ /* Check for empty. */
+ if (evcn + 1 == svcn)
+ return 0;
+
+ if (evcn < svcn)
+ return -EINVAL;
+
+ run_0 = run_buf;
+ run_last = run_buf + run_buf_size;
+ prev_lcn = 0;
+ vcn64 = svcn;
+
+ /* Read all runs the chain. */
+ /* size_size - How much bytes is packed len. */
+ while (run_buf < run_last) {
+ /* size_size - How much bytes is packed len. */
+ u8 size_size = *run_buf & 0xF;
+ /* offset_size - How much bytes is packed dlcn. */
+ u8 offset_size = *run_buf++ >> 4;
+ u64 len;
+
+ if (!size_size)
+ break;
+
+ /*
+ * Unpack runs.
+ * NOTE: Runs are stored little endian order
+ * "len" is unsigned value, "dlcn" is signed.
+ * Large positive number requires to store 5 bytes
+ * e.g.: 05 FF 7E FF FF 00 00 00
+ */
+ if (size_size > 8)
+ return -EINVAL;
+
+ len = run_unpack_s64(run_buf, size_size, 0);
+ /* Skip size_size. */
+ run_buf += size_size;
+
+ if (!len)
+ return -EINVAL;
+
+ if (!offset_size)
+ lcn = SPARSE_LCN64;
+ else if (offset_size <= 8) {
+ s64 dlcn;
+
+ /* Initial value of dlcn is -1 or 0. */
+ dlcn = (run_buf[offset_size - 1] & 0x80) ? (s64)-1 : 0;
+ dlcn = run_unpack_s64(run_buf, offset_size, dlcn);
+ /* Skip offset_size. */
+ run_buf += offset_size;
+
+ if (!dlcn)
+ return -EINVAL;
+ lcn = prev_lcn + dlcn;
+ prev_lcn = lcn;
+ } else
+ return -EINVAL;
+
+ next_vcn = vcn64 + len;
+ /* Check boundary. */
+ if (next_vcn > evcn + 1)
+ return -EINVAL;
+
+#ifndef CONFIG_NTFS3_64BIT_CLUSTER
+ if (next_vcn > 0x100000000ull || (lcn + len) > 0x100000000ull) {
+ ntfs_err(
+ sbi->sb,
+ "This driver is compiled without CONFIG_NTFS3_64BIT_CLUSTER (like windows driver).\n"
+ "Volume contains 64 bits run: vcn %llx, lcn %llx, len %llx.\n"
+ "Activate CONFIG_NTFS3_64BIT_CLUSTER to process this case",
+ vcn64, lcn, len);
+ return -EOPNOTSUPP;
+ }
+#endif
+ if (lcn != SPARSE_LCN64 && lcn + len > sbi->used.bitmap.nbits) {
+ /* LCN range is out of volume. */
+ return -EINVAL;
+ }
+
+ if (!run)
+ ; /* Called from check_attr(fslog.c) to check run. */
+ else if (run == RUN_DEALLOCATE) {
+ /*
+ * Called from ni_delete_all to free clusters
+ * without storing in run.
+ */
+ if (lcn != SPARSE_LCN64)
+ mark_as_free_ex(sbi, lcn, len, true);
+ } else if (vcn64 >= vcn) {
+ if (!run_add_entry(run, vcn64, lcn, len, is_mft))
+ return -ENOMEM;
+ } else if (next_vcn > vcn) {
+ u64 dlen = vcn - vcn64;
+
+ if (!run_add_entry(run, vcn, lcn + dlen, len - dlen,
+ is_mft))
+ return -ENOMEM;
+ }
+
+ vcn64 = next_vcn;
+ }
+
+ if (vcn64 != evcn + 1) {
+ /* Not expected length of unpacked runs. */
+ return -EINVAL;
+ }
+
+ return run_buf - run_0;
+}
+
+#ifdef NTFS3_CHECK_FREE_CLST
+/*
+ * run_unpack_ex - Unpack packed runs from "run_buf".
+ *
+ * Checks unpacked runs to be used in bitmap.
+ *
+ * Return: Error if negative, or real used bytes.
+ */
+int run_unpack_ex(struct runs_tree *run, struct ntfs_sb_info *sbi, CLST ino,
+ CLST svcn, CLST evcn, CLST vcn, const u8 *run_buf,
+ int run_buf_size)
+{
+ int ret, err;
+ CLST next_vcn, lcn, len;
+ size_t index;
+ bool ok;
+ struct wnd_bitmap *wnd;
+
+ ret = run_unpack(run, sbi, ino, svcn, evcn, vcn, run_buf, run_buf_size);
+ if (ret <= 0)
+ return ret;
+
+ if (!sbi->used.bitmap.sb || !run || run == RUN_DEALLOCATE)
+ return ret;
+
+ if (ino == MFT_REC_BADCLUST)
+ return ret;
+
+ next_vcn = vcn = svcn;
+ wnd = &sbi->used.bitmap;
+
+ for (ok = run_lookup_entry(run, vcn, &lcn, &len, &index);
+ next_vcn <= evcn;
+ ok = run_get_entry(run, ++index, &vcn, &lcn, &len)) {
+ if (!ok || next_vcn != vcn)
+ return -EINVAL;
+
+ next_vcn = vcn + len;
+
+ if (lcn == SPARSE_LCN)
+ continue;
+
+ if (sbi->flags & NTFS_FLAGS_NEED_REPLAY)
+ continue;
+
+ down_read_nested(&wnd->rw_lock, BITMAP_MUTEX_CLUSTERS);
+ /* Check for free blocks. */
+ ok = wnd_is_used(wnd, lcn, len);
+ up_read(&wnd->rw_lock);
+ if (ok)
+ continue;
+
+ /* Looks like volume is corrupted. */
+ ntfs_set_state(sbi, NTFS_DIRTY_ERROR);
+
+ if (down_write_trylock(&wnd->rw_lock)) {
+ /* Mark all zero bits as used in range [lcn, lcn+len). */
+ CLST i, lcn_f = 0, len_f = 0;
+
+ err = 0;
+ for (i = 0; i < len; i++) {
+ if (wnd_is_free(wnd, lcn + i, 1)) {
+ if (!len_f)
+ lcn_f = lcn + i;
+ len_f += 1;
+ } else if (len_f) {
+ err = wnd_set_used(wnd, lcn_f, len_f);
+ len_f = 0;
+ if (err)
+ break;
+ }
+ }
+
+ if (len_f)
+ err = wnd_set_used(wnd, lcn_f, len_f);
+
+ up_write(&wnd->rw_lock);
+ if (err)
+ return err;
+ }
+ }
+
+ return ret;
+}
+#endif
+
+/*
+ * run_get_highest_vcn
+ *
+ * Return the highest vcn from a mapping pairs array
+ * it used while replaying log file.
+ */
+int run_get_highest_vcn(CLST vcn, const u8 *run_buf, u64 *highest_vcn)
+{
+ u64 vcn64 = vcn;
+ u8 size_size;
+
+ while ((size_size = *run_buf & 0xF)) {
+ u8 offset_size = *run_buf++ >> 4;
+ u64 len;
+
+ if (size_size > 8 || offset_size > 8)
+ return -EINVAL;
+
+ len = run_unpack_s64(run_buf, size_size, 0);
+ if (!len)
+ return -EINVAL;
+
+ run_buf += size_size + offset_size;
+ vcn64 += len;
+
+#ifndef CONFIG_NTFS3_64BIT_CLUSTER
+ if (vcn64 > 0x100000000ull)
+ return -EINVAL;
+#endif
+ }
+
+ *highest_vcn = vcn64 - 1;
+ return 0;
+}
+
+/*
+ * run_clone
+ *
+ * Make a copy of run
+ */
+int run_clone(const struct runs_tree *run, struct runs_tree *new_run)
+{
+ size_t bytes = run->count * sizeof(struct ntfs_run);
+
+ if (bytes > new_run->allocated) {
+ struct ntfs_run *new_ptr = kvmalloc(bytes, GFP_KERNEL);
+
+ if (!new_ptr)
+ return -ENOMEM;
+
+ kvfree(new_run->runs);
+ new_run->runs = new_ptr;
+ new_run->allocated = bytes;
+ }
+
+ memcpy(new_run->runs, run->runs, bytes);
+ new_run->count = run->count;
+ return 0;
+}
diff --git a/fs/ntfs3/super.c b/fs/ntfs3/super.c
new file mode 100644
index 000000000..6066eea3f
--- /dev/null
+++ b/fs/ntfs3/super.c
@@ -0,0 +1,1517 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ *
+ * Copyright (C) 2019-2021 Paragon Software GmbH, All rights reserved.
+ *
+ *
+ * terminology
+ *
+ * cluster - allocation unit - 512,1K,2K,4K,...,2M
+ * vcn - virtual cluster number - Offset inside the file in clusters.
+ * vbo - virtual byte offset - Offset inside the file in bytes.
+ * lcn - logical cluster number - 0 based cluster in clusters heap.
+ * lbo - logical byte offset - Absolute position inside volume.
+ * run - maps VCN to LCN - Stored in attributes in packed form.
+ * attr - attribute segment - std/name/data etc records inside MFT.
+ * mi - MFT inode - One MFT record(usually 1024 bytes or 4K), consists of attributes.
+ * ni - NTFS inode - Extends linux inode. consists of one or more mft inodes.
+ * index - unit inside directory - 2K, 4K, <=page size, does not depend on cluster size.
+ *
+ * WSL - Windows Subsystem for Linux
+ * https://docs.microsoft.com/en-us/windows/wsl/file-permissions
+ * It stores uid/gid/mode/dev in xattr
+ *
+ */
+
+#include <linux/blkdev.h>
+#include <linux/buffer_head.h>
+#include <linux/exportfs.h>
+#include <linux/fs.h>
+#include <linux/fs_context.h>
+#include <linux/fs_parser.h>
+#include <linux/log2.h>
+#include <linux/minmax.h>
+#include <linux/module.h>
+#include <linux/nls.h>
+#include <linux/seq_file.h>
+#include <linux/statfs.h>
+
+#include "debug.h"
+#include "ntfs.h"
+#include "ntfs_fs.h"
+#ifdef CONFIG_NTFS3_LZX_XPRESS
+#include "lib/lib.h"
+#endif
+
+#ifdef CONFIG_PRINTK
+/*
+ * ntfs_printk - Trace warnings/notices/errors.
+ *
+ * Thanks Joe Perches <joe@perches.com> for implementation
+ */
+void ntfs_printk(const struct super_block *sb, const char *fmt, ...)
+{
+ struct va_format vaf;
+ va_list args;
+ int level;
+ struct ntfs_sb_info *sbi = sb->s_fs_info;
+
+ /* Should we use different ratelimits for warnings/notices/errors? */
+ if (!___ratelimit(&sbi->msg_ratelimit, "ntfs3"))
+ return;
+
+ va_start(args, fmt);
+
+ level = printk_get_level(fmt);
+ vaf.fmt = printk_skip_level(fmt);
+ vaf.va = &args;
+ printk("%c%cntfs3: %s: %pV\n", KERN_SOH_ASCII, level, sb->s_id, &vaf);
+
+ va_end(args);
+}
+
+static char s_name_buf[512];
+static atomic_t s_name_buf_cnt = ATOMIC_INIT(1); // 1 means 'free s_name_buf'.
+
+/*
+ * ntfs_inode_printk
+ *
+ * Print warnings/notices/errors about inode using name or inode number.
+ */
+void ntfs_inode_printk(struct inode *inode, const char *fmt, ...)
+{
+ struct super_block *sb = inode->i_sb;
+ struct ntfs_sb_info *sbi = sb->s_fs_info;
+ char *name;
+ va_list args;
+ struct va_format vaf;
+ int level;
+
+ if (!___ratelimit(&sbi->msg_ratelimit, "ntfs3"))
+ return;
+
+ /* Use static allocated buffer, if possible. */
+ name = atomic_dec_and_test(&s_name_buf_cnt)
+ ? s_name_buf
+ : kmalloc(sizeof(s_name_buf), GFP_NOFS);
+
+ if (name) {
+ struct dentry *de = d_find_alias(inode);
+ const u32 name_len = ARRAY_SIZE(s_name_buf) - 1;
+
+ if (de) {
+ spin_lock(&de->d_lock);
+ snprintf(name, name_len, " \"%s\"", de->d_name.name);
+ spin_unlock(&de->d_lock);
+ name[name_len] = 0; /* To be sure. */
+ } else {
+ name[0] = 0;
+ }
+ dput(de); /* Cocci warns if placed in branch "if (de)" */
+ }
+
+ va_start(args, fmt);
+
+ level = printk_get_level(fmt);
+ vaf.fmt = printk_skip_level(fmt);
+ vaf.va = &args;
+
+ printk("%c%cntfs3: %s: ino=%lx,%s %pV\n", KERN_SOH_ASCII, level,
+ sb->s_id, inode->i_ino, name ? name : "", &vaf);
+
+ va_end(args);
+
+ atomic_inc(&s_name_buf_cnt);
+ if (name != s_name_buf)
+ kfree(name);
+}
+#endif
+
+/*
+ * Shared memory struct.
+ *
+ * On-disk ntfs's upcase table is created by ntfs formatter.
+ * 'upcase' table is 128K bytes of memory.
+ * We should read it into memory when mounting.
+ * Several ntfs volumes likely use the same 'upcase' table.
+ * It is good idea to share in-memory 'upcase' table between different volumes.
+ * Unfortunately winxp/vista/win7 use different upcase tables.
+ */
+static DEFINE_SPINLOCK(s_shared_lock);
+
+static struct {
+ void *ptr;
+ u32 len;
+ int cnt;
+} s_shared[8];
+
+/*
+ * ntfs_set_shared
+ *
+ * Return:
+ * * @ptr - If pointer was saved in shared memory.
+ * * NULL - If pointer was not shared.
+ */
+void *ntfs_set_shared(void *ptr, u32 bytes)
+{
+ void *ret = NULL;
+ int i, j = -1;
+
+ spin_lock(&s_shared_lock);
+ for (i = 0; i < ARRAY_SIZE(s_shared); i++) {
+ if (!s_shared[i].cnt) {
+ j = i;
+ } else if (bytes == s_shared[i].len &&
+ !memcmp(s_shared[i].ptr, ptr, bytes)) {
+ s_shared[i].cnt += 1;
+ ret = s_shared[i].ptr;
+ break;
+ }
+ }
+
+ if (!ret && j != -1) {
+ s_shared[j].ptr = ptr;
+ s_shared[j].len = bytes;
+ s_shared[j].cnt = 1;
+ ret = ptr;
+ }
+ spin_unlock(&s_shared_lock);
+
+ return ret;
+}
+
+/*
+ * ntfs_put_shared
+ *
+ * Return:
+ * * @ptr - If pointer is not shared anymore.
+ * * NULL - If pointer is still shared.
+ */
+void *ntfs_put_shared(void *ptr)
+{
+ void *ret = ptr;
+ int i;
+
+ spin_lock(&s_shared_lock);
+ for (i = 0; i < ARRAY_SIZE(s_shared); i++) {
+ if (s_shared[i].cnt && s_shared[i].ptr == ptr) {
+ if (--s_shared[i].cnt)
+ ret = NULL;
+ break;
+ }
+ }
+ spin_unlock(&s_shared_lock);
+
+ return ret;
+}
+
+static inline void put_mount_options(struct ntfs_mount_options *options)
+{
+ kfree(options->nls_name);
+ unload_nls(options->nls);
+ kfree(options);
+}
+
+enum Opt {
+ Opt_uid,
+ Opt_gid,
+ Opt_umask,
+ Opt_dmask,
+ Opt_fmask,
+ Opt_immutable,
+ Opt_discard,
+ Opt_force,
+ Opt_sparse,
+ Opt_nohidden,
+ Opt_showmeta,
+ Opt_acl,
+ Opt_iocharset,
+ Opt_prealloc,
+ Opt_noacsrules,
+ Opt_err,
+};
+
+static const struct fs_parameter_spec ntfs_fs_parameters[] = {
+ fsparam_u32("uid", Opt_uid),
+ fsparam_u32("gid", Opt_gid),
+ fsparam_u32oct("umask", Opt_umask),
+ fsparam_u32oct("dmask", Opt_dmask),
+ fsparam_u32oct("fmask", Opt_fmask),
+ fsparam_flag_no("sys_immutable", Opt_immutable),
+ fsparam_flag_no("discard", Opt_discard),
+ fsparam_flag_no("force", Opt_force),
+ fsparam_flag_no("sparse", Opt_sparse),
+ fsparam_flag_no("hidden", Opt_nohidden),
+ fsparam_flag_no("acl", Opt_acl),
+ fsparam_flag_no("showmeta", Opt_showmeta),
+ fsparam_flag_no("prealloc", Opt_prealloc),
+ fsparam_flag_no("acsrules", Opt_noacsrules),
+ fsparam_string("iocharset", Opt_iocharset),
+ {}
+};
+
+/*
+ * Load nls table or if @nls is utf8 then return NULL.
+ */
+static struct nls_table *ntfs_load_nls(char *nls)
+{
+ struct nls_table *ret;
+
+ if (!nls)
+ nls = CONFIG_NLS_DEFAULT;
+
+ if (strcmp(nls, "utf8") == 0)
+ return NULL;
+
+ if (strcmp(nls, CONFIG_NLS_DEFAULT) == 0)
+ return load_nls_default();
+
+ ret = load_nls(nls);
+ if (ret)
+ return ret;
+
+ return ERR_PTR(-EINVAL);
+}
+
+static int ntfs_fs_parse_param(struct fs_context *fc,
+ struct fs_parameter *param)
+{
+ struct ntfs_mount_options *opts = fc->fs_private;
+ struct fs_parse_result result;
+ int opt;
+
+ opt = fs_parse(fc, ntfs_fs_parameters, param, &result);
+ if (opt < 0)
+ return opt;
+
+ switch (opt) {
+ case Opt_uid:
+ opts->fs_uid = make_kuid(current_user_ns(), result.uint_32);
+ if (!uid_valid(opts->fs_uid))
+ return invalf(fc, "ntfs3: Invalid value for uid.");
+ break;
+ case Opt_gid:
+ opts->fs_gid = make_kgid(current_user_ns(), result.uint_32);
+ if (!gid_valid(opts->fs_gid))
+ return invalf(fc, "ntfs3: Invalid value for gid.");
+ break;
+ case Opt_umask:
+ if (result.uint_32 & ~07777)
+ return invalf(fc, "ntfs3: Invalid value for umask.");
+ opts->fs_fmask_inv = ~result.uint_32;
+ opts->fs_dmask_inv = ~result.uint_32;
+ opts->fmask = 1;
+ opts->dmask = 1;
+ break;
+ case Opt_dmask:
+ if (result.uint_32 & ~07777)
+ return invalf(fc, "ntfs3: Invalid value for dmask.");
+ opts->fs_dmask_inv = ~result.uint_32;
+ opts->dmask = 1;
+ break;
+ case Opt_fmask:
+ if (result.uint_32 & ~07777)
+ return invalf(fc, "ntfs3: Invalid value for fmask.");
+ opts->fs_fmask_inv = ~result.uint_32;
+ opts->fmask = 1;
+ break;
+ case Opt_immutable:
+ opts->sys_immutable = result.negated ? 0 : 1;
+ break;
+ case Opt_discard:
+ opts->discard = result.negated ? 0 : 1;
+ break;
+ case Opt_force:
+ opts->force = result.negated ? 0 : 1;
+ break;
+ case Opt_sparse:
+ opts->sparse = result.negated ? 0 : 1;
+ break;
+ case Opt_nohidden:
+ opts->nohidden = result.negated ? 1 : 0;
+ break;
+ case Opt_acl:
+ if (!result.negated)
+#ifdef CONFIG_NTFS3_FS_POSIX_ACL
+ fc->sb_flags |= SB_POSIXACL;
+#else
+ return invalf(fc, "ntfs3: Support for ACL not compiled in!");
+#endif
+ else
+ fc->sb_flags &= ~SB_POSIXACL;
+ break;
+ case Opt_showmeta:
+ opts->showmeta = result.negated ? 0 : 1;
+ break;
+ case Opt_iocharset:
+ kfree(opts->nls_name);
+ opts->nls_name = param->string;
+ param->string = NULL;
+ break;
+ case Opt_prealloc:
+ opts->prealloc = result.negated ? 0 : 1;
+ break;
+ case Opt_noacsrules:
+ opts->noacsrules = result.negated ? 1 : 0;
+ break;
+ default:
+ /* Should not be here unless we forget add case. */
+ return -EINVAL;
+ }
+ return 0;
+}
+
+static int ntfs_fs_reconfigure(struct fs_context *fc)
+{
+ struct super_block *sb = fc->root->d_sb;
+ struct ntfs_sb_info *sbi = sb->s_fs_info;
+ struct ntfs_mount_options *new_opts = fc->fs_private;
+ int ro_rw;
+
+ ro_rw = sb_rdonly(sb) && !(fc->sb_flags & SB_RDONLY);
+ if (ro_rw && (sbi->flags & NTFS_FLAGS_NEED_REPLAY)) {
+ errorf(fc, "ntfs3: Couldn't remount rw because journal is not replayed. Please umount/remount instead\n");
+ return -EINVAL;
+ }
+
+ new_opts->nls = ntfs_load_nls(new_opts->nls_name);
+ if (IS_ERR(new_opts->nls)) {
+ new_opts->nls = NULL;
+ errorf(fc, "ntfs3: Cannot load iocharset %s", new_opts->nls_name);
+ return -EINVAL;
+ }
+ if (new_opts->nls != sbi->options->nls)
+ return invalf(fc, "ntfs3: Cannot use different iocharset when remounting!");
+
+ sync_filesystem(sb);
+
+ if (ro_rw && (sbi->volume.flags & VOLUME_FLAG_DIRTY) &&
+ !new_opts->force) {
+ errorf(fc, "ntfs3: Volume is dirty and \"force\" flag is not set!");
+ return -EINVAL;
+ }
+
+ swap(sbi->options, fc->fs_private);
+
+ return 0;
+}
+
+static struct kmem_cache *ntfs_inode_cachep;
+
+static struct inode *ntfs_alloc_inode(struct super_block *sb)
+{
+ struct ntfs_inode *ni = alloc_inode_sb(sb, ntfs_inode_cachep, GFP_NOFS);
+
+ if (!ni)
+ return NULL;
+
+ memset(ni, 0, offsetof(struct ntfs_inode, vfs_inode));
+
+ mutex_init(&ni->ni_lock);
+
+ return &ni->vfs_inode;
+}
+
+static void ntfs_i_callback(struct rcu_head *head)
+{
+ struct inode *inode = container_of(head, struct inode, i_rcu);
+ struct ntfs_inode *ni = ntfs_i(inode);
+
+ mutex_destroy(&ni->ni_lock);
+
+ kmem_cache_free(ntfs_inode_cachep, ni);
+}
+
+static void ntfs_destroy_inode(struct inode *inode)
+{
+ call_rcu(&inode->i_rcu, ntfs_i_callback);
+}
+
+static void init_once(void *foo)
+{
+ struct ntfs_inode *ni = foo;
+
+ inode_init_once(&ni->vfs_inode);
+}
+
+/*
+ * put_ntfs - Noinline to reduce binary size.
+ */
+static noinline void put_ntfs(struct ntfs_sb_info *sbi)
+{
+ kfree(sbi->new_rec);
+ kvfree(ntfs_put_shared(sbi->upcase));
+ kfree(sbi->def_table);
+
+ wnd_close(&sbi->mft.bitmap);
+ wnd_close(&sbi->used.bitmap);
+
+ if (sbi->mft.ni)
+ iput(&sbi->mft.ni->vfs_inode);
+
+ if (sbi->security.ni)
+ iput(&sbi->security.ni->vfs_inode);
+
+ if (sbi->reparse.ni)
+ iput(&sbi->reparse.ni->vfs_inode);
+
+ if (sbi->objid.ni)
+ iput(&sbi->objid.ni->vfs_inode);
+
+ if (sbi->volume.ni)
+ iput(&sbi->volume.ni->vfs_inode);
+
+ ntfs_update_mftmirr(sbi, 0);
+
+ indx_clear(&sbi->security.index_sii);
+ indx_clear(&sbi->security.index_sdh);
+ indx_clear(&sbi->reparse.index_r);
+ indx_clear(&sbi->objid.index_o);
+ kfree(sbi->compress.lznt);
+#ifdef CONFIG_NTFS3_LZX_XPRESS
+ xpress_free_decompressor(sbi->compress.xpress);
+ lzx_free_decompressor(sbi->compress.lzx);
+#endif
+ kfree(sbi);
+}
+
+static void ntfs_put_super(struct super_block *sb)
+{
+ struct ntfs_sb_info *sbi = sb->s_fs_info;
+
+ /* Mark rw ntfs as clear, if possible. */
+ ntfs_set_state(sbi, NTFS_DIRTY_CLEAR);
+
+ put_mount_options(sbi->options);
+ put_ntfs(sbi);
+ sb->s_fs_info = NULL;
+
+ sync_blockdev(sb->s_bdev);
+}
+
+static int ntfs_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+ struct super_block *sb = dentry->d_sb;
+ struct ntfs_sb_info *sbi = sb->s_fs_info;
+ struct wnd_bitmap *wnd = &sbi->used.bitmap;
+
+ buf->f_type = sb->s_magic;
+ buf->f_bsize = sbi->cluster_size;
+ buf->f_blocks = wnd->nbits;
+
+ buf->f_bfree = buf->f_bavail = wnd_zeroes(wnd);
+ buf->f_fsid.val[0] = sbi->volume.ser_num;
+ buf->f_fsid.val[1] = (sbi->volume.ser_num >> 32);
+ buf->f_namelen = NTFS_NAME_LEN;
+
+ return 0;
+}
+
+static int ntfs_show_options(struct seq_file *m, struct dentry *root)
+{
+ struct super_block *sb = root->d_sb;
+ struct ntfs_sb_info *sbi = sb->s_fs_info;
+ struct ntfs_mount_options *opts = sbi->options;
+ struct user_namespace *user_ns = seq_user_ns(m);
+
+ seq_printf(m, ",uid=%u",
+ from_kuid_munged(user_ns, opts->fs_uid));
+ seq_printf(m, ",gid=%u",
+ from_kgid_munged(user_ns, opts->fs_gid));
+ if (opts->fmask)
+ seq_printf(m, ",fmask=%04o", ~opts->fs_fmask_inv);
+ if (opts->dmask)
+ seq_printf(m, ",dmask=%04o", ~opts->fs_dmask_inv);
+ if (opts->nls)
+ seq_printf(m, ",iocharset=%s", opts->nls->charset);
+ else
+ seq_puts(m, ",iocharset=utf8");
+ if (opts->sys_immutable)
+ seq_puts(m, ",sys_immutable");
+ if (opts->discard)
+ seq_puts(m, ",discard");
+ if (opts->sparse)
+ seq_puts(m, ",sparse");
+ if (opts->showmeta)
+ seq_puts(m, ",showmeta");
+ if (opts->nohidden)
+ seq_puts(m, ",nohidden");
+ if (opts->force)
+ seq_puts(m, ",force");
+ if (opts->noacsrules)
+ seq_puts(m, ",noacsrules");
+ if (opts->prealloc)
+ seq_puts(m, ",prealloc");
+ if (sb->s_flags & SB_POSIXACL)
+ seq_puts(m, ",acl");
+
+ return 0;
+}
+
+/*
+ * ntfs_sync_fs - super_operations::sync_fs
+ */
+static int ntfs_sync_fs(struct super_block *sb, int wait)
+{
+ int err = 0, err2;
+ struct ntfs_sb_info *sbi = sb->s_fs_info;
+ struct ntfs_inode *ni;
+ struct inode *inode;
+
+ ni = sbi->security.ni;
+ if (ni) {
+ inode = &ni->vfs_inode;
+ err2 = _ni_write_inode(inode, wait);
+ if (err2 && !err)
+ err = err2;
+ }
+
+ ni = sbi->objid.ni;
+ if (ni) {
+ inode = &ni->vfs_inode;
+ err2 = _ni_write_inode(inode, wait);
+ if (err2 && !err)
+ err = err2;
+ }
+
+ ni = sbi->reparse.ni;
+ if (ni) {
+ inode = &ni->vfs_inode;
+ err2 = _ni_write_inode(inode, wait);
+ if (err2 && !err)
+ err = err2;
+ }
+
+ if (!err)
+ ntfs_set_state(sbi, NTFS_DIRTY_CLEAR);
+
+ ntfs_update_mftmirr(sbi, wait);
+
+ return err;
+}
+
+static const struct super_operations ntfs_sops = {
+ .alloc_inode = ntfs_alloc_inode,
+ .destroy_inode = ntfs_destroy_inode,
+ .evict_inode = ntfs_evict_inode,
+ .put_super = ntfs_put_super,
+ .statfs = ntfs_statfs,
+ .show_options = ntfs_show_options,
+ .sync_fs = ntfs_sync_fs,
+ .write_inode = ntfs3_write_inode,
+};
+
+static struct inode *ntfs_export_get_inode(struct super_block *sb, u64 ino,
+ u32 generation)
+{
+ struct MFT_REF ref;
+ struct inode *inode;
+
+ ref.low = cpu_to_le32(ino);
+#ifdef CONFIG_NTFS3_64BIT_CLUSTER
+ ref.high = cpu_to_le16(ino >> 32);
+#else
+ ref.high = 0;
+#endif
+ ref.seq = cpu_to_le16(generation);
+
+ inode = ntfs_iget5(sb, &ref, NULL);
+ if (!IS_ERR(inode) && is_bad_inode(inode)) {
+ iput(inode);
+ inode = ERR_PTR(-ESTALE);
+ }
+
+ return inode;
+}
+
+static struct dentry *ntfs_fh_to_dentry(struct super_block *sb, struct fid *fid,
+ int fh_len, int fh_type)
+{
+ return generic_fh_to_dentry(sb, fid, fh_len, fh_type,
+ ntfs_export_get_inode);
+}
+
+static struct dentry *ntfs_fh_to_parent(struct super_block *sb, struct fid *fid,
+ int fh_len, int fh_type)
+{
+ return generic_fh_to_parent(sb, fid, fh_len, fh_type,
+ ntfs_export_get_inode);
+}
+
+/* TODO: == ntfs_sync_inode */
+static int ntfs_nfs_commit_metadata(struct inode *inode)
+{
+ return _ni_write_inode(inode, 1);
+}
+
+static const struct export_operations ntfs_export_ops = {
+ .fh_to_dentry = ntfs_fh_to_dentry,
+ .fh_to_parent = ntfs_fh_to_parent,
+ .get_parent = ntfs3_get_parent,
+ .commit_metadata = ntfs_nfs_commit_metadata,
+};
+
+/*
+ * format_size_gb - Return Gb,Mb to print with "%u.%02u Gb".
+ */
+static u32 format_size_gb(const u64 bytes, u32 *mb)
+{
+ /* Do simple right 30 bit shift of 64 bit value. */
+ u64 kbytes = bytes >> 10;
+ u32 kbytes32 = kbytes;
+
+ *mb = (100 * (kbytes32 & 0xfffff) + 0x7ffff) >> 20;
+ if (*mb >= 100)
+ *mb = 99;
+
+ return (kbytes32 >> 20) | (((u32)(kbytes >> 32)) << 12);
+}
+
+static u32 true_sectors_per_clst(const struct NTFS_BOOT *boot)
+{
+ if (boot->sectors_per_clusters <= 0x80)
+ return boot->sectors_per_clusters;
+ if (boot->sectors_per_clusters >= 0xf4) /* limit shift to 2MB max */
+ return 1U << -(s8)boot->sectors_per_clusters;
+ return -EINVAL;
+}
+
+/*
+ * ntfs_init_from_boot - Init internal info from on-disk boot sector.
+ */
+static int ntfs_init_from_boot(struct super_block *sb, u32 sector_size,
+ u64 dev_size)
+{
+ struct ntfs_sb_info *sbi = sb->s_fs_info;
+ int err;
+ u32 mb, gb, boot_sector_size, sct_per_clst, record_size;
+ u64 sectors, clusters, mlcn, mlcn2;
+ struct NTFS_BOOT *boot;
+ struct buffer_head *bh;
+ struct MFT_REC *rec;
+ u16 fn, ao;
+
+ sbi->volume.blocks = dev_size >> PAGE_SHIFT;
+
+ bh = ntfs_bread(sb, 0);
+ if (!bh)
+ return -EIO;
+
+ err = -EINVAL;
+ boot = (struct NTFS_BOOT *)bh->b_data;
+
+ if (memcmp(boot->system_id, "NTFS ", sizeof("NTFS ") - 1))
+ goto out;
+
+ /* 0x55AA is not mandaroty. Thanks Maxim Suhanov*/
+ /*if (0x55 != boot->boot_magic[0] || 0xAA != boot->boot_magic[1])
+ * goto out;
+ */
+
+ boot_sector_size = (u32)boot->bytes_per_sector[1] << 8;
+ if (boot->bytes_per_sector[0] || boot_sector_size < SECTOR_SIZE ||
+ !is_power_of_2(boot_sector_size)) {
+ goto out;
+ }
+
+ /* cluster size: 512, 1K, 2K, 4K, ... 2M */
+ sct_per_clst = true_sectors_per_clst(boot);
+ if ((int)sct_per_clst < 0)
+ goto out;
+ if (!is_power_of_2(sct_per_clst))
+ goto out;
+
+ mlcn = le64_to_cpu(boot->mft_clst);
+ mlcn2 = le64_to_cpu(boot->mft2_clst);
+ sectors = le64_to_cpu(boot->sectors_per_volume);
+
+ if (mlcn * sct_per_clst >= sectors)
+ goto out;
+
+ if (mlcn2 * sct_per_clst >= sectors)
+ goto out;
+
+ /* Check MFT record size. */
+ if ((boot->record_size < 0 &&
+ SECTOR_SIZE > (2U << (-boot->record_size))) ||
+ (boot->record_size >= 0 && !is_power_of_2(boot->record_size))) {
+ goto out;
+ }
+
+ /* Check index record size. */
+ if ((boot->index_size < 0 &&
+ SECTOR_SIZE > (2U << (-boot->index_size))) ||
+ (boot->index_size >= 0 && !is_power_of_2(boot->index_size))) {
+ goto out;
+ }
+
+ sbi->volume.size = sectors * boot_sector_size;
+
+ gb = format_size_gb(sbi->volume.size + boot_sector_size, &mb);
+
+ /*
+ * - Volume formatted and mounted with the same sector size.
+ * - Volume formatted 4K and mounted as 512.
+ * - Volume formatted 512 and mounted as 4K.
+ */
+ if (boot_sector_size != sector_size) {
+ ntfs_warn(
+ sb,
+ "Different NTFS' sector size (%u) and media sector size (%u)",
+ boot_sector_size, sector_size);
+ dev_size += sector_size - 1;
+ }
+
+ sbi->cluster_size = boot_sector_size * sct_per_clst;
+ sbi->cluster_bits = blksize_bits(sbi->cluster_size);
+
+ sbi->mft.lbo = mlcn << sbi->cluster_bits;
+ sbi->mft.lbo2 = mlcn2 << sbi->cluster_bits;
+
+ /* Compare boot's cluster and sector. */
+ if (sbi->cluster_size < boot_sector_size)
+ goto out;
+
+ /* Compare boot's cluster and media sector. */
+ if (sbi->cluster_size < sector_size) {
+ /* No way to use ntfs_get_block in this case. */
+ ntfs_err(
+ sb,
+ "Failed to mount 'cause NTFS's cluster size (%u) is less than media sector size (%u)",
+ sbi->cluster_size, sector_size);
+ goto out;
+ }
+
+ sbi->cluster_mask = sbi->cluster_size - 1;
+ sbi->cluster_mask_inv = ~(u64)sbi->cluster_mask;
+ sbi->record_size = record_size = boot->record_size < 0
+ ? 1 << (-boot->record_size)
+ : (u32)boot->record_size
+ << sbi->cluster_bits;
+
+ if (record_size > MAXIMUM_BYTES_PER_MFT || record_size < SECTOR_SIZE)
+ goto out;
+
+ sbi->record_bits = blksize_bits(record_size);
+ sbi->attr_size_tr = (5 * record_size >> 4); // ~320 bytes
+
+ sbi->max_bytes_per_attr =
+ record_size - ALIGN(MFTRECORD_FIXUP_OFFSET_1, 8) -
+ ALIGN(((record_size >> SECTOR_SHIFT) * sizeof(short)), 8) -
+ ALIGN(sizeof(enum ATTR_TYPE), 8);
+
+ sbi->index_size = boot->index_size < 0
+ ? 1u << (-boot->index_size)
+ : (u32)boot->index_size << sbi->cluster_bits;
+
+ sbi->volume.ser_num = le64_to_cpu(boot->serial_num);
+
+ /* Warning if RAW volume. */
+ if (dev_size < sbi->volume.size + boot_sector_size) {
+ u32 mb0, gb0;
+
+ gb0 = format_size_gb(dev_size, &mb0);
+ ntfs_warn(
+ sb,
+ "RAW NTFS volume: Filesystem size %u.%02u Gb > volume size %u.%02u Gb. Mount in read-only",
+ gb, mb, gb0, mb0);
+ sb->s_flags |= SB_RDONLY;
+ }
+
+ clusters = sbi->volume.size >> sbi->cluster_bits;
+#ifndef CONFIG_NTFS3_64BIT_CLUSTER
+ /* 32 bits per cluster. */
+ if (clusters >> 32) {
+ ntfs_notice(
+ sb,
+ "NTFS %u.%02u Gb is too big to use 32 bits per cluster",
+ gb, mb);
+ goto out;
+ }
+#elif BITS_PER_LONG < 64
+#error "CONFIG_NTFS3_64BIT_CLUSTER incompatible in 32 bit OS"
+#endif
+
+ sbi->used.bitmap.nbits = clusters;
+
+ rec = kzalloc(record_size, GFP_NOFS);
+ if (!rec) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ sbi->new_rec = rec;
+ rec->rhdr.sign = NTFS_FILE_SIGNATURE;
+ rec->rhdr.fix_off = cpu_to_le16(MFTRECORD_FIXUP_OFFSET_1);
+ fn = (sbi->record_size >> SECTOR_SHIFT) + 1;
+ rec->rhdr.fix_num = cpu_to_le16(fn);
+ ao = ALIGN(MFTRECORD_FIXUP_OFFSET_1 + sizeof(short) * fn, 8);
+ rec->attr_off = cpu_to_le16(ao);
+ rec->used = cpu_to_le32(ao + ALIGN(sizeof(enum ATTR_TYPE), 8));
+ rec->total = cpu_to_le32(sbi->record_size);
+ ((struct ATTRIB *)Add2Ptr(rec, ao))->type = ATTR_END;
+
+ sb_set_blocksize(sb, min_t(u32, sbi->cluster_size, PAGE_SIZE));
+
+ sbi->block_mask = sb->s_blocksize - 1;
+ sbi->blocks_per_cluster = sbi->cluster_size >> sb->s_blocksize_bits;
+ sbi->volume.blocks = sbi->volume.size >> sb->s_blocksize_bits;
+
+ /* Maximum size for normal files. */
+ sbi->maxbytes = (clusters << sbi->cluster_bits) - 1;
+
+#ifdef CONFIG_NTFS3_64BIT_CLUSTER
+ if (clusters >= (1ull << (64 - sbi->cluster_bits)))
+ sbi->maxbytes = -1;
+ sbi->maxbytes_sparse = -1;
+ sb->s_maxbytes = MAX_LFS_FILESIZE;
+#else
+ /* Maximum size for sparse file. */
+ sbi->maxbytes_sparse = (1ull << (sbi->cluster_bits + 32)) - 1;
+ sb->s_maxbytes = 0xFFFFFFFFull << sbi->cluster_bits;
+#endif
+
+ /*
+ * Compute the MFT zone at two steps.
+ * It would be nice if we are able to allocate 1/8 of
+ * total clusters for MFT but not more then 512 MB.
+ */
+ sbi->zone_max = min_t(CLST, 0x20000000 >> sbi->cluster_bits, clusters >> 3);
+
+ err = 0;
+
+out:
+ brelse(bh);
+
+ return err;
+}
+
+/*
+ * ntfs_fill_super - Try to mount.
+ */
+static int ntfs_fill_super(struct super_block *sb, struct fs_context *fc)
+{
+ int err;
+ struct ntfs_sb_info *sbi = sb->s_fs_info;
+ struct block_device *bdev = sb->s_bdev;
+ struct inode *inode;
+ struct ntfs_inode *ni;
+ size_t i, tt;
+ CLST vcn, lcn, len;
+ struct ATTRIB *attr;
+ const struct VOLUME_INFO *info;
+ u32 idx, done, bytes;
+ struct ATTR_DEF_ENTRY *t;
+ u16 *shared;
+ struct MFT_REF ref;
+
+ ref.high = 0;
+
+ sbi->sb = sb;
+ sbi->options = fc->fs_private;
+ fc->fs_private = NULL;
+ sb->s_flags |= SB_NODIRATIME;
+ sb->s_magic = 0x7366746e; // "ntfs"
+ sb->s_op = &ntfs_sops;
+ sb->s_export_op = &ntfs_export_ops;
+ sb->s_time_gran = NTFS_TIME_GRAN; // 100 nsec
+ sb->s_xattr = ntfs_xattr_handlers;
+
+ sbi->options->nls = ntfs_load_nls(sbi->options->nls_name);
+ if (IS_ERR(sbi->options->nls)) {
+ sbi->options->nls = NULL;
+ errorf(fc, "Cannot load nls %s", sbi->options->nls_name);
+ err = -EINVAL;
+ goto out;
+ }
+
+ if (bdev_max_discard_sectors(bdev) && bdev_discard_granularity(bdev)) {
+ sbi->discard_granularity = bdev_discard_granularity(bdev);
+ sbi->discard_granularity_mask_inv =
+ ~(u64)(sbi->discard_granularity - 1);
+ }
+
+ /* Parse boot. */
+ err = ntfs_init_from_boot(sb, bdev_logical_block_size(bdev),
+ bdev_nr_bytes(bdev));
+ if (err)
+ goto out;
+
+ /*
+ * Load $Volume. This should be done before $LogFile
+ * 'cause 'sbi->volume.ni' is used 'ntfs_set_state'.
+ */
+ ref.low = cpu_to_le32(MFT_REC_VOL);
+ ref.seq = cpu_to_le16(MFT_REC_VOL);
+ inode = ntfs_iget5(sb, &ref, &NAME_VOLUME);
+ if (IS_ERR(inode)) {
+ ntfs_err(sb, "Failed to load $Volume.");
+ err = PTR_ERR(inode);
+ goto out;
+ }
+
+ ni = ntfs_i(inode);
+
+ /* Load and save label (not necessary). */
+ attr = ni_find_attr(ni, NULL, NULL, ATTR_LABEL, NULL, 0, NULL, NULL);
+
+ if (!attr) {
+ /* It is ok if no ATTR_LABEL */
+ } else if (!attr->non_res && !is_attr_ext(attr)) {
+ /* $AttrDef allows labels to be up to 128 symbols. */
+ err = utf16s_to_utf8s(resident_data(attr),
+ le32_to_cpu(attr->res.data_size) >> 1,
+ UTF16_LITTLE_ENDIAN, sbi->volume.label,
+ sizeof(sbi->volume.label));
+ if (err < 0)
+ sbi->volume.label[0] = 0;
+ } else {
+ /* Should we break mounting here? */
+ //err = -EINVAL;
+ //goto put_inode_out;
+ }
+
+ attr = ni_find_attr(ni, attr, NULL, ATTR_VOL_INFO, NULL, 0, NULL, NULL);
+ if (!attr || is_attr_ext(attr)) {
+ err = -EINVAL;
+ goto put_inode_out;
+ }
+
+ info = resident_data_ex(attr, SIZEOF_ATTRIBUTE_VOLUME_INFO);
+ if (!info) {
+ err = -EINVAL;
+ goto put_inode_out;
+ }
+
+ sbi->volume.major_ver = info->major_ver;
+ sbi->volume.minor_ver = info->minor_ver;
+ sbi->volume.flags = info->flags;
+ sbi->volume.ni = ni;
+
+ /* Load $MFTMirr to estimate recs_mirr. */
+ ref.low = cpu_to_le32(MFT_REC_MIRR);
+ ref.seq = cpu_to_le16(MFT_REC_MIRR);
+ inode = ntfs_iget5(sb, &ref, &NAME_MIRROR);
+ if (IS_ERR(inode)) {
+ ntfs_err(sb, "Failed to load $MFTMirr.");
+ err = PTR_ERR(inode);
+ goto out;
+ }
+
+ sbi->mft.recs_mirr =
+ ntfs_up_cluster(sbi, inode->i_size) >> sbi->record_bits;
+
+ iput(inode);
+
+ /* Load LogFile to replay. */
+ ref.low = cpu_to_le32(MFT_REC_LOG);
+ ref.seq = cpu_to_le16(MFT_REC_LOG);
+ inode = ntfs_iget5(sb, &ref, &NAME_LOGFILE);
+ if (IS_ERR(inode)) {
+ ntfs_err(sb, "Failed to load \x24LogFile.");
+ err = PTR_ERR(inode);
+ goto out;
+ }
+
+ ni = ntfs_i(inode);
+
+ err = ntfs_loadlog_and_replay(ni, sbi);
+ if (err)
+ goto put_inode_out;
+
+ iput(inode);
+
+ if (sbi->flags & NTFS_FLAGS_NEED_REPLAY) {
+ if (!sb_rdonly(sb)) {
+ ntfs_warn(sb,
+ "failed to replay log file. Can't mount rw!");
+ err = -EINVAL;
+ goto out;
+ }
+ } else if (sbi->volume.flags & VOLUME_FLAG_DIRTY) {
+ if (!sb_rdonly(sb) && !sbi->options->force) {
+ ntfs_warn(
+ sb,
+ "volume is dirty and \"force\" flag is not set!");
+ err = -EINVAL;
+ goto out;
+ }
+ }
+
+ /* Load $MFT. */
+ ref.low = cpu_to_le32(MFT_REC_MFT);
+ ref.seq = cpu_to_le16(1);
+
+ inode = ntfs_iget5(sb, &ref, &NAME_MFT);
+ if (IS_ERR(inode)) {
+ ntfs_err(sb, "Failed to load $MFT.");
+ err = PTR_ERR(inode);
+ goto out;
+ }
+
+ ni = ntfs_i(inode);
+
+ sbi->mft.used = ni->i_valid >> sbi->record_bits;
+ tt = inode->i_size >> sbi->record_bits;
+ sbi->mft.next_free = MFT_REC_USER;
+
+ err = wnd_init(&sbi->mft.bitmap, sb, tt);
+ if (err)
+ goto put_inode_out;
+
+ err = ni_load_all_mi(ni);
+ if (err)
+ goto put_inode_out;
+
+ sbi->mft.ni = ni;
+
+ /* Load $BadClus. */
+ ref.low = cpu_to_le32(MFT_REC_BADCLUST);
+ ref.seq = cpu_to_le16(MFT_REC_BADCLUST);
+ inode = ntfs_iget5(sb, &ref, &NAME_BADCLUS);
+ if (IS_ERR(inode)) {
+ ntfs_err(sb, "Failed to load $BadClus.");
+ err = PTR_ERR(inode);
+ goto out;
+ }
+
+ ni = ntfs_i(inode);
+
+ for (i = 0; run_get_entry(&ni->file.run, i, &vcn, &lcn, &len); i++) {
+ if (lcn == SPARSE_LCN)
+ continue;
+
+ if (!sbi->bad_clusters)
+ ntfs_notice(sb, "Volume contains bad blocks");
+
+ sbi->bad_clusters += len;
+ }
+
+ iput(inode);
+
+ /* Load $Bitmap. */
+ ref.low = cpu_to_le32(MFT_REC_BITMAP);
+ ref.seq = cpu_to_le16(MFT_REC_BITMAP);
+ inode = ntfs_iget5(sb, &ref, &NAME_BITMAP);
+ if (IS_ERR(inode)) {
+ ntfs_err(sb, "Failed to load $Bitmap.");
+ err = PTR_ERR(inode);
+ goto out;
+ }
+
+#ifndef CONFIG_NTFS3_64BIT_CLUSTER
+ if (inode->i_size >> 32) {
+ err = -EINVAL;
+ goto put_inode_out;
+ }
+#endif
+
+ /* Check bitmap boundary. */
+ tt = sbi->used.bitmap.nbits;
+ if (inode->i_size < bitmap_size(tt)) {
+ err = -EINVAL;
+ goto put_inode_out;
+ }
+
+ /* Not necessary. */
+ sbi->used.bitmap.set_tail = true;
+ err = wnd_init(&sbi->used.bitmap, sb, tt);
+ if (err)
+ goto put_inode_out;
+
+ iput(inode);
+
+ /* Compute the MFT zone. */
+ err = ntfs_refresh_zone(sbi);
+ if (err)
+ goto out;
+
+ /* Load $AttrDef. */
+ ref.low = cpu_to_le32(MFT_REC_ATTR);
+ ref.seq = cpu_to_le16(MFT_REC_ATTR);
+ inode = ntfs_iget5(sb, &ref, &NAME_ATTRDEF);
+ if (IS_ERR(inode)) {
+ ntfs_err(sb, "Failed to load $AttrDef -> %d", err);
+ err = PTR_ERR(inode);
+ goto out;
+ }
+
+ if (inode->i_size < sizeof(struct ATTR_DEF_ENTRY)) {
+ err = -EINVAL;
+ goto put_inode_out;
+ }
+ bytes = inode->i_size;
+ sbi->def_table = t = kvmalloc(bytes, GFP_KERNEL);
+ if (!t) {
+ err = -ENOMEM;
+ goto put_inode_out;
+ }
+
+ for (done = idx = 0; done < bytes; done += PAGE_SIZE, idx++) {
+ unsigned long tail = bytes - done;
+ struct page *page = ntfs_map_page(inode->i_mapping, idx);
+
+ if (IS_ERR(page)) {
+ err = PTR_ERR(page);
+ goto put_inode_out;
+ }
+ memcpy(Add2Ptr(t, done), page_address(page),
+ min(PAGE_SIZE, tail));
+ ntfs_unmap_page(page);
+
+ if (!idx && ATTR_STD != t->type) {
+ err = -EINVAL;
+ goto put_inode_out;
+ }
+ }
+
+ t += 1;
+ sbi->def_entries = 1;
+ done = sizeof(struct ATTR_DEF_ENTRY);
+ sbi->reparse.max_size = MAXIMUM_REPARSE_DATA_BUFFER_SIZE;
+ sbi->ea_max_size = 0x10000; /* default formatter value */
+
+ while (done + sizeof(struct ATTR_DEF_ENTRY) <= bytes) {
+ u32 t32 = le32_to_cpu(t->type);
+ u64 sz = le64_to_cpu(t->max_sz);
+
+ if ((t32 & 0xF) || le32_to_cpu(t[-1].type) >= t32)
+ break;
+
+ if (t->type == ATTR_REPARSE)
+ sbi->reparse.max_size = sz;
+ else if (t->type == ATTR_EA)
+ sbi->ea_max_size = sz;
+
+ done += sizeof(struct ATTR_DEF_ENTRY);
+ t += 1;
+ sbi->def_entries += 1;
+ }
+ iput(inode);
+
+ /* Load $UpCase. */
+ ref.low = cpu_to_le32(MFT_REC_UPCASE);
+ ref.seq = cpu_to_le16(MFT_REC_UPCASE);
+ inode = ntfs_iget5(sb, &ref, &NAME_UPCASE);
+ if (IS_ERR(inode)) {
+ ntfs_err(sb, "Failed to load $UpCase.");
+ err = PTR_ERR(inode);
+ goto out;
+ }
+
+ if (inode->i_size != 0x10000 * sizeof(short)) {
+ err = -EINVAL;
+ goto put_inode_out;
+ }
+
+ for (idx = 0; idx < (0x10000 * sizeof(short) >> PAGE_SHIFT); idx++) {
+ const __le16 *src;
+ u16 *dst = Add2Ptr(sbi->upcase, idx << PAGE_SHIFT);
+ struct page *page = ntfs_map_page(inode->i_mapping, idx);
+
+ if (IS_ERR(page)) {
+ err = PTR_ERR(page);
+ goto put_inode_out;
+ }
+
+ src = page_address(page);
+
+#ifdef __BIG_ENDIAN
+ for (i = 0; i < PAGE_SIZE / sizeof(u16); i++)
+ *dst++ = le16_to_cpu(*src++);
+#else
+ memcpy(dst, src, PAGE_SIZE);
+#endif
+ ntfs_unmap_page(page);
+ }
+
+ shared = ntfs_set_shared(sbi->upcase, 0x10000 * sizeof(short));
+ if (shared && sbi->upcase != shared) {
+ kvfree(sbi->upcase);
+ sbi->upcase = shared;
+ }
+
+ iput(inode);
+
+ if (is_ntfs3(sbi)) {
+ /* Load $Secure. */
+ err = ntfs_security_init(sbi);
+ if (err)
+ goto out;
+
+ /* Load $Extend. */
+ err = ntfs_extend_init(sbi);
+ if (err)
+ goto load_root;
+
+ /* Load $Extend\$Reparse. */
+ err = ntfs_reparse_init(sbi);
+ if (err)
+ goto load_root;
+
+ /* Load $Extend\$ObjId. */
+ err = ntfs_objid_init(sbi);
+ if (err)
+ goto load_root;
+ }
+
+load_root:
+ /* Load root. */
+ ref.low = cpu_to_le32(MFT_REC_ROOT);
+ ref.seq = cpu_to_le16(MFT_REC_ROOT);
+ inode = ntfs_iget5(sb, &ref, &NAME_ROOT);
+ if (IS_ERR(inode) || !inode->i_op) {
+ ntfs_err(sb, "Failed to load root.");
+ err = IS_ERR(inode) ? PTR_ERR(inode) : -EINVAL;
+ goto out;
+ }
+
+ sb->s_root = d_make_root(inode);
+ if (!sb->s_root) {
+ err = -ENOMEM;
+ goto put_inode_out;
+ }
+
+ return 0;
+
+put_inode_out:
+ iput(inode);
+out:
+ /*
+ * Free resources here.
+ * ntfs_fs_free will be called with fc->s_fs_info = NULL
+ */
+ put_mount_options(sbi->options);
+ put_ntfs(sbi);
+ sb->s_fs_info = NULL;
+
+ return err;
+}
+
+void ntfs_unmap_meta(struct super_block *sb, CLST lcn, CLST len)
+{
+ struct ntfs_sb_info *sbi = sb->s_fs_info;
+ struct block_device *bdev = sb->s_bdev;
+ sector_t devblock = (u64)lcn * sbi->blocks_per_cluster;
+ unsigned long blocks = (u64)len * sbi->blocks_per_cluster;
+ unsigned long cnt = 0;
+ unsigned long limit = global_zone_page_state(NR_FREE_PAGES)
+ << (PAGE_SHIFT - sb->s_blocksize_bits);
+
+ if (limit >= 0x2000)
+ limit -= 0x1000;
+ else if (limit < 32)
+ limit = 32;
+ else
+ limit >>= 1;
+
+ while (blocks--) {
+ clean_bdev_aliases(bdev, devblock++, 1);
+ if (cnt++ >= limit) {
+ sync_blockdev(bdev);
+ cnt = 0;
+ }
+ }
+}
+
+/*
+ * ntfs_discard - Issue a discard request (trim for SSD).
+ */
+int ntfs_discard(struct ntfs_sb_info *sbi, CLST lcn, CLST len)
+{
+ int err;
+ u64 lbo, bytes, start, end;
+ struct super_block *sb;
+
+ if (sbi->used.next_free_lcn == lcn + len)
+ sbi->used.next_free_lcn = lcn;
+
+ if (sbi->flags & NTFS_FLAGS_NODISCARD)
+ return -EOPNOTSUPP;
+
+ if (!sbi->options->discard)
+ return -EOPNOTSUPP;
+
+ lbo = (u64)lcn << sbi->cluster_bits;
+ bytes = (u64)len << sbi->cluster_bits;
+
+ /* Align up 'start' on discard_granularity. */
+ start = (lbo + sbi->discard_granularity - 1) &
+ sbi->discard_granularity_mask_inv;
+ /* Align down 'end' on discard_granularity. */
+ end = (lbo + bytes) & sbi->discard_granularity_mask_inv;
+
+ sb = sbi->sb;
+ if (start >= end)
+ return 0;
+
+ err = blkdev_issue_discard(sb->s_bdev, start >> 9, (end - start) >> 9,
+ GFP_NOFS);
+
+ if (err == -EOPNOTSUPP)
+ sbi->flags |= NTFS_FLAGS_NODISCARD;
+
+ return err;
+}
+
+static int ntfs_fs_get_tree(struct fs_context *fc)
+{
+ return get_tree_bdev(fc, ntfs_fill_super);
+}
+
+/*
+ * ntfs_fs_free - Free fs_context.
+ *
+ * Note that this will be called after fill_super and reconfigure
+ * even when they pass. So they have to take pointers if they pass.
+ */
+static void ntfs_fs_free(struct fs_context *fc)
+{
+ struct ntfs_mount_options *opts = fc->fs_private;
+ struct ntfs_sb_info *sbi = fc->s_fs_info;
+
+ if (sbi)
+ put_ntfs(sbi);
+
+ if (opts)
+ put_mount_options(opts);
+}
+
+static const struct fs_context_operations ntfs_context_ops = {
+ .parse_param = ntfs_fs_parse_param,
+ .get_tree = ntfs_fs_get_tree,
+ .reconfigure = ntfs_fs_reconfigure,
+ .free = ntfs_fs_free,
+};
+
+/*
+ * ntfs_init_fs_context - Initialize spi and opts
+ *
+ * This will called when mount/remount. We will first initialize
+ * options so that if remount we can use just that.
+ */
+static int ntfs_init_fs_context(struct fs_context *fc)
+{
+ struct ntfs_mount_options *opts;
+ struct ntfs_sb_info *sbi;
+
+ opts = kzalloc(sizeof(struct ntfs_mount_options), GFP_NOFS);
+ if (!opts)
+ return -ENOMEM;
+
+ /* Default options. */
+ opts->fs_uid = current_uid();
+ opts->fs_gid = current_gid();
+ opts->fs_fmask_inv = ~current_umask();
+ opts->fs_dmask_inv = ~current_umask();
+
+ if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE)
+ goto ok;
+
+ sbi = kzalloc(sizeof(struct ntfs_sb_info), GFP_NOFS);
+ if (!sbi)
+ goto free_opts;
+
+ sbi->upcase = kvmalloc(0x10000 * sizeof(short), GFP_KERNEL);
+ if (!sbi->upcase)
+ goto free_sbi;
+
+ ratelimit_state_init(&sbi->msg_ratelimit, DEFAULT_RATELIMIT_INTERVAL,
+ DEFAULT_RATELIMIT_BURST);
+
+ mutex_init(&sbi->compress.mtx_lznt);
+#ifdef CONFIG_NTFS3_LZX_XPRESS
+ mutex_init(&sbi->compress.mtx_xpress);
+ mutex_init(&sbi->compress.mtx_lzx);
+#endif
+
+ fc->s_fs_info = sbi;
+ok:
+ fc->fs_private = opts;
+ fc->ops = &ntfs_context_ops;
+
+ return 0;
+free_sbi:
+ kfree(sbi);
+free_opts:
+ kfree(opts);
+ return -ENOMEM;
+}
+
+// clang-format off
+static struct file_system_type ntfs_fs_type = {
+ .owner = THIS_MODULE,
+ .name = "ntfs3",
+ .init_fs_context = ntfs_init_fs_context,
+ .parameters = ntfs_fs_parameters,
+ .kill_sb = kill_block_super,
+ .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP,
+};
+// clang-format on
+
+static int __init init_ntfs_fs(void)
+{
+ int err;
+
+ pr_info("ntfs3: Max link count %u\n", NTFS_LINK_MAX);
+
+ if (IS_ENABLED(CONFIG_NTFS3_FS_POSIX_ACL))
+ pr_info("ntfs3: Enabled Linux POSIX ACLs support\n");
+ if (IS_ENABLED(CONFIG_NTFS3_64BIT_CLUSTER))
+ pr_notice("ntfs3: Warning: Activated 64 bits per cluster. Windows does not support this\n");
+ if (IS_ENABLED(CONFIG_NTFS3_LZX_XPRESS))
+ pr_info("ntfs3: Read-only LZX/Xpress compression included\n");
+
+ err = ntfs3_init_bitmap();
+ if (err)
+ return err;
+
+ ntfs_inode_cachep = kmem_cache_create(
+ "ntfs_inode_cache", sizeof(struct ntfs_inode), 0,
+ (SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD | SLAB_ACCOUNT),
+ init_once);
+ if (!ntfs_inode_cachep) {
+ err = -ENOMEM;
+ goto out1;
+ }
+
+ err = register_filesystem(&ntfs_fs_type);
+ if (err)
+ goto out;
+
+ return 0;
+out:
+ kmem_cache_destroy(ntfs_inode_cachep);
+out1:
+ ntfs3_exit_bitmap();
+ return err;
+}
+
+static void __exit exit_ntfs_fs(void)
+{
+ if (ntfs_inode_cachep) {
+ rcu_barrier();
+ kmem_cache_destroy(ntfs_inode_cachep);
+ }
+
+ unregister_filesystem(&ntfs_fs_type);
+ ntfs3_exit_bitmap();
+}
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("ntfs3 read/write filesystem");
+#ifdef CONFIG_NTFS3_FS_POSIX_ACL
+MODULE_INFO(behaviour, "Enabled Linux POSIX ACLs support");
+#endif
+#ifdef CONFIG_NTFS3_64BIT_CLUSTER
+MODULE_INFO(cluster, "Warning: Activated 64 bits per cluster. Windows does not support this");
+#endif
+#ifdef CONFIG_NTFS3_LZX_XPRESS
+MODULE_INFO(compression, "Read-only lzx/xpress compression included");
+#endif
+
+MODULE_AUTHOR("Konstantin Komarov");
+MODULE_ALIAS_FS("ntfs3");
+
+module_init(init_ntfs_fs);
+module_exit(exit_ntfs_fs);
diff --git a/fs/ntfs3/upcase.c b/fs/ntfs3/upcase.c
new file mode 100644
index 000000000..b5e8256fd
--- /dev/null
+++ b/fs/ntfs3/upcase.c
@@ -0,0 +1,104 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ *
+ * Copyright (C) 2019-2021 Paragon Software GmbH, All rights reserved.
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+
+#include "ntfs_fs.h"
+
+static inline u16 upcase_unicode_char(const u16 *upcase, u16 chr)
+{
+ if (chr < 'a')
+ return chr;
+
+ if (chr <= 'z')
+ return chr - ('a' - 'A');
+
+ return upcase[chr];
+}
+
+/*
+ * ntfs_cmp_names
+ *
+ * Thanks Kari Argillander <kari.argillander@gmail.com> for idea and implementation 'bothcase'
+ *
+ * Straight way to compare names:
+ * - Case insensitive
+ * - If name equals and 'bothcases' then
+ * - Case sensitive
+ * 'Straight way' code scans input names twice in worst case.
+ * Optimized code scans input names only once.
+ */
+int ntfs_cmp_names(const __le16 *s1, size_t l1, const __le16 *s2, size_t l2,
+ const u16 *upcase, bool bothcase)
+{
+ int diff1 = 0;
+ int diff2;
+ size_t len = min(l1, l2);
+
+ if (!bothcase && upcase)
+ goto case_insentive;
+
+ for (; len; s1++, s2++, len--) {
+ diff1 = le16_to_cpu(*s1) - le16_to_cpu(*s2);
+ if (diff1) {
+ if (bothcase && upcase)
+ goto case_insentive;
+
+ return diff1;
+ }
+ }
+ return l1 - l2;
+
+case_insentive:
+ for (; len; s1++, s2++, len--) {
+ diff2 = upcase_unicode_char(upcase, le16_to_cpu(*s1)) -
+ upcase_unicode_char(upcase, le16_to_cpu(*s2));
+ if (diff2)
+ return diff2;
+ }
+
+ diff2 = l1 - l2;
+ return diff2 ? diff2 : diff1;
+}
+
+int ntfs_cmp_names_cpu(const struct cpu_str *uni1, const struct le_str *uni2,
+ const u16 *upcase, bool bothcase)
+{
+ const u16 *s1 = uni1->name;
+ const __le16 *s2 = uni2->name;
+ size_t l1 = uni1->len;
+ size_t l2 = uni2->len;
+ size_t len = min(l1, l2);
+ int diff1 = 0;
+ int diff2;
+
+ if (!bothcase && upcase)
+ goto case_insentive;
+
+ for (; len; s1++, s2++, len--) {
+ diff1 = *s1 - le16_to_cpu(*s2);
+ if (diff1) {
+ if (bothcase && upcase)
+ goto case_insentive;
+
+ return diff1;
+ }
+ }
+ return l1 - l2;
+
+case_insentive:
+ for (; len; s1++, s2++, len--) {
+ diff2 = upcase_unicode_char(upcase, *s1) -
+ upcase_unicode_char(upcase, le16_to_cpu(*s2));
+ if (diff2)
+ return diff2;
+ }
+
+ diff2 = l1 - l2;
+ return diff2 ? diff2 : diff1;
+}
diff --git a/fs/ntfs3/xattr.c b/fs/ntfs3/xattr.c
new file mode 100644
index 000000000..df15e00c2
--- /dev/null
+++ b/fs/ntfs3/xattr.c
@@ -0,0 +1,1051 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ *
+ * Copyright (C) 2019-2021 Paragon Software GmbH, All rights reserved.
+ *
+ */
+
+#include <linux/fs.h>
+#include <linux/posix_acl.h>
+#include <linux/posix_acl_xattr.h>
+#include <linux/xattr.h>
+
+#include "debug.h"
+#include "ntfs.h"
+#include "ntfs_fs.h"
+
+// clang-format off
+#define SYSTEM_DOS_ATTRIB "system.dos_attrib"
+#define SYSTEM_NTFS_ATTRIB "system.ntfs_attrib"
+#define SYSTEM_NTFS_SECURITY "system.ntfs_security"
+// clang-format on
+
+static inline size_t unpacked_ea_size(const struct EA_FULL *ea)
+{
+ return ea->size ? le32_to_cpu(ea->size)
+ : ALIGN(struct_size(ea, name,
+ 1 + ea->name_len +
+ le16_to_cpu(ea->elength)),
+ 4);
+}
+
+static inline size_t packed_ea_size(const struct EA_FULL *ea)
+{
+ return struct_size(ea, name,
+ 1 + ea->name_len + le16_to_cpu(ea->elength)) -
+ offsetof(struct EA_FULL, flags);
+}
+
+/*
+ * find_ea
+ *
+ * Assume there is at least one xattr in the list.
+ */
+static inline bool find_ea(const struct EA_FULL *ea_all, u32 bytes,
+ const char *name, u8 name_len, u32 *off, u32 *ea_sz)
+{
+ u32 ea_size;
+
+ *off = 0;
+ if (!ea_all)
+ return false;
+
+ for (; *off < bytes; *off += ea_size) {
+ const struct EA_FULL *ea = Add2Ptr(ea_all, *off);
+ ea_size = unpacked_ea_size(ea);
+ if (ea->name_len == name_len &&
+ !memcmp(ea->name, name, name_len)) {
+ if (ea_sz)
+ *ea_sz = ea_size;
+ return true;
+ }
+ }
+
+ return false;
+}
+
+/*
+ * ntfs_read_ea - Read all extended attributes.
+ * @ea: New allocated memory.
+ * @info: Pointer into resident data.
+ */
+static int ntfs_read_ea(struct ntfs_inode *ni, struct EA_FULL **ea,
+ size_t add_bytes, const struct EA_INFO **info)
+{
+ int err = -EINVAL;
+ struct ntfs_sb_info *sbi = ni->mi.sbi;
+ struct ATTR_LIST_ENTRY *le = NULL;
+ struct ATTRIB *attr_info, *attr_ea;
+ void *ea_p;
+ u32 size, off, ea_size;
+
+ static_assert(le32_to_cpu(ATTR_EA_INFO) < le32_to_cpu(ATTR_EA));
+
+ *ea = NULL;
+ *info = NULL;
+
+ attr_info =
+ ni_find_attr(ni, NULL, &le, ATTR_EA_INFO, NULL, 0, NULL, NULL);
+ attr_ea =
+ ni_find_attr(ni, attr_info, &le, ATTR_EA, NULL, 0, NULL, NULL);
+
+ if (!attr_ea || !attr_info)
+ return 0;
+
+ *info = resident_data_ex(attr_info, sizeof(struct EA_INFO));
+ if (!*info)
+ goto out;
+
+ /* Check Ea limit. */
+ size = le32_to_cpu((*info)->size);
+ if (size > sbi->ea_max_size) {
+ err = -EFBIG;
+ goto out;
+ }
+
+ if (attr_size(attr_ea) > sbi->ea_max_size) {
+ err = -EFBIG;
+ goto out;
+ }
+
+ if (!size) {
+ /* EA info persists, but xattr is empty. Looks like EA problem. */
+ goto out;
+ }
+
+ /* Allocate memory for packed Ea. */
+ ea_p = kmalloc(size_add(size, add_bytes), GFP_NOFS);
+ if (!ea_p)
+ return -ENOMEM;
+
+ if (attr_ea->non_res) {
+ struct runs_tree run;
+
+ run_init(&run);
+
+ err = attr_load_runs_range(ni, ATTR_EA, NULL, 0, &run, 0, size);
+ if (!err)
+ err = ntfs_read_run_nb(sbi, &run, 0, ea_p, size, NULL);
+ run_close(&run);
+
+ if (err)
+ goto out1;
+ } else {
+ void *p = resident_data_ex(attr_ea, size);
+
+ if (!p)
+ goto out1;
+ memcpy(ea_p, p, size);
+ }
+
+ memset(Add2Ptr(ea_p, size), 0, add_bytes);
+
+ /* Check all attributes for consistency. */
+ for (off = 0; off < size; off += ea_size) {
+ const struct EA_FULL *ef = Add2Ptr(ea_p, off);
+ u32 bytes = size - off;
+
+ /* Check if we can use field ea->size. */
+ if (bytes < sizeof(ef->size))
+ goto out1;
+
+ if (ef->size) {
+ ea_size = le32_to_cpu(ef->size);
+ if (ea_size > bytes)
+ goto out1;
+ continue;
+ }
+
+ /* Check if we can use fields ef->name_len and ef->elength. */
+ if (bytes < offsetof(struct EA_FULL, name))
+ goto out1;
+
+ ea_size = ALIGN(struct_size(ef, name,
+ 1 + ef->name_len +
+ le16_to_cpu(ef->elength)),
+ 4);
+ if (ea_size > bytes)
+ goto out1;
+ }
+
+ *ea = ea_p;
+ return 0;
+
+out1:
+ kfree(ea_p);
+out:
+ ntfs_set_state(sbi, NTFS_DIRTY_DIRTY);
+ return err;
+}
+
+/*
+ * ntfs_list_ea
+ *
+ * Copy a list of xattrs names into the buffer
+ * provided, or compute the buffer size required.
+ *
+ * Return:
+ * * Number of bytes used / required on
+ * * -ERRNO - on failure
+ */
+static ssize_t ntfs_list_ea(struct ntfs_inode *ni, char *buffer,
+ size_t bytes_per_buffer)
+{
+ const struct EA_INFO *info;
+ struct EA_FULL *ea_all = NULL;
+ const struct EA_FULL *ea;
+ u32 off, size;
+ int err;
+ int ea_size;
+ size_t ret;
+
+ err = ntfs_read_ea(ni, &ea_all, 0, &info);
+ if (err)
+ return err;
+
+ if (!info || !ea_all)
+ return 0;
+
+ size = le32_to_cpu(info->size);
+
+ /* Enumerate all xattrs. */
+ ret = 0;
+ for (off = 0; off + sizeof(struct EA_FULL) < size; off += ea_size) {
+ ea = Add2Ptr(ea_all, off);
+ ea_size = unpacked_ea_size(ea);
+
+ if (!ea->name_len)
+ break;
+
+ if (buffer) {
+ /* Check if we can use field ea->name */
+ if (off + ea_size > size)
+ break;
+
+ if (ret + ea->name_len + 1 > bytes_per_buffer) {
+ err = -ERANGE;
+ goto out;
+ }
+
+ memcpy(buffer + ret, ea->name, ea->name_len);
+ buffer[ret + ea->name_len] = 0;
+ }
+
+ ret += ea->name_len + 1;
+ }
+
+out:
+ kfree(ea_all);
+ return err ? err : ret;
+}
+
+static int ntfs_get_ea(struct inode *inode, const char *name, size_t name_len,
+ void *buffer, size_t size, size_t *required)
+{
+ struct ntfs_inode *ni = ntfs_i(inode);
+ const struct EA_INFO *info;
+ struct EA_FULL *ea_all = NULL;
+ const struct EA_FULL *ea;
+ u32 off, len;
+ int err;
+
+ if (!(ni->ni_flags & NI_FLAG_EA))
+ return -ENODATA;
+
+ if (!required)
+ ni_lock(ni);
+
+ len = 0;
+
+ if (name_len > 255) {
+ err = -ENAMETOOLONG;
+ goto out;
+ }
+
+ err = ntfs_read_ea(ni, &ea_all, 0, &info);
+ if (err)
+ goto out;
+
+ if (!info)
+ goto out;
+
+ /* Enumerate all xattrs. */
+ if (!find_ea(ea_all, le32_to_cpu(info->size), name, name_len, &off,
+ NULL)) {
+ err = -ENODATA;
+ goto out;
+ }
+ ea = Add2Ptr(ea_all, off);
+
+ len = le16_to_cpu(ea->elength);
+ if (!buffer) {
+ err = 0;
+ goto out;
+ }
+
+ if (len > size) {
+ err = -ERANGE;
+ if (required)
+ *required = len;
+ goto out;
+ }
+
+ memcpy(buffer, ea->name + ea->name_len + 1, len);
+ err = 0;
+
+out:
+ kfree(ea_all);
+ if (!required)
+ ni_unlock(ni);
+
+ return err ? err : len;
+}
+
+static noinline int ntfs_set_ea(struct inode *inode, const char *name,
+ size_t name_len, const void *value,
+ size_t val_size, int flags, bool locked)
+{
+ struct ntfs_inode *ni = ntfs_i(inode);
+ struct ntfs_sb_info *sbi = ni->mi.sbi;
+ int err;
+ struct EA_INFO ea_info;
+ const struct EA_INFO *info;
+ struct EA_FULL *new_ea;
+ struct EA_FULL *ea_all = NULL;
+ size_t add, new_pack;
+ u32 off, size, ea_sz;
+ __le16 size_pack;
+ struct ATTRIB *attr;
+ struct ATTR_LIST_ENTRY *le;
+ struct mft_inode *mi;
+ struct runs_tree ea_run;
+ u64 new_sz;
+ void *p;
+
+ if (!locked)
+ ni_lock(ni);
+
+ run_init(&ea_run);
+
+ if (name_len > 255) {
+ err = -ENAMETOOLONG;
+ goto out;
+ }
+
+ add = ALIGN(struct_size(ea_all, name, 1 + name_len + val_size), 4);
+
+ err = ntfs_read_ea(ni, &ea_all, add, &info);
+ if (err)
+ goto out;
+
+ if (!info) {
+ memset(&ea_info, 0, sizeof(ea_info));
+ size = 0;
+ size_pack = 0;
+ } else {
+ memcpy(&ea_info, info, sizeof(ea_info));
+ size = le32_to_cpu(ea_info.size);
+ size_pack = ea_info.size_pack;
+ }
+
+ if (info && find_ea(ea_all, size, name, name_len, &off, &ea_sz)) {
+ struct EA_FULL *ea;
+
+ if (flags & XATTR_CREATE) {
+ err = -EEXIST;
+ goto out;
+ }
+
+ ea = Add2Ptr(ea_all, off);
+
+ /*
+ * Check simple case when we try to insert xattr with the same value
+ * e.g. ntfs_save_wsl_perm
+ */
+ if (val_size && le16_to_cpu(ea->elength) == val_size &&
+ !memcmp(ea->name + ea->name_len + 1, value, val_size)) {
+ /* xattr already contains the required value. */
+ goto out;
+ }
+
+ /* Remove current xattr. */
+ if (ea->flags & FILE_NEED_EA)
+ le16_add_cpu(&ea_info.count, -1);
+
+ le16_add_cpu(&ea_info.size_pack, 0 - packed_ea_size(ea));
+
+ memmove(ea, Add2Ptr(ea, ea_sz), size - off - ea_sz);
+
+ size -= ea_sz;
+ memset(Add2Ptr(ea_all, size), 0, ea_sz);
+
+ ea_info.size = cpu_to_le32(size);
+
+ if ((flags & XATTR_REPLACE) && !val_size) {
+ /* Remove xattr. */
+ goto update_ea;
+ }
+ } else {
+ if (flags & XATTR_REPLACE) {
+ err = -ENODATA;
+ goto out;
+ }
+
+ if (!ea_all) {
+ ea_all = kzalloc(add, GFP_NOFS);
+ if (!ea_all) {
+ err = -ENOMEM;
+ goto out;
+ }
+ }
+ }
+
+ /* Append new xattr. */
+ new_ea = Add2Ptr(ea_all, size);
+ new_ea->size = cpu_to_le32(add);
+ new_ea->flags = 0;
+ new_ea->name_len = name_len;
+ new_ea->elength = cpu_to_le16(val_size);
+ memcpy(new_ea->name, name, name_len);
+ new_ea->name[name_len] = 0;
+ memcpy(new_ea->name + name_len + 1, value, val_size);
+ new_pack = le16_to_cpu(ea_info.size_pack) + packed_ea_size(new_ea);
+ ea_info.size_pack = cpu_to_le16(new_pack);
+ /* New size of ATTR_EA. */
+ size += add;
+ ea_info.size = cpu_to_le32(size);
+
+ /*
+ * 1. Check ea_info.size_pack for overflow.
+ * 2. New attibute size must fit value from $AttrDef
+ */
+ if (new_pack > 0xffff || size > sbi->ea_max_size) {
+ ntfs_inode_warn(
+ inode,
+ "The size of extended attributes must not exceed 64KiB");
+ err = -EFBIG; // -EINVAL?
+ goto out;
+ }
+
+update_ea:
+
+ if (!info) {
+ /* Create xattr. */
+ if (!size) {
+ err = 0;
+ goto out;
+ }
+
+ err = ni_insert_resident(ni, sizeof(struct EA_INFO),
+ ATTR_EA_INFO, NULL, 0, NULL, NULL,
+ NULL);
+ if (err)
+ goto out;
+
+ err = ni_insert_resident(ni, 0, ATTR_EA, NULL, 0, NULL, NULL,
+ NULL);
+ if (err)
+ goto out;
+ }
+
+ new_sz = size;
+ err = attr_set_size(ni, ATTR_EA, NULL, 0, &ea_run, new_sz, &new_sz,
+ false, NULL);
+ if (err)
+ goto out;
+
+ le = NULL;
+ attr = ni_find_attr(ni, NULL, &le, ATTR_EA_INFO, NULL, 0, NULL, &mi);
+ if (!attr) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ if (!size) {
+ /* Delete xattr, ATTR_EA_INFO */
+ ni_remove_attr_le(ni, attr, mi, le);
+ } else {
+ p = resident_data_ex(attr, sizeof(struct EA_INFO));
+ if (!p) {
+ err = -EINVAL;
+ goto out;
+ }
+ memcpy(p, &ea_info, sizeof(struct EA_INFO));
+ mi->dirty = true;
+ }
+
+ le = NULL;
+ attr = ni_find_attr(ni, NULL, &le, ATTR_EA, NULL, 0, NULL, &mi);
+ if (!attr) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ if (!size) {
+ /* Delete xattr, ATTR_EA */
+ ni_remove_attr_le(ni, attr, mi, le);
+ } else if (attr->non_res) {
+ err = attr_load_runs_range(ni, ATTR_EA, NULL, 0, &ea_run, 0,
+ size);
+ if (err)
+ goto out;
+
+ err = ntfs_sb_write_run(sbi, &ea_run, 0, ea_all, size, 0);
+ if (err)
+ goto out;
+ } else {
+ p = resident_data_ex(attr, size);
+ if (!p) {
+ err = -EINVAL;
+ goto out;
+ }
+ memcpy(p, ea_all, size);
+ mi->dirty = true;
+ }
+
+ /* Check if we delete the last xattr. */
+ if (size)
+ ni->ni_flags |= NI_FLAG_EA;
+ else
+ ni->ni_flags &= ~NI_FLAG_EA;
+
+ if (ea_info.size_pack != size_pack)
+ ni->ni_flags |= NI_FLAG_UPDATE_PARENT;
+ mark_inode_dirty(&ni->vfs_inode);
+
+out:
+ if (!locked)
+ ni_unlock(ni);
+
+ run_close(&ea_run);
+ kfree(ea_all);
+
+ return err;
+}
+
+#ifdef CONFIG_NTFS3_FS_POSIX_ACL
+static struct posix_acl *ntfs_get_acl_ex(struct inode *inode, int type,
+ int locked)
+{
+ struct ntfs_inode *ni = ntfs_i(inode);
+ const char *name;
+ size_t name_len;
+ struct posix_acl *acl;
+ size_t req;
+ int err;
+ void *buf;
+
+ /* Allocate PATH_MAX bytes. */
+ buf = __getname();
+ if (!buf)
+ return ERR_PTR(-ENOMEM);
+
+ /* Possible values of 'type' was already checked above. */
+ if (type == ACL_TYPE_ACCESS) {
+ name = XATTR_NAME_POSIX_ACL_ACCESS;
+ name_len = sizeof(XATTR_NAME_POSIX_ACL_ACCESS) - 1;
+ } else {
+ name = XATTR_NAME_POSIX_ACL_DEFAULT;
+ name_len = sizeof(XATTR_NAME_POSIX_ACL_DEFAULT) - 1;
+ }
+
+ if (!locked)
+ ni_lock(ni);
+
+ err = ntfs_get_ea(inode, name, name_len, buf, PATH_MAX, &req);
+
+ if (!locked)
+ ni_unlock(ni);
+
+ /* Translate extended attribute to acl. */
+ if (err >= 0) {
+ acl = posix_acl_from_xattr(&init_user_ns, buf, err);
+ } else if (err == -ENODATA) {
+ acl = NULL;
+ } else {
+ acl = ERR_PTR(err);
+ }
+
+ if (!IS_ERR(acl))
+ set_cached_acl(inode, type, acl);
+
+ __putname(buf);
+
+ return acl;
+}
+
+/*
+ * ntfs_get_acl - inode_operations::get_acl
+ */
+struct posix_acl *ntfs_get_acl(struct inode *inode, int type, bool rcu)
+{
+ if (rcu)
+ return ERR_PTR(-ECHILD);
+
+ return ntfs_get_acl_ex(inode, type, 0);
+}
+
+static noinline int ntfs_set_acl_ex(struct user_namespace *mnt_userns,
+ struct inode *inode, struct posix_acl *acl,
+ int type, bool init_acl)
+{
+ const char *name;
+ size_t size, name_len;
+ void *value;
+ int err;
+ int flags;
+ umode_t mode;
+
+ if (S_ISLNK(inode->i_mode))
+ return -EOPNOTSUPP;
+
+ mode = inode->i_mode;
+ switch (type) {
+ case ACL_TYPE_ACCESS:
+ /* Do not change i_mode if we are in init_acl */
+ if (acl && !init_acl) {
+ err = posix_acl_update_mode(mnt_userns, inode, &mode,
+ &acl);
+ if (err)
+ return err;
+ }
+ name = XATTR_NAME_POSIX_ACL_ACCESS;
+ name_len = sizeof(XATTR_NAME_POSIX_ACL_ACCESS) - 1;
+ break;
+
+ case ACL_TYPE_DEFAULT:
+ if (!S_ISDIR(inode->i_mode))
+ return acl ? -EACCES : 0;
+ name = XATTR_NAME_POSIX_ACL_DEFAULT;
+ name_len = sizeof(XATTR_NAME_POSIX_ACL_DEFAULT) - 1;
+ break;
+
+ default:
+ return -EINVAL;
+ }
+
+ if (!acl) {
+ /* Remove xattr if it can be presented via mode. */
+ size = 0;
+ value = NULL;
+ flags = XATTR_REPLACE;
+ } else {
+ size = posix_acl_xattr_size(acl->a_count);
+ value = kmalloc(size, GFP_NOFS);
+ if (!value)
+ return -ENOMEM;
+ err = posix_acl_to_xattr(&init_user_ns, acl, value, size);
+ if (err < 0)
+ goto out;
+ flags = 0;
+ }
+
+ err = ntfs_set_ea(inode, name, name_len, value, size, flags, 0);
+ if (err == -ENODATA && !size)
+ err = 0; /* Removing non existed xattr. */
+ if (!err) {
+ set_cached_acl(inode, type, acl);
+ if (inode->i_mode != mode) {
+ inode->i_mode = mode;
+ mark_inode_dirty(inode);
+ }
+ }
+
+out:
+ kfree(value);
+
+ return err;
+}
+
+/*
+ * ntfs_set_acl - inode_operations::set_acl
+ */
+int ntfs_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
+ struct posix_acl *acl, int type)
+{
+ return ntfs_set_acl_ex(mnt_userns, inode, acl, type, false);
+}
+
+/*
+ * ntfs_init_acl - Initialize the ACLs of a new inode.
+ *
+ * Called from ntfs_create_inode().
+ */
+int ntfs_init_acl(struct user_namespace *mnt_userns, struct inode *inode,
+ struct inode *dir)
+{
+ struct posix_acl *default_acl, *acl;
+ int err;
+
+ err = posix_acl_create(dir, &inode->i_mode, &default_acl, &acl);
+ if (err)
+ return err;
+
+ if (default_acl) {
+ err = ntfs_set_acl_ex(mnt_userns, inode, default_acl,
+ ACL_TYPE_DEFAULT, true);
+ posix_acl_release(default_acl);
+ } else {
+ inode->i_default_acl = NULL;
+ }
+
+ if (acl) {
+ if (!err)
+ err = ntfs_set_acl_ex(mnt_userns, inode, acl,
+ ACL_TYPE_ACCESS, true);
+ posix_acl_release(acl);
+ } else {
+ inode->i_acl = NULL;
+ }
+
+ return err;
+}
+#endif
+
+/*
+ * ntfs_acl_chmod - Helper for ntfs3_setattr().
+ */
+int ntfs_acl_chmod(struct user_namespace *mnt_userns, struct inode *inode)
+{
+ struct super_block *sb = inode->i_sb;
+
+ if (!(sb->s_flags & SB_POSIXACL))
+ return 0;
+
+ if (S_ISLNK(inode->i_mode))
+ return -EOPNOTSUPP;
+
+ return posix_acl_chmod(mnt_userns, inode, inode->i_mode);
+}
+
+/*
+ * ntfs_permission - inode_operations::permission
+ */
+int ntfs_permission(struct user_namespace *mnt_userns, struct inode *inode,
+ int mask)
+{
+ if (ntfs_sb(inode->i_sb)->options->noacsrules) {
+ /* "No access rules" mode - Allow all changes. */
+ return 0;
+ }
+
+ return generic_permission(mnt_userns, inode, mask);
+}
+
+/*
+ * ntfs_listxattr - inode_operations::listxattr
+ */
+ssize_t ntfs_listxattr(struct dentry *dentry, char *buffer, size_t size)
+{
+ struct inode *inode = d_inode(dentry);
+ struct ntfs_inode *ni = ntfs_i(inode);
+ ssize_t ret;
+
+ if (!(ni->ni_flags & NI_FLAG_EA)) {
+ /* no xattr in file */
+ return 0;
+ }
+
+ ni_lock(ni);
+
+ ret = ntfs_list_ea(ni, buffer, size);
+
+ ni_unlock(ni);
+
+ return ret;
+}
+
+static int ntfs_getxattr(const struct xattr_handler *handler, struct dentry *de,
+ struct inode *inode, const char *name, void *buffer,
+ size_t size)
+{
+ int err;
+ struct ntfs_inode *ni = ntfs_i(inode);
+ size_t name_len = strlen(name);
+
+ /* Dispatch request. */
+ if (name_len == sizeof(SYSTEM_DOS_ATTRIB) - 1 &&
+ !memcmp(name, SYSTEM_DOS_ATTRIB, sizeof(SYSTEM_DOS_ATTRIB))) {
+ /* system.dos_attrib */
+ if (!buffer) {
+ err = sizeof(u8);
+ } else if (size < sizeof(u8)) {
+ err = -ENODATA;
+ } else {
+ err = sizeof(u8);
+ *(u8 *)buffer = le32_to_cpu(ni->std_fa);
+ }
+ goto out;
+ }
+
+ if (name_len == sizeof(SYSTEM_NTFS_ATTRIB) - 1 &&
+ !memcmp(name, SYSTEM_NTFS_ATTRIB, sizeof(SYSTEM_NTFS_ATTRIB))) {
+ /* system.ntfs_attrib */
+ if (!buffer) {
+ err = sizeof(u32);
+ } else if (size < sizeof(u32)) {
+ err = -ENODATA;
+ } else {
+ err = sizeof(u32);
+ *(u32 *)buffer = le32_to_cpu(ni->std_fa);
+ }
+ goto out;
+ }
+
+ if (name_len == sizeof(SYSTEM_NTFS_SECURITY) - 1 &&
+ !memcmp(name, SYSTEM_NTFS_SECURITY, sizeof(SYSTEM_NTFS_SECURITY))) {
+ /* system.ntfs_security*/
+ struct SECURITY_DESCRIPTOR_RELATIVE *sd = NULL;
+ size_t sd_size = 0;
+
+ if (!is_ntfs3(ni->mi.sbi)) {
+ /* We should get nt4 security. */
+ err = -EINVAL;
+ goto out;
+ } else if (le32_to_cpu(ni->std_security_id) <
+ SECURITY_ID_FIRST) {
+ err = -ENOENT;
+ goto out;
+ }
+
+ err = ntfs_get_security_by_id(ni->mi.sbi, ni->std_security_id,
+ &sd, &sd_size);
+ if (err)
+ goto out;
+
+ if (!is_sd_valid(sd, sd_size)) {
+ ntfs_inode_warn(
+ inode,
+ "looks like you get incorrect security descriptor id=%u",
+ ni->std_security_id);
+ }
+
+ if (!buffer) {
+ err = sd_size;
+ } else if (size < sd_size) {
+ err = -ENODATA;
+ } else {
+ err = sd_size;
+ memcpy(buffer, sd, sd_size);
+ }
+ kfree(sd);
+ goto out;
+ }
+
+ /* Deal with NTFS extended attribute. */
+ err = ntfs_get_ea(inode, name, name_len, buffer, size, NULL);
+
+out:
+ return err;
+}
+
+/*
+ * ntfs_setxattr - inode_operations::setxattr
+ */
+static noinline int ntfs_setxattr(const struct xattr_handler *handler,
+ struct user_namespace *mnt_userns,
+ struct dentry *de, struct inode *inode,
+ const char *name, const void *value,
+ size_t size, int flags)
+{
+ int err = -EINVAL;
+ struct ntfs_inode *ni = ntfs_i(inode);
+ size_t name_len = strlen(name);
+ enum FILE_ATTRIBUTE new_fa;
+
+ /* Dispatch request. */
+ if (name_len == sizeof(SYSTEM_DOS_ATTRIB) - 1 &&
+ !memcmp(name, SYSTEM_DOS_ATTRIB, sizeof(SYSTEM_DOS_ATTRIB))) {
+ if (sizeof(u8) != size)
+ goto out;
+ new_fa = cpu_to_le32(*(u8 *)value);
+ goto set_new_fa;
+ }
+
+ if (name_len == sizeof(SYSTEM_NTFS_ATTRIB) - 1 &&
+ !memcmp(name, SYSTEM_NTFS_ATTRIB, sizeof(SYSTEM_NTFS_ATTRIB))) {
+ if (size != sizeof(u32))
+ goto out;
+ new_fa = cpu_to_le32(*(u32 *)value);
+
+ if (S_ISREG(inode->i_mode)) {
+ /* Process compressed/sparsed in special way. */
+ ni_lock(ni);
+ err = ni_new_attr_flags(ni, new_fa);
+ ni_unlock(ni);
+ if (err)
+ goto out;
+ }
+set_new_fa:
+ /*
+ * Thanks Mark Harmstone:
+ * Keep directory bit consistency.
+ */
+ if (S_ISDIR(inode->i_mode))
+ new_fa |= FILE_ATTRIBUTE_DIRECTORY;
+ else
+ new_fa &= ~FILE_ATTRIBUTE_DIRECTORY;
+
+ if (ni->std_fa != new_fa) {
+ ni->std_fa = new_fa;
+ if (new_fa & FILE_ATTRIBUTE_READONLY)
+ inode->i_mode &= ~0222;
+ else
+ inode->i_mode |= 0222;
+ /* Std attribute always in primary record. */
+ ni->mi.dirty = true;
+ mark_inode_dirty(inode);
+ }
+ err = 0;
+
+ goto out;
+ }
+
+ if (name_len == sizeof(SYSTEM_NTFS_SECURITY) - 1 &&
+ !memcmp(name, SYSTEM_NTFS_SECURITY, sizeof(SYSTEM_NTFS_SECURITY))) {
+ /* system.ntfs_security*/
+ __le32 security_id;
+ bool inserted;
+ struct ATTR_STD_INFO5 *std;
+
+ if (!is_ntfs3(ni->mi.sbi)) {
+ /*
+ * We should replace ATTR_SECURE.
+ * Skip this way cause it is nt4 feature.
+ */
+ err = -EINVAL;
+ goto out;
+ }
+
+ if (!is_sd_valid(value, size)) {
+ err = -EINVAL;
+ ntfs_inode_warn(
+ inode,
+ "you try to set invalid security descriptor");
+ goto out;
+ }
+
+ err = ntfs_insert_security(ni->mi.sbi, value, size,
+ &security_id, &inserted);
+ if (err)
+ goto out;
+
+ ni_lock(ni);
+ std = ni_std5(ni);
+ if (!std) {
+ err = -EINVAL;
+ } else if (std->security_id != security_id) {
+ std->security_id = ni->std_security_id = security_id;
+ /* Std attribute always in primary record. */
+ ni->mi.dirty = true;
+ mark_inode_dirty(&ni->vfs_inode);
+ }
+ ni_unlock(ni);
+ goto out;
+ }
+
+ /* Deal with NTFS extended attribute. */
+ err = ntfs_set_ea(inode, name, name_len, value, size, flags, 0);
+
+out:
+ inode->i_ctime = current_time(inode);
+ mark_inode_dirty(inode);
+
+ return err;
+}
+
+/*
+ * ntfs_save_wsl_perm
+ *
+ * save uid/gid/mode in xattr
+ */
+int ntfs_save_wsl_perm(struct inode *inode)
+{
+ int err;
+ __le32 value;
+ struct ntfs_inode *ni = ntfs_i(inode);
+
+ ni_lock(ni);
+ value = cpu_to_le32(i_uid_read(inode));
+ err = ntfs_set_ea(inode, "$LXUID", sizeof("$LXUID") - 1, &value,
+ sizeof(value), 0, true); /* true == already locked. */
+ if (err)
+ goto out;
+
+ value = cpu_to_le32(i_gid_read(inode));
+ err = ntfs_set_ea(inode, "$LXGID", sizeof("$LXGID") - 1, &value,
+ sizeof(value), 0, true);
+ if (err)
+ goto out;
+
+ value = cpu_to_le32(inode->i_mode);
+ err = ntfs_set_ea(inode, "$LXMOD", sizeof("$LXMOD") - 1, &value,
+ sizeof(value), 0, true);
+ if (err)
+ goto out;
+
+ if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) {
+ value = cpu_to_le32(inode->i_rdev);
+ err = ntfs_set_ea(inode, "$LXDEV", sizeof("$LXDEV") - 1, &value,
+ sizeof(value), 0, true);
+ if (err)
+ goto out;
+ }
+
+out:
+ ni_unlock(ni);
+ /* In case of error should we delete all WSL xattr? */
+ return err;
+}
+
+/*
+ * ntfs_get_wsl_perm
+ *
+ * get uid/gid/mode from xattr
+ * it is called from ntfs_iget5->ntfs_read_mft
+ */
+void ntfs_get_wsl_perm(struct inode *inode)
+{
+ size_t sz;
+ __le32 value[3];
+
+ if (ntfs_get_ea(inode, "$LXUID", sizeof("$LXUID") - 1, &value[0],
+ sizeof(value[0]), &sz) == sizeof(value[0]) &&
+ ntfs_get_ea(inode, "$LXGID", sizeof("$LXGID") - 1, &value[1],
+ sizeof(value[1]), &sz) == sizeof(value[1]) &&
+ ntfs_get_ea(inode, "$LXMOD", sizeof("$LXMOD") - 1, &value[2],
+ sizeof(value[2]), &sz) == sizeof(value[2])) {
+ i_uid_write(inode, (uid_t)le32_to_cpu(value[0]));
+ i_gid_write(inode, (gid_t)le32_to_cpu(value[1]));
+ inode->i_mode = le32_to_cpu(value[2]);
+
+ if (ntfs_get_ea(inode, "$LXDEV", sizeof("$$LXDEV") - 1,
+ &value[0], sizeof(value),
+ &sz) == sizeof(value[0])) {
+ inode->i_rdev = le32_to_cpu(value[0]);
+ }
+ }
+}
+
+static bool ntfs_xattr_user_list(struct dentry *dentry)
+{
+ return true;
+}
+
+// clang-format off
+static const struct xattr_handler ntfs_other_xattr_handler = {
+ .prefix = "",
+ .get = ntfs_getxattr,
+ .set = ntfs_setxattr,
+ .list = ntfs_xattr_user_list,
+};
+
+const struct xattr_handler *ntfs_xattr_handlers[] = {
+#ifdef CONFIG_NTFS3_FS_POSIX_ACL
+ &posix_acl_access_xattr_handler,
+ &posix_acl_default_xattr_handler,
+#endif
+ &ntfs_other_xattr_handler,
+ NULL,
+};
+// clang-format on